feat: connection pooling via urllib3 + batch OG fetching

Replace per-request SOCKS5+TLS handshakes with urllib3 SOCKSProxyManager
connection pool (20 pools, 4 conns/host). Batch _fetch_og calls via
ThreadPoolExecutor to parallelize OG tag enrichment in alert polling.
Cache flaskpaste SSL context at module level.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
user
2026-02-17 20:52:22 +01:00
parent e11994f320
commit 94f563d55a
8 changed files with 291 additions and 20 deletions

View File

@@ -2,7 +2,8 @@ FROM python:3.13-alpine
WORKDIR /app WORKDIR /app
RUN pip install --no-cache-dir maxminddb>=2.0 PySocks>=1.7.1 COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
ENV PYTHONPATH=/app/src ENV PYTHONPATH=/app/src
ENV PYTHONUNBUFFERED=1 ENV PYTHONUNBUFFERED=1

View File

@@ -1,6 +1,17 @@
# derp - Tasks # derp - Tasks
## Current Sprint -- v1.2.1 Performance + Polish (2026-02-17) ## Current Sprint -- v1.2.2 Connection Pooling + Batch OG (2026-02-17)
| Pri | Status | Task |
|-----|--------|------|
| P0 | [x] | Batch `_fetch_og` calls via ThreadPoolExecutor (alert.py) |
| P0 | [x] | Connection pooling via `urllib3[socks]` SOCKSProxyManager (http.py) |
| P1 | [x] | Cache FlaskPaste `_ssl_context()` at module level |
| P1 | [x] | Backward-compat `urllib.error.HTTPError` for 4xx/5xx in pooled path |
| P1 | [x] | Legacy opener fallback for `context=` callers (username.py) |
| P2 | [x] | Containerfile uses requirements.txt for deps |
## Previous Sprint -- v1.2.1 Performance + Polish (2026-02-17)
| Pri | Status | Task | | Pri | Status | Task |
|-----|--------|------| |-----|--------|------|

View File

@@ -330,6 +330,23 @@ def _fetch_og(url: str) -> tuple[str, str, str]:
return "", "", "" return "", "", ""
def _fetch_og_batch(urls: list[str]) -> dict[str, tuple[str, str, str]]:
"""Fetch OG tags for multiple URLs concurrently.
Returns {url: (og_title, og_description, date)} for each input URL.
"""
from concurrent.futures import ThreadPoolExecutor, as_completed
if not urls:
return {}
results: dict[str, tuple[str, str, str]] = {}
with ThreadPoolExecutor(max_workers=min(len(urls), 8)) as pool:
futures = {pool.submit(_fetch_og, url): url for url in urls}
for fut in as_completed(futures):
results[futures[fut]] = fut.result()
return results
# -- YouTube InnerTube search (blocking) ------------------------------------ # -- YouTube InnerTube search (blocking) ------------------------------------
def _extract_videos(obj: object, depth: int = 0) -> list[dict]: def _extract_videos(obj: object, depth: int = 0) -> list[dict]:
@@ -1753,26 +1770,41 @@ async def _poll_once(bot, key: str, announce: bool = True) -> None:
# Filter: only announce results that actually contain the keyword # Filter: only announce results that actually contain the keyword
# Check title/URL first, then fall back to og:title/og:description # Check title/URL first, then fall back to og:title/og:description
kw_lower = keyword.lower() kw_lower = keyword.lower()
# Collect URLs that need OG enrichment (batch fetch)
urls_needing_og: set[str] = set()
for item in new_items:
title_l = item.get("title", "").lower()
url_l = item.get("url", "").lower()
if kw_lower in title_l or kw_lower in url_l:
# Title/URL match -- only need OG for date enrichment
if not item.get("date") and item.get("url"):
urls_needing_og.add(item["url"])
elif item.get("url"):
# No title/URL match -- need OG for keyword fallback
urls_needing_og.add(item["url"])
og_cache: dict[str, tuple[str, str, str]] = {}
if urls_needing_og:
og_cache = await loop.run_in_executor(
None, _fetch_og_batch, list(urls_needing_og),
)
matched = [] matched = []
for item in new_items: for item in new_items:
title_l = item.get("title", "").lower() title_l = item.get("title", "").lower()
url_l = item.get("url", "").lower() url_l = item.get("url", "").lower()
if kw_lower in title_l or kw_lower in url_l: if kw_lower in title_l or kw_lower in url_l:
# Fetch OG tags for date if backend didn't provide one
if not item.get("date") and item.get("url"): if not item.get("date") and item.get("url"):
_, _, og_date = await loop.run_in_executor( _, _, og_date = og_cache.get(item["url"], ("", "", ""))
None, _fetch_og, item["url"],
)
if og_date: if og_date:
item["date"] = og_date item["date"] = og_date
matched.append(item) matched.append(item)
continue continue
# Fetch OG tags for items that didn't match on title/URL # Check OG tags for keyword match
item_url = item.get("url", "") item_url = item.get("url", "")
if item_url: if item_url:
og_title, og_desc, og_date = await loop.run_in_executor( og_title, og_desc, og_date = og_cache.get(item_url, ("", "", ""))
None, _fetch_og, item_url,
)
if (kw_lower in og_title.lower() if (kw_lower in og_title.lower()
or kw_lower in og_desc.lower()): or kw_lower in og_desc.lower()):
if og_title and len(og_title) > len(item.get("title", "")): if og_title and len(og_title) > len(item.get("title", "")):

View File

@@ -34,14 +34,23 @@ def _has_client_cert() -> bool:
return (_CERT_DIR / "derp.crt").exists() and (_CERT_DIR / "derp.key").exists() return (_CERT_DIR / "derp.crt").exists() and (_CERT_DIR / "derp.key").exists()
_cached_ssl_ctx: ssl.SSLContext | None = None
def _ssl_context() -> ssl.SSLContext: def _ssl_context() -> ssl.SSLContext:
"""Build SSL context, loading client cert for mTLS if available.""" """Build SSL context, loading client cert for mTLS if available.
ctx = ssl.create_default_context()
cert_path = _CERT_DIR / "derp.crt" Cached at module level -- cert files are static at runtime.
key_path = _CERT_DIR / "derp.key" """
if cert_path.exists() and key_path.exists(): global _cached_ssl_ctx
ctx.load_cert_chain(str(cert_path), str(key_path)) if _cached_ssl_ctx is None:
return ctx ctx = ssl.create_default_context()
cert_path = _CERT_DIR / "derp.crt"
key_path = _CERT_DIR / "derp.key"
if cert_path.exists() and key_path.exists():
ctx.load_cert_chain(str(cert_path), str(key_path))
_cached_ssl_ctx = ctx
return _cached_ssl_ctx
def _solve_pow(nonce: str, difficulty: int) -> int: def _solve_pow(nonce: str, difficulty: int) -> int:

View File

@@ -11,6 +11,7 @@ license = "MIT"
dependencies = [ dependencies = [
"maxminddb>=2.0", "maxminddb>=2.0",
"PySocks>=1.7.1", "PySocks>=1.7.1",
"urllib3[socks]>=2.0",
] ]
[project.scripts] [project.scripts]

3
requirements.txt Normal file
View File

@@ -0,0 +1,3 @@
maxminddb>=2.0
PySocks>=1.7.1
urllib3[socks]>=2.0

View File

@@ -5,19 +5,50 @@ import logging
import socket import socket
import ssl import ssl
import time import time
import urllib.error
import urllib.request import urllib.request
import socks import socks
import urllib3
from socks import SOCKS5 from socks import SOCKS5
from sockshandler import SocksiPyConnectionS, SocksiPyHandler from sockshandler import SocksiPyConnectionS, SocksiPyHandler
from urllib3.contrib.socks import SOCKSProxyManager
_PROXY_ADDR = "127.0.0.1" _PROXY_ADDR = "127.0.0.1"
_PROXY_PORT = 1080 _PROXY_PORT = 1080
_MAX_RETRIES = 3 _MAX_RETRIES = 3
_RETRY_ERRORS = (ssl.SSLError, ConnectionError, TimeoutError, OSError) _RETRY_ERRORS = (
ssl.SSLError, ConnectionError, TimeoutError, OSError,
urllib3.exceptions.HTTPError,
)
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
# -- Connection pool (urllib3) ------------------------------------------------
_pool: SOCKSProxyManager | None = None
# Allow redirects but no urllib3-level retries (we retry ourselves).
_POOL_RETRIES = urllib3.Retry(
total=10, connect=0, read=0, redirect=10, status=0, other=0,
)
def _get_pool() -> SOCKSProxyManager:
"""Lazy-init the SOCKS5 connection pool."""
global _pool
if _pool is None:
_pool = SOCKSProxyManager(
f"socks5h://{_PROXY_ADDR}:{_PROXY_PORT}/",
num_pools=20,
maxsize=4,
retries=_POOL_RETRIES,
)
return _pool
# -- Legacy opener (for build_opener / context= callers) ---------------------
_default_opener: urllib.request.OpenerDirector | None = None _default_opener: urllib.request.OpenerDirector | None = None
@@ -52,12 +83,66 @@ class _ProxyHandler(SocksiPyHandler, urllib.request.HTTPSHandler):
return self.do_open(build, req) return self.do_open(build, req)
# -- Public HTTP interface ---------------------------------------------------
def urlopen(req, *, timeout=None, context=None, retries=None): def urlopen(req, *, timeout=None, context=None, retries=None):
"""Proxy-aware drop-in for urllib.request.urlopen. """Proxy-aware drop-in for urllib.request.urlopen.
Uses connection pooling via urllib3 for default requests.
Falls back to legacy opener for custom SSL context.
Retries on transient SSL/connection errors with exponential backoff. Retries on transient SSL/connection errors with exponential backoff.
""" """
max_retries = retries if retries is not None else _MAX_RETRIES max_retries = retries if retries is not None else _MAX_RETRIES
# Custom SSL context -> fall back to opener (rare: username.py only)
if context is not None:
return _urlopen_legacy(req, timeout=timeout, context=context, retries=max_retries)
# Default path: pooled urllib3
pool = _get_pool()
if isinstance(req, str):
url, headers, body, method = req, {}, None, "GET"
else:
url = req.full_url
headers = dict(req.header_items())
body = req.data
method = req.get_method()
to = urllib3.Timeout(total=timeout) if timeout else urllib3.Timeout(total=30)
for attempt in range(max_retries):
try:
resp = pool.request(
method, url,
headers=headers,
body=body,
timeout=to,
preload_content=False,
)
if resp.status >= 400:
# Drain body so connection returns to pool, then raise
# urllib.error.HTTPError for backward compatibility.
resp.read()
raise urllib.error.HTTPError(
url, resp.status, resp.reason or "",
resp.headers, None,
)
return resp
except urllib.error.HTTPError:
raise
except _RETRY_ERRORS as exc:
if attempt + 1 >= max_retries:
raise
delay = 2 ** attempt
_log.debug("urlopen retry %d/%d after %s: %s",
attempt + 1, max_retries, type(exc).__name__, exc)
time.sleep(delay)
def _urlopen_legacy(req, *, timeout=None, context=None, retries=None):
"""Open URL through legacy opener (custom SSL context)."""
max_retries = retries if retries is not None else _MAX_RETRIES
opener = _get_opener(context) opener = _get_opener(context)
kwargs = {} kwargs = {}
if timeout is not None: if timeout is not None:
@@ -82,6 +167,8 @@ def build_opener(*handlers, context=None):
return urllib.request.build_opener(proxy, *handlers) return urllib.request.build_opener(proxy, *handlers)
# -- Raw TCP helpers (unchanged) ---------------------------------------------
def create_connection(address, *, timeout=None): def create_connection(address, *, timeout=None):
"""SOCKS5-proxied drop-in for socket.create_connection. """SOCKS5-proxied drop-in for socket.create_connection.

View File

@@ -1,6 +1,7 @@
"""Tests for the SOCKS5 proxy HTTP/TCP module.""" """Tests for the SOCKS5 proxy HTTP/TCP module."""
import ssl import ssl
import urllib.error
import urllib.request import urllib.request
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
@@ -12,20 +13,46 @@ from derp.http import (
_PROXY_ADDR, _PROXY_ADDR,
_PROXY_PORT, _PROXY_PORT,
_get_opener, _get_opener,
_get_pool,
_ProxyHandler, _ProxyHandler,
build_opener, build_opener,
create_connection, create_connection,
urlopen,
) )
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def _reset_opener_cache(): def _reset_caches():
"""Clear cached opener between tests.""" """Clear cached opener and pool between tests."""
derp.http._default_opener = None derp.http._default_opener = None
derp.http._pool = None
yield yield
derp.http._default_opener = None derp.http._default_opener = None
derp.http._pool = None
# -- Connection pool ---------------------------------------------------------
class TestConnectionPool:
def test_pool_lazy_init(self):
assert derp.http._pool is None
pool = _get_pool()
assert pool is not None
assert derp.http._pool is pool
def test_pool_cached(self):
a = _get_pool()
b = _get_pool()
assert a is b
def test_pool_is_socks_manager(self):
from urllib3.contrib.socks import SOCKSProxyManager
pool = _get_pool()
assert isinstance(pool, SOCKSProxyManager)
# -- Legacy opener -----------------------------------------------------------
class TestProxyHandler: class TestProxyHandler:
def test_uses_socks5(self): def test_uses_socks5(self):
handler = _ProxyHandler() handler = _ProxyHandler()
@@ -103,6 +130,106 @@ class TestOpenerCache:
assert a is not b assert a is not b
# -- urlopen (pooled path) --------------------------------------------------
class TestUrlopen:
@patch.object(derp.http, "_get_pool")
def test_extracts_request_fields(self, mock_pool_fn):
pool = MagicMock()
resp = MagicMock()
resp.status = 200
pool.request.return_value = resp
mock_pool_fn.return_value = pool
req = urllib.request.Request(
"https://example.com/test",
headers={"X-Custom": "val"},
method="POST",
)
req.data = b"body"
urlopen(req, timeout=10)
pool.request.assert_called_once()
call_kw = pool.request.call_args
assert call_kw[0][0] == "POST"
assert call_kw[0][1] == "https://example.com/test"
assert call_kw[1]["body"] == b"body"
@patch.object(derp.http, "_get_pool")
def test_string_url(self, mock_pool_fn):
pool = MagicMock()
resp = MagicMock()
resp.status = 200
pool.request.return_value = resp
mock_pool_fn.return_value = pool
urlopen("https://example.com/")
call_args = pool.request.call_args
assert call_args[0] == ("GET", "https://example.com/")
@patch.object(derp.http, "_get_pool")
def test_raises_http_error_on_4xx(self, mock_pool_fn):
pool = MagicMock()
resp = MagicMock()
resp.status = 404
resp.reason = "Not Found"
resp.headers = {}
resp.read.return_value = b""
pool.request.return_value = resp
mock_pool_fn.return_value = pool
with pytest.raises(urllib.error.HTTPError) as exc_info:
urlopen("https://example.com/missing")
assert exc_info.value.code == 404
@patch.object(derp.http, "_get_pool")
def test_raises_http_error_on_5xx(self, mock_pool_fn):
pool = MagicMock()
resp = MagicMock()
resp.status = 500
resp.reason = "Internal Server Error"
resp.headers = {}
resp.read.return_value = b""
pool.request.return_value = resp
mock_pool_fn.return_value = pool
with pytest.raises(urllib.error.HTTPError) as exc_info:
urlopen("https://example.com/error")
assert exc_info.value.code == 500
@patch.object(derp.http, "_get_pool")
def test_returns_response_on_2xx(self, mock_pool_fn):
pool = MagicMock()
resp = MagicMock()
resp.status = 200
pool.request.return_value = resp
mock_pool_fn.return_value = pool
result = urlopen("https://example.com/")
assert result is resp
@patch.object(derp.http, "_get_pool")
def test_context_falls_back_to_opener(self, mock_pool_fn):
"""Custom SSL context should use legacy opener, not pool."""
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
with patch.object(derp.http, "_get_opener") as mock_opener_fn:
opener = MagicMock()
resp = MagicMock()
opener.open.return_value = resp
mock_opener_fn.return_value = opener
result = urlopen("https://example.com/", context=ctx)
mock_pool_fn.assert_not_called()
mock_opener_fn.assert_called_once_with(ctx)
assert result is resp
# -- create_connection -------------------------------------------------------
class TestCreateConnection: class TestCreateConnection:
@patch("derp.http.socks.socksocket") @patch("derp.http.socks.socksocket")
def test_sets_socks5_proxy(self, mock_cls): def test_sets_socks5_proxy(self, mock_cls):