watchd: add target health tracking for all target pools

Generalizes JudgeStats into TargetStats with cooldown-based filtering
for head targets, SSL targets, and IRC servers. Targets that repeatedly
block or fail are temporarily avoided, preventing unfair proxy failures
when a target goes down. Exposes per-pool health via /api/stats.
This commit is contained in:
Username
2026-02-18 18:21:53 +01:00
parent 3e5c486e7e
commit e985f52fe6
2 changed files with 125 additions and 70 deletions

View File

@@ -44,7 +44,7 @@ import dns
from misc import _log, categorize_error, tor_proxy_url, is_ssl_protocol_error from misc import _log, categorize_error, tor_proxy_url, is_ssl_protocol_error
import rocksock import rocksock
import connection_pool import connection_pool
from stats import JudgeStats, Stats, regexes, ssl_targets, try_div from stats import TargetStats, JudgeStats, Stats, regexes, ssl_targets, try_div
from mitm import MITMCertStats, extract_cert_info, get_mitm_certificate from mitm import MITMCertStats, extract_cert_info, get_mitm_certificate
from dns import socks4_resolve from dns import socks4_resolve
from job import PriorityJobQueue, calculate_priority from job import PriorityJobQueue, calculate_priority
@@ -164,9 +164,9 @@ DEAD_PROXY = -1
# Error categories that indicate proxy is definitely dead (not temporary failure) # Error categories that indicate proxy is definitely dead (not temporary failure)
FATAL_ERROR_CATEGORIES = ('refused', 'unreachable', 'auth') FATAL_ERROR_CATEGORIES = ('refused', 'unreachable', 'auth')
# Patterns indicating judge is blocking the proxy (not a proxy failure) # Patterns indicating HTTP target is blocking the proxy (not a proxy failure)
# These should NOT count as proxy failures - retry with different judge # These should NOT count as proxy failures - applies to judges and head targets
JUDGE_BLOCK_PATTERNS = [ HTTP_BLOCK_PATTERNS = [
r'HTTP/1\.[01] 403', # Forbidden r'HTTP/1\.[01] 403', # Forbidden
r'HTTP/1\.[01] 429', # Too Many Requests r'HTTP/1\.[01] 429', # Too Many Requests
r'HTTP/1\.[01] 503', # Service Unavailable r'HTTP/1\.[01] 503', # Service Unavailable
@@ -179,7 +179,7 @@ JUDGE_BLOCK_PATTERNS = [
r'blocked', # Explicit block r'blocked', # Explicit block
r'Checking your browser', # Cloudflare JS challenge r'Checking your browser', # Cloudflare JS challenge
] ]
JUDGE_BLOCK_RE = re.compile('|'.join(JUDGE_BLOCK_PATTERNS), re.IGNORECASE) HTTP_BLOCK_RE = re.compile('|'.join(HTTP_BLOCK_PATTERNS), re.IGNORECASE)
# Check types: irc, http (header match), judges (body match), ssl (TLS handshake) # Check types: irc, http (header match), judges (body match), ssl (TLS handshake)
# Judge services - return IP in body (plain text, JSON, or HTML) # Judge services - return IP in body (plain text, JSON, or HTML)
@@ -213,6 +213,9 @@ judges = {
# Global instances # Global instances
judge_stats = JudgeStats() judge_stats = JudgeStats()
head_target_stats = TargetStats(cooldown_seconds=300, block_threshold=3)
ssl_target_stats = TargetStats(cooldown_seconds=300, block_threshold=3)
irc_target_stats = TargetStats(cooldown_seconds=300, block_threshold=3)
mitm_cert_stats = MITMCertStats() mitm_cert_stats = MITMCertStats()
@@ -411,18 +414,19 @@ class ProxyTestState(object):
self.evaluated = True self.evaluated = True
self.checktime = int(time.time()) self.checktime = int(time.time())
# Filter out judge_block results (inconclusive, neither pass nor fail) # Filter out target_block results (inconclusive, neither pass nor fail)
real_results = [r for r in self.results if r.get('category') != 'judge_block'] block_cats = ('judge_block', 'target_block')
real_results = [r for r in self.results if r.get('category') not in block_cats]
successes = [r for r in real_results if r['success']] successes = [r for r in real_results if r['success']]
failures = [r for r in real_results if not r['success']] failures = [r for r in real_results if not r['success']]
num_success = len(successes) num_success = len(successes)
judge_blocks = len(self.results) - len(real_results) target_blocks = len(self.results) - len(real_results)
_dbg('evaluate: %d success, %d fail, %d judge_block, results=%d' % ( _dbg('evaluate: %d success, %d fail, %d target_block, results=%d' % (
num_success, len(failures), judge_blocks, len(self.results)), self.proxy) num_success, len(failures), target_blocks, len(self.results)), self.proxy)
# All results were judge blocks: inconclusive, preserve current state # All results were target blocks: inconclusive, preserve current state
if not real_results and self.results: if not real_results and self.results:
_dbg('all results inconclusive (judge_block), no state change', self.proxy) _dbg('all results inconclusive (target_block), no state change', self.proxy)
self.failcount = self.original_failcount self.failcount = self.original_failcount
return (self.original_failcount == 0, None) return (self.original_failcount == 0, None)
@@ -617,6 +621,10 @@ class TargetTestJob(object):
reveals_headers = bool(re.search(HEADER_REVEAL_PATTERN, recv, re.IGNORECASE)) reveals_headers = bool(re.search(HEADER_REVEAL_PATTERN, recv, re.IGNORECASE))
# Record successful judge # Record successful judge
judge_stats.record_success(srv) judge_stats.record_success(srv)
elif self.checktype == 'head':
head_target_stats.record_success(srv)
elif self.checktype == 'irc':
irc_target_stats.record_success(srv)
self.proxy_state.record_result( self.proxy_state.record_result(
True, proto=proto, duration=elapsed, True, proto=proto, duration=elapsed,
srv=srv, tor=tor, ssl=is_ssl, exit_ip=exit_ip, srv=srv, tor=tor, ssl=is_ssl, exit_ip=exit_ip,
@@ -624,22 +632,28 @@ class TargetTestJob(object):
) )
else: else:
_dbg('regex NO MATCH, recv[:100]=%r' % recv[:100], self.proxy_state.proxy) _dbg('regex NO MATCH, recv[:100]=%r' % recv[:100], self.proxy_state.proxy)
# Check if judge is blocking us (not a proxy failure) # Check if HTTP target is blocking us (not a proxy failure)
if self.checktype == 'judges' and JUDGE_BLOCK_RE.search(recv): if self.checktype in ('judges', 'head') and HTTP_BLOCK_RE.search(recv):
if self.checktype == 'judges':
judge_stats.record_block(srv) judge_stats.record_block(srv)
# Judge block = inconclusive, not a pass or fail else:
_dbg('judge BLOCK detected, skipping (neutral)', self.proxy_state.proxy) head_target_stats.record_block(srv)
_dbg('target BLOCK detected, skipping (neutral)', self.proxy_state.proxy)
self.proxy_state.record_result( self.proxy_state.record_result(
False, category='judge_block', proto=proto, False, category='target_block', proto=proto,
srv=srv, tor=tor, ssl=is_ssl srv=srv, tor=tor, ssl=is_ssl
) )
if config.watchd.debug: if config.watchd.debug:
_log('judge %s challenged proxy %s (neutral, skipped)' % ( _log('%s %s challenged proxy %s (neutral, skipped)' % (
srv, self.proxy_state.proxy), 'debug') self.checktype, srv, self.proxy_state.proxy), 'debug')
else: else:
_dbg('FAIL: no match, no block', self.proxy_state.proxy) _dbg('FAIL: no match, no block', self.proxy_state.proxy)
if self.checktype == 'judges': if self.checktype == 'judges':
judge_stats.record_failure(srv) judge_stats.record_failure(srv)
elif self.checktype == 'head':
head_target_stats.record_failure(srv)
elif self.checktype == 'irc':
irc_target_stats.record_failure(srv)
self.proxy_state.record_result(False, category='other') self.proxy_state.record_result(False, category='other')
except KeyboardInterrupt as e: except KeyboardInterrupt as e:
@@ -823,7 +837,8 @@ class TargetTestJob(object):
string (for secondary check SSL/plain decision). string (for secondary check SSL/plain decision).
""" """
ps = self.proxy_state ps = self.proxy_state
ssl_target = random.choice(ssl_targets) available_ssl = ssl_target_stats.get_available(ssl_targets) or ssl_targets
ssl_target = random.choice(available_ssl)
last_error_category = None last_error_category = None
last_ssl_reason = None last_ssl_reason = None
@@ -866,6 +881,7 @@ class TargetTestJob(object):
elapsed = time.time() - duration elapsed = time.time() - duration
if pool: if pool:
pool.record_success(torhost, elapsed) pool.record_success(torhost, elapsed)
ssl_target_stats.record_success(ssl_target)
sock.disconnect() sock.disconnect()
_dbg('SSL handshake OK', ps.proxy) _dbg('SSL handshake OK', ps.proxy)
return (None, proto, duration, torhost, ssl_target, 0, 1, 'ssl_ok'), None return (None, proto, duration, torhost, ssl_target, 0, 1, 'ssl_ok'), None
@@ -901,9 +917,17 @@ class TargetTestJob(object):
# Check for Tor connection issues # Check for Tor connection issues
if et == rocksock.RS_ET_OWN: if et == rocksock.RS_ET_OWN:
if e.get_failedproxy() == 0 and err == rocksock.RS_E_TARGET_CONN_REFUSED: fp = e.get_failedproxy()
if fp == 0 and err == rocksock.RS_E_TARGET_CONN_REFUSED:
if pool: if pool:
pool.record_failure(torhost) pool.record_failure(torhost)
elif fp == 1 and (err == rocksock.RS_E_REMOTE_DISCONNECTED or
err == rocksock.RS_E_HIT_TIMEOUT):
# Target-side failure
ssl_target_stats.record_failure(ssl_target)
elif et == rocksock.RS_ET_GAI:
# DNS failure -- target unresolvable
ssl_target_stats.record_block(ssl_target)
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise
@@ -1025,6 +1049,11 @@ class TargetTestJob(object):
if et == rocksock.RS_ET_OWN: if et == rocksock.RS_ET_OWN:
if fp == 1 and (err == rocksock.RS_E_REMOTE_DISCONNECTED or if fp == 1 and (err == rocksock.RS_E_REMOTE_DISCONNECTED or
err == rocksock.RS_E_HIT_TIMEOUT): err == rocksock.RS_E_HIT_TIMEOUT):
# Target-side failure -- proxy reached target but it's down
if self.checktype == 'head':
head_target_stats.record_failure(srvname)
elif self.checktype == 'irc':
irc_target_stats.record_failure(srvname)
break break
elif fp == 0 and err == rocksock.RS_E_TARGET_CONN_REFUSED: elif fp == 0 and err == rocksock.RS_E_TARGET_CONN_REFUSED:
# Tor connection failed - record in pool # Tor connection failed - record in pool
@@ -1034,6 +1063,11 @@ class TargetTestJob(object):
_log("could not connect to tor, sleep 5s", "ERROR") _log("could not connect to tor, sleep 5s", "ERROR")
time.sleep(5) time.sleep(5)
elif et == rocksock.RS_ET_GAI: elif et == rocksock.RS_ET_GAI:
# DNS failure -- target hostname unresolvable (hard failure)
if self.checktype == 'head':
head_target_stats.record_block(connect_host)
elif self.checktype == 'irc':
irc_target_stats.record_block(srvname)
_log("could not resolve connection target %s" % connect_host, "ERROR") _log("could not resolve connection target %s" % connect_host, "ERROR")
break break
elif et == rocksock.RS_ET_SSL and err == rocksock.RS_E_SSL_CERTIFICATE_ERROR: elif et == rocksock.RS_ET_SSL and err == rocksock.RS_E_SSL_CERTIFICATE_ERROR:
@@ -1506,7 +1540,7 @@ class Proxywatchd():
_dbg('prepare_jobs: %d rows, checktypes=%s' % (len(rows), config.watchd.checktypes)) _dbg('prepare_jobs: %d rows, checktypes=%s' % (len(rows), config.watchd.checktypes))
checktypes = config.watchd.checktypes checktypes = config.watchd.checktypes
# Build target pools for each checktype # Build target pools for each checktype (filter out targets in cooldown)
target_pools = {} target_pools = {}
for ct in checktypes: for ct in checktypes:
if ct == 'none': if ct == 'none':
@@ -1514,19 +1548,23 @@ class Proxywatchd():
target_pools[ct] = ssl_targets target_pools[ct] = ssl_targets
_dbg('target_pool[none]: SSL-only mode, %d ssl targets' % len(ssl_targets)) _dbg('target_pool[none]: SSL-only mode, %d ssl targets' % len(ssl_targets))
elif ct == 'irc': elif ct == 'irc':
target_pools[ct] = config.servers all_servers = config.servers
_dbg('target_pool[irc]: %d servers' % len(config.servers)) available = irc_target_stats.get_available(all_servers)
target_pools[ct] = available if available else all_servers
_dbg('target_pool[irc]: %d/%d servers available' % (len(target_pools[ct]), len(all_servers)))
elif ct == 'judges': elif ct == 'judges':
# Filter out judges in cooldown (blocked/rate-limited)
all_judges = list(judges.keys()) all_judges = list(judges.keys())
available = judge_stats.get_available_judges(all_judges) available = judge_stats.get_available(all_judges)
target_pools[ct] = available if available else all_judges target_pools[ct] = available if available else all_judges
elif ct == 'ssl': elif ct == 'ssl':
target_pools[ct] = ssl_targets available = ssl_target_stats.get_available(ssl_targets)
_dbg('target_pool[ssl]: %d targets' % len(ssl_targets)) target_pools[ct] = available if available else ssl_targets
_dbg('target_pool[ssl]: %d/%d targets available' % (len(target_pools[ct]), len(ssl_targets)))
else: # head else: # head
target_pools[ct] = list(regexes.keys()) all_targets = list(regexes.keys())
_dbg('target_pool[%s]: %d targets' % (ct, len(regexes))) available = head_target_stats.get_available(all_targets)
target_pools[ct] = available if available else all_targets
_dbg('target_pool[%s]: %d/%d targets available' % (ct, len(target_pools[ct]), len(all_targets)))
# create all jobs first, then shuffle for interleaving # create all jobs first, then shuffle for interleaving
all_jobs = [] all_jobs = []
@@ -1852,15 +1890,25 @@ class Proxywatchd():
# Judge stats (when using judges checktype) # Judge stats (when using judges checktype)
if 'judges' in config.watchd.checktypes: if 'judges' in config.watchd.checktypes:
js = judge_stats.get_stats() js = judge_stats.get_stats()
# Remap 'target' -> 'judge' for dashboard compatibility
top = [dict(j, judge=j['target']) for j in js.get('top', [])[:5]]
stats_data['judges'] = { stats_data['judges'] = {
'total': js.get('total', 0), 'total': js.get('total', 0),
'available': js.get('available', 0), 'available': js.get('available', 0),
'in_cooldown': js.get('in_cooldown', 0), 'in_cooldown': js.get('in_cooldown', 0),
'top_judges': js.get('top', [])[:5] # top 5 most successful 'top_judges': top,
} }
else: else:
stats_data['judges'] = None stats_data['judges'] = None
# Target health stats (all target pools)
stats_data['target_health'] = {
'head': head_target_stats.get_stats(),
'ssl': ssl_target_stats.get_stats(),
'irc': irc_target_stats.get_stats(),
'judges': judge_stats.get_stats(),
}
# Scraper/engine stats # Scraper/engine stats
if scraper_available: if scraper_available:
scraper_stats = scraper_module.get_scraper_stats() scraper_stats = scraper_module.get_scraper_stats()

View File

@@ -14,60 +14,64 @@ def try_div(a, b):
return 0 return 0
class JudgeStats(): class TargetStats():
"""Track per-judge success/failure rates for reliability scoring. """Track per-target success/failure rates with cooldown.
Judges that frequently block or rate-limit are temporarily avoided. Targets that frequently block or fail are temporarily avoided.
Stats decay over time to allow recovery. Block counters reset on success or cooldown expiry.
Used for all target pools: judges, head targets, SSL targets, IRC servers.
""" """
def __init__(self, cooldown_seconds=300, block_threshold=3): def __init__(self, cooldown_seconds=300, block_threshold=3):
self.lock = threading.Lock() self.lock = threading.Lock()
self.stats = {} # judge -> {'success': n, 'fail': n, 'block': n, 'last_block': timestamp} self.stats = {} # target -> {'success': n, 'fail': n, 'block': n, 'last_block': timestamp}
self.cooldown_seconds = cooldown_seconds # seconds to avoid blocked judges self.cooldown_seconds = cooldown_seconds
self.block_threshold = block_threshold # consecutive blocks before cooldown self.block_threshold = block_threshold
def record_success(self, judge): def _ensure(self, target):
"""Record successful judge response.""" if target not in self.stats:
with self.lock: self.stats[target] = {'success': 0, 'fail': 0, 'block': 0, 'last_block': 0}
if judge not in self.stats:
self.stats[judge] = {'success': 0, 'fail': 0, 'block': 0, 'last_block': 0}
self.stats[judge]['success'] += 1
# Reset block count on success
self.stats[judge]['block'] = 0
def record_failure(self, judge): def record_success(self, target):
"""Record judge failure (proxy failed, not judge block).""" """Record successful target response."""
with self.lock: with self.lock:
if judge not in self.stats: self._ensure(target)
self.stats[judge] = {'success': 0, 'fail': 0, 'block': 0, 'last_block': 0} self.stats[target]['success'] += 1
self.stats[judge]['fail'] += 1 self.stats[target]['block'] = 0
def record_block(self, judge): def record_failure(self, target):
"""Record judge blocking the proxy (403, captcha, rate-limit).""" """Record target failure (soft -- doesn't trigger cooldown)."""
with self.lock: with self.lock:
if judge not in self.stats: self._ensure(target)
self.stats[judge] = {'success': 0, 'fail': 0, 'block': 0, 'last_block': 0} self.stats[target]['fail'] += 1
self.stats[judge]['block'] += 1
self.stats[judge]['last_block'] = time.time()
def is_available(self, judge): def record_block(self, target):
"""Check if judge is available (not in cooldown).""" """Record target block (403, captcha, DNS failure, rate-limit)."""
with self.lock: with self.lock:
if judge not in self.stats: self._ensure(target)
self.stats[target]['block'] += 1
self.stats[target]['last_block'] = time.time()
def is_available(self, target):
"""Check if target is available (not in cooldown)."""
with self.lock:
if target not in self.stats:
return True return True
s = self.stats[judge] s = self.stats[target]
# Check if in cooldown period
if s['block'] >= self.block_threshold: if s['block'] >= self.block_threshold:
if (time.time() - s['last_block']) < self.cooldown_seconds: if (time.time() - s['last_block']) < self.cooldown_seconds:
return False return False
# Cooldown expired, reset block count
s['block'] = 0 s['block'] = 0
return True return True
def get_available(self, target_list):
"""Return targets not in cooldown."""
return [t for t in target_list if self.is_available(t)]
def get_available_judges(self, judge_list): def get_available_judges(self, judge_list):
"""Return list of judges not in cooldown.""" """Compat alias for get_available()."""
return [j for j in judge_list if self.is_available(j)] return self.get_available(judge_list)
def status_line(self): def status_line(self):
"""Return status summary for logging.""" """Return status summary for logging."""
@@ -76,7 +80,7 @@ class JudgeStats():
blocked = sum(1 for s in self.stats.values() blocked = sum(1 for s in self.stats.values()
if s['block'] >= self.block_threshold and if s['block'] >= self.block_threshold and
(time.time() - s['last_block']) < self.cooldown_seconds) (time.time() - s['last_block']) < self.cooldown_seconds)
return 'judges: %d total, %d in cooldown' % (total, blocked) return '%d total, %d in cooldown' % (total, blocked)
def get_stats(self): def get_stats(self):
"""Return statistics dict for API/dashboard.""" """Return statistics dict for API/dashboard."""
@@ -87,18 +91,21 @@ class JudgeStats():
if s['block'] >= self.block_threshold and if s['block'] >= self.block_threshold and
(now - s['last_block']) < self.cooldown_seconds) (now - s['last_block']) < self.cooldown_seconds)
available = total - in_cooldown available = total - in_cooldown
# Get top judges by success count
top = [] top = []
for judge, s in self.stats.items(): for target, s in self.stats.items():
total_tests = s['success'] + s['fail'] total_tests = s['success'] + s['fail']
if total_tests > 0: if total_tests > 0:
success_pct = (s['success'] * 100.0) / total_tests success_pct = (s['success'] * 100.0) / total_tests
top.append({'judge': judge, 'success': s['success'], top.append({'target': target, 'success': s['success'],
'tests': total_tests, 'rate': round(success_pct, 1)}) 'tests': total_tests, 'rate': round(success_pct, 1)})
top.sort(key=lambda x: x['success'], reverse=True) top.sort(key=lambda x: x['success'], reverse=True)
return {'total': total, 'available': available, 'in_cooldown': in_cooldown, 'top': top} return {'total': total, 'available': available, 'in_cooldown': in_cooldown, 'top': top}
# Backwards-compatible alias
JudgeStats = TargetStats
# HTTP targets - check for specific headers # HTTP targets - check for specific headers
regexes = { regexes = {
'www.facebook.com': 'X-FB-Debug', 'www.facebook.com': 'X-FB-Debug',