watchd: add target health tracking for all target pools

Generalizes JudgeStats into TargetStats with cooldown-based filtering
for head targets, SSL targets, and IRC servers. Targets that repeatedly
block or fail are temporarily avoided, preventing unfair proxy failures
when a target goes down. Exposes per-pool health via /api/stats.
This commit is contained in:
Username
2026-02-18 18:21:53 +01:00
parent 3e5c486e7e
commit e985f52fe6
2 changed files with 125 additions and 70 deletions

View File

@@ -14,60 +14,64 @@ def try_div(a, b):
return 0
class JudgeStats():
"""Track per-judge success/failure rates for reliability scoring.
class TargetStats():
"""Track per-target success/failure rates with cooldown.
Judges that frequently block or rate-limit are temporarily avoided.
Stats decay over time to allow recovery.
Targets that frequently block or fail are temporarily avoided.
Block counters reset on success or cooldown expiry.
Used for all target pools: judges, head targets, SSL targets, IRC servers.
"""
def __init__(self, cooldown_seconds=300, block_threshold=3):
self.lock = threading.Lock()
self.stats = {} # judge -> {'success': n, 'fail': n, 'block': n, 'last_block': timestamp}
self.cooldown_seconds = cooldown_seconds # seconds to avoid blocked judges
self.block_threshold = block_threshold # consecutive blocks before cooldown
self.stats = {} # target -> {'success': n, 'fail': n, 'block': n, 'last_block': timestamp}
self.cooldown_seconds = cooldown_seconds
self.block_threshold = block_threshold
def record_success(self, judge):
"""Record successful judge response."""
with self.lock:
if judge not in self.stats:
self.stats[judge] = {'success': 0, 'fail': 0, 'block': 0, 'last_block': 0}
self.stats[judge]['success'] += 1
# Reset block count on success
self.stats[judge]['block'] = 0
def _ensure(self, target):
if target not in self.stats:
self.stats[target] = {'success': 0, 'fail': 0, 'block': 0, 'last_block': 0}
def record_failure(self, judge):
"""Record judge failure (proxy failed, not judge block)."""
def record_success(self, target):
"""Record successful target response."""
with self.lock:
if judge not in self.stats:
self.stats[judge] = {'success': 0, 'fail': 0, 'block': 0, 'last_block': 0}
self.stats[judge]['fail'] += 1
self._ensure(target)
self.stats[target]['success'] += 1
self.stats[target]['block'] = 0
def record_block(self, judge):
"""Record judge blocking the proxy (403, captcha, rate-limit)."""
def record_failure(self, target):
"""Record target failure (soft -- doesn't trigger cooldown)."""
with self.lock:
if judge not in self.stats:
self.stats[judge] = {'success': 0, 'fail': 0, 'block': 0, 'last_block': 0}
self.stats[judge]['block'] += 1
self.stats[judge]['last_block'] = time.time()
self._ensure(target)
self.stats[target]['fail'] += 1
def is_available(self, judge):
"""Check if judge is available (not in cooldown)."""
def record_block(self, target):
"""Record target block (403, captcha, DNS failure, rate-limit)."""
with self.lock:
if judge not in self.stats:
self._ensure(target)
self.stats[target]['block'] += 1
self.stats[target]['last_block'] = time.time()
def is_available(self, target):
"""Check if target is available (not in cooldown)."""
with self.lock:
if target not in self.stats:
return True
s = self.stats[judge]
# Check if in cooldown period
s = self.stats[target]
if s['block'] >= self.block_threshold:
if (time.time() - s['last_block']) < self.cooldown_seconds:
return False
# Cooldown expired, reset block count
s['block'] = 0
return True
def get_available(self, target_list):
"""Return targets not in cooldown."""
return [t for t in target_list if self.is_available(t)]
def get_available_judges(self, judge_list):
"""Return list of judges not in cooldown."""
return [j for j in judge_list if self.is_available(j)]
"""Compat alias for get_available()."""
return self.get_available(judge_list)
def status_line(self):
"""Return status summary for logging."""
@@ -76,7 +80,7 @@ class JudgeStats():
blocked = sum(1 for s in self.stats.values()
if s['block'] >= self.block_threshold and
(time.time() - s['last_block']) < self.cooldown_seconds)
return 'judges: %d total, %d in cooldown' % (total, blocked)
return '%d total, %d in cooldown' % (total, blocked)
def get_stats(self):
"""Return statistics dict for API/dashboard."""
@@ -87,18 +91,21 @@ class JudgeStats():
if s['block'] >= self.block_threshold and
(now - s['last_block']) < self.cooldown_seconds)
available = total - in_cooldown
# Get top judges by success count
top = []
for judge, s in self.stats.items():
for target, s in self.stats.items():
total_tests = s['success'] + s['fail']
if total_tests > 0:
success_pct = (s['success'] * 100.0) / total_tests
top.append({'judge': judge, 'success': s['success'],
top.append({'target': target, 'success': s['success'],
'tests': total_tests, 'rate': round(success_pct, 1)})
top.sort(key=lambda x: x['success'], reverse=True)
return {'total': total, 'available': available, 'in_cooldown': in_cooldown, 'top': top}
# Backwards-compatible alias
JudgeStats = TargetStats
# HTTP targets - check for specific headers
regexes = {
'www.facebook.com': 'X-FB-Debug',