refactor core modules, integrate network stats

This commit is contained in:
Username
2025-12-25 11:13:20 +01:00
parent 2201515b10
commit 269fed55ff
8 changed files with 270 additions and 219 deletions

View File

@@ -48,6 +48,7 @@ class EngineTracker(object):
self.state_file = state_file or STATE_FILE
self._save_interval = 60 # seconds between saves
self._last_save = 0
self._lock = threading.RLock() # Reentrant lock for nested calls
# Build list of (engine_instance, identifier)
self.engines = []
@@ -70,31 +71,34 @@ class EngineTracker(object):
"""Return engines not currently in backoff."""
now = time.time()
available = []
for eng, ident in self.engines:
if ident not in self.backoff_until or now >= self.backoff_until[ident]:
available.append((eng, ident))
with self._lock:
for eng, ident in self.engines:
if ident not in self.backoff_until or now >= self.backoff_until[ident]:
available.append((eng, ident))
return available
def mark_success(self, ident):
"""Reset failure count on success."""
self.failures[ident] = 0
self.success_count[ident] = self.success_count.get(ident, 0) + 1
if ident in self.backoff_until:
del self.backoff_until[ident]
with self._lock:
self.failures[ident] = 0
self.success_count[ident] = self.success_count.get(ident, 0) + 1
if ident in self.backoff_until:
del self.backoff_until[ident]
self.save_state()
def mark_failure(self, ident):
"""Increment failure count and set exponential backoff."""
count = self.failures.get(ident, 0) + 1
self.failures[ident] = count
delay = min(self.base_delay * (2 ** (count - 1)), self.max_delay)
self.backoff_until[ident] = time.time() + delay
now = time.time()
if (now - self.last_rate_log) >= self.log_interval:
name = ident.split('/')[2] if '/' in ident else ident
avail, in_backoff, total = self.get_status()
_log('%d/%d engines in backoff (last: %s)' % (in_backoff, total, name), 'rate')
self.last_rate_log = now
with self._lock:
count = self.failures.get(ident, 0) + 1
self.failures[ident] = count
delay = min(self.base_delay * (2 ** (count - 1)), self.max_delay)
self.backoff_until[ident] = time.time() + delay
now = time.time()
if (now - self.last_rate_log) >= self.log_interval:
name = ident.split('/')[2] if '/' in ident else ident
avail, in_backoff, total = self.get_status()
_log('%d/%d engines in backoff (last: %s)' % (in_backoff, total, name), 'rate')
self.last_rate_log = now
self.save_state()
return delay
@@ -107,28 +111,31 @@ class EngineTracker(object):
def get_stats(self):
"""Return detailed stats for API/dashboard."""
now = time.time()
available = self.get_available()
available_ids = set(ident for _, ident in available)
with self._lock:
available = self.get_available()
available_ids = set(ident for _, ident in available)
engines_list = []
for eng, ident in self.engines:
# Shorten identifier for display
if '/' in ident:
name = ident.split('/')[2] # extract domain from URL
else:
name = ident
engines_list = []
for eng, ident in self.engines:
# Shorten identifier for display
if '/' in ident:
name = ident.split('/')[2] # extract domain from URL
else:
name = ident
backoff_remaining = 0
if ident in self.backoff_until:
backoff_remaining = max(0, int(self.backoff_until[ident] - now))
backoff_remaining = 0
if ident in self.backoff_until:
backoff_remaining = max(0, int(self.backoff_until[ident] - now))
engines_list.append({
'name': name,
'available': ident in available_ids,
'successes': self.success_count.get(ident, 0),
'failures': self.failures.get(ident, 0),
'backoff_remaining': backoff_remaining
})
engines_list.append({
'name': name,
'available': ident in available_ids,
'successes': self.success_count.get(ident, 0),
'failures': self.failures.get(ident, 0),
'backoff_remaining': backoff_remaining
})
total_successes = sum(self.success_count.values())
# Sort by success count descending
engines_list.sort(key=lambda x: -x['successes'])
@@ -137,7 +144,7 @@ class EngineTracker(object):
'available': len(available),
'in_backoff': len(self.engines) - len(available),
'total': len(self.engines),
'total_successes': sum(self.success_count.values()),
'total_successes': total_successes,
'engines': engines_list[:20] # Top 20 engines
}
@@ -184,25 +191,32 @@ class EngineTracker(object):
if not force and (now - self._last_save) < self._save_interval:
return
with self._lock:
try:
# Ensure directory exists
state_dir = os.path.dirname(self.state_file)
if state_dir and not os.path.exists(state_dir):
os.makedirs(state_dir)
# Copy dicts under lock for thread-safe serialization
data = {
'failures': dict(self.failures),
'backoff_until': dict(self.backoff_until),
'success_count': dict(self.success_count),
'saved_at': now
}
self._last_save = now
except (IOError, OSError) as e:
_log('failed to save scraper state: %s' % str(e), 'warn')
return
# File I/O outside lock to minimize lock hold time
try:
# Ensure directory exists
state_dir = os.path.dirname(self.state_file)
if state_dir and not os.path.exists(state_dir):
os.makedirs(state_dir)
data = {
'failures': self.failures,
'backoff_until': self.backoff_until,
'success_count': self.success_count,
'saved_at': now
}
# Atomic write
tmp_file = self.state_file + '.tmp'
with open(tmp_file, 'w') as f:
json.dump(data, f, indent=2)
os.rename(tmp_file, self.state_file)
self._last_save = now
except (IOError, OSError) as e:
_log('failed to save scraper state: %s' % str(e), 'warn')