refactor core modules, integrate network stats
This commit is contained in:
116
scraper.py
116
scraper.py
@@ -48,6 +48,7 @@ class EngineTracker(object):
|
||||
self.state_file = state_file or STATE_FILE
|
||||
self._save_interval = 60 # seconds between saves
|
||||
self._last_save = 0
|
||||
self._lock = threading.RLock() # Reentrant lock for nested calls
|
||||
|
||||
# Build list of (engine_instance, identifier)
|
||||
self.engines = []
|
||||
@@ -70,31 +71,34 @@ class EngineTracker(object):
|
||||
"""Return engines not currently in backoff."""
|
||||
now = time.time()
|
||||
available = []
|
||||
for eng, ident in self.engines:
|
||||
if ident not in self.backoff_until or now >= self.backoff_until[ident]:
|
||||
available.append((eng, ident))
|
||||
with self._lock:
|
||||
for eng, ident in self.engines:
|
||||
if ident not in self.backoff_until or now >= self.backoff_until[ident]:
|
||||
available.append((eng, ident))
|
||||
return available
|
||||
|
||||
def mark_success(self, ident):
|
||||
"""Reset failure count on success."""
|
||||
self.failures[ident] = 0
|
||||
self.success_count[ident] = self.success_count.get(ident, 0) + 1
|
||||
if ident in self.backoff_until:
|
||||
del self.backoff_until[ident]
|
||||
with self._lock:
|
||||
self.failures[ident] = 0
|
||||
self.success_count[ident] = self.success_count.get(ident, 0) + 1
|
||||
if ident in self.backoff_until:
|
||||
del self.backoff_until[ident]
|
||||
self.save_state()
|
||||
|
||||
def mark_failure(self, ident):
|
||||
"""Increment failure count and set exponential backoff."""
|
||||
count = self.failures.get(ident, 0) + 1
|
||||
self.failures[ident] = count
|
||||
delay = min(self.base_delay * (2 ** (count - 1)), self.max_delay)
|
||||
self.backoff_until[ident] = time.time() + delay
|
||||
now = time.time()
|
||||
if (now - self.last_rate_log) >= self.log_interval:
|
||||
name = ident.split('/')[2] if '/' in ident else ident
|
||||
avail, in_backoff, total = self.get_status()
|
||||
_log('%d/%d engines in backoff (last: %s)' % (in_backoff, total, name), 'rate')
|
||||
self.last_rate_log = now
|
||||
with self._lock:
|
||||
count = self.failures.get(ident, 0) + 1
|
||||
self.failures[ident] = count
|
||||
delay = min(self.base_delay * (2 ** (count - 1)), self.max_delay)
|
||||
self.backoff_until[ident] = time.time() + delay
|
||||
now = time.time()
|
||||
if (now - self.last_rate_log) >= self.log_interval:
|
||||
name = ident.split('/')[2] if '/' in ident else ident
|
||||
avail, in_backoff, total = self.get_status()
|
||||
_log('%d/%d engines in backoff (last: %s)' % (in_backoff, total, name), 'rate')
|
||||
self.last_rate_log = now
|
||||
self.save_state()
|
||||
return delay
|
||||
|
||||
@@ -107,28 +111,31 @@ class EngineTracker(object):
|
||||
def get_stats(self):
|
||||
"""Return detailed stats for API/dashboard."""
|
||||
now = time.time()
|
||||
available = self.get_available()
|
||||
available_ids = set(ident for _, ident in available)
|
||||
with self._lock:
|
||||
available = self.get_available()
|
||||
available_ids = set(ident for _, ident in available)
|
||||
|
||||
engines_list = []
|
||||
for eng, ident in self.engines:
|
||||
# Shorten identifier for display
|
||||
if '/' in ident:
|
||||
name = ident.split('/')[2] # extract domain from URL
|
||||
else:
|
||||
name = ident
|
||||
engines_list = []
|
||||
for eng, ident in self.engines:
|
||||
# Shorten identifier for display
|
||||
if '/' in ident:
|
||||
name = ident.split('/')[2] # extract domain from URL
|
||||
else:
|
||||
name = ident
|
||||
|
||||
backoff_remaining = 0
|
||||
if ident in self.backoff_until:
|
||||
backoff_remaining = max(0, int(self.backoff_until[ident] - now))
|
||||
backoff_remaining = 0
|
||||
if ident in self.backoff_until:
|
||||
backoff_remaining = max(0, int(self.backoff_until[ident] - now))
|
||||
|
||||
engines_list.append({
|
||||
'name': name,
|
||||
'available': ident in available_ids,
|
||||
'successes': self.success_count.get(ident, 0),
|
||||
'failures': self.failures.get(ident, 0),
|
||||
'backoff_remaining': backoff_remaining
|
||||
})
|
||||
engines_list.append({
|
||||
'name': name,
|
||||
'available': ident in available_ids,
|
||||
'successes': self.success_count.get(ident, 0),
|
||||
'failures': self.failures.get(ident, 0),
|
||||
'backoff_remaining': backoff_remaining
|
||||
})
|
||||
|
||||
total_successes = sum(self.success_count.values())
|
||||
|
||||
# Sort by success count descending
|
||||
engines_list.sort(key=lambda x: -x['successes'])
|
||||
@@ -137,7 +144,7 @@ class EngineTracker(object):
|
||||
'available': len(available),
|
||||
'in_backoff': len(self.engines) - len(available),
|
||||
'total': len(self.engines),
|
||||
'total_successes': sum(self.success_count.values()),
|
||||
'total_successes': total_successes,
|
||||
'engines': engines_list[:20] # Top 20 engines
|
||||
}
|
||||
|
||||
@@ -184,25 +191,32 @@ class EngineTracker(object):
|
||||
if not force and (now - self._last_save) < self._save_interval:
|
||||
return
|
||||
|
||||
with self._lock:
|
||||
try:
|
||||
# Ensure directory exists
|
||||
state_dir = os.path.dirname(self.state_file)
|
||||
if state_dir and not os.path.exists(state_dir):
|
||||
os.makedirs(state_dir)
|
||||
|
||||
# Copy dicts under lock for thread-safe serialization
|
||||
data = {
|
||||
'failures': dict(self.failures),
|
||||
'backoff_until': dict(self.backoff_until),
|
||||
'success_count': dict(self.success_count),
|
||||
'saved_at': now
|
||||
}
|
||||
self._last_save = now
|
||||
|
||||
except (IOError, OSError) as e:
|
||||
_log('failed to save scraper state: %s' % str(e), 'warn')
|
||||
return
|
||||
|
||||
# File I/O outside lock to minimize lock hold time
|
||||
try:
|
||||
# Ensure directory exists
|
||||
state_dir = os.path.dirname(self.state_file)
|
||||
if state_dir and not os.path.exists(state_dir):
|
||||
os.makedirs(state_dir)
|
||||
|
||||
data = {
|
||||
'failures': self.failures,
|
||||
'backoff_until': self.backoff_until,
|
||||
'success_count': self.success_count,
|
||||
'saved_at': now
|
||||
}
|
||||
|
||||
# Atomic write
|
||||
tmp_file = self.state_file + '.tmp'
|
||||
with open(tmp_file, 'w') as f:
|
||||
json.dump(data, f, indent=2)
|
||||
os.rename(tmp_file, self.state_file)
|
||||
self._last_save = now
|
||||
|
||||
except (IOError, OSError) as e:
|
||||
_log('failed to save scraper state: %s' % str(e), 'warn')
|
||||
|
||||
Reference in New Issue
Block a user