diff --git a/httpd.py b/httpd.py index 2207fff..576c64b 100644 --- a/httpd.py +++ b/httpd.py @@ -1564,6 +1564,10 @@ class ProxyAPIServer(threading.Thread): stats['db_health'] = get_db_health(db) except Exception as e: _log('api/stats db error: %s' % e, 'warn') + # Add URL pipeline stats + url_stats = self._get_url_stats() + if url_stats is not None: + stats['urls'] = url_stats # Add profiling flag (from constructor or stats_provider) if 'profiling' not in stats: stats['profiling'] = self.profiling @@ -1908,6 +1912,65 @@ class ProxyAPIServer(threading.Thread): _log('_get_db_stats error: %s' % e, 'warn') return stats + def _get_url_stats(self): + """Get URL pipeline statistics from the websites database.""" + if not self.url_database: + return None + try: + db = mysqlite.mysqlite(self.url_database, str) + stats = {} + now = int(time.time()) + + # Total URLs and health breakdown + row = db.execute('SELECT COUNT(*) FROM uris').fetchone() + stats['total'] = row[0] if row else 0 + row = db.execute('SELECT COUNT(*) FROM uris WHERE error >= 10').fetchone() + stats['dead'] = row[0] if row else 0 + row = db.execute('SELECT COUNT(*) FROM uris WHERE error > 0 AND error < 10').fetchone() + stats['erroring'] = row[0] if row else 0 + row = db.execute('SELECT COUNT(*) FROM uris WHERE error = 0').fetchone() + stats['healthy'] = row[0] if row else 0 + + # Recently active (fetched in last hour) + row = db.execute( + 'SELECT COUNT(*) FROM uris WHERE check_time >= ?', + (now - 3600,)).fetchone() + stats['fetched_last_hour'] = row[0] if row else 0 + + # Productive sources (have produced working proxies) + row = db.execute( + 'SELECT COUNT(*) FROM uris WHERE working_ratio > 0' + ).fetchone() + stats['productive'] = row[0] if row else 0 + + # Aggregate yield + row = db.execute( + 'SELECT SUM(proxies_added), SUM(retrievals) FROM uris' + ).fetchone() + stats['total_proxies_extracted'] = row[0] or 0 if row else 0 + stats['total_fetches'] = row[1] or 0 if row else 0 + + # Currently claimed + with _url_claims_lock: + stats['claimed'] = len(_url_claims) + + # Top sources by working_ratio (productive URLs only) + rows = db.execute( + 'SELECT url, working_ratio, yield_rate, proxies_added, retrievals ' + 'FROM uris WHERE working_ratio > 0 AND retrievals > 0 ' + 'ORDER BY working_ratio DESC LIMIT 10' + ).fetchall() + stats['top_sources'] = [{ + 'url': r[0], 'working_ratio': round(r[1], 3), + 'yield_rate': round(r[2], 1), 'proxies_added': r[3], + 'fetches': r[4], + } for r in rows] + + return stats + except Exception as e: + _log('_get_url_stats error: %s' % e, 'warn') + return None + def _get_workers_data(self, db): """Get worker status data. Used by /api/workers and /api/dashboard.""" now = time.time()