diff --git a/httpd.py b/httpd.py index 576c64b..ece8f91 100644 --- a/httpd.py +++ b/httpd.py @@ -350,6 +350,40 @@ def get_worker_test_rate(worker_id): return 0.0 return total_tests / elapsed +def _get_proto_boost(): + """Calculate protocol scarcity boost for URL scoring. + + Returns a value 0.0-1.0 to boost SOCKS sources when SOCKS proxies + are underrepresented relative to HTTP. Returns 0.0 when balanced. + """ + try: + db = mysqlite.mysqlite(_proxy_database, str) if _proxy_database else None + if not db: + return 0.0 + row = db.execute( + "SELECT " + " SUM(CASE WHEN proto='http' THEN 1 ELSE 0 END)," + " SUM(CASE WHEN proto IN ('socks4','socks5') THEN 1 ELSE 0 END)" + " FROM proxylist WHERE failed=0" + ).fetchone() + if not row or not row[0]: + return 0.5 # no data, default mild boost + http_count, socks_count = row[0] or 0, row[1] or 0 + total = http_count + socks_count + if total == 0: + return 0.5 + socks_ratio = float(socks_count) / total + # Boost SOCKS sources when socks_ratio < 40% + if socks_ratio >= 0.4: + return 0.0 + return min((0.4 - socks_ratio) * 2.5, 1.0) # 0.0-1.0 scale + except Exception: + return 0.0 + +# Global reference to proxy database path (set by ProxyAPIServer.__init__) +_proxy_database = None + + def claim_urls(url_db, worker_id, count=5): """Claim a batch of URLs for worker-driven fetching. Returns list of URL dicts. @@ -360,6 +394,7 @@ def claim_urls(url_db, worker_id, count=5): - quality_bonus: 0-0.5 based on working_ratio - error_penalty: 0-2.0 based on consecutive errors - stale_penalty: 0-1.0 based on unchanged fetches + - proto_boost: 0-1.0 for SOCKS sources when SOCKS underrepresented """ now = time.time() now_int = int(now) @@ -385,6 +420,9 @@ def claim_urls(url_db, worker_id, count=5): list_max_age_seconds = _url_list_max_age_days * 86400 min_added = now_int - list_max_age_seconds + # Boost SOCKS sources when protocol pool is imbalanced + proto_boost = _get_proto_boost() + try: rows = url_db.execute( '''SELECT url, content_hash, @@ -393,6 +431,8 @@ def claim_urls(url_db, worker_id, count=5): + COALESCE(working_ratio, 0) * 0.5 - MIN(error * 0.3, 2.0) - MIN(stale_count * 0.1, 1.0) + + CASE WHEN LOWER(url) LIKE '%socks5%' OR LOWER(url) LIKE '%socks4%' + THEN ? ELSE 0 END AS score FROM uris WHERE error < ? @@ -400,7 +440,7 @@ def claim_urls(url_db, worker_id, count=5): AND (added > ? OR proxies_added > 0) ORDER BY score DESC LIMIT ?''', - (now_int, _url_max_fail, now_int, min_added, count * 3) + (now_int, proto_boost, _url_max_fail, now_int, min_added, count * 3) ).fetchall() except Exception as e: _log('claim_urls query error: %s' % e, 'error') @@ -1355,8 +1395,9 @@ class ProxyAPIServer(threading.Thread): self.stats_provider = stats_provider self.profiling = profiling self.daemon = True - global _url_database_path + global _url_database_path, _proxy_database _url_database_path = url_database + _proxy_database = database self.server = None self._stop_event = threading.Event() if not GEVENT_PATCHED else None # Load static library files into cache