httpd: add protocol-aware source weighting
Boost SOCKS sources in claim_urls scoring when SOCKS proxies are underrepresented (<40% of pool). Dynamic 0-1.0 boost based on current protocol distribution.
This commit is contained in:
45
httpd.py
45
httpd.py
@@ -350,6 +350,40 @@ def get_worker_test_rate(worker_id):
|
||||
return 0.0
|
||||
return total_tests / elapsed
|
||||
|
||||
def _get_proto_boost():
|
||||
"""Calculate protocol scarcity boost for URL scoring.
|
||||
|
||||
Returns a value 0.0-1.0 to boost SOCKS sources when SOCKS proxies
|
||||
are underrepresented relative to HTTP. Returns 0.0 when balanced.
|
||||
"""
|
||||
try:
|
||||
db = mysqlite.mysqlite(_proxy_database, str) if _proxy_database else None
|
||||
if not db:
|
||||
return 0.0
|
||||
row = db.execute(
|
||||
"SELECT "
|
||||
" SUM(CASE WHEN proto='http' THEN 1 ELSE 0 END),"
|
||||
" SUM(CASE WHEN proto IN ('socks4','socks5') THEN 1 ELSE 0 END)"
|
||||
" FROM proxylist WHERE failed=0"
|
||||
).fetchone()
|
||||
if not row or not row[0]:
|
||||
return 0.5 # no data, default mild boost
|
||||
http_count, socks_count = row[0] or 0, row[1] or 0
|
||||
total = http_count + socks_count
|
||||
if total == 0:
|
||||
return 0.5
|
||||
socks_ratio = float(socks_count) / total
|
||||
# Boost SOCKS sources when socks_ratio < 40%
|
||||
if socks_ratio >= 0.4:
|
||||
return 0.0
|
||||
return min((0.4 - socks_ratio) * 2.5, 1.0) # 0.0-1.0 scale
|
||||
except Exception:
|
||||
return 0.0
|
||||
|
||||
# Global reference to proxy database path (set by ProxyAPIServer.__init__)
|
||||
_proxy_database = None
|
||||
|
||||
|
||||
def claim_urls(url_db, worker_id, count=5):
|
||||
"""Claim a batch of URLs for worker-driven fetching. Returns list of URL dicts.
|
||||
|
||||
@@ -360,6 +394,7 @@ def claim_urls(url_db, worker_id, count=5):
|
||||
- quality_bonus: 0-0.5 based on working_ratio
|
||||
- error_penalty: 0-2.0 based on consecutive errors
|
||||
- stale_penalty: 0-1.0 based on unchanged fetches
|
||||
- proto_boost: 0-1.0 for SOCKS sources when SOCKS underrepresented
|
||||
"""
|
||||
now = time.time()
|
||||
now_int = int(now)
|
||||
@@ -385,6 +420,9 @@ def claim_urls(url_db, worker_id, count=5):
|
||||
list_max_age_seconds = _url_list_max_age_days * 86400
|
||||
min_added = now_int - list_max_age_seconds
|
||||
|
||||
# Boost SOCKS sources when protocol pool is imbalanced
|
||||
proto_boost = _get_proto_boost()
|
||||
|
||||
try:
|
||||
rows = url_db.execute(
|
||||
'''SELECT url, content_hash,
|
||||
@@ -393,6 +431,8 @@ def claim_urls(url_db, worker_id, count=5):
|
||||
+ COALESCE(working_ratio, 0) * 0.5
|
||||
- MIN(error * 0.3, 2.0)
|
||||
- MIN(stale_count * 0.1, 1.0)
|
||||
+ CASE WHEN LOWER(url) LIKE '%socks5%' OR LOWER(url) LIKE '%socks4%'
|
||||
THEN ? ELSE 0 END
|
||||
AS score
|
||||
FROM uris
|
||||
WHERE error < ?
|
||||
@@ -400,7 +440,7 @@ def claim_urls(url_db, worker_id, count=5):
|
||||
AND (added > ? OR proxies_added > 0)
|
||||
ORDER BY score DESC
|
||||
LIMIT ?''',
|
||||
(now_int, _url_max_fail, now_int, min_added, count * 3)
|
||||
(now_int, proto_boost, _url_max_fail, now_int, min_added, count * 3)
|
||||
).fetchall()
|
||||
except Exception as e:
|
||||
_log('claim_urls query error: %s' % e, 'error')
|
||||
@@ -1355,8 +1395,9 @@ class ProxyAPIServer(threading.Thread):
|
||||
self.stats_provider = stats_provider
|
||||
self.profiling = profiling
|
||||
self.daemon = True
|
||||
global _url_database_path
|
||||
global _url_database_path, _proxy_database
|
||||
_url_database_path = url_database
|
||||
_proxy_database = database
|
||||
self.server = None
|
||||
self._stop_event = threading.Event() if not GEVENT_PATCHED else None
|
||||
# Load static library files into cache
|
||||
|
||||
Reference in New Issue
Block a user