httpd: add protocol-aware source weighting
Boost SOCKS sources in claim_urls scoring when SOCKS proxies are underrepresented (<40% of pool). Dynamic 0-1.0 boost based on current protocol distribution.
This commit is contained in:
45
httpd.py
45
httpd.py
@@ -350,6 +350,40 @@ def get_worker_test_rate(worker_id):
|
|||||||
return 0.0
|
return 0.0
|
||||||
return total_tests / elapsed
|
return total_tests / elapsed
|
||||||
|
|
||||||
|
def _get_proto_boost():
|
||||||
|
"""Calculate protocol scarcity boost for URL scoring.
|
||||||
|
|
||||||
|
Returns a value 0.0-1.0 to boost SOCKS sources when SOCKS proxies
|
||||||
|
are underrepresented relative to HTTP. Returns 0.0 when balanced.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
db = mysqlite.mysqlite(_proxy_database, str) if _proxy_database else None
|
||||||
|
if not db:
|
||||||
|
return 0.0
|
||||||
|
row = db.execute(
|
||||||
|
"SELECT "
|
||||||
|
" SUM(CASE WHEN proto='http' THEN 1 ELSE 0 END),"
|
||||||
|
" SUM(CASE WHEN proto IN ('socks4','socks5') THEN 1 ELSE 0 END)"
|
||||||
|
" FROM proxylist WHERE failed=0"
|
||||||
|
).fetchone()
|
||||||
|
if not row or not row[0]:
|
||||||
|
return 0.5 # no data, default mild boost
|
||||||
|
http_count, socks_count = row[0] or 0, row[1] or 0
|
||||||
|
total = http_count + socks_count
|
||||||
|
if total == 0:
|
||||||
|
return 0.5
|
||||||
|
socks_ratio = float(socks_count) / total
|
||||||
|
# Boost SOCKS sources when socks_ratio < 40%
|
||||||
|
if socks_ratio >= 0.4:
|
||||||
|
return 0.0
|
||||||
|
return min((0.4 - socks_ratio) * 2.5, 1.0) # 0.0-1.0 scale
|
||||||
|
except Exception:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
# Global reference to proxy database path (set by ProxyAPIServer.__init__)
|
||||||
|
_proxy_database = None
|
||||||
|
|
||||||
|
|
||||||
def claim_urls(url_db, worker_id, count=5):
|
def claim_urls(url_db, worker_id, count=5):
|
||||||
"""Claim a batch of URLs for worker-driven fetching. Returns list of URL dicts.
|
"""Claim a batch of URLs for worker-driven fetching. Returns list of URL dicts.
|
||||||
|
|
||||||
@@ -360,6 +394,7 @@ def claim_urls(url_db, worker_id, count=5):
|
|||||||
- quality_bonus: 0-0.5 based on working_ratio
|
- quality_bonus: 0-0.5 based on working_ratio
|
||||||
- error_penalty: 0-2.0 based on consecutive errors
|
- error_penalty: 0-2.0 based on consecutive errors
|
||||||
- stale_penalty: 0-1.0 based on unchanged fetches
|
- stale_penalty: 0-1.0 based on unchanged fetches
|
||||||
|
- proto_boost: 0-1.0 for SOCKS sources when SOCKS underrepresented
|
||||||
"""
|
"""
|
||||||
now = time.time()
|
now = time.time()
|
||||||
now_int = int(now)
|
now_int = int(now)
|
||||||
@@ -385,6 +420,9 @@ def claim_urls(url_db, worker_id, count=5):
|
|||||||
list_max_age_seconds = _url_list_max_age_days * 86400
|
list_max_age_seconds = _url_list_max_age_days * 86400
|
||||||
min_added = now_int - list_max_age_seconds
|
min_added = now_int - list_max_age_seconds
|
||||||
|
|
||||||
|
# Boost SOCKS sources when protocol pool is imbalanced
|
||||||
|
proto_boost = _get_proto_boost()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
rows = url_db.execute(
|
rows = url_db.execute(
|
||||||
'''SELECT url, content_hash,
|
'''SELECT url, content_hash,
|
||||||
@@ -393,6 +431,8 @@ def claim_urls(url_db, worker_id, count=5):
|
|||||||
+ COALESCE(working_ratio, 0) * 0.5
|
+ COALESCE(working_ratio, 0) * 0.5
|
||||||
- MIN(error * 0.3, 2.0)
|
- MIN(error * 0.3, 2.0)
|
||||||
- MIN(stale_count * 0.1, 1.0)
|
- MIN(stale_count * 0.1, 1.0)
|
||||||
|
+ CASE WHEN LOWER(url) LIKE '%socks5%' OR LOWER(url) LIKE '%socks4%'
|
||||||
|
THEN ? ELSE 0 END
|
||||||
AS score
|
AS score
|
||||||
FROM uris
|
FROM uris
|
||||||
WHERE error < ?
|
WHERE error < ?
|
||||||
@@ -400,7 +440,7 @@ def claim_urls(url_db, worker_id, count=5):
|
|||||||
AND (added > ? OR proxies_added > 0)
|
AND (added > ? OR proxies_added > 0)
|
||||||
ORDER BY score DESC
|
ORDER BY score DESC
|
||||||
LIMIT ?''',
|
LIMIT ?''',
|
||||||
(now_int, _url_max_fail, now_int, min_added, count * 3)
|
(now_int, proto_boost, _url_max_fail, now_int, min_added, count * 3)
|
||||||
).fetchall()
|
).fetchall()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
_log('claim_urls query error: %s' % e, 'error')
|
_log('claim_urls query error: %s' % e, 'error')
|
||||||
@@ -1355,8 +1395,9 @@ class ProxyAPIServer(threading.Thread):
|
|||||||
self.stats_provider = stats_provider
|
self.stats_provider = stats_provider
|
||||||
self.profiling = profiling
|
self.profiling = profiling
|
||||||
self.daemon = True
|
self.daemon = True
|
||||||
global _url_database_path
|
global _url_database_path, _proxy_database
|
||||||
_url_database_path = url_database
|
_url_database_path = url_database
|
||||||
|
_proxy_database = database
|
||||||
self.server = None
|
self.server = None
|
||||||
self._stop_event = threading.Event() if not GEVENT_PATCHED else None
|
self._stop_event = threading.Event() if not GEVENT_PATCHED else None
|
||||||
# Load static library files into cache
|
# Load static library files into cache
|
||||||
|
|||||||
Reference in New Issue
Block a user