Compare commits

...

2 Commits

Author SHA1 Message Date
Username
d1e22a388c httpd: add ASN enrichment for worker-reported proxies
All checks were successful
CI / validate (push) Successful in 21s
Load pyasn database in httpd and look up ASN when workers report
working proxies. Previously ASN was only populated by proxywatchd
which doesn't run independently on the master node, leaving all
worker-reported proxies with asn=null.
2026-02-22 11:18:51 +01:00
Username
7ae0ac0c26 ppf: add periodic re-seeding of proxy source URLs
Seed sources that error out are permanently excluded from claiming.
Over time this starves the pipeline. Re-seed every 6 hours with
error reset for exhausted sources, preventing the starvation loop
that caused the previous outage.
2026-02-22 11:18:45 +01:00
3 changed files with 46 additions and 5 deletions

26
dbs.py
View File

@@ -639,10 +639,18 @@ PROXY_SOURCES = [
]
def seed_proxy_sources(sqlite):
"""Seed known proxy list sources into uris table."""
def seed_proxy_sources(sqlite, reset_errors=False):
"""Seed known proxy list sources into uris table.
Args:
sqlite: Database connection
reset_errors: If True, reset error/stale counts on existing seed
sources that have errored out, allowing them to be
retried. Safe to call periodically.
"""
timestamp = int(time.time())
added = 0
reset = 0
for url in PROXY_SOURCES:
try:
sqlite.execute(
@@ -653,11 +661,21 @@ def seed_proxy_sources(sqlite):
)
if sqlite.cursor.rowcount > 0:
added += 1
elif reset_errors:
# Reset errored-out seed sources so they get reclaimed
sqlite.execute(
'UPDATE uris SET error = 0, stale_count = 0, '
'check_interval = 3600, check_time = 0 '
'WHERE url = ? AND error >= 5',
(url,)
)
if sqlite.cursor.rowcount > 0:
reset += 1
except Exception as e:
_log('seed_urls insert error for %s: %s' % (url, e), 'warn')
sqlite.commit()
if added > 0:
_log('seeded %d proxy source URLs' % added, 'info')
if added > 0 or reset > 0:
_log('seed sources: %d new, %d reset' % (added, reset), 'info')
def save_session_state(sqlite, stats):

View File

@@ -31,6 +31,13 @@ except (ImportError, IOError, ValueError):
_geodb = None
_geolite = False
# ASN lookup (optional)
try:
import pyasn
_asndb = pyasn.pyasn(os.path.join("data", "ipasn.dat"))
except (ImportError, IOError):
_asndb = None
# Rate limiting configuration
_rate_limits = defaultdict(list)
_rate_lock = threading.Lock()
@@ -604,7 +611,7 @@ def submit_proxy_reports(db, worker_id, proxies):
''', (proxy_key, ip, port, proto, now_int, now_int, latency, now_int,
checktype, target))
# Geolocate if IP2Location available
# Geolocate and ASN lookup
if _geolite and _geodb:
try:
rec = _geodb.get_all(ip)
@@ -614,6 +621,15 @@ def submit_proxy_reports(db, worker_id, proxies):
(rec.country_short, proxy_key))
except Exception:
pass
if _asndb:
try:
asn_result = _asndb.lookup(ip)
if asn_result and asn_result[0]:
db.execute(
'UPDATE proxylist SET asn=? WHERE proxy=?',
(asn_result[0], proxy_key))
except Exception:
pass
# Track per-URL working count for working_ratio
if source_url:

7
ppf.py
View File

@@ -1045,8 +1045,15 @@ def main():
statusmsg = time.time()
list_max_age_seconds = config.ppf.list_max_age_days * 86400
last_skip_log = 0
last_reseed = time.time()
reseed_interval = 6 * 3600 # re-seed sources every 6 hours
while True:
try:
# Periodic re-seeding: reset errored-out seed sources
if time.time() - last_reseed >= reseed_interval:
dbs.seed_proxy_sources(urldb, reset_errors=True)
last_reseed = time.time()
# When ppf threads = 0, skip URL fetching (workers handle it via /api/claim-urls)
if config.ppf.threads == 0:
time.sleep(60)