fetch: add IPv6, auth proxy, and confidence scoring support

This commit is contained in:
Username
2025-12-26 19:13:36 +01:00
parent 50f49a20ff
commit 481dc514fb
2 changed files with 587 additions and 27 deletions

50
dbs.py
View File

@@ -56,6 +56,16 @@ def _migrate_geolocation_columns(sqlite):
sqlite.commit()
def _migrate_confidence_column(sqlite):
"""Add confidence column for extraction quality scoring."""
try:
sqlite.execute('SELECT confidence FROM proxylist LIMIT 1')
except Exception:
# confidence: 0-100 score indicating extraction reliability
sqlite.execute('ALTER TABLE proxylist ADD COLUMN confidence INT DEFAULT 30')
sqlite.commit()
def compute_proxy_list_hash(proxies):
"""Compute MD5 hash of sorted proxy list for change detection.
@@ -279,12 +289,14 @@ def create_table_if_not_exists(sqlite, dbname):
exit_ip TEXT,
asn INT,
latitude REAL,
longitude REAL)""")
longitude REAL,
confidence INT DEFAULT 30)""")
# Migration: add columns to existing databases (must run before creating indexes)
_migrate_latency_columns(sqlite)
_migrate_anonymity_columns(sqlite)
_migrate_asn_column(sqlite)
_migrate_geolocation_columns(sqlite)
_migrate_confidence_column(sqlite)
# Indexes for common query patterns
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_failed ON proxylist(failed)')
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_tested ON proxylist(tested)')
@@ -359,7 +371,10 @@ def insert_proxies(proxydb, proxies, url):
Args:
proxydb: Database connection
proxies: List of (address, proto) tuples or plain address strings
proxies: List of tuples or plain address strings
- (address, proto) - 2-tuple, default confidence
- (address, proto, confidence) - 3-tuple with score
- address string - default proto and confidence
url: Source URL for logging
"""
if not proxies:
@@ -367,17 +382,36 @@ def insert_proxies(proxydb, proxies, url):
timestamp = int(time.time())
rows = []
for p in proxies:
# Handle both tuple (address, proto) and plain string formats
# Handle tuple (address, proto[, confidence]) and plain string formats
confidence = 30 # Default confidence (CONFIDENCE_REGEX)
if isinstance(p, tuple):
addr, proto = p
if len(p) >= 3:
addr, proto, confidence = p[0], p[1], p[2]
else:
addr, proto = p[0], p[1]
else:
addr, proto = p, None
ip, port = addr.split(':')
rows.append((timestamp, addr, ip, port, proto, 1, 0, 0, 0, 0, 0))
# Parse address into ip and port
# Formats: ip:port, [ipv6]:port, user:pass@ip:port, user:pass@[ipv6]:port
addr_part = addr.split('@')[-1] # Strip auth if present
if addr_part.startswith('['):
# IPv6: [ipv6]:port
bracket_end = addr_part.find(']')
if bracket_end < 0:
continue
ip = addr_part[:bracket_end + 1] # Include brackets
port = addr_part[bracket_end + 2:] # Skip ]:
else:
# IPv4: ip:port
ip, port = addr_part.rsplit(':', 1)
rows.append((timestamp, addr, ip, port, proto, 1, 0, 0, 0, 0, 0, confidence))
proxydb.executemany(
'INSERT OR IGNORE INTO proxylist '
'(added,proxy,ip,port,proto,failed,tested,success_count,total_duration,mitm,consecutive_success) '
'VALUES (?,?,?,?,?,?,?,?,?,?,?)',
'(added,proxy,ip,port,proto,failed,tested,success_count,total_duration,mitm,consecutive_success,confidence) '
'VALUES (?,?,?,?,?,?,?,?,?,?,?,?)',
rows
)
proxydb.commit()