fetch: add IPv6, auth proxy, and confidence scoring support
This commit is contained in:
50
dbs.py
50
dbs.py
@@ -56,6 +56,16 @@ def _migrate_geolocation_columns(sqlite):
|
||||
sqlite.commit()
|
||||
|
||||
|
||||
def _migrate_confidence_column(sqlite):
|
||||
"""Add confidence column for extraction quality scoring."""
|
||||
try:
|
||||
sqlite.execute('SELECT confidence FROM proxylist LIMIT 1')
|
||||
except Exception:
|
||||
# confidence: 0-100 score indicating extraction reliability
|
||||
sqlite.execute('ALTER TABLE proxylist ADD COLUMN confidence INT DEFAULT 30')
|
||||
sqlite.commit()
|
||||
|
||||
|
||||
def compute_proxy_list_hash(proxies):
|
||||
"""Compute MD5 hash of sorted proxy list for change detection.
|
||||
|
||||
@@ -279,12 +289,14 @@ def create_table_if_not_exists(sqlite, dbname):
|
||||
exit_ip TEXT,
|
||||
asn INT,
|
||||
latitude REAL,
|
||||
longitude REAL)""")
|
||||
longitude REAL,
|
||||
confidence INT DEFAULT 30)""")
|
||||
# Migration: add columns to existing databases (must run before creating indexes)
|
||||
_migrate_latency_columns(sqlite)
|
||||
_migrate_anonymity_columns(sqlite)
|
||||
_migrate_asn_column(sqlite)
|
||||
_migrate_geolocation_columns(sqlite)
|
||||
_migrate_confidence_column(sqlite)
|
||||
# Indexes for common query patterns
|
||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_failed ON proxylist(failed)')
|
||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_tested ON proxylist(tested)')
|
||||
@@ -359,7 +371,10 @@ def insert_proxies(proxydb, proxies, url):
|
||||
|
||||
Args:
|
||||
proxydb: Database connection
|
||||
proxies: List of (address, proto) tuples or plain address strings
|
||||
proxies: List of tuples or plain address strings
|
||||
- (address, proto) - 2-tuple, default confidence
|
||||
- (address, proto, confidence) - 3-tuple with score
|
||||
- address string - default proto and confidence
|
||||
url: Source URL for logging
|
||||
"""
|
||||
if not proxies:
|
||||
@@ -367,17 +382,36 @@ def insert_proxies(proxydb, proxies, url):
|
||||
timestamp = int(time.time())
|
||||
rows = []
|
||||
for p in proxies:
|
||||
# Handle both tuple (address, proto) and plain string formats
|
||||
# Handle tuple (address, proto[, confidence]) and plain string formats
|
||||
confidence = 30 # Default confidence (CONFIDENCE_REGEX)
|
||||
if isinstance(p, tuple):
|
||||
addr, proto = p
|
||||
if len(p) >= 3:
|
||||
addr, proto, confidence = p[0], p[1], p[2]
|
||||
else:
|
||||
addr, proto = p[0], p[1]
|
||||
else:
|
||||
addr, proto = p, None
|
||||
ip, port = addr.split(':')
|
||||
rows.append((timestamp, addr, ip, port, proto, 1, 0, 0, 0, 0, 0))
|
||||
|
||||
# Parse address into ip and port
|
||||
# Formats: ip:port, [ipv6]:port, user:pass@ip:port, user:pass@[ipv6]:port
|
||||
addr_part = addr.split('@')[-1] # Strip auth if present
|
||||
|
||||
if addr_part.startswith('['):
|
||||
# IPv6: [ipv6]:port
|
||||
bracket_end = addr_part.find(']')
|
||||
if bracket_end < 0:
|
||||
continue
|
||||
ip = addr_part[:bracket_end + 1] # Include brackets
|
||||
port = addr_part[bracket_end + 2:] # Skip ]:
|
||||
else:
|
||||
# IPv4: ip:port
|
||||
ip, port = addr_part.rsplit(':', 1)
|
||||
|
||||
rows.append((timestamp, addr, ip, port, proto, 1, 0, 0, 0, 0, 0, confidence))
|
||||
proxydb.executemany(
|
||||
'INSERT OR IGNORE INTO proxylist '
|
||||
'(added,proxy,ip,port,proto,failed,tested,success_count,total_duration,mitm,consecutive_success) '
|
||||
'VALUES (?,?,?,?,?,?,?,?,?,?,?)',
|
||||
'(added,proxy,ip,port,proto,failed,tested,success_count,total_duration,mitm,consecutive_success,confidence) '
|
||||
'VALUES (?,?,?,?,?,?,?,?,?,?,?,?)',
|
||||
rows
|
||||
)
|
||||
proxydb.commit()
|
||||
|
||||
Reference in New Issue
Block a user