ppf: add content hash for duplicate proxy list detection
This commit is contained in:
30
dbs.py
30
dbs.py
@@ -37,6 +37,31 @@ def _migrate_asn_column(sqlite):
|
||||
sqlite.commit()
|
||||
|
||||
|
||||
def _migrate_content_hash_column(sqlite):
|
||||
"""Add content_hash column to uris table for duplicate detection."""
|
||||
try:
|
||||
sqlite.execute('SELECT content_hash FROM uris LIMIT 1')
|
||||
except Exception:
|
||||
sqlite.execute('ALTER TABLE uris ADD COLUMN content_hash TEXT')
|
||||
sqlite.commit()
|
||||
|
||||
|
||||
def compute_proxy_list_hash(proxies):
|
||||
"""Compute MD5 hash of sorted proxy list for change detection.
|
||||
|
||||
Args:
|
||||
proxies: List of proxy strings (ip:port format)
|
||||
|
||||
Returns:
|
||||
Hexadecimal MD5 hash string, or None if list is empty
|
||||
"""
|
||||
if not proxies:
|
||||
return None
|
||||
import hashlib
|
||||
sorted_list = '\n'.join(sorted(proxies))
|
||||
return hashlib.md5(sorted_list.encode('utf-8') if hasattr(sorted_list, 'encode') else sorted_list).hexdigest()
|
||||
|
||||
|
||||
def update_proxy_latency(sqlite, proxy, latency_ms):
|
||||
"""Update rolling average latency for a proxy.
|
||||
|
||||
@@ -159,7 +184,10 @@ def create_table_if_not_exists(sqlite, dbname):
|
||||
stale_count INT,
|
||||
retrievals INT,
|
||||
proxies_added INT,
|
||||
added INT)""")
|
||||
added INT,
|
||||
content_hash TEXT)""")
|
||||
# Migration for existing databases
|
||||
_migrate_content_hash_column(sqlite)
|
||||
# Indexes for common query patterns
|
||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_error ON uris(error)')
|
||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_checktime ON uris(check_time)')
|
||||
|
||||
Reference in New Issue
Block a user