From f382a4ab6a39bae82f5468121079255bc2bba0e0 Mon Sep 17 00:00:00 2001 From: Username Date: Mon, 22 Dec 2025 00:03:12 +0100 Subject: [PATCH] ppf: add content hash for duplicate proxy list detection --- dbs.py | 30 +++++++++++++++++++++++++++++- ppf.py | 32 +++++++++++++++++++++++++++----- 2 files changed, 56 insertions(+), 6 deletions(-) diff --git a/dbs.py b/dbs.py index 2cc9e1f..d9df0c4 100644 --- a/dbs.py +++ b/dbs.py @@ -37,6 +37,31 @@ def _migrate_asn_column(sqlite): sqlite.commit() +def _migrate_content_hash_column(sqlite): + """Add content_hash column to uris table for duplicate detection.""" + try: + sqlite.execute('SELECT content_hash FROM uris LIMIT 1') + except Exception: + sqlite.execute('ALTER TABLE uris ADD COLUMN content_hash TEXT') + sqlite.commit() + + +def compute_proxy_list_hash(proxies): + """Compute MD5 hash of sorted proxy list for change detection. + + Args: + proxies: List of proxy strings (ip:port format) + + Returns: + Hexadecimal MD5 hash string, or None if list is empty + """ + if not proxies: + return None + import hashlib + sorted_list = '\n'.join(sorted(proxies)) + return hashlib.md5(sorted_list.encode('utf-8') if hasattr(sorted_list, 'encode') else sorted_list).hexdigest() + + def update_proxy_latency(sqlite, proxy, latency_ms): """Update rolling average latency for a proxy. @@ -159,7 +184,10 @@ def create_table_if_not_exists(sqlite, dbname): stale_count INT, retrievals INT, proxies_added INT, - added INT)""") + added INT, + content_hash TEXT)""") + # Migration for existing databases + _migrate_content_hash_column(sqlite) # Indexes for common query patterns sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_error ON uris(error)') sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_checktime ON uris(check_time)') diff --git a/ppf.py b/ppf.py index 74e20fc..a4c6d8f 100644 --- a/ppf.py +++ b/ppf.py @@ -90,7 +90,7 @@ def import_proxies_from_file(proxydb, fn): return 1 class Leechered(threading.Thread): - def __init__(self, url, stale_count, error, retrievals, proxies_added, content_type, proxy): + def __init__(self, url, stale_count, error, retrievals, proxies_added, content_type, content_hash, proxy): self.status = 'nok' self.proxylist = [] self.running = True @@ -100,6 +100,9 @@ class Leechered(threading.Thread): self.retrievals = retrievals self.proxies_added = proxies_added self.content_type = content_type + self.content_hash = content_hash + self.new_hash = None + self.hash_unchanged = False self.proxy = proxy self.execute = '' threading.Thread.__init__(self) @@ -132,6 +135,22 @@ class Leechered(threading.Thread): content = '' unique = fetch.extract_proxies(content, filter_known=False) + + # Compute hash of all extracted proxies for change detection + self.new_hash = dbs.compute_proxy_list_hash(unique) + + # Check if content unchanged (same proxies as last time) + if self.new_hash and self.content_hash and self.new_hash == self.content_hash: + self.hash_unchanged = True + self.proxylist = [] + self.stale_count += 1 + _log('%s: unchanged (hash match)' % self.url.split('/')[2], 'stale') + # Content unchanged - increment stale_count, update check_time + self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added, self.content_type, self.url) + self.status = 'ok' + return + + # Content changed or first fetch - proceed with normal processing self.proxylist = [ proxy for proxy in unique if not fetch.is_known_proxy(proxy) ] proxy_count = len(self.proxylist) @@ -203,7 +222,7 @@ def main(): scraperd = scraper.Scraper(config) scraperd.start() - qurl = 'SELECT url,stale_count,error,retrievals,proxies_added,content_type FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?)