ppf: add content hash for duplicate proxy list detection

This commit is contained in:
Username
2025-12-22 00:03:12 +01:00
parent 6b5eb83bf4
commit f382a4ab6a
2 changed files with 56 additions and 6 deletions

30
dbs.py
View File

@@ -37,6 +37,31 @@ def _migrate_asn_column(sqlite):
sqlite.commit()
def _migrate_content_hash_column(sqlite):
"""Add content_hash column to uris table for duplicate detection."""
try:
sqlite.execute('SELECT content_hash FROM uris LIMIT 1')
except Exception:
sqlite.execute('ALTER TABLE uris ADD COLUMN content_hash TEXT')
sqlite.commit()
def compute_proxy_list_hash(proxies):
"""Compute MD5 hash of sorted proxy list for change detection.
Args:
proxies: List of proxy strings (ip:port format)
Returns:
Hexadecimal MD5 hash string, or None if list is empty
"""
if not proxies:
return None
import hashlib
sorted_list = '\n'.join(sorted(proxies))
return hashlib.md5(sorted_list.encode('utf-8') if hasattr(sorted_list, 'encode') else sorted_list).hexdigest()
def update_proxy_latency(sqlite, proxy, latency_ms):
"""Update rolling average latency for a proxy.
@@ -159,7 +184,10 @@ def create_table_if_not_exists(sqlite, dbname):
stale_count INT,
retrievals INT,
proxies_added INT,
added INT)""")
added INT,
content_hash TEXT)""")
# Migration for existing databases
_migrate_content_hash_column(sqlite)
# Indexes for common query patterns
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_error ON uris(error)')
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_checktime ON uris(check_time)')

32
ppf.py
View File

@@ -90,7 +90,7 @@ def import_proxies_from_file(proxydb, fn):
return 1
class Leechered(threading.Thread):
def __init__(self, url, stale_count, error, retrievals, proxies_added, content_type, proxy):
def __init__(self, url, stale_count, error, retrievals, proxies_added, content_type, content_hash, proxy):
self.status = 'nok'
self.proxylist = []
self.running = True
@@ -100,6 +100,9 @@ class Leechered(threading.Thread):
self.retrievals = retrievals
self.proxies_added = proxies_added
self.content_type = content_type
self.content_hash = content_hash
self.new_hash = None
self.hash_unchanged = False
self.proxy = proxy
self.execute = ''
threading.Thread.__init__(self)
@@ -132,6 +135,22 @@ class Leechered(threading.Thread):
content = ''
unique = fetch.extract_proxies(content, filter_known=False)
# Compute hash of all extracted proxies for change detection
self.new_hash = dbs.compute_proxy_list_hash(unique)
# Check if content unchanged (same proxies as last time)
if self.new_hash and self.content_hash and self.new_hash == self.content_hash:
self.hash_unchanged = True
self.proxylist = []
self.stale_count += 1
_log('%s: unchanged (hash match)' % self.url.split('/')[2], 'stale')
# Content unchanged - increment stale_count, update check_time
self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added, self.content_type, self.url)
self.status = 'ok'
return
# Content changed or first fetch - proceed with normal processing
self.proxylist = [ proxy for proxy in unique if not fetch.is_known_proxy(proxy) ]
proxy_count = len(self.proxylist)
@@ -203,7 +222,7 @@ def main():
scraperd = scraper.Scraper(config)
scraperd.start()
qurl = 'SELECT url,stale_count,error,retrievals,proxies_added,content_type FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) <?) ORDER BY RANDOM()'
qurl = 'SELECT url,stale_count,error,retrievals,proxies_added,content_type,content_hash FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) <?) ORDER BY RANDOM()'
threads = []
rows = []
reqtime = time.time() - 3600
@@ -233,8 +252,10 @@ def main():
new = [ p for p in proxylist if not fetch.is_known_proxy(p) ]
if new:
fetch.add_known_proxies(new)
execute = (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, url)
urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=?,content_type=? where url=?', execute)
# Update content_hash if we have a new one
new_hash = thread.new_hash
execute = (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, new_hash, url)
urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=?,content_type=?,content_hash=? where url=?', execute)
urldb.commit()
if new: dbs.insert_proxies(proxydb, new, url)
@@ -245,7 +266,8 @@ def main():
urldb.execute('UPDATE uris SET check_time=? where url=?', (time.time(), row[0]))
urldb.commit()
rows.remove(row)
t = Leechered(row[0], row[1], row[2], row[3], row[4], row[5], p)
# row: url, stale_count, error, retrievals, proxies_added, content_type, content_hash
t = Leechered(row[0], row[1], row[2], row[3], row[4], row[5], row[6], p)
threads.append(t)
t.start()