dbs: add CDN filtering and verification tables
- CDN_PREFIXES: filter Cloudflare, Fastly, Akamai, CloudFront, Google - is_cdn_ip(): check if IP belongs to known CDN ranges - insert_proxies(): skip CDN IPs with count in log message - verification tables: worker_results, verification_queue, worker_trust - queue_verification(): add proxies for manager re-testing - get_verification_stats(): queue size and trigger breakdown - get_all_worker_trust(): trust scores for all workers
This commit is contained in:
337
dbs.py
337
dbs.py
@@ -366,6 +366,37 @@ def create_table_if_not_exists(sqlite, dbname):
|
|||||||
sqlite.commit()
|
sqlite.commit()
|
||||||
|
|
||||||
|
|
||||||
|
# CDN IP ranges to filter out (not real proxies)
|
||||||
|
CDN_PREFIXES = (
|
||||||
|
# Cloudflare
|
||||||
|
'141.101.', '172.67.', '172.64.', '104.16.', '104.17.', '104.18.',
|
||||||
|
'104.19.', '104.20.', '104.21.', '104.22.', '104.23.', '104.24.',
|
||||||
|
'104.25.', '104.26.', '104.27.', '188.114.', '190.93.', '197.234.',
|
||||||
|
'198.41.', '173.245.', '103.21.', '103.22.', '103.31.',
|
||||||
|
# Fastly
|
||||||
|
'151.101.',
|
||||||
|
# Akamai (common ranges)
|
||||||
|
'23.32.', '23.33.', '23.34.', '23.35.', '23.36.', '23.37.',
|
||||||
|
'23.38.', '23.39.', '23.40.', '23.41.', '23.42.', '23.43.',
|
||||||
|
'23.44.', '23.45.', '23.46.', '23.47.', '23.48.', '23.49.',
|
||||||
|
'23.50.', '23.51.', '23.52.', '23.53.', '23.54.', '23.55.',
|
||||||
|
'23.56.', '23.57.', '23.58.', '23.59.', '23.60.', '23.61.',
|
||||||
|
'23.62.', '23.63.', '23.64.', '23.65.', '23.66.', '23.67.',
|
||||||
|
# Amazon CloudFront
|
||||||
|
'13.32.', '13.33.', '13.35.', '13.224.', '13.225.', '13.226.', '13.227.',
|
||||||
|
# Google
|
||||||
|
'34.64.', '34.65.', '34.66.', '34.67.', '34.68.', '34.69.', '34.70.', '34.71.',
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_cdn_ip(ip):
|
||||||
|
"""Check if IP belongs to known CDN ranges."""
|
||||||
|
for prefix in CDN_PREFIXES:
|
||||||
|
if ip.startswith(prefix):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def insert_proxies(proxydb, proxies, url):
|
def insert_proxies(proxydb, proxies, url):
|
||||||
"""Insert new proxies into database.
|
"""Insert new proxies into database.
|
||||||
|
|
||||||
@@ -381,6 +412,7 @@ def insert_proxies(proxydb, proxies, url):
|
|||||||
return
|
return
|
||||||
timestamp = int(time.time())
|
timestamp = int(time.time())
|
||||||
rows = []
|
rows = []
|
||||||
|
filtered = 0
|
||||||
for p in proxies:
|
for p in proxies:
|
||||||
# Handle tuple (address, proto[, confidence]) and plain string formats
|
# Handle tuple (address, proto[, confidence]) and plain string formats
|
||||||
confidence = 30 # Default confidence (CONFIDENCE_REGEX)
|
confidence = 30 # Default confidence (CONFIDENCE_REGEX)
|
||||||
@@ -407,6 +439,11 @@ def insert_proxies(proxydb, proxies, url):
|
|||||||
# IPv4: ip:port
|
# IPv4: ip:port
|
||||||
ip, port = addr_part.rsplit(':', 1)
|
ip, port = addr_part.rsplit(':', 1)
|
||||||
|
|
||||||
|
# Filter out CDN IPs (not real proxies)
|
||||||
|
if is_cdn_ip(ip):
|
||||||
|
filtered += 1
|
||||||
|
continue
|
||||||
|
|
||||||
rows.append((timestamp, addr, ip, port, proto, 1, 0, 0, 0, 0, 0, confidence))
|
rows.append((timestamp, addr, ip, port, proto, 1, 0, 0, 0, 0, 0, confidence))
|
||||||
proxydb.executemany(
|
proxydb.executemany(
|
||||||
'INSERT OR IGNORE INTO proxylist '
|
'INSERT OR IGNORE INTO proxylist '
|
||||||
@@ -415,7 +452,10 @@ def insert_proxies(proxydb, proxies, url):
|
|||||||
rows
|
rows
|
||||||
)
|
)
|
||||||
proxydb.commit()
|
proxydb.commit()
|
||||||
_log('+%d proxy/ies from %s' % (len(proxies), url), 'added')
|
if filtered:
|
||||||
|
_log('+%d proxy/ies from %s (filtered %d CDN IPs)' % (len(rows), url, filtered), 'added')
|
||||||
|
else:
|
||||||
|
_log('+%d proxy/ies from %s' % (len(rows), url), 'added')
|
||||||
|
|
||||||
|
|
||||||
def insert_urls(urls, search, sqlite):
|
def insert_urls(urls, search, sqlite):
|
||||||
@@ -485,8 +525,8 @@ def seed_proxy_sources(sqlite):
|
|||||||
)
|
)
|
||||||
if sqlite.cursor.rowcount > 0:
|
if sqlite.cursor.rowcount > 0:
|
||||||
added += 1
|
added += 1
|
||||||
except Exception:
|
except Exception as e:
|
||||||
pass
|
_log('seed_urls insert error for %s: %s' % (url, e), 'warn')
|
||||||
sqlite.commit()
|
sqlite.commit()
|
||||||
if added > 0:
|
if added > 0:
|
||||||
_log('seeded %d proxy source URLs' % added, 'info')
|
_log('seeded %d proxy source URLs' % added, 'info')
|
||||||
@@ -628,6 +668,293 @@ def get_stats_history(sqlite, hours=24):
|
|||||||
return [dict(zip(cols, row)) for row in rows]
|
return [dict(zip(cols, row)) for row in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def create_verification_tables(sqlite):
|
||||||
|
"""Create tables for the verification system.
|
||||||
|
|
||||||
|
Tables:
|
||||||
|
proxy_results: Per-test result history for disagreement detection
|
||||||
|
verification_queue: Disputed proxies awaiting manager verification
|
||||||
|
worker_trust: Worker accuracy tracking and trust scores
|
||||||
|
"""
|
||||||
|
# Per-test result history - stores last N results per proxy per worker
|
||||||
|
sqlite.execute("""CREATE TABLE IF NOT EXISTS proxy_results (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
proxy TEXT NOT NULL,
|
||||||
|
worker_id TEXT NOT NULL,
|
||||||
|
tested INTEGER NOT NULL,
|
||||||
|
working INTEGER NOT NULL,
|
||||||
|
latency_ms INTEGER,
|
||||||
|
error_category TEXT,
|
||||||
|
target TEXT,
|
||||||
|
FOREIGN KEY (proxy) REFERENCES proxylist(proxy))""")
|
||||||
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_results_proxy ON proxy_results(proxy, tested DESC)')
|
||||||
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_results_worker ON proxy_results(worker_id, tested DESC)')
|
||||||
|
|
||||||
|
# Verification queue - proxies needing manager verification
|
||||||
|
sqlite.execute("""CREATE TABLE IF NOT EXISTS verification_queue (
|
||||||
|
proxy TEXT PRIMARY KEY,
|
||||||
|
trigger TEXT NOT NULL,
|
||||||
|
priority INTEGER DEFAULT 1,
|
||||||
|
queued_at INTEGER NOT NULL,
|
||||||
|
worker_a TEXT,
|
||||||
|
worker_b TEXT,
|
||||||
|
result_a INTEGER,
|
||||||
|
result_b INTEGER)""")
|
||||||
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_verify_priority ON verification_queue(priority DESC, queued_at)')
|
||||||
|
|
||||||
|
# Worker trust tracking
|
||||||
|
sqlite.execute("""CREATE TABLE IF NOT EXISTS worker_trust (
|
||||||
|
worker_id TEXT PRIMARY KEY,
|
||||||
|
verifications INTEGER DEFAULT 0,
|
||||||
|
correct INTEGER DEFAULT 0,
|
||||||
|
incorrect INTEGER DEFAULT 0,
|
||||||
|
trust_score REAL DEFAULT 1.0,
|
||||||
|
last_updated INTEGER)""")
|
||||||
|
|
||||||
|
sqlite.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def insert_proxy_result(sqlite, proxy, worker_id, working, latency_ms=None,
|
||||||
|
error_category=None, target=None, max_results=10):
|
||||||
|
"""Insert a proxy test result and prune old results.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sqlite: Database connection
|
||||||
|
proxy: Proxy address (ip:port)
|
||||||
|
worker_id: Worker that tested the proxy
|
||||||
|
working: 1 if passed, 0 if failed
|
||||||
|
latency_ms: Response latency in milliseconds
|
||||||
|
error_category: Error type (timeout, refused, auth, etc.)
|
||||||
|
target: Target host that was tested
|
||||||
|
max_results: Maximum results to keep per proxy (default 10)
|
||||||
|
"""
|
||||||
|
now = int(time.time())
|
||||||
|
sqlite.execute(
|
||||||
|
'''INSERT INTO proxy_results
|
||||||
|
(proxy, worker_id, tested, working, latency_ms, error_category, target)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?)''',
|
||||||
|
(proxy, worker_id, now, working, latency_ms, error_category, target)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prune old results for this proxy (keep last max_results)
|
||||||
|
sqlite.execute('''
|
||||||
|
DELETE FROM proxy_results
|
||||||
|
WHERE proxy = ? AND id NOT IN (
|
||||||
|
SELECT id FROM proxy_results
|
||||||
|
WHERE proxy = ?
|
||||||
|
ORDER BY tested DESC
|
||||||
|
LIMIT ?
|
||||||
|
)
|
||||||
|
''', (proxy, proxy, max_results))
|
||||||
|
|
||||||
|
|
||||||
|
def check_for_disagreement(sqlite, proxy, worker_id, working, window_seconds=300):
|
||||||
|
"""Check if this result disagrees with recent results from other workers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sqlite: Database connection
|
||||||
|
proxy: Proxy address
|
||||||
|
worker_id: Current worker
|
||||||
|
working: Current test result (1=pass, 0=fail)
|
||||||
|
window_seconds: Time window to check (default 5 minutes)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple (disagreement_found, other_worker_id, other_result) or (False, None, None)
|
||||||
|
"""
|
||||||
|
now = int(time.time())
|
||||||
|
cutoff = now - window_seconds
|
||||||
|
|
||||||
|
rows = sqlite.execute('''
|
||||||
|
SELECT worker_id, working FROM proxy_results
|
||||||
|
WHERE proxy = ? AND worker_id != ? AND tested > ?
|
||||||
|
ORDER BY tested DESC LIMIT 3
|
||||||
|
''', (proxy, worker_id, cutoff)).fetchall()
|
||||||
|
|
||||||
|
for other_worker, other_result in rows:
|
||||||
|
if other_result != working:
|
||||||
|
return (True, other_worker, other_result)
|
||||||
|
|
||||||
|
return (False, None, None)
|
||||||
|
|
||||||
|
|
||||||
|
def queue_verification(sqlite, proxy, trigger, priority=1,
|
||||||
|
worker_a=None, worker_b=None, result_a=None, result_b=None):
|
||||||
|
"""Add a proxy to the verification queue.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sqlite: Database connection
|
||||||
|
proxy: Proxy address
|
||||||
|
trigger: Why verification is needed (disagreement, sudden_death, resurrection, spot_check)
|
||||||
|
priority: 1=low, 2=medium, 3=high
|
||||||
|
worker_a: First worker involved (for disagreements)
|
||||||
|
worker_b: Second worker involved (for disagreements)
|
||||||
|
result_a: First worker's result
|
||||||
|
result_b: Second worker's result
|
||||||
|
"""
|
||||||
|
now = int(time.time())
|
||||||
|
# Check if already queued with higher priority
|
||||||
|
existing = sqlite.execute(
|
||||||
|
'SELECT priority FROM verification_queue WHERE proxy = ?', (proxy,)
|
||||||
|
).fetchone()
|
||||||
|
|
||||||
|
if existing and existing[0] >= priority:
|
||||||
|
return # Already queued with equal or higher priority
|
||||||
|
|
||||||
|
sqlite.execute('''
|
||||||
|
INSERT OR REPLACE INTO verification_queue
|
||||||
|
(proxy, trigger, priority, queued_at, worker_a, worker_b, result_a, result_b)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
''', (proxy, trigger, priority, now, worker_a, worker_b, result_a, result_b))
|
||||||
|
|
||||||
|
|
||||||
|
def get_pending_verifications(sqlite, limit=10):
|
||||||
|
"""Get highest priority pending verifications.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sqlite: Database connection
|
||||||
|
limit: Maximum number to return
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of dicts with proxy and verification details
|
||||||
|
"""
|
||||||
|
rows = sqlite.execute('''
|
||||||
|
SELECT proxy, trigger, priority, queued_at, worker_a, worker_b, result_a, result_b
|
||||||
|
FROM verification_queue
|
||||||
|
ORDER BY priority DESC, queued_at ASC
|
||||||
|
LIMIT ?
|
||||||
|
''', (limit,)).fetchall()
|
||||||
|
|
||||||
|
cols = ['proxy', 'trigger', 'priority', 'queued_at',
|
||||||
|
'worker_a', 'worker_b', 'result_a', 'result_b']
|
||||||
|
return [dict(zip(cols, row)) for row in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def remove_from_verification_queue(sqlite, proxy):
|
||||||
|
"""Remove a proxy from the verification queue after verification."""
|
||||||
|
sqlite.execute('DELETE FROM verification_queue WHERE proxy = ?', (proxy,))
|
||||||
|
|
||||||
|
|
||||||
|
def update_worker_trust(sqlite, worker_id, was_correct):
|
||||||
|
"""Update worker trust score after verification.
|
||||||
|
|
||||||
|
Uses Bayesian smoothing: trust_score = (correct + 5) / (verifications + 10)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sqlite: Database connection
|
||||||
|
worker_id: Worker to update
|
||||||
|
was_correct: True if worker's result matched verification
|
||||||
|
"""
|
||||||
|
now = int(time.time())
|
||||||
|
|
||||||
|
# Get current stats or initialize
|
||||||
|
row = sqlite.execute(
|
||||||
|
'SELECT verifications, correct, incorrect FROM worker_trust WHERE worker_id = ?',
|
||||||
|
(worker_id,)
|
||||||
|
).fetchone()
|
||||||
|
|
||||||
|
if row:
|
||||||
|
verifications = row[0] + 1
|
||||||
|
correct = row[1] + (1 if was_correct else 0)
|
||||||
|
incorrect = row[2] + (0 if was_correct else 1)
|
||||||
|
else:
|
||||||
|
verifications = 1
|
||||||
|
correct = 1 if was_correct else 0
|
||||||
|
incorrect = 0 if was_correct else 1
|
||||||
|
|
||||||
|
# Bayesian smoothing
|
||||||
|
trust_score = (correct + 5.0) / (verifications + 10.0)
|
||||||
|
|
||||||
|
sqlite.execute('''
|
||||||
|
INSERT OR REPLACE INTO worker_trust
|
||||||
|
(worker_id, verifications, correct, incorrect, trust_score, last_updated)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?)
|
||||||
|
''', (worker_id, verifications, correct, incorrect, trust_score, now))
|
||||||
|
|
||||||
|
|
||||||
|
def get_worker_trust(sqlite, worker_id):
|
||||||
|
"""Get trust score for a worker.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sqlite: Database connection
|
||||||
|
worker_id: Worker to look up
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with trust stats or default values if not found
|
||||||
|
"""
|
||||||
|
row = sqlite.execute(
|
||||||
|
'SELECT verifications, correct, incorrect, trust_score, last_updated FROM worker_trust WHERE worker_id = ?',
|
||||||
|
(worker_id,)
|
||||||
|
).fetchone()
|
||||||
|
|
||||||
|
if row:
|
||||||
|
return {
|
||||||
|
'verifications': row[0],
|
||||||
|
'correct': row[1],
|
||||||
|
'incorrect': row[2],
|
||||||
|
'trust_score': row[3],
|
||||||
|
'last_updated': row[4]
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
'verifications': 0,
|
||||||
|
'correct': 0,
|
||||||
|
'incorrect': 0,
|
||||||
|
'trust_score': 1.0, # New workers start trusted
|
||||||
|
'last_updated': None
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_worker_trust(sqlite):
|
||||||
|
"""Get trust scores for all workers.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping worker_id to trust stats
|
||||||
|
"""
|
||||||
|
rows = sqlite.execute(
|
||||||
|
'SELECT worker_id, verifications, correct, incorrect, trust_score, last_updated FROM worker_trust'
|
||||||
|
).fetchall()
|
||||||
|
|
||||||
|
result = {}
|
||||||
|
for row in rows:
|
||||||
|
result[row[0]] = {
|
||||||
|
'verifications': row[1],
|
||||||
|
'correct': row[2],
|
||||||
|
'incorrect': row[3],
|
||||||
|
'trust_score': row[4],
|
||||||
|
'last_updated': row[5]
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def get_verification_stats(sqlite):
|
||||||
|
"""Get verification system statistics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with queue_size, verified counts, disagreement counts
|
||||||
|
"""
|
||||||
|
now = int(time.time())
|
||||||
|
today_start = (now // 86400) * 86400
|
||||||
|
|
||||||
|
stats = {}
|
||||||
|
|
||||||
|
# Queue size
|
||||||
|
row = sqlite.execute('SELECT COUNT(*) FROM verification_queue').fetchone()
|
||||||
|
stats['queue_size'] = row[0] if row else 0
|
||||||
|
|
||||||
|
# Queue by priority
|
||||||
|
rows = sqlite.execute(
|
||||||
|
'SELECT priority, COUNT(*) FROM verification_queue GROUP BY priority'
|
||||||
|
).fetchall()
|
||||||
|
stats['queue_by_priority'] = {row[0]: row[1] for row in rows}
|
||||||
|
|
||||||
|
# Queue by trigger type
|
||||||
|
rows = sqlite.execute(
|
||||||
|
'SELECT trigger, COUNT(*) FROM verification_queue GROUP BY trigger'
|
||||||
|
).fetchall()
|
||||||
|
stats['queue_by_trigger'] = {row[0]: row[1] for row in rows}
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
|
||||||
def analyze_database(sqlite):
|
def analyze_database(sqlite):
|
||||||
"""Run ANALYZE to update SQLite query planner statistics.
|
"""Run ANALYZE to update SQLite query planner statistics.
|
||||||
|
|
||||||
@@ -699,7 +1026,7 @@ def get_database_stats(sqlite):
|
|||||||
row = sqlite.execute('SELECT COUNT(*) FROM uris').fetchone()
|
row = sqlite.execute('SELECT COUNT(*) FROM uris').fetchone()
|
||||||
stats['uri_count'] = row[0] if row else 0
|
stats['uri_count'] = row[0] if row else 0
|
||||||
|
|
||||||
except Exception:
|
except Exception as e:
|
||||||
pass
|
_log('get_database_stats error: %s' % e, 'warn')
|
||||||
|
|
||||||
return stats
|
return stats
|
||||||
|
|||||||
Reference in New Issue
Block a user