All checks were successful
CI / validate (push) Successful in 20s
Add 21 new proxy source URLs: missing protocol variants from existing repos, 4 new GitHub repos, openproxylist.xyz and spys.me APIs, 5 web scraper targets, 2 Telegram channels.
1299 lines
48 KiB
Python
1299 lines
48 KiB
Python
#!/usr/bin/env python2
|
|
# -*- coding: utf-8 -*-
|
|
"""Database table creation and insertion utilities."""
|
|
|
|
import time
|
|
from misc import _log
|
|
|
|
|
|
def _migrate_latency_columns(sqlite):
|
|
"""Add latency columns to existing databases."""
|
|
try:
|
|
sqlite.execute('SELECT avg_latency FROM proxylist LIMIT 1')
|
|
except Exception:
|
|
sqlite.execute('ALTER TABLE proxylist ADD COLUMN avg_latency REAL DEFAULT 0')
|
|
sqlite.execute('ALTER TABLE proxylist ADD COLUMN latency_samples INT DEFAULT 0')
|
|
sqlite.commit()
|
|
|
|
|
|
def _migrate_anonymity_columns(sqlite):
|
|
"""Add anonymity detection columns to existing databases."""
|
|
try:
|
|
sqlite.execute('SELECT anonymity FROM proxylist LIMIT 1')
|
|
except Exception:
|
|
# anonymity: transparent, anonymous, elite, or NULL (unknown)
|
|
sqlite.execute('ALTER TABLE proxylist ADD COLUMN anonymity TEXT')
|
|
# exit_ip: the IP seen by the target server
|
|
sqlite.execute('ALTER TABLE proxylist ADD COLUMN exit_ip TEXT')
|
|
sqlite.commit()
|
|
|
|
|
|
def _migrate_asn_column(sqlite):
|
|
"""Add ASN column to existing databases."""
|
|
try:
|
|
sqlite.execute('SELECT asn FROM proxylist LIMIT 1')
|
|
except Exception:
|
|
sqlite.execute('ALTER TABLE proxylist ADD COLUMN asn INT')
|
|
sqlite.commit()
|
|
|
|
|
|
def _migrate_content_hash_column(sqlite):
|
|
"""Add content_hash column to uris table for duplicate detection."""
|
|
try:
|
|
sqlite.execute('SELECT content_hash FROM uris LIMIT 1')
|
|
except Exception:
|
|
sqlite.execute('ALTER TABLE uris ADD COLUMN content_hash TEXT')
|
|
sqlite.commit()
|
|
|
|
|
|
def _migrate_geolocation_columns(sqlite):
|
|
"""Add latitude/longitude columns for precise proxy mapping."""
|
|
try:
|
|
sqlite.execute('SELECT latitude FROM proxylist LIMIT 1')
|
|
except Exception:
|
|
sqlite.execute('ALTER TABLE proxylist ADD COLUMN latitude REAL')
|
|
sqlite.execute('ALTER TABLE proxylist ADD COLUMN longitude REAL')
|
|
sqlite.commit()
|
|
|
|
|
|
def _migrate_confidence_column(sqlite):
|
|
"""Add confidence column for extraction quality scoring."""
|
|
try:
|
|
sqlite.execute('SELECT confidence FROM proxylist LIMIT 1')
|
|
except Exception:
|
|
# confidence: 0-100 score indicating extraction reliability
|
|
sqlite.execute('ALTER TABLE proxylist ADD COLUMN confidence INT DEFAULT 30')
|
|
sqlite.commit()
|
|
|
|
|
|
def _migrate_source_proto(sqlite):
|
|
"""Add source_proto columns to preserve scraper-detected protocol intelligence."""
|
|
try:
|
|
sqlite.execute('SELECT source_proto FROM proxylist LIMIT 1')
|
|
except Exception:
|
|
# source_proto: protocol detected by scraper (never overwritten by tests)
|
|
sqlite.execute('ALTER TABLE proxylist ADD COLUMN source_proto TEXT')
|
|
# source_confidence: scraper confidence score (0-100)
|
|
sqlite.execute('ALTER TABLE proxylist ADD COLUMN source_confidence INT DEFAULT 0')
|
|
sqlite.commit()
|
|
|
|
|
|
def _migrate_protos_working(sqlite):
|
|
"""Add protos_working column for multi-protocol storage."""
|
|
try:
|
|
sqlite.execute('SELECT protos_working FROM proxylist LIMIT 1')
|
|
except Exception:
|
|
# protos_working: comma-separated list of working protos (e.g. "http,socks5")
|
|
sqlite.execute('ALTER TABLE proxylist ADD COLUMN protos_working TEXT')
|
|
sqlite.commit()
|
|
|
|
|
|
def _migrate_last_seen(sqlite):
|
|
"""Add last_seen column for worker-reported proxy freshness tracking."""
|
|
try:
|
|
sqlite.execute('SELECT last_seen FROM proxylist LIMIT 1')
|
|
except Exception:
|
|
# last_seen: unix timestamp of most recent "working" report from any worker
|
|
sqlite.execute('ALTER TABLE proxylist ADD COLUMN last_seen INT DEFAULT 0')
|
|
sqlite.commit()
|
|
|
|
|
|
def _migrate_last_check_columns(sqlite):
|
|
"""Add last_check and last_target columns for test provenance tracking."""
|
|
for col, typedef in (('last_check', 'TEXT'), ('last_target', 'TEXT')):
|
|
try:
|
|
sqlite.execute('SELECT %s FROM proxylist LIMIT 1' % col)
|
|
except Exception:
|
|
sqlite.execute('ALTER TABLE proxylist ADD COLUMN %s %s' % (col, typedef))
|
|
sqlite.commit()
|
|
|
|
|
|
def _migrate_uri_check_interval(sqlite):
|
|
"""Add adaptive check_interval column to uris table."""
|
|
try:
|
|
sqlite.execute('SELECT check_interval FROM uris LIMIT 1')
|
|
except Exception:
|
|
sqlite.execute('ALTER TABLE uris ADD COLUMN check_interval INT DEFAULT 3600')
|
|
sqlite.commit()
|
|
|
|
|
|
def _migrate_uri_working_ratio(sqlite):
|
|
"""Add working_ratio column to uris table for proxy quality tracking."""
|
|
try:
|
|
sqlite.execute('SELECT working_ratio FROM uris LIMIT 1')
|
|
except Exception:
|
|
sqlite.execute('ALTER TABLE uris ADD COLUMN working_ratio REAL DEFAULT 0.0')
|
|
sqlite.commit()
|
|
|
|
|
|
def _migrate_uri_avg_fetch_time(sqlite):
|
|
"""Add avg_fetch_time column to uris table for fetch latency EMA."""
|
|
try:
|
|
sqlite.execute('SELECT avg_fetch_time FROM uris LIMIT 1')
|
|
except Exception:
|
|
sqlite.execute('ALTER TABLE uris ADD COLUMN avg_fetch_time INT DEFAULT 0')
|
|
sqlite.commit()
|
|
|
|
|
|
def _migrate_uri_last_worker(sqlite):
|
|
"""Add last_worker column to uris table."""
|
|
try:
|
|
sqlite.execute('SELECT last_worker FROM uris LIMIT 1')
|
|
except Exception:
|
|
sqlite.execute('ALTER TABLE uris ADD COLUMN last_worker TEXT')
|
|
sqlite.commit()
|
|
|
|
|
|
def _migrate_uri_yield_rate(sqlite):
|
|
"""Add yield_rate column to uris table for proxy yield EMA."""
|
|
try:
|
|
sqlite.execute('SELECT yield_rate FROM uris LIMIT 1')
|
|
except Exception:
|
|
sqlite.execute('ALTER TABLE uris ADD COLUMN yield_rate REAL DEFAULT 0.0')
|
|
sqlite.commit()
|
|
|
|
|
|
def compute_proxy_list_hash(proxies):
|
|
"""Compute MD5 hash of sorted proxy list for change detection.
|
|
|
|
Args:
|
|
proxies: List of proxy strings (ip:port) or tuples (address, proto)
|
|
|
|
Returns:
|
|
Hexadecimal MD5 hash string, or None if list is empty
|
|
"""
|
|
if not proxies:
|
|
return None
|
|
import hashlib
|
|
# Handle both tuple (address, proto) and plain string formats
|
|
addresses = [p[0] if isinstance(p, tuple) else p for p in proxies]
|
|
sorted_list = '\n'.join(sorted(addresses))
|
|
return hashlib.md5(sorted_list.encode('utf-8') if hasattr(sorted_list, 'encode') else sorted_list).hexdigest()
|
|
|
|
|
|
def update_proxy_latency(sqlite, proxy, latency_ms):
|
|
"""Update rolling average latency for a proxy.
|
|
|
|
Args:
|
|
sqlite: Database connection
|
|
proxy: Proxy address (ip:port)
|
|
latency_ms: Response latency in milliseconds
|
|
"""
|
|
row = sqlite.execute(
|
|
'SELECT avg_latency, latency_samples FROM proxylist WHERE proxy=?',
|
|
(proxy,)
|
|
).fetchone()
|
|
|
|
if row:
|
|
old_avg, samples = row[0] or 0, row[1] or 0
|
|
# Exponential moving average, capped at 100 samples
|
|
new_samples = min(samples + 1, 100)
|
|
if samples == 0:
|
|
new_avg = latency_ms
|
|
else:
|
|
# Weight recent samples more heavily
|
|
alpha = 2.0 / (new_samples + 1)
|
|
new_avg = alpha * latency_ms + (1 - alpha) * old_avg
|
|
|
|
sqlite.execute(
|
|
'UPDATE proxylist SET avg_latency=?, latency_samples=? WHERE proxy=?',
|
|
(new_avg, new_samples, proxy)
|
|
)
|
|
|
|
|
|
def batch_update_proxy_latency(sqlite, latency_updates):
|
|
"""Batch update latency for multiple proxies.
|
|
|
|
Uses a single SELECT to fetch current values, computes new averages in Python,
|
|
then uses executemany for batch UPDATE. Much faster than individual calls.
|
|
|
|
Args:
|
|
sqlite: Database connection
|
|
latency_updates: List of (proxy, latency_ms) tuples
|
|
"""
|
|
if not latency_updates:
|
|
return
|
|
|
|
# Build proxy list for IN clause
|
|
proxies = [p for p, _ in latency_updates]
|
|
latency_map = {p: lat for p, lat in latency_updates}
|
|
|
|
# Fetch current values in single query
|
|
placeholders = ','.join('?' * len(proxies))
|
|
rows = sqlite.execute(
|
|
'SELECT proxy, avg_latency, latency_samples FROM proxylist WHERE proxy IN (%s)' % placeholders,
|
|
proxies
|
|
).fetchall()
|
|
|
|
# Compute new averages
|
|
updates = []
|
|
for row in rows:
|
|
proxy, old_avg, samples = row[0], row[1] or 0, row[2] or 0
|
|
latency_ms = latency_map.get(proxy)
|
|
if latency_ms is None:
|
|
continue
|
|
|
|
new_samples = min(samples + 1, 100)
|
|
if samples == 0:
|
|
new_avg = latency_ms
|
|
else:
|
|
alpha = 2.0 / (new_samples + 1)
|
|
new_avg = alpha * latency_ms + (1 - alpha) * old_avg
|
|
|
|
updates.append((new_avg, new_samples, proxy))
|
|
|
|
# Batch update
|
|
if updates:
|
|
sqlite.executemany(
|
|
'UPDATE proxylist SET avg_latency=?, latency_samples=? WHERE proxy=?',
|
|
updates
|
|
)
|
|
|
|
|
|
def update_proxy_anonymity(sqlite, proxy, exit_ip, proxy_ip, reveals_headers=None):
|
|
"""Update anonymity level based on exit IP and header analysis.
|
|
|
|
Anonymity levels:
|
|
transparent: exit_ip == proxy_ip (proxy reveals itself)
|
|
anonymous: exit_ip != proxy_ip, adds X-Forwarded-For/Via headers
|
|
elite: exit_ip != proxy_ip, no revealing headers
|
|
|
|
Args:
|
|
sqlite: Database connection
|
|
proxy: Proxy address (ip:port)
|
|
exit_ip: IP address seen by target server
|
|
proxy_ip: Proxy's IP address
|
|
reveals_headers: True if proxy adds revealing headers, False if not, None if unknown
|
|
"""
|
|
if not exit_ip:
|
|
return
|
|
|
|
# Normalize IPs (remove leading zeros)
|
|
def normalize_ip(ip):
|
|
if not ip:
|
|
return None
|
|
parts = ip.strip().split('.')
|
|
if len(parts) != 4:
|
|
return None
|
|
try:
|
|
return '.'.join(str(int(p)) for p in parts)
|
|
except ValueError:
|
|
return None
|
|
|
|
exit_ip = normalize_ip(exit_ip)
|
|
proxy_ip = normalize_ip(proxy_ip)
|
|
|
|
if not exit_ip:
|
|
return
|
|
|
|
# Determine anonymity level
|
|
if exit_ip == proxy_ip:
|
|
anonymity = 'transparent'
|
|
elif reveals_headers is False:
|
|
anonymity = 'elite'
|
|
elif reveals_headers is True:
|
|
anonymity = 'anonymous'
|
|
else:
|
|
# No header check performed, conservative default
|
|
anonymity = 'anonymous'
|
|
|
|
sqlite.execute(
|
|
'UPDATE proxylist SET anonymity=?, exit_ip=? WHERE proxy=?',
|
|
(anonymity, exit_ip, proxy)
|
|
)
|
|
|
|
|
|
def batch_update_proxy_anonymity(sqlite, anonymity_updates):
|
|
"""Batch update anonymity for multiple proxies.
|
|
|
|
Args:
|
|
sqlite: Database connection
|
|
anonymity_updates: List of (proxy, exit_ip, proxy_ip, reveals_headers) tuples
|
|
"""
|
|
if not anonymity_updates:
|
|
return
|
|
|
|
# Normalize IPs and compute anonymity levels
|
|
def normalize_ip(ip):
|
|
if not ip:
|
|
return None
|
|
parts = ip.strip().split('.')
|
|
if len(parts) != 4:
|
|
return None
|
|
try:
|
|
return '.'.join(str(int(p)) for p in parts)
|
|
except ValueError:
|
|
return None
|
|
|
|
updates = []
|
|
for proxy, exit_ip, proxy_ip, reveals_headers in anonymity_updates:
|
|
exit_ip = normalize_ip(exit_ip)
|
|
proxy_ip = normalize_ip(proxy_ip)
|
|
|
|
if not exit_ip:
|
|
continue
|
|
|
|
# Determine anonymity level
|
|
if exit_ip == proxy_ip:
|
|
anonymity = 'transparent'
|
|
elif reveals_headers is False:
|
|
anonymity = 'elite'
|
|
elif reveals_headers is True:
|
|
anonymity = 'anonymous'
|
|
else:
|
|
anonymity = 'anonymous'
|
|
|
|
updates.append((anonymity, exit_ip, proxy))
|
|
|
|
# Batch update
|
|
if updates:
|
|
sqlite.executemany(
|
|
'UPDATE proxylist SET anonymity=?, exit_ip=? WHERE proxy=?',
|
|
updates
|
|
)
|
|
|
|
|
|
def create_table_if_not_exists(sqlite, dbname):
|
|
"""Create database table with indexes if it doesn't exist."""
|
|
if dbname == 'proxylist':
|
|
sqlite.execute("""CREATE TABLE IF NOT EXISTS proxylist (
|
|
proxy BLOB UNIQUE,
|
|
country BLOB,
|
|
added INT,
|
|
failed INT,
|
|
tested INT,
|
|
dronebl INT,
|
|
proto TEXT,
|
|
mitm INT,
|
|
success_count INT,
|
|
ip TEXT,
|
|
port INT,
|
|
consecutive_success INT,
|
|
total_duration INT,
|
|
avg_latency REAL DEFAULT 0,
|
|
latency_samples INT DEFAULT 0,
|
|
anonymity TEXT,
|
|
exit_ip TEXT,
|
|
asn INT,
|
|
latitude REAL,
|
|
longitude REAL,
|
|
confidence INT DEFAULT 30,
|
|
source_proto TEXT,
|
|
source_confidence INT DEFAULT 0,
|
|
protos_working TEXT,
|
|
last_seen INT DEFAULT 0,
|
|
last_check TEXT,
|
|
last_target TEXT)""")
|
|
# Migration: add columns to existing databases (must run before creating indexes)
|
|
_migrate_latency_columns(sqlite)
|
|
_migrate_anonymity_columns(sqlite)
|
|
_migrate_asn_column(sqlite)
|
|
_migrate_geolocation_columns(sqlite)
|
|
_migrate_confidence_column(sqlite)
|
|
_migrate_source_proto(sqlite)
|
|
_migrate_protos_working(sqlite)
|
|
_migrate_last_seen(sqlite)
|
|
_migrate_last_check_columns(sqlite)
|
|
# Indexes for common query patterns
|
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_failed ON proxylist(failed)')
|
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_tested ON proxylist(tested)')
|
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_proto ON proxylist(proto)')
|
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_anonymity ON proxylist(anonymity)')
|
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_asn ON proxylist(asn)')
|
|
|
|
elif dbname == 'uris':
|
|
sqlite.execute("""CREATE TABLE IF NOT EXISTS uris (
|
|
url TEXT UNIQUE,
|
|
content_type TEXT,
|
|
check_time INT,
|
|
error INT,
|
|
stale_count INT,
|
|
retrievals INT,
|
|
proxies_added INT,
|
|
added INT,
|
|
content_hash TEXT)""")
|
|
# Migration for existing databases
|
|
_migrate_content_hash_column(sqlite)
|
|
_migrate_uri_check_interval(sqlite)
|
|
_migrate_uri_working_ratio(sqlite)
|
|
_migrate_uri_avg_fetch_time(sqlite)
|
|
_migrate_uri_last_worker(sqlite)
|
|
_migrate_uri_yield_rate(sqlite)
|
|
# Indexes for common query patterns
|
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_error ON uris(error)')
|
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_checktime ON uris(check_time)')
|
|
|
|
elif dbname == 'stats_history':
|
|
# Hourly stats snapshots for historical graphs
|
|
sqlite.execute("""CREATE TABLE IF NOT EXISTS stats_history (
|
|
timestamp INT PRIMARY KEY,
|
|
tested INT,
|
|
passed INT,
|
|
failed INT,
|
|
success_rate REAL,
|
|
avg_latency REAL,
|
|
ssl_tested INT,
|
|
ssl_passed INT,
|
|
mitm_detected INT,
|
|
proto_http INT,
|
|
proto_socks4 INT,
|
|
proto_socks5 INT)""")
|
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_stats_history_ts ON stats_history(timestamp)')
|
|
|
|
elif dbname == 'session_state':
|
|
# Single-row table for persisting session state across restarts
|
|
sqlite.execute("""CREATE TABLE IF NOT EXISTS session_state (
|
|
id INT PRIMARY KEY DEFAULT 1,
|
|
tested INT,
|
|
passed INT,
|
|
failed INT,
|
|
ssl_tested INT,
|
|
ssl_passed INT,
|
|
ssl_failed INT,
|
|
mitm_detected INT,
|
|
cert_errors INT,
|
|
proto_http_tested INT,
|
|
proto_http_passed INT,
|
|
proto_socks4_tested INT,
|
|
proto_socks4_passed INT,
|
|
proto_socks5_tested INT,
|
|
proto_socks5_passed INT,
|
|
peak_rate REAL,
|
|
start_time INT,
|
|
last_save INT,
|
|
fail_categories TEXT,
|
|
country_passed TEXT,
|
|
asn_passed TEXT)""")
|
|
|
|
sqlite.commit()
|
|
|
|
|
|
# CDN IP ranges to filter out (not real proxies)
|
|
CDN_PREFIXES = (
|
|
# Cloudflare
|
|
'141.101.', '172.67.', '172.64.', '104.16.', '104.17.', '104.18.',
|
|
'104.19.', '104.20.', '104.21.', '104.22.', '104.23.', '104.24.',
|
|
'104.25.', '104.26.', '104.27.', '188.114.', '190.93.', '197.234.',
|
|
'198.41.', '173.245.', '103.21.', '103.22.', '103.31.',
|
|
# Fastly
|
|
'151.101.',
|
|
# Akamai (common ranges)
|
|
'23.32.', '23.33.', '23.34.', '23.35.', '23.36.', '23.37.',
|
|
'23.38.', '23.39.', '23.40.', '23.41.', '23.42.', '23.43.',
|
|
'23.44.', '23.45.', '23.46.', '23.47.', '23.48.', '23.49.',
|
|
'23.50.', '23.51.', '23.52.', '23.53.', '23.54.', '23.55.',
|
|
'23.56.', '23.57.', '23.58.', '23.59.', '23.60.', '23.61.',
|
|
'23.62.', '23.63.', '23.64.', '23.65.', '23.66.', '23.67.',
|
|
# Amazon CloudFront
|
|
'13.32.', '13.33.', '13.35.', '13.224.', '13.225.', '13.226.', '13.227.',
|
|
# Google
|
|
'34.64.', '34.65.', '34.66.', '34.67.', '34.68.', '34.69.', '34.70.', '34.71.',
|
|
)
|
|
|
|
|
|
def is_cdn_ip(ip):
|
|
"""Check if IP belongs to known CDN ranges."""
|
|
for prefix in CDN_PREFIXES:
|
|
if ip.startswith(prefix):
|
|
return True
|
|
return False
|
|
|
|
|
|
def insert_proxies(proxydb, proxies, url):
|
|
"""Insert new proxies into database.
|
|
|
|
Args:
|
|
proxydb: Database connection
|
|
proxies: List of tuples or plain address strings
|
|
- (address, proto) - 2-tuple, default confidence
|
|
- (address, proto, confidence) - 3-tuple with score
|
|
- address string - default proto and confidence
|
|
url: Source URL for logging
|
|
"""
|
|
if not proxies:
|
|
return
|
|
timestamp = int(time.time())
|
|
rows = []
|
|
filtered = 0
|
|
for p in proxies:
|
|
# Handle tuple (address, proto[, confidence]) and plain string formats
|
|
confidence = 30 # Default confidence (CONFIDENCE_REGEX)
|
|
if isinstance(p, tuple):
|
|
if len(p) >= 3:
|
|
addr, proto, confidence = p[0], p[1], p[2]
|
|
else:
|
|
addr, proto = p[0], p[1]
|
|
else:
|
|
addr, proto = p, None
|
|
|
|
# Parse address into ip and port
|
|
# Formats: ip:port, [ipv6]:port, user:pass@ip:port, user:pass@[ipv6]:port
|
|
addr_part = addr.split('@')[-1] # Strip auth if present
|
|
|
|
if addr_part.startswith('['):
|
|
# IPv6: [ipv6]:port
|
|
bracket_end = addr_part.find(']')
|
|
if bracket_end < 0:
|
|
continue
|
|
ip = addr_part[:bracket_end + 1] # Include brackets
|
|
port = addr_part[bracket_end + 2:] # Skip ]:
|
|
else:
|
|
# IPv4: ip:port
|
|
ip, port = addr_part.rsplit(':', 1)
|
|
|
|
# Filter out CDN IPs (not real proxies)
|
|
if is_cdn_ip(ip):
|
|
filtered += 1
|
|
continue
|
|
|
|
rows.append((timestamp, addr, ip, port, proto, 1, 0, 0, 0, 0, 0, confidence, proto, confidence))
|
|
proxydb.executemany(
|
|
'INSERT OR IGNORE INTO proxylist '
|
|
'(added,proxy,ip,port,proto,failed,tested,success_count,total_duration,mitm,consecutive_success,confidence,source_proto,source_confidence) '
|
|
'VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)',
|
|
rows
|
|
)
|
|
proxydb.commit()
|
|
if filtered:
|
|
_log('+%d proxy/ies from %s (filtered %d CDN IPs)' % (len(rows), url, filtered), 'added')
|
|
else:
|
|
_log('+%d proxy/ies from %s' % (len(rows), url), 'added')
|
|
|
|
|
|
def insert_urls(urls, search, sqlite):
|
|
"""Insert new URLs into database. Returns count of newly inserted URLs."""
|
|
if not urls:
|
|
return 0
|
|
# Count before insert
|
|
before = sqlite.execute('SELECT COUNT(*) FROM uris').fetchone()[0]
|
|
timestamp = int(time.time())
|
|
rows = [(timestamp, u, 0, 1, 0, 0, 0) for u in urls]
|
|
sqlite.executemany(
|
|
'INSERT OR IGNORE INTO uris '
|
|
'(added,url,check_time,error,stale_count,retrievals,proxies_added) '
|
|
'VALUES (?,?,?,?,?,?,?)',
|
|
rows
|
|
)
|
|
sqlite.commit()
|
|
# Count after insert to determine how many were actually new
|
|
after = sqlite.execute('SELECT COUNT(*) FROM uris').fetchone()[0]
|
|
return after - before
|
|
|
|
|
|
# Known proxy list sources (GitHub raw lists, APIs)
|
|
PROXY_SOURCES = [
|
|
# --- GitHub raw lists (sorted by update frequency) ---
|
|
|
|
# TheSpeedX/PROXY-List - large, hourly updates
|
|
'https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt',
|
|
'https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt',
|
|
'https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txt',
|
|
# monosans/proxy-list - hourly updates
|
|
'https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt',
|
|
'https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks4.txt',
|
|
'https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks5.txt',
|
|
# prxchk/proxy-list - 10 min updates
|
|
'https://raw.githubusercontent.com/prxchk/proxy-list/main/http.txt',
|
|
'https://raw.githubusercontent.com/prxchk/proxy-list/main/socks4.txt',
|
|
'https://raw.githubusercontent.com/prxchk/proxy-list/main/socks5.txt',
|
|
# jetkai/proxy-list - 10 min updates
|
|
'https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies.txt',
|
|
# hookzof/socks5_list - hourly, SOCKS5 focused
|
|
'https://raw.githubusercontent.com/hookzof/socks5_list/master/proxy.txt',
|
|
# mmpx12/proxy-list
|
|
'https://raw.githubusercontent.com/mmpx12/proxy-list/master/http.txt',
|
|
'https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks4.txt',
|
|
'https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks5.txt',
|
|
# ShiftyTR/Proxy-List
|
|
'https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/http.txt',
|
|
'https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks4.txt',
|
|
'https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks5.txt',
|
|
# roosterkid/openproxylist
|
|
'https://raw.githubusercontent.com/roosterkid/openproxylist/main/HTTPS_RAW.txt',
|
|
'https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS4_RAW.txt',
|
|
'https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS5_RAW.txt',
|
|
# clarketm/proxy-list - curated, daily
|
|
'https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt',
|
|
# officialputuid/KangProxy - 4-6 hour updates
|
|
'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/http/http.txt',
|
|
'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/https/https.txt',
|
|
'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/socks4/socks4.txt',
|
|
'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/socks5/socks5.txt',
|
|
# iplocate/free-proxy-list - 30 min updates
|
|
'https://raw.githubusercontent.com/iplocate/free-proxy-list/main/protocols/http.txt',
|
|
'https://raw.githubusercontent.com/iplocate/free-proxy-list/main/protocols/socks4.txt',
|
|
'https://raw.githubusercontent.com/iplocate/free-proxy-list/main/protocols/socks5.txt',
|
|
# ErcinDedeworken/proxy-list - hourly
|
|
'https://raw.githubusercontent.com/ErcinDedeworken/proxy-list/main/proxy-list/data.txt',
|
|
# MuRongPIG/Proxy-Master - 10 min updates
|
|
'https://raw.githubusercontent.com/MuRongPIG/Proxy-Master/main/http.txt',
|
|
'https://raw.githubusercontent.com/MuRongPIG/Proxy-Master/main/socks4.txt',
|
|
'https://raw.githubusercontent.com/MuRongPIG/Proxy-Master/main/socks5.txt',
|
|
# zloi-user/hideip.me - hourly
|
|
'https://raw.githubusercontent.com/zloi-user/hideip.me/main/http.txt',
|
|
'https://raw.githubusercontent.com/zloi-user/hideip.me/main/socks4.txt',
|
|
'https://raw.githubusercontent.com/zloi-user/hideip.me/main/socks5.txt',
|
|
# FLAVIEN-music/proxy-list - 30 min updates
|
|
'https://raw.githubusercontent.com/FLAVIEN-music/proxy-list/main/proxies/http.txt',
|
|
'https://raw.githubusercontent.com/FLAVIEN-music/proxy-list/main/proxies/socks4.txt',
|
|
'https://raw.githubusercontent.com/FLAVIEN-music/proxy-list/main/proxies/socks5.txt',
|
|
# Zaeem20/FREE_PROXIES_LIST - 30 min updates
|
|
'https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/http.txt',
|
|
'https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/https.txt',
|
|
'https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/socks4.txt',
|
|
'https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/socks5.txt',
|
|
# r00tee/Proxy-List - hourly
|
|
'https://raw.githubusercontent.com/r00tee/Proxy-List/main/Https.txt',
|
|
'https://raw.githubusercontent.com/r00tee/Proxy-List/main/Socks4.txt',
|
|
'https://raw.githubusercontent.com/r00tee/Proxy-List/main/Socks5.txt',
|
|
# casals-ar/proxy-list
|
|
'https://raw.githubusercontent.com/casals-ar/proxy-list/main/http',
|
|
'https://raw.githubusercontent.com/casals-ar/proxy-list/main/socks4',
|
|
'https://raw.githubusercontent.com/casals-ar/proxy-list/main/socks5',
|
|
# yemixzy/proxy-list
|
|
'https://raw.githubusercontent.com/yemixzy/proxy-list/main/proxies/http.txt',
|
|
'https://raw.githubusercontent.com/yemixzy/proxy-list/main/proxies/socks4.txt',
|
|
'https://raw.githubusercontent.com/yemixzy/proxy-list/main/proxies/socks5.txt',
|
|
# opsxcq/proxy-list
|
|
'https://raw.githubusercontent.com/opsxcq/proxy-list/master/list.txt',
|
|
# im-razvan/proxy_list - 10 min updates
|
|
'https://raw.githubusercontent.com/im-razvan/proxy_list/main/http.txt',
|
|
'https://raw.githubusercontent.com/im-razvan/proxy_list/main/socks4.txt',
|
|
'https://raw.githubusercontent.com/im-razvan/proxy_list/main/socks5.txt',
|
|
# zevtyardt/proxy-list - daily SOCKS5
|
|
'https://raw.githubusercontent.com/zevtyardt/proxy-list/main/socks5.txt',
|
|
# UptimerBot/proxy-list - 15 min updates
|
|
'https://raw.githubusercontent.com/UptimerBot/proxy-list/main/proxies/socks5.txt',
|
|
# Anonym0usWork1221/Free-Proxies
|
|
'https://raw.githubusercontent.com/Anonym0usWork1221/Free-Proxies/main/proxy_files/https_proxies.txt',
|
|
'https://raw.githubusercontent.com/Anonym0usWork1221/Free-Proxies/main/proxy_files/socks4_proxies.txt',
|
|
'https://raw.githubusercontent.com/Anonym0usWork1221/Free-Proxies/main/proxy_files/socks5_proxies.txt',
|
|
# ErcinDedeoglu/proxies - hourly
|
|
'https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/http.txt',
|
|
'https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/socks4.txt',
|
|
'https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/socks5.txt',
|
|
# dinoz0rg/proxy-list - daily, all protocols
|
|
'https://raw.githubusercontent.com/dinoz0rg/proxy-list/main/all.txt',
|
|
# elliottophellia/proxylist - SOCKS5
|
|
'https://raw.githubusercontent.com/elliottophellia/proxylist/master/results/socks5/global/socks5_len.txt',
|
|
# gfpcom/free-proxy-list - SOCKS5
|
|
'https://raw.githubusercontent.com/gfpcom/free-proxy-list/main/socks5.txt',
|
|
# databay-labs/free-proxy-list - SOCKS5
|
|
'https://raw.githubusercontent.com/databay-labs/free-proxy-list/master/socks5.txt',
|
|
|
|
# --- GitHub Pages / CDN hosted ---
|
|
|
|
# proxifly/free-proxy-list - 5 min updates (jsDelivr CDN)
|
|
'https://cdn.jsdelivr.net/gh/proxifly/free-proxy-list@main/proxies/protocols/http/data.txt',
|
|
'https://cdn.jsdelivr.net/gh/proxifly/free-proxy-list@main/proxies/protocols/socks4/data.txt',
|
|
'https://cdn.jsdelivr.net/gh/proxifly/free-proxy-list@main/proxies/protocols/socks5/data.txt',
|
|
# vakhov/fresh-proxy-list - 5-20 min updates (GitHub Pages)
|
|
'https://vakhov.github.io/fresh-proxy-list/http.txt',
|
|
'https://vakhov.github.io/fresh-proxy-list/socks4.txt',
|
|
'https://vakhov.github.io/fresh-proxy-list/socks5.txt',
|
|
# sunny9577/proxy-scraper - 3 hour updates (GitHub Pages)
|
|
'https://sunny9577.github.io/proxy-scraper/generated/http_proxies.txt',
|
|
'https://sunny9577.github.io/proxy-scraper/generated/socks4_proxies.txt',
|
|
'https://sunny9577.github.io/proxy-scraper/generated/socks5_proxies.txt',
|
|
|
|
# --- API endpoints ---
|
|
|
|
# proxyscrape
|
|
'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=http&timeout=10000&country=all',
|
|
'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks4&timeout=10000&country=all',
|
|
'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks5&timeout=10000&country=all',
|
|
|
|
# proxy-list.download - SOCKS5 API
|
|
'https://www.proxy-list.download/api/v1/get?type=socks5',
|
|
'https://www.proxy-list.download/api/v1/get?type=socks4',
|
|
# openproxylist.xyz - plain text
|
|
'https://api.openproxylist.xyz/http.txt',
|
|
'https://api.openproxylist.xyz/socks4.txt',
|
|
'https://api.openproxylist.xyz/socks5.txt',
|
|
# spys.me - plain text, 30 min updates
|
|
'http://spys.me/proxy.txt',
|
|
'http://spys.me/socks.txt',
|
|
|
|
# --- Web scrapers (HTML pages) ---
|
|
|
|
# spys.one - mixed protocols, requires parsing
|
|
'https://spys.one/en/free-proxy-list/',
|
|
'https://spys.one/en/socks-proxy-list/',
|
|
'https://spys.one/en/https-ssl-proxy/',
|
|
# free-proxy-list.net
|
|
'https://free-proxy-list.net/',
|
|
'https://www.sslproxies.org/',
|
|
'https://www.socks-proxy.net/',
|
|
# sockslist.us - SOCKS5 focused
|
|
'https://sockslist.us/',
|
|
# mtpro.xyz - SOCKS5, updated every 5 min
|
|
'https://mtpro.xyz/socks5',
|
|
# proxy-tools.com - SOCKS5 filtered
|
|
'https://proxy-tools.com/proxy/socks5',
|
|
# hidemy.name - all protocols, paginated
|
|
'https://hide.mn/en/proxy-list/',
|
|
# advanced.name - SOCKS5 filtered
|
|
'https://advanced.name/freeproxy?type=socks5',
|
|
# proxynova.com - by country
|
|
'https://www.proxynova.com/proxy-server-list/',
|
|
# freeproxy.world - SOCKS5 filtered
|
|
'https://www.freeproxy.world/?type=socks5',
|
|
# proxydb.net - all protocols
|
|
'http://proxydb.net/',
|
|
# geonode
|
|
'https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc&protocols=http',
|
|
'https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc&protocols=socks4',
|
|
'https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc&protocols=socks5',
|
|
# openproxy.space
|
|
'https://openproxy.space/list/http',
|
|
'https://openproxy.space/list/socks4',
|
|
'https://openproxy.space/list/socks5',
|
|
|
|
# --- Telegram channels (public HTML view) ---
|
|
|
|
'https://t.me/s/spys_one',
|
|
'https://t.me/s/proxyfree1',
|
|
'https://t.me/s/proxylist4free',
|
|
'https://t.me/s/proxy_lists',
|
|
'https://t.me/s/Proxies4ForYou',
|
|
]
|
|
|
|
|
|
def seed_proxy_sources(sqlite, reset_errors=False):
|
|
"""Seed known proxy list sources into uris table.
|
|
|
|
Args:
|
|
sqlite: Database connection
|
|
reset_errors: If True, reset error/stale counts on existing seed
|
|
sources that have errored out, allowing them to be
|
|
retried. Safe to call periodically.
|
|
"""
|
|
timestamp = int(time.time())
|
|
added = 0
|
|
reset = 0
|
|
for url in PROXY_SOURCES:
|
|
try:
|
|
sqlite.execute(
|
|
'INSERT OR IGNORE INTO uris '
|
|
'(added,url,check_time,error,stale_count,retrievals,proxies_added) '
|
|
'VALUES (?,?,?,?,?,?,?)',
|
|
(timestamp, url, 0, 0, 0, 0, 0)
|
|
)
|
|
if sqlite.cursor.rowcount > 0:
|
|
added += 1
|
|
elif reset_errors:
|
|
# Reset errored-out seed sources so they get reclaimed
|
|
sqlite.execute(
|
|
'UPDATE uris SET error = 0, stale_count = 0, '
|
|
'check_interval = 3600, check_time = 0 '
|
|
'WHERE url = ? AND error >= 5',
|
|
(url,)
|
|
)
|
|
if sqlite.cursor.rowcount > 0:
|
|
reset += 1
|
|
except Exception as e:
|
|
_log('seed_urls insert error for %s: %s' % (url, e), 'warn')
|
|
sqlite.commit()
|
|
if added > 0 or reset > 0:
|
|
_log('seed sources: %d new, %d reset' % (added, reset), 'info')
|
|
|
|
|
|
def save_session_state(sqlite, stats):
|
|
"""Save session state to database for persistence across restarts.
|
|
|
|
Args:
|
|
sqlite: Database connection
|
|
stats: Stats object from proxywatchd
|
|
"""
|
|
import json
|
|
now = int(time.time())
|
|
|
|
# Serialize dicts as JSON
|
|
fail_cats_json = json.dumps(dict(stats.fail_categories))
|
|
country_json = json.dumps(dict(stats.country_passed))
|
|
asn_json = json.dumps(dict(stats.asn_passed))
|
|
|
|
sqlite.execute('''INSERT OR REPLACE INTO session_state
|
|
(id, tested, passed, failed, ssl_tested, ssl_passed, ssl_failed,
|
|
mitm_detected, cert_errors, proto_http_tested, proto_http_passed,
|
|
proto_socks4_tested, proto_socks4_passed, proto_socks5_tested, proto_socks5_passed,
|
|
peak_rate, start_time, last_save, fail_categories, country_passed, asn_passed)
|
|
VALUES (1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
|
|
(stats.tested, stats.passed, stats.failed,
|
|
stats.ssl_tested, stats.ssl_passed, stats.ssl_failed,
|
|
stats.mitm_detected, stats.cert_errors,
|
|
stats.proto_tested.get('http', 0), stats.proto_passed.get('http', 0),
|
|
stats.proto_tested.get('socks4', 0), stats.proto_passed.get('socks4', 0),
|
|
stats.proto_tested.get('socks5', 0), stats.proto_passed.get('socks5', 0),
|
|
stats.peak_rate, int(stats.start_time), now,
|
|
fail_cats_json, country_json, asn_json))
|
|
sqlite.commit()
|
|
|
|
|
|
def load_session_state(sqlite):
|
|
"""Load session state from database.
|
|
|
|
Args:
|
|
sqlite: Database connection
|
|
|
|
Returns:
|
|
dict with state fields, or None if no saved state
|
|
"""
|
|
import json
|
|
try:
|
|
row = sqlite.execute(
|
|
'SELECT * FROM session_state WHERE id=1'
|
|
).fetchone()
|
|
if not row:
|
|
return None
|
|
|
|
# Map column names to values
|
|
cols = ['id', 'tested', 'passed', 'failed', 'ssl_tested', 'ssl_passed',
|
|
'ssl_failed', 'mitm_detected', 'cert_errors',
|
|
'proto_http_tested', 'proto_http_passed',
|
|
'proto_socks4_tested', 'proto_socks4_passed',
|
|
'proto_socks5_tested', 'proto_socks5_passed',
|
|
'peak_rate', 'start_time', 'last_save',
|
|
'fail_categories', 'country_passed', 'asn_passed']
|
|
state = dict(zip(cols, row))
|
|
|
|
# Parse JSON fields
|
|
if state.get('fail_categories'):
|
|
state['fail_categories'] = json.loads(state['fail_categories'])
|
|
else:
|
|
state['fail_categories'] = {}
|
|
if state.get('country_passed'):
|
|
state['country_passed'] = json.loads(state['country_passed'])
|
|
else:
|
|
state['country_passed'] = {}
|
|
if state.get('asn_passed'):
|
|
state['asn_passed'] = json.loads(state['asn_passed'])
|
|
else:
|
|
state['asn_passed'] = {}
|
|
|
|
return state
|
|
except Exception as e:
|
|
_log('failed to load session state: %s' % str(e), 'warn')
|
|
return None
|
|
|
|
|
|
def save_stats_snapshot(sqlite, stats):
|
|
"""Save hourly stats snapshot for historical graphs.
|
|
|
|
Args:
|
|
sqlite: Database connection
|
|
stats: Stats object from proxywatchd
|
|
"""
|
|
now = int(time.time())
|
|
# Round to nearest hour
|
|
hour_ts = (now // 3600) * 3600
|
|
|
|
success_rate = 0
|
|
if stats.tested > 0:
|
|
success_rate = (stats.passed * 100.0) / stats.tested
|
|
|
|
avg_latency = 0
|
|
if stats.latency_count > 0:
|
|
avg_latency = stats.latency_sum / stats.latency_count
|
|
|
|
sqlite.execute('''INSERT OR REPLACE INTO stats_history
|
|
(timestamp, tested, passed, failed, success_rate, avg_latency,
|
|
ssl_tested, ssl_passed, mitm_detected, proto_http, proto_socks4, proto_socks5)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
|
|
(hour_ts, stats.tested, stats.passed, stats.failed,
|
|
success_rate, avg_latency,
|
|
stats.ssl_tested, stats.ssl_passed, stats.mitm_detected,
|
|
stats.proto_passed.get('http', 0),
|
|
stats.proto_passed.get('socks4', 0),
|
|
stats.proto_passed.get('socks5', 0)))
|
|
sqlite.commit()
|
|
|
|
|
|
def get_stats_history(sqlite, hours=24):
|
|
"""Get historical stats for the last N hours.
|
|
|
|
Args:
|
|
sqlite: Database connection
|
|
hours: Number of hours of history to retrieve
|
|
|
|
Returns:
|
|
List of dicts with hourly stats
|
|
"""
|
|
now = int(time.time())
|
|
since = now - (hours * 3600)
|
|
|
|
rows = sqlite.execute(
|
|
'SELECT * FROM stats_history WHERE timestamp >= ? ORDER BY timestamp',
|
|
(since,)
|
|
).fetchall()
|
|
|
|
cols = ['timestamp', 'tested', 'passed', 'failed', 'success_rate',
|
|
'avg_latency', 'ssl_tested', 'ssl_passed', 'mitm_detected',
|
|
'proto_http', 'proto_socks4', 'proto_socks5']
|
|
|
|
return [dict(zip(cols, row)) for row in rows]
|
|
|
|
|
|
def create_verification_tables(sqlite):
|
|
"""Create tables for the verification system.
|
|
|
|
Tables:
|
|
proxy_results: Per-test result history for disagreement detection
|
|
verification_queue: Disputed proxies awaiting manager verification
|
|
worker_trust: Worker accuracy tracking and trust scores
|
|
"""
|
|
# Per-test result history - stores last N results per proxy per worker
|
|
sqlite.execute("""CREATE TABLE IF NOT EXISTS proxy_results (
|
|
id INTEGER PRIMARY KEY,
|
|
proxy TEXT NOT NULL,
|
|
worker_id TEXT NOT NULL,
|
|
tested INTEGER NOT NULL,
|
|
working INTEGER NOT NULL,
|
|
latency_ms INTEGER,
|
|
error_category TEXT,
|
|
target TEXT,
|
|
FOREIGN KEY (proxy) REFERENCES proxylist(proxy))""")
|
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_results_proxy ON proxy_results(proxy, tested DESC)')
|
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_results_worker ON proxy_results(worker_id, tested DESC)')
|
|
|
|
# Verification queue - proxies needing manager verification
|
|
sqlite.execute("""CREATE TABLE IF NOT EXISTS verification_queue (
|
|
proxy TEXT PRIMARY KEY,
|
|
trigger TEXT NOT NULL,
|
|
priority INTEGER DEFAULT 1,
|
|
queued_at INTEGER NOT NULL,
|
|
worker_a TEXT,
|
|
worker_b TEXT,
|
|
result_a INTEGER,
|
|
result_b INTEGER)""")
|
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_verify_priority ON verification_queue(priority DESC, queued_at)')
|
|
|
|
# Worker trust tracking
|
|
sqlite.execute("""CREATE TABLE IF NOT EXISTS worker_trust (
|
|
worker_id TEXT PRIMARY KEY,
|
|
verifications INTEGER DEFAULT 0,
|
|
correct INTEGER DEFAULT 0,
|
|
incorrect INTEGER DEFAULT 0,
|
|
trust_score REAL DEFAULT 1.0,
|
|
last_updated INTEGER)""")
|
|
|
|
sqlite.commit()
|
|
|
|
|
|
def insert_proxy_result(sqlite, proxy, worker_id, working, latency_ms=None,
|
|
error_category=None, target=None, max_results=10):
|
|
"""Insert a proxy test result and prune old results.
|
|
|
|
Args:
|
|
sqlite: Database connection
|
|
proxy: Proxy address (ip:port)
|
|
worker_id: Worker that tested the proxy
|
|
working: 1 if passed, 0 if failed
|
|
latency_ms: Response latency in milliseconds
|
|
error_category: Error type (timeout, refused, auth, etc.)
|
|
target: Target host that was tested
|
|
max_results: Maximum results to keep per proxy (default 10)
|
|
"""
|
|
now = int(time.time())
|
|
sqlite.execute(
|
|
'''INSERT INTO proxy_results
|
|
(proxy, worker_id, tested, working, latency_ms, error_category, target)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?)''',
|
|
(proxy, worker_id, now, working, latency_ms, error_category, target)
|
|
)
|
|
|
|
# Prune old results for this proxy (keep last max_results)
|
|
sqlite.execute('''
|
|
DELETE FROM proxy_results
|
|
WHERE proxy = ? AND id NOT IN (
|
|
SELECT id FROM proxy_results
|
|
WHERE proxy = ?
|
|
ORDER BY tested DESC
|
|
LIMIT ?
|
|
)
|
|
''', (proxy, proxy, max_results))
|
|
|
|
|
|
def check_for_disagreement(sqlite, proxy, worker_id, working, window_seconds=300):
|
|
"""Check if this result disagrees with recent results from other workers.
|
|
|
|
Args:
|
|
sqlite: Database connection
|
|
proxy: Proxy address
|
|
worker_id: Current worker
|
|
working: Current test result (1=pass, 0=fail)
|
|
window_seconds: Time window to check (default 5 minutes)
|
|
|
|
Returns:
|
|
tuple (disagreement_found, other_worker_id, other_result) or (False, None, None)
|
|
"""
|
|
now = int(time.time())
|
|
cutoff = now - window_seconds
|
|
|
|
rows = sqlite.execute('''
|
|
SELECT worker_id, working FROM proxy_results
|
|
WHERE proxy = ? AND worker_id != ? AND tested > ?
|
|
ORDER BY tested DESC LIMIT 3
|
|
''', (proxy, worker_id, cutoff)).fetchall()
|
|
|
|
for other_worker, other_result in rows:
|
|
if other_result != working:
|
|
return (True, other_worker, other_result)
|
|
|
|
return (False, None, None)
|
|
|
|
|
|
def queue_verification(sqlite, proxy, trigger, priority=1,
|
|
worker_a=None, worker_b=None, result_a=None, result_b=None):
|
|
"""Add a proxy to the verification queue.
|
|
|
|
Args:
|
|
sqlite: Database connection
|
|
proxy: Proxy address
|
|
trigger: Why verification is needed (disagreement, sudden_death, resurrection, spot_check)
|
|
priority: 1=low, 2=medium, 3=high
|
|
worker_a: First worker involved (for disagreements)
|
|
worker_b: Second worker involved (for disagreements)
|
|
result_a: First worker's result
|
|
result_b: Second worker's result
|
|
"""
|
|
now = int(time.time())
|
|
# Check if already queued with higher priority
|
|
existing = sqlite.execute(
|
|
'SELECT priority FROM verification_queue WHERE proxy = ?', (proxy,)
|
|
).fetchone()
|
|
|
|
if existing and existing[0] >= priority:
|
|
return # Already queued with equal or higher priority
|
|
|
|
sqlite.execute('''
|
|
INSERT OR REPLACE INTO verification_queue
|
|
(proxy, trigger, priority, queued_at, worker_a, worker_b, result_a, result_b)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
''', (proxy, trigger, priority, now, worker_a, worker_b, result_a, result_b))
|
|
|
|
|
|
def get_pending_verifications(sqlite, limit=10):
|
|
"""Get highest priority pending verifications.
|
|
|
|
Args:
|
|
sqlite: Database connection
|
|
limit: Maximum number to return
|
|
|
|
Returns:
|
|
List of dicts with proxy and verification details
|
|
"""
|
|
rows = sqlite.execute('''
|
|
SELECT proxy, trigger, priority, queued_at, worker_a, worker_b, result_a, result_b
|
|
FROM verification_queue
|
|
ORDER BY priority DESC, queued_at ASC
|
|
LIMIT ?
|
|
''', (limit,)).fetchall()
|
|
|
|
cols = ['proxy', 'trigger', 'priority', 'queued_at',
|
|
'worker_a', 'worker_b', 'result_a', 'result_b']
|
|
return [dict(zip(cols, row)) for row in rows]
|
|
|
|
|
|
def remove_from_verification_queue(sqlite, proxy):
|
|
"""Remove a proxy from the verification queue after verification."""
|
|
sqlite.execute('DELETE FROM verification_queue WHERE proxy = ?', (proxy,))
|
|
|
|
|
|
def update_worker_trust(sqlite, worker_id, was_correct):
|
|
"""Update worker trust score after verification.
|
|
|
|
Uses Bayesian smoothing: trust_score = (correct + 5) / (verifications + 10)
|
|
|
|
Args:
|
|
sqlite: Database connection
|
|
worker_id: Worker to update
|
|
was_correct: True if worker's result matched verification
|
|
"""
|
|
now = int(time.time())
|
|
|
|
# Get current stats or initialize
|
|
row = sqlite.execute(
|
|
'SELECT verifications, correct, incorrect FROM worker_trust WHERE worker_id = ?',
|
|
(worker_id,)
|
|
).fetchone()
|
|
|
|
if row:
|
|
verifications = row[0] + 1
|
|
correct = row[1] + (1 if was_correct else 0)
|
|
incorrect = row[2] + (0 if was_correct else 1)
|
|
else:
|
|
verifications = 1
|
|
correct = 1 if was_correct else 0
|
|
incorrect = 0 if was_correct else 1
|
|
|
|
# Bayesian smoothing
|
|
trust_score = (correct + 5.0) / (verifications + 10.0)
|
|
|
|
sqlite.execute('''
|
|
INSERT OR REPLACE INTO worker_trust
|
|
(worker_id, verifications, correct, incorrect, trust_score, last_updated)
|
|
VALUES (?, ?, ?, ?, ?, ?)
|
|
''', (worker_id, verifications, correct, incorrect, trust_score, now))
|
|
|
|
|
|
def get_worker_trust(sqlite, worker_id):
|
|
"""Get trust score for a worker.
|
|
|
|
Args:
|
|
sqlite: Database connection
|
|
worker_id: Worker to look up
|
|
|
|
Returns:
|
|
Dict with trust stats or default values if not found
|
|
"""
|
|
row = sqlite.execute(
|
|
'SELECT verifications, correct, incorrect, trust_score, last_updated FROM worker_trust WHERE worker_id = ?',
|
|
(worker_id,)
|
|
).fetchone()
|
|
|
|
if row:
|
|
return {
|
|
'verifications': row[0],
|
|
'correct': row[1],
|
|
'incorrect': row[2],
|
|
'trust_score': row[3],
|
|
'last_updated': row[4]
|
|
}
|
|
return {
|
|
'verifications': 0,
|
|
'correct': 0,
|
|
'incorrect': 0,
|
|
'trust_score': 1.0, # New workers start trusted
|
|
'last_updated': None
|
|
}
|
|
|
|
|
|
def get_all_worker_trust(sqlite):
|
|
"""Get trust scores for all workers.
|
|
|
|
Returns:
|
|
Dict mapping worker_id to trust stats
|
|
"""
|
|
rows = sqlite.execute(
|
|
'SELECT worker_id, verifications, correct, incorrect, trust_score, last_updated FROM worker_trust'
|
|
).fetchall()
|
|
|
|
result = {}
|
|
for row in rows:
|
|
result[row[0]] = {
|
|
'verifications': row[1],
|
|
'correct': row[2],
|
|
'incorrect': row[3],
|
|
'trust_score': row[4],
|
|
'last_updated': row[5]
|
|
}
|
|
return result
|
|
|
|
|
|
def get_verification_stats(sqlite):
|
|
"""Get verification system statistics.
|
|
|
|
Returns:
|
|
Dict with queue_size, verified counts, disagreement counts
|
|
"""
|
|
now = int(time.time())
|
|
today_start = (now // 86400) * 86400
|
|
|
|
stats = {}
|
|
|
|
# Queue size
|
|
row = sqlite.execute('SELECT COUNT(*) FROM verification_queue').fetchone()
|
|
stats['queue_size'] = row[0] if row else 0
|
|
|
|
# Queue by priority
|
|
rows = sqlite.execute(
|
|
'SELECT priority, COUNT(*) FROM verification_queue GROUP BY priority'
|
|
).fetchall()
|
|
stats['queue_by_priority'] = {row[0]: row[1] for row in rows}
|
|
|
|
# Queue by trigger type
|
|
rows = sqlite.execute(
|
|
'SELECT trigger, COUNT(*) FROM verification_queue GROUP BY trigger'
|
|
).fetchall()
|
|
stats['queue_by_trigger'] = {row[0]: row[1] for row in rows}
|
|
|
|
return stats
|
|
|
|
|
|
def analyze_database(sqlite):
|
|
"""Run ANALYZE to update SQLite query planner statistics.
|
|
|
|
Should be called periodically (e.g., hourly) for optimal query performance.
|
|
Also enables stat4 for better index statistics on complex queries.
|
|
|
|
Args:
|
|
sqlite: Database connection
|
|
"""
|
|
try:
|
|
# Enable advanced statistics (persists in database)
|
|
sqlite.execute('PRAGMA analysis_limit=1000')
|
|
# Run ANALYZE on all tables and indexes
|
|
sqlite.execute('ANALYZE')
|
|
sqlite.commit()
|
|
_log('database ANALYZE completed', 'debug')
|
|
except Exception as e:
|
|
_log('database ANALYZE failed: %s' % str(e), 'warn')
|
|
|
|
|
|
def vacuum_database(sqlite):
|
|
"""Run VACUUM to reclaim unused space and defragment database.
|
|
|
|
Should be called infrequently (e.g., daily or weekly) as it's expensive.
|
|
Requires no active transactions.
|
|
|
|
Args:
|
|
sqlite: Database connection
|
|
"""
|
|
try:
|
|
sqlite.execute('VACUUM')
|
|
_log('database VACUUM completed', 'info')
|
|
except Exception as e:
|
|
_log('database VACUUM failed: %s' % str(e), 'warn')
|
|
|
|
|
|
def get_database_stats(sqlite):
|
|
"""Get database statistics for monitoring.
|
|
|
|
Args:
|
|
sqlite: Database connection
|
|
|
|
Returns:
|
|
Dict with database statistics
|
|
"""
|
|
stats = {}
|
|
try:
|
|
row = sqlite.execute('PRAGMA page_count').fetchone()
|
|
stats['page_count'] = row[0] if row else 0
|
|
|
|
row = sqlite.execute('PRAGMA page_size').fetchone()
|
|
stats['page_size'] = row[0] if row else 4096
|
|
|
|
row = sqlite.execute('PRAGMA freelist_count').fetchone()
|
|
stats['freelist_count'] = row[0] if row else 0
|
|
|
|
# Calculate sizes
|
|
stats['total_size'] = stats['page_count'] * stats['page_size']
|
|
stats['free_size'] = stats['freelist_count'] * stats['page_size']
|
|
stats['used_size'] = stats['total_size'] - stats['free_size']
|
|
|
|
# Table row counts
|
|
row = sqlite.execute('SELECT COUNT(*) FROM proxylist').fetchone()
|
|
stats['proxy_count'] = row[0] if row else 0
|
|
|
|
row = sqlite.execute('SELECT COUNT(*) FROM proxylist WHERE failed=0 AND tested IS NOT NULL').fetchone()
|
|
stats['working_count'] = row[0] if row else 0
|
|
|
|
row = sqlite.execute('SELECT COUNT(*) FROM uris').fetchone()
|
|
stats['uri_count'] = row[0] if row else 0
|
|
|
|
except Exception as e:
|
|
_log('get_database_stats error: %s' % e, 'warn')
|
|
|
|
return stats
|