#!/usr/bin/env python2 # -*- coding: utf-8 -*- """Database table creation and insertion utilities.""" import time from misc import _log def _migrate_latency_columns(sqlite): """Add latency columns to existing databases.""" try: sqlite.execute('SELECT avg_latency FROM proxylist LIMIT 1') except Exception: sqlite.execute('ALTER TABLE proxylist ADD COLUMN avg_latency REAL DEFAULT 0') sqlite.execute('ALTER TABLE proxylist ADD COLUMN latency_samples INT DEFAULT 0') sqlite.commit() def _migrate_anonymity_columns(sqlite): """Add anonymity detection columns to existing databases.""" try: sqlite.execute('SELECT anonymity FROM proxylist LIMIT 1') except Exception: # anonymity: transparent, anonymous, elite, or NULL (unknown) sqlite.execute('ALTER TABLE proxylist ADD COLUMN anonymity TEXT') # exit_ip: the IP seen by the target server sqlite.execute('ALTER TABLE proxylist ADD COLUMN exit_ip TEXT') sqlite.commit() def _migrate_asn_column(sqlite): """Add ASN column to existing databases.""" try: sqlite.execute('SELECT asn FROM proxylist LIMIT 1') except Exception: sqlite.execute('ALTER TABLE proxylist ADD COLUMN asn INT') sqlite.commit() def _migrate_content_hash_column(sqlite): """Add content_hash column to uris table for duplicate detection.""" try: sqlite.execute('SELECT content_hash FROM uris LIMIT 1') except Exception: sqlite.execute('ALTER TABLE uris ADD COLUMN content_hash TEXT') sqlite.commit() def _migrate_geolocation_columns(sqlite): """Add latitude/longitude columns for precise proxy mapping.""" try: sqlite.execute('SELECT latitude FROM proxylist LIMIT 1') except Exception: sqlite.execute('ALTER TABLE proxylist ADD COLUMN latitude REAL') sqlite.execute('ALTER TABLE proxylist ADD COLUMN longitude REAL') sqlite.commit() def _migrate_confidence_column(sqlite): """Add confidence column for extraction quality scoring.""" try: sqlite.execute('SELECT confidence FROM proxylist LIMIT 1') except Exception: # confidence: 0-100 score indicating extraction reliability sqlite.execute('ALTER TABLE proxylist ADD COLUMN confidence INT DEFAULT 30') sqlite.commit() def _migrate_source_proto(sqlite): """Add source_proto columns to preserve scraper-detected protocol intelligence.""" try: sqlite.execute('SELECT source_proto FROM proxylist LIMIT 1') except Exception: # source_proto: protocol detected by scraper (never overwritten by tests) sqlite.execute('ALTER TABLE proxylist ADD COLUMN source_proto TEXT') # source_confidence: scraper confidence score (0-100) sqlite.execute('ALTER TABLE proxylist ADD COLUMN source_confidence INT DEFAULT 0') sqlite.commit() def _migrate_protos_working(sqlite): """Add protos_working column for multi-protocol storage.""" try: sqlite.execute('SELECT protos_working FROM proxylist LIMIT 1') except Exception: # protos_working: comma-separated list of working protos (e.g. "http,socks5") sqlite.execute('ALTER TABLE proxylist ADD COLUMN protos_working TEXT') sqlite.commit() def _migrate_last_seen(sqlite): """Add last_seen column for worker-reported proxy freshness tracking.""" try: sqlite.execute('SELECT last_seen FROM proxylist LIMIT 1') except Exception: # last_seen: unix timestamp of most recent "working" report from any worker sqlite.execute('ALTER TABLE proxylist ADD COLUMN last_seen INT DEFAULT 0') sqlite.commit() def _migrate_last_check_columns(sqlite): """Add last_check and last_target columns for test provenance tracking.""" for col, typedef in (('last_check', 'TEXT'), ('last_target', 'TEXT')): try: sqlite.execute('SELECT %s FROM proxylist LIMIT 1' % col) except Exception: sqlite.execute('ALTER TABLE proxylist ADD COLUMN %s %s' % (col, typedef)) sqlite.commit() def _migrate_uri_check_interval(sqlite): """Add adaptive check_interval column to uris table.""" try: sqlite.execute('SELECT check_interval FROM uris LIMIT 1') except Exception: sqlite.execute('ALTER TABLE uris ADD COLUMN check_interval INT DEFAULT 3600') sqlite.commit() def _migrate_uri_working_ratio(sqlite): """Add working_ratio column to uris table for proxy quality tracking.""" try: sqlite.execute('SELECT working_ratio FROM uris LIMIT 1') except Exception: sqlite.execute('ALTER TABLE uris ADD COLUMN working_ratio REAL DEFAULT 0.0') sqlite.commit() def _migrate_uri_avg_fetch_time(sqlite): """Add avg_fetch_time column to uris table for fetch latency EMA.""" try: sqlite.execute('SELECT avg_fetch_time FROM uris LIMIT 1') except Exception: sqlite.execute('ALTER TABLE uris ADD COLUMN avg_fetch_time INT DEFAULT 0') sqlite.commit() def _migrate_uri_last_worker(sqlite): """Add last_worker column to uris table.""" try: sqlite.execute('SELECT last_worker FROM uris LIMIT 1') except Exception: sqlite.execute('ALTER TABLE uris ADD COLUMN last_worker TEXT') sqlite.commit() def _migrate_uri_yield_rate(sqlite): """Add yield_rate column to uris table for proxy yield EMA.""" try: sqlite.execute('SELECT yield_rate FROM uris LIMIT 1') except Exception: sqlite.execute('ALTER TABLE uris ADD COLUMN yield_rate REAL DEFAULT 0.0') sqlite.commit() def compute_proxy_list_hash(proxies): """Compute MD5 hash of sorted proxy list for change detection. Args: proxies: List of proxy strings (ip:port) or tuples (address, proto) Returns: Hexadecimal MD5 hash string, or None if list is empty """ if not proxies: return None import hashlib # Handle both tuple (address, proto) and plain string formats addresses = [p[0] if isinstance(p, tuple) else p for p in proxies] sorted_list = '\n'.join(sorted(addresses)) return hashlib.md5(sorted_list.encode('utf-8') if hasattr(sorted_list, 'encode') else sorted_list).hexdigest() def update_proxy_latency(sqlite, proxy, latency_ms): """Update rolling average latency for a proxy. Args: sqlite: Database connection proxy: Proxy address (ip:port) latency_ms: Response latency in milliseconds """ row = sqlite.execute( 'SELECT avg_latency, latency_samples FROM proxylist WHERE proxy=?', (proxy,) ).fetchone() if row: old_avg, samples = row[0] or 0, row[1] or 0 # Exponential moving average, capped at 100 samples new_samples = min(samples + 1, 100) if samples == 0: new_avg = latency_ms else: # Weight recent samples more heavily alpha = 2.0 / (new_samples + 1) new_avg = alpha * latency_ms + (1 - alpha) * old_avg sqlite.execute( 'UPDATE proxylist SET avg_latency=?, latency_samples=? WHERE proxy=?', (new_avg, new_samples, proxy) ) def batch_update_proxy_latency(sqlite, latency_updates): """Batch update latency for multiple proxies. Uses a single SELECT to fetch current values, computes new averages in Python, then uses executemany for batch UPDATE. Much faster than individual calls. Args: sqlite: Database connection latency_updates: List of (proxy, latency_ms) tuples """ if not latency_updates: return # Build proxy list for IN clause proxies = [p for p, _ in latency_updates] latency_map = {p: lat for p, lat in latency_updates} # Fetch current values in single query placeholders = ','.join('?' * len(proxies)) rows = sqlite.execute( 'SELECT proxy, avg_latency, latency_samples FROM proxylist WHERE proxy IN (%s)' % placeholders, proxies ).fetchall() # Compute new averages updates = [] for row in rows: proxy, old_avg, samples = row[0], row[1] or 0, row[2] or 0 latency_ms = latency_map.get(proxy) if latency_ms is None: continue new_samples = min(samples + 1, 100) if samples == 0: new_avg = latency_ms else: alpha = 2.0 / (new_samples + 1) new_avg = alpha * latency_ms + (1 - alpha) * old_avg updates.append((new_avg, new_samples, proxy)) # Batch update if updates: sqlite.executemany( 'UPDATE proxylist SET avg_latency=?, latency_samples=? WHERE proxy=?', updates ) def update_proxy_anonymity(sqlite, proxy, exit_ip, proxy_ip, reveals_headers=None): """Update anonymity level based on exit IP and header analysis. Anonymity levels: transparent: exit_ip == proxy_ip (proxy reveals itself) anonymous: exit_ip != proxy_ip, adds X-Forwarded-For/Via headers elite: exit_ip != proxy_ip, no revealing headers Args: sqlite: Database connection proxy: Proxy address (ip:port) exit_ip: IP address seen by target server proxy_ip: Proxy's IP address reveals_headers: True if proxy adds revealing headers, False if not, None if unknown """ if not exit_ip: return # Normalize IPs (remove leading zeros) def normalize_ip(ip): if not ip: return None parts = ip.strip().split('.') if len(parts) != 4: return None try: return '.'.join(str(int(p)) for p in parts) except ValueError: return None exit_ip = normalize_ip(exit_ip) proxy_ip = normalize_ip(proxy_ip) if not exit_ip: return # Determine anonymity level if exit_ip == proxy_ip: anonymity = 'transparent' elif reveals_headers is False: anonymity = 'elite' elif reveals_headers is True: anonymity = 'anonymous' else: # No header check performed, conservative default anonymity = 'anonymous' sqlite.execute( 'UPDATE proxylist SET anonymity=?, exit_ip=? WHERE proxy=?', (anonymity, exit_ip, proxy) ) def batch_update_proxy_anonymity(sqlite, anonymity_updates): """Batch update anonymity for multiple proxies. Args: sqlite: Database connection anonymity_updates: List of (proxy, exit_ip, proxy_ip, reveals_headers) tuples """ if not anonymity_updates: return # Normalize IPs and compute anonymity levels def normalize_ip(ip): if not ip: return None parts = ip.strip().split('.') if len(parts) != 4: return None try: return '.'.join(str(int(p)) for p in parts) except ValueError: return None updates = [] for proxy, exit_ip, proxy_ip, reveals_headers in anonymity_updates: exit_ip = normalize_ip(exit_ip) proxy_ip = normalize_ip(proxy_ip) if not exit_ip: continue # Determine anonymity level if exit_ip == proxy_ip: anonymity = 'transparent' elif reveals_headers is False: anonymity = 'elite' elif reveals_headers is True: anonymity = 'anonymous' else: anonymity = 'anonymous' updates.append((anonymity, exit_ip, proxy)) # Batch update if updates: sqlite.executemany( 'UPDATE proxylist SET anonymity=?, exit_ip=? WHERE proxy=?', updates ) def create_table_if_not_exists(sqlite, dbname): """Create database table with indexes if it doesn't exist.""" if dbname == 'proxylist': sqlite.execute("""CREATE TABLE IF NOT EXISTS proxylist ( proxy BLOB UNIQUE, country BLOB, added INT, failed INT, tested INT, dronebl INT, proto TEXT, mitm INT, success_count INT, ip TEXT, port INT, consecutive_success INT, total_duration INT, avg_latency REAL DEFAULT 0, latency_samples INT DEFAULT 0, anonymity TEXT, exit_ip TEXT, asn INT, latitude REAL, longitude REAL, confidence INT DEFAULT 30, source_proto TEXT, source_confidence INT DEFAULT 0, protos_working TEXT, last_seen INT DEFAULT 0, last_check TEXT, last_target TEXT)""") # Migration: add columns to existing databases (must run before creating indexes) _migrate_latency_columns(sqlite) _migrate_anonymity_columns(sqlite) _migrate_asn_column(sqlite) _migrate_geolocation_columns(sqlite) _migrate_confidence_column(sqlite) _migrate_source_proto(sqlite) _migrate_protos_working(sqlite) _migrate_last_seen(sqlite) _migrate_last_check_columns(sqlite) # Indexes for common query patterns sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_failed ON proxylist(failed)') sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_tested ON proxylist(tested)') sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_proto ON proxylist(proto)') sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_anonymity ON proxylist(anonymity)') sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_asn ON proxylist(asn)') elif dbname == 'uris': sqlite.execute("""CREATE TABLE IF NOT EXISTS uris ( url TEXT UNIQUE, content_type TEXT, check_time INT, error INT, stale_count INT, retrievals INT, proxies_added INT, added INT, content_hash TEXT)""") # Migration for existing databases _migrate_content_hash_column(sqlite) _migrate_uri_check_interval(sqlite) _migrate_uri_working_ratio(sqlite) _migrate_uri_avg_fetch_time(sqlite) _migrate_uri_last_worker(sqlite) _migrate_uri_yield_rate(sqlite) # Indexes for common query patterns sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_error ON uris(error)') sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_checktime ON uris(check_time)') elif dbname == 'stats_history': # Hourly stats snapshots for historical graphs sqlite.execute("""CREATE TABLE IF NOT EXISTS stats_history ( timestamp INT PRIMARY KEY, tested INT, passed INT, failed INT, success_rate REAL, avg_latency REAL, ssl_tested INT, ssl_passed INT, mitm_detected INT, proto_http INT, proto_socks4 INT, proto_socks5 INT)""") sqlite.execute('CREATE INDEX IF NOT EXISTS idx_stats_history_ts ON stats_history(timestamp)') elif dbname == 'session_state': # Single-row table for persisting session state across restarts sqlite.execute("""CREATE TABLE IF NOT EXISTS session_state ( id INT PRIMARY KEY DEFAULT 1, tested INT, passed INT, failed INT, ssl_tested INT, ssl_passed INT, ssl_failed INT, mitm_detected INT, cert_errors INT, proto_http_tested INT, proto_http_passed INT, proto_socks4_tested INT, proto_socks4_passed INT, proto_socks5_tested INT, proto_socks5_passed INT, peak_rate REAL, start_time INT, last_save INT, fail_categories TEXT, country_passed TEXT, asn_passed TEXT)""") sqlite.commit() # CDN IP ranges to filter out (not real proxies) CDN_PREFIXES = ( # Cloudflare '141.101.', '172.67.', '172.64.', '104.16.', '104.17.', '104.18.', '104.19.', '104.20.', '104.21.', '104.22.', '104.23.', '104.24.', '104.25.', '104.26.', '104.27.', '188.114.', '190.93.', '197.234.', '198.41.', '173.245.', '103.21.', '103.22.', '103.31.', # Fastly '151.101.', # Akamai (common ranges) '23.32.', '23.33.', '23.34.', '23.35.', '23.36.', '23.37.', '23.38.', '23.39.', '23.40.', '23.41.', '23.42.', '23.43.', '23.44.', '23.45.', '23.46.', '23.47.', '23.48.', '23.49.', '23.50.', '23.51.', '23.52.', '23.53.', '23.54.', '23.55.', '23.56.', '23.57.', '23.58.', '23.59.', '23.60.', '23.61.', '23.62.', '23.63.', '23.64.', '23.65.', '23.66.', '23.67.', # Amazon CloudFront '13.32.', '13.33.', '13.35.', '13.224.', '13.225.', '13.226.', '13.227.', # Google '34.64.', '34.65.', '34.66.', '34.67.', '34.68.', '34.69.', '34.70.', '34.71.', ) def is_cdn_ip(ip): """Check if IP belongs to known CDN ranges.""" for prefix in CDN_PREFIXES: if ip.startswith(prefix): return True return False def insert_proxies(proxydb, proxies, url): """Insert new proxies into database. Args: proxydb: Database connection proxies: List of tuples or plain address strings - (address, proto) - 2-tuple, default confidence - (address, proto, confidence) - 3-tuple with score - address string - default proto and confidence url: Source URL for logging """ if not proxies: return timestamp = int(time.time()) rows = [] filtered = 0 for p in proxies: # Handle tuple (address, proto[, confidence]) and plain string formats confidence = 30 # Default confidence (CONFIDENCE_REGEX) if isinstance(p, tuple): if len(p) >= 3: addr, proto, confidence = p[0], p[1], p[2] else: addr, proto = p[0], p[1] else: addr, proto = p, None # Parse address into ip and port # Formats: ip:port, [ipv6]:port, user:pass@ip:port, user:pass@[ipv6]:port addr_part = addr.split('@')[-1] # Strip auth if present if addr_part.startswith('['): # IPv6: [ipv6]:port bracket_end = addr_part.find(']') if bracket_end < 0: continue ip = addr_part[:bracket_end + 1] # Include brackets port = addr_part[bracket_end + 2:] # Skip ]: else: # IPv4: ip:port ip, port = addr_part.rsplit(':', 1) # Filter out CDN IPs (not real proxies) if is_cdn_ip(ip): filtered += 1 continue rows.append((timestamp, addr, ip, port, proto, 1, 0, 0, 0, 0, 0, confidence, proto, confidence)) proxydb.executemany( 'INSERT OR IGNORE INTO proxylist ' '(added,proxy,ip,port,proto,failed,tested,success_count,total_duration,mitm,consecutive_success,confidence,source_proto,source_confidence) ' 'VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', rows ) proxydb.commit() if filtered: _log('+%d proxy/ies from %s (filtered %d CDN IPs)' % (len(rows), url, filtered), 'added') else: _log('+%d proxy/ies from %s' % (len(rows), url), 'added') def insert_urls(urls, search, sqlite): """Insert new URLs into database. Returns count of newly inserted URLs.""" if not urls: return 0 # Count before insert before = sqlite.execute('SELECT COUNT(*) FROM uris').fetchone()[0] timestamp = int(time.time()) rows = [(timestamp, u, 0, 1, 0, 0, 0) for u in urls] sqlite.executemany( 'INSERT OR IGNORE INTO uris ' '(added,url,check_time,error,stale_count,retrievals,proxies_added) ' 'VALUES (?,?,?,?,?,?,?)', rows ) sqlite.commit() # Count after insert to determine how many were actually new after = sqlite.execute('SELECT COUNT(*) FROM uris').fetchone()[0] return after - before # Known proxy list sources (GitHub raw lists, APIs) PROXY_SOURCES = [ # --- GitHub raw lists (sorted by update frequency) --- # TheSpeedX/PROXY-List - large, hourly updates 'https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt', 'https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt', 'https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txt', # monosans/proxy-list - hourly updates 'https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt', 'https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks4.txt', 'https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks5.txt', # prxchk/proxy-list - 10 min updates 'https://raw.githubusercontent.com/prxchk/proxy-list/main/http.txt', 'https://raw.githubusercontent.com/prxchk/proxy-list/main/socks4.txt', 'https://raw.githubusercontent.com/prxchk/proxy-list/main/socks5.txt', # jetkai/proxy-list - 10 min updates 'https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies.txt', # hookzof/socks5_list - hourly, SOCKS5 focused 'https://raw.githubusercontent.com/hookzof/socks5_list/master/proxy.txt', # mmpx12/proxy-list 'https://raw.githubusercontent.com/mmpx12/proxy-list/master/http.txt', 'https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks4.txt', 'https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks5.txt', # ShiftyTR/Proxy-List 'https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/http.txt', 'https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks4.txt', 'https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks5.txt', # roosterkid/openproxylist 'https://raw.githubusercontent.com/roosterkid/openproxylist/main/HTTPS_RAW.txt', 'https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS4_RAW.txt', 'https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS5_RAW.txt', # clarketm/proxy-list - curated, daily 'https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt', # officialputuid/KangProxy - 4-6 hour updates 'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/http/http.txt', 'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/https/https.txt', 'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/socks4/socks4.txt', 'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/socks5/socks5.txt', # iplocate/free-proxy-list - 30 min updates 'https://raw.githubusercontent.com/iplocate/free-proxy-list/main/protocols/http.txt', 'https://raw.githubusercontent.com/iplocate/free-proxy-list/main/protocols/socks4.txt', 'https://raw.githubusercontent.com/iplocate/free-proxy-list/main/protocols/socks5.txt', # ErcinDedeworken/proxy-list - hourly 'https://raw.githubusercontent.com/ErcinDedeworken/proxy-list/main/proxy-list/data.txt', # MuRongPIG/Proxy-Master - 10 min updates 'https://raw.githubusercontent.com/MuRongPIG/Proxy-Master/main/http.txt', 'https://raw.githubusercontent.com/MuRongPIG/Proxy-Master/main/socks4.txt', 'https://raw.githubusercontent.com/MuRongPIG/Proxy-Master/main/socks5.txt', # zloi-user/hideip.me - hourly 'https://raw.githubusercontent.com/zloi-user/hideip.me/main/http.txt', 'https://raw.githubusercontent.com/zloi-user/hideip.me/main/socks4.txt', 'https://raw.githubusercontent.com/zloi-user/hideip.me/main/socks5.txt', # FLAVIEN-music/proxy-list - 30 min updates 'https://raw.githubusercontent.com/FLAVIEN-music/proxy-list/main/proxies/http.txt', 'https://raw.githubusercontent.com/FLAVIEN-music/proxy-list/main/proxies/socks4.txt', 'https://raw.githubusercontent.com/FLAVIEN-music/proxy-list/main/proxies/socks5.txt', # Zaeem20/FREE_PROXIES_LIST - 30 min updates 'https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/http.txt', 'https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/https.txt', 'https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/socks4.txt', 'https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/socks5.txt', # r00tee/Proxy-List - hourly 'https://raw.githubusercontent.com/r00tee/Proxy-List/main/Https.txt', 'https://raw.githubusercontent.com/r00tee/Proxy-List/main/Socks4.txt', 'https://raw.githubusercontent.com/r00tee/Proxy-List/main/Socks5.txt', # casals-ar/proxy-list 'https://raw.githubusercontent.com/casals-ar/proxy-list/main/http', 'https://raw.githubusercontent.com/casals-ar/proxy-list/main/socks4', 'https://raw.githubusercontent.com/casals-ar/proxy-list/main/socks5', # yemixzy/proxy-list 'https://raw.githubusercontent.com/yemixzy/proxy-list/main/proxies/http.txt', 'https://raw.githubusercontent.com/yemixzy/proxy-list/main/proxies/socks4.txt', 'https://raw.githubusercontent.com/yemixzy/proxy-list/main/proxies/socks5.txt', # opsxcq/proxy-list 'https://raw.githubusercontent.com/opsxcq/proxy-list/master/list.txt', # im-razvan/proxy_list - 10 min updates 'https://raw.githubusercontent.com/im-razvan/proxy_list/main/http.txt', 'https://raw.githubusercontent.com/im-razvan/proxy_list/main/socks4.txt', 'https://raw.githubusercontent.com/im-razvan/proxy_list/main/socks5.txt', # zevtyardt/proxy-list - daily SOCKS5 'https://raw.githubusercontent.com/zevtyardt/proxy-list/main/socks5.txt', # UptimerBot/proxy-list - 15 min updates 'https://raw.githubusercontent.com/UptimerBot/proxy-list/main/proxies/socks5.txt', # Anonym0usWork1221/Free-Proxies 'https://raw.githubusercontent.com/Anonym0usWork1221/Free-Proxies/main/proxy_files/https_proxies.txt', 'https://raw.githubusercontent.com/Anonym0usWork1221/Free-Proxies/main/proxy_files/socks4_proxies.txt', 'https://raw.githubusercontent.com/Anonym0usWork1221/Free-Proxies/main/proxy_files/socks5_proxies.txt', # ErcinDedeoglu/proxies - hourly 'https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/http.txt', 'https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/socks4.txt', 'https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/socks5.txt', # dinoz0rg/proxy-list - daily, all protocols 'https://raw.githubusercontent.com/dinoz0rg/proxy-list/main/all.txt', # elliottophellia/proxylist - SOCKS5 'https://raw.githubusercontent.com/elliottophellia/proxylist/master/results/socks5/global/socks5_len.txt', # gfpcom/free-proxy-list - SOCKS5 'https://raw.githubusercontent.com/gfpcom/free-proxy-list/main/socks5.txt', # databay-labs/free-proxy-list - SOCKS5 'https://raw.githubusercontent.com/databay-labs/free-proxy-list/master/socks5.txt', # --- GitHub Pages / CDN hosted --- # proxifly/free-proxy-list - 5 min updates (jsDelivr CDN) 'https://cdn.jsdelivr.net/gh/proxifly/free-proxy-list@main/proxies/protocols/http/data.txt', 'https://cdn.jsdelivr.net/gh/proxifly/free-proxy-list@main/proxies/protocols/socks4/data.txt', 'https://cdn.jsdelivr.net/gh/proxifly/free-proxy-list@main/proxies/protocols/socks5/data.txt', # vakhov/fresh-proxy-list - 5-20 min updates (GitHub Pages) 'https://vakhov.github.io/fresh-proxy-list/http.txt', 'https://vakhov.github.io/fresh-proxy-list/socks4.txt', 'https://vakhov.github.io/fresh-proxy-list/socks5.txt', # sunny9577/proxy-scraper - 3 hour updates (GitHub Pages) 'https://sunny9577.github.io/proxy-scraper/generated/http_proxies.txt', 'https://sunny9577.github.io/proxy-scraper/generated/socks4_proxies.txt', 'https://sunny9577.github.io/proxy-scraper/generated/socks5_proxies.txt', # --- API endpoints --- # proxyscrape 'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=http&timeout=10000&country=all', 'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks4&timeout=10000&country=all', 'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks5&timeout=10000&country=all', # proxy-list.download - SOCKS5 API 'https://www.proxy-list.download/api/v1/get?type=socks5', 'https://www.proxy-list.download/api/v1/get?type=socks4', # openproxylist.xyz - plain text 'https://api.openproxylist.xyz/http.txt', 'https://api.openproxylist.xyz/socks4.txt', 'https://api.openproxylist.xyz/socks5.txt', # spys.me - plain text, 30 min updates 'http://spys.me/proxy.txt', 'http://spys.me/socks.txt', # --- Web scrapers (HTML pages) --- # spys.one - mixed protocols, requires parsing 'https://spys.one/en/free-proxy-list/', 'https://spys.one/en/socks-proxy-list/', 'https://spys.one/en/https-ssl-proxy/', # free-proxy-list.net 'https://free-proxy-list.net/', 'https://www.sslproxies.org/', 'https://www.socks-proxy.net/', # sockslist.us - SOCKS5 focused 'https://sockslist.us/', # mtpro.xyz - SOCKS5, updated every 5 min 'https://mtpro.xyz/socks5', # proxy-tools.com - SOCKS5 filtered 'https://proxy-tools.com/proxy/socks5', # hidemy.name - all protocols, paginated 'https://hide.mn/en/proxy-list/', # advanced.name - SOCKS5 filtered 'https://advanced.name/freeproxy?type=socks5', # proxynova.com - by country 'https://www.proxynova.com/proxy-server-list/', # freeproxy.world - SOCKS5 filtered 'https://www.freeproxy.world/?type=socks5', # proxydb.net - all protocols 'http://proxydb.net/', # geonode 'https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc&protocols=http', 'https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc&protocols=socks4', 'https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc&protocols=socks5', # openproxy.space 'https://openproxy.space/list/http', 'https://openproxy.space/list/socks4', 'https://openproxy.space/list/socks5', # --- Telegram channels (public HTML view) --- 'https://t.me/s/spys_one', 'https://t.me/s/proxyfree1', 'https://t.me/s/proxylist4free', 'https://t.me/s/proxy_lists', 'https://t.me/s/Proxies4ForYou', ] def seed_proxy_sources(sqlite, reset_errors=False): """Seed known proxy list sources into uris table. Args: sqlite: Database connection reset_errors: If True, reset error/stale counts on existing seed sources that have errored out, allowing them to be retried. Safe to call periodically. """ timestamp = int(time.time()) added = 0 reset = 0 for url in PROXY_SOURCES: try: sqlite.execute( 'INSERT OR IGNORE INTO uris ' '(added,url,check_time,error,stale_count,retrievals,proxies_added) ' 'VALUES (?,?,?,?,?,?,?)', (timestamp, url, 0, 0, 0, 0, 0) ) if sqlite.cursor.rowcount > 0: added += 1 elif reset_errors: # Reset errored-out seed sources so they get reclaimed sqlite.execute( 'UPDATE uris SET error = 0, stale_count = 0, ' 'check_interval = 3600, check_time = 0 ' 'WHERE url = ? AND error >= 5', (url,) ) if sqlite.cursor.rowcount > 0: reset += 1 except Exception as e: _log('seed_urls insert error for %s: %s' % (url, e), 'warn') sqlite.commit() if added > 0 or reset > 0: _log('seed sources: %d new, %d reset' % (added, reset), 'info') def save_session_state(sqlite, stats): """Save session state to database for persistence across restarts. Args: sqlite: Database connection stats: Stats object from proxywatchd """ import json now = int(time.time()) # Serialize dicts as JSON fail_cats_json = json.dumps(dict(stats.fail_categories)) country_json = json.dumps(dict(stats.country_passed)) asn_json = json.dumps(dict(stats.asn_passed)) sqlite.execute('''INSERT OR REPLACE INTO session_state (id, tested, passed, failed, ssl_tested, ssl_passed, ssl_failed, mitm_detected, cert_errors, proto_http_tested, proto_http_passed, proto_socks4_tested, proto_socks4_passed, proto_socks5_tested, proto_socks5_passed, peak_rate, start_time, last_save, fail_categories, country_passed, asn_passed) VALUES (1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', (stats.tested, stats.passed, stats.failed, stats.ssl_tested, stats.ssl_passed, stats.ssl_failed, stats.mitm_detected, stats.cert_errors, stats.proto_tested.get('http', 0), stats.proto_passed.get('http', 0), stats.proto_tested.get('socks4', 0), stats.proto_passed.get('socks4', 0), stats.proto_tested.get('socks5', 0), stats.proto_passed.get('socks5', 0), stats.peak_rate, int(stats.start_time), now, fail_cats_json, country_json, asn_json)) sqlite.commit() def load_session_state(sqlite): """Load session state from database. Args: sqlite: Database connection Returns: dict with state fields, or None if no saved state """ import json try: row = sqlite.execute( 'SELECT * FROM session_state WHERE id=1' ).fetchone() if not row: return None # Map column names to values cols = ['id', 'tested', 'passed', 'failed', 'ssl_tested', 'ssl_passed', 'ssl_failed', 'mitm_detected', 'cert_errors', 'proto_http_tested', 'proto_http_passed', 'proto_socks4_tested', 'proto_socks4_passed', 'proto_socks5_tested', 'proto_socks5_passed', 'peak_rate', 'start_time', 'last_save', 'fail_categories', 'country_passed', 'asn_passed'] state = dict(zip(cols, row)) # Parse JSON fields if state.get('fail_categories'): state['fail_categories'] = json.loads(state['fail_categories']) else: state['fail_categories'] = {} if state.get('country_passed'): state['country_passed'] = json.loads(state['country_passed']) else: state['country_passed'] = {} if state.get('asn_passed'): state['asn_passed'] = json.loads(state['asn_passed']) else: state['asn_passed'] = {} return state except Exception as e: _log('failed to load session state: %s' % str(e), 'warn') return None def save_stats_snapshot(sqlite, stats): """Save hourly stats snapshot for historical graphs. Args: sqlite: Database connection stats: Stats object from proxywatchd """ now = int(time.time()) # Round to nearest hour hour_ts = (now // 3600) * 3600 success_rate = 0 if stats.tested > 0: success_rate = (stats.passed * 100.0) / stats.tested avg_latency = 0 if stats.latency_count > 0: avg_latency = stats.latency_sum / stats.latency_count sqlite.execute('''INSERT OR REPLACE INTO stats_history (timestamp, tested, passed, failed, success_rate, avg_latency, ssl_tested, ssl_passed, mitm_detected, proto_http, proto_socks4, proto_socks5) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', (hour_ts, stats.tested, stats.passed, stats.failed, success_rate, avg_latency, stats.ssl_tested, stats.ssl_passed, stats.mitm_detected, stats.proto_passed.get('http', 0), stats.proto_passed.get('socks4', 0), stats.proto_passed.get('socks5', 0))) sqlite.commit() def get_stats_history(sqlite, hours=24): """Get historical stats for the last N hours. Args: sqlite: Database connection hours: Number of hours of history to retrieve Returns: List of dicts with hourly stats """ now = int(time.time()) since = now - (hours * 3600) rows = sqlite.execute( 'SELECT * FROM stats_history WHERE timestamp >= ? ORDER BY timestamp', (since,) ).fetchall() cols = ['timestamp', 'tested', 'passed', 'failed', 'success_rate', 'avg_latency', 'ssl_tested', 'ssl_passed', 'mitm_detected', 'proto_http', 'proto_socks4', 'proto_socks5'] return [dict(zip(cols, row)) for row in rows] def create_verification_tables(sqlite): """Create tables for the verification system. Tables: proxy_results: Per-test result history for disagreement detection verification_queue: Disputed proxies awaiting manager verification worker_trust: Worker accuracy tracking and trust scores """ # Per-test result history - stores last N results per proxy per worker sqlite.execute("""CREATE TABLE IF NOT EXISTS proxy_results ( id INTEGER PRIMARY KEY, proxy TEXT NOT NULL, worker_id TEXT NOT NULL, tested INTEGER NOT NULL, working INTEGER NOT NULL, latency_ms INTEGER, error_category TEXT, target TEXT, FOREIGN KEY (proxy) REFERENCES proxylist(proxy))""") sqlite.execute('CREATE INDEX IF NOT EXISTS idx_results_proxy ON proxy_results(proxy, tested DESC)') sqlite.execute('CREATE INDEX IF NOT EXISTS idx_results_worker ON proxy_results(worker_id, tested DESC)') # Verification queue - proxies needing manager verification sqlite.execute("""CREATE TABLE IF NOT EXISTS verification_queue ( proxy TEXT PRIMARY KEY, trigger TEXT NOT NULL, priority INTEGER DEFAULT 1, queued_at INTEGER NOT NULL, worker_a TEXT, worker_b TEXT, result_a INTEGER, result_b INTEGER)""") sqlite.execute('CREATE INDEX IF NOT EXISTS idx_verify_priority ON verification_queue(priority DESC, queued_at)') # Worker trust tracking sqlite.execute("""CREATE TABLE IF NOT EXISTS worker_trust ( worker_id TEXT PRIMARY KEY, verifications INTEGER DEFAULT 0, correct INTEGER DEFAULT 0, incorrect INTEGER DEFAULT 0, trust_score REAL DEFAULT 1.0, last_updated INTEGER)""") sqlite.commit() def insert_proxy_result(sqlite, proxy, worker_id, working, latency_ms=None, error_category=None, target=None, max_results=10): """Insert a proxy test result and prune old results. Args: sqlite: Database connection proxy: Proxy address (ip:port) worker_id: Worker that tested the proxy working: 1 if passed, 0 if failed latency_ms: Response latency in milliseconds error_category: Error type (timeout, refused, auth, etc.) target: Target host that was tested max_results: Maximum results to keep per proxy (default 10) """ now = int(time.time()) sqlite.execute( '''INSERT INTO proxy_results (proxy, worker_id, tested, working, latency_ms, error_category, target) VALUES (?, ?, ?, ?, ?, ?, ?)''', (proxy, worker_id, now, working, latency_ms, error_category, target) ) # Prune old results for this proxy (keep last max_results) sqlite.execute(''' DELETE FROM proxy_results WHERE proxy = ? AND id NOT IN ( SELECT id FROM proxy_results WHERE proxy = ? ORDER BY tested DESC LIMIT ? ) ''', (proxy, proxy, max_results)) def check_for_disagreement(sqlite, proxy, worker_id, working, window_seconds=300): """Check if this result disagrees with recent results from other workers. Args: sqlite: Database connection proxy: Proxy address worker_id: Current worker working: Current test result (1=pass, 0=fail) window_seconds: Time window to check (default 5 minutes) Returns: tuple (disagreement_found, other_worker_id, other_result) or (False, None, None) """ now = int(time.time()) cutoff = now - window_seconds rows = sqlite.execute(''' SELECT worker_id, working FROM proxy_results WHERE proxy = ? AND worker_id != ? AND tested > ? ORDER BY tested DESC LIMIT 3 ''', (proxy, worker_id, cutoff)).fetchall() for other_worker, other_result in rows: if other_result != working: return (True, other_worker, other_result) return (False, None, None) def queue_verification(sqlite, proxy, trigger, priority=1, worker_a=None, worker_b=None, result_a=None, result_b=None): """Add a proxy to the verification queue. Args: sqlite: Database connection proxy: Proxy address trigger: Why verification is needed (disagreement, sudden_death, resurrection, spot_check) priority: 1=low, 2=medium, 3=high worker_a: First worker involved (for disagreements) worker_b: Second worker involved (for disagreements) result_a: First worker's result result_b: Second worker's result """ now = int(time.time()) # Check if already queued with higher priority existing = sqlite.execute( 'SELECT priority FROM verification_queue WHERE proxy = ?', (proxy,) ).fetchone() if existing and existing[0] >= priority: return # Already queued with equal or higher priority sqlite.execute(''' INSERT OR REPLACE INTO verification_queue (proxy, trigger, priority, queued_at, worker_a, worker_b, result_a, result_b) VALUES (?, ?, ?, ?, ?, ?, ?, ?) ''', (proxy, trigger, priority, now, worker_a, worker_b, result_a, result_b)) def get_pending_verifications(sqlite, limit=10): """Get highest priority pending verifications. Args: sqlite: Database connection limit: Maximum number to return Returns: List of dicts with proxy and verification details """ rows = sqlite.execute(''' SELECT proxy, trigger, priority, queued_at, worker_a, worker_b, result_a, result_b FROM verification_queue ORDER BY priority DESC, queued_at ASC LIMIT ? ''', (limit,)).fetchall() cols = ['proxy', 'trigger', 'priority', 'queued_at', 'worker_a', 'worker_b', 'result_a', 'result_b'] return [dict(zip(cols, row)) for row in rows] def remove_from_verification_queue(sqlite, proxy): """Remove a proxy from the verification queue after verification.""" sqlite.execute('DELETE FROM verification_queue WHERE proxy = ?', (proxy,)) def update_worker_trust(sqlite, worker_id, was_correct): """Update worker trust score after verification. Uses Bayesian smoothing: trust_score = (correct + 5) / (verifications + 10) Args: sqlite: Database connection worker_id: Worker to update was_correct: True if worker's result matched verification """ now = int(time.time()) # Get current stats or initialize row = sqlite.execute( 'SELECT verifications, correct, incorrect FROM worker_trust WHERE worker_id = ?', (worker_id,) ).fetchone() if row: verifications = row[0] + 1 correct = row[1] + (1 if was_correct else 0) incorrect = row[2] + (0 if was_correct else 1) else: verifications = 1 correct = 1 if was_correct else 0 incorrect = 0 if was_correct else 1 # Bayesian smoothing trust_score = (correct + 5.0) / (verifications + 10.0) sqlite.execute(''' INSERT OR REPLACE INTO worker_trust (worker_id, verifications, correct, incorrect, trust_score, last_updated) VALUES (?, ?, ?, ?, ?, ?) ''', (worker_id, verifications, correct, incorrect, trust_score, now)) def get_worker_trust(sqlite, worker_id): """Get trust score for a worker. Args: sqlite: Database connection worker_id: Worker to look up Returns: Dict with trust stats or default values if not found """ row = sqlite.execute( 'SELECT verifications, correct, incorrect, trust_score, last_updated FROM worker_trust WHERE worker_id = ?', (worker_id,) ).fetchone() if row: return { 'verifications': row[0], 'correct': row[1], 'incorrect': row[2], 'trust_score': row[3], 'last_updated': row[4] } return { 'verifications': 0, 'correct': 0, 'incorrect': 0, 'trust_score': 1.0, # New workers start trusted 'last_updated': None } def get_all_worker_trust(sqlite): """Get trust scores for all workers. Returns: Dict mapping worker_id to trust stats """ rows = sqlite.execute( 'SELECT worker_id, verifications, correct, incorrect, trust_score, last_updated FROM worker_trust' ).fetchall() result = {} for row in rows: result[row[0]] = { 'verifications': row[1], 'correct': row[2], 'incorrect': row[3], 'trust_score': row[4], 'last_updated': row[5] } return result def get_verification_stats(sqlite): """Get verification system statistics. Returns: Dict with queue_size, verified counts, disagreement counts """ now = int(time.time()) today_start = (now // 86400) * 86400 stats = {} # Queue size row = sqlite.execute('SELECT COUNT(*) FROM verification_queue').fetchone() stats['queue_size'] = row[0] if row else 0 # Queue by priority rows = sqlite.execute( 'SELECT priority, COUNT(*) FROM verification_queue GROUP BY priority' ).fetchall() stats['queue_by_priority'] = {row[0]: row[1] for row in rows} # Queue by trigger type rows = sqlite.execute( 'SELECT trigger, COUNT(*) FROM verification_queue GROUP BY trigger' ).fetchall() stats['queue_by_trigger'] = {row[0]: row[1] for row in rows} return stats def analyze_database(sqlite): """Run ANALYZE to update SQLite query planner statistics. Should be called periodically (e.g., hourly) for optimal query performance. Also enables stat4 for better index statistics on complex queries. Args: sqlite: Database connection """ try: # Enable advanced statistics (persists in database) sqlite.execute('PRAGMA analysis_limit=1000') # Run ANALYZE on all tables and indexes sqlite.execute('ANALYZE') sqlite.commit() _log('database ANALYZE completed', 'debug') except Exception as e: _log('database ANALYZE failed: %s' % str(e), 'warn') def vacuum_database(sqlite): """Run VACUUM to reclaim unused space and defragment database. Should be called infrequently (e.g., daily or weekly) as it's expensive. Requires no active transactions. Args: sqlite: Database connection """ try: sqlite.execute('VACUUM') _log('database VACUUM completed', 'info') except Exception as e: _log('database VACUUM failed: %s' % str(e), 'warn') def get_database_stats(sqlite): """Get database statistics for monitoring. Args: sqlite: Database connection Returns: Dict with database statistics """ stats = {} try: row = sqlite.execute('PRAGMA page_count').fetchone() stats['page_count'] = row[0] if row else 0 row = sqlite.execute('PRAGMA page_size').fetchone() stats['page_size'] = row[0] if row else 4096 row = sqlite.execute('PRAGMA freelist_count').fetchone() stats['freelist_count'] = row[0] if row else 0 # Calculate sizes stats['total_size'] = stats['page_count'] * stats['page_size'] stats['free_size'] = stats['freelist_count'] * stats['page_size'] stats['used_size'] = stats['total_size'] - stats['free_size'] # Table row counts row = sqlite.execute('SELECT COUNT(*) FROM proxylist').fetchone() stats['proxy_count'] = row[0] if row else 0 row = sqlite.execute('SELECT COUNT(*) FROM proxylist WHERE failed=0 AND tested IS NOT NULL').fetchone() stats['working_count'] = row[0] if row else 0 row = sqlite.execute('SELECT COUNT(*) FROM uris').fetchone() stats['uri_count'] = row[0] if row else 0 except Exception as e: _log('get_database_stats error: %s' % e, 'warn') return stats