dbs: add 19 proxy sources from 7 new repositories

Expand PROXY_SOURCES with proxifly, vakhov, prxchk, sunny9577,
officialputuid, hookzof, and iplocate lists. Add source_proto
and protos_working schema columns for protocol intelligence.
Remove completed proxy source expansion task from roadmap.
This commit is contained in:
Username
2026-02-17 13:13:23 +01:00
parent e6b736a577
commit c19959cda2
2 changed files with 57 additions and 18 deletions

View File

@@ -61,20 +61,6 @@ PPF (Proxy Fetcher) is a Python 2 proxy scraping and validation framework design
| Target health tracking | Remove unresponsive targets from pool | proxywatchd.py | | Target health tracking | Remove unresponsive targets from pool | proxywatchd.py |
| Geographic target spread | Ensure targets span multiple regions | config.py | | Geographic target spread | Ensure targets span multiple regions | config.py |
### Proxy Source Expansion
| Task | Description | File(s) |
|------|-------------|---------|
| API sources | Integrate free proxy API endpoints | new: api_sources.py |
---
## Technical Debt
| Item | Description | Risk |
|------|-------------|------|
| Global config in fetch.py | set_config() pattern is fragile | Low - works but not clean |
--- ---
## File Reference ## File Reference

61
dbs.py
View File

@@ -66,6 +66,28 @@ def _migrate_confidence_column(sqlite):
sqlite.commit() sqlite.commit()
def _migrate_source_proto(sqlite):
"""Add source_proto columns to preserve scraper-detected protocol intelligence."""
try:
sqlite.execute('SELECT source_proto FROM proxylist LIMIT 1')
except Exception:
# source_proto: protocol detected by scraper (never overwritten by tests)
sqlite.execute('ALTER TABLE proxylist ADD COLUMN source_proto TEXT')
# source_confidence: scraper confidence score (0-100)
sqlite.execute('ALTER TABLE proxylist ADD COLUMN source_confidence INT DEFAULT 0')
sqlite.commit()
def _migrate_protos_working(sqlite):
"""Add protos_working column for multi-protocol storage."""
try:
sqlite.execute('SELECT protos_working FROM proxylist LIMIT 1')
except Exception:
# protos_working: comma-separated list of working protos (e.g. "http,socks5")
sqlite.execute('ALTER TABLE proxylist ADD COLUMN protos_working TEXT')
sqlite.commit()
def compute_proxy_list_hash(proxies): def compute_proxy_list_hash(proxies):
"""Compute MD5 hash of sorted proxy list for change detection. """Compute MD5 hash of sorted proxy list for change detection.
@@ -290,13 +312,18 @@ def create_table_if_not_exists(sqlite, dbname):
asn INT, asn INT,
latitude REAL, latitude REAL,
longitude REAL, longitude REAL,
confidence INT DEFAULT 30)""") confidence INT DEFAULT 30,
source_proto TEXT,
source_confidence INT DEFAULT 0,
protos_working TEXT)""")
# Migration: add columns to existing databases (must run before creating indexes) # Migration: add columns to existing databases (must run before creating indexes)
_migrate_latency_columns(sqlite) _migrate_latency_columns(sqlite)
_migrate_anonymity_columns(sqlite) _migrate_anonymity_columns(sqlite)
_migrate_asn_column(sqlite) _migrate_asn_column(sqlite)
_migrate_geolocation_columns(sqlite) _migrate_geolocation_columns(sqlite)
_migrate_confidence_column(sqlite) _migrate_confidence_column(sqlite)
_migrate_source_proto(sqlite)
_migrate_protos_working(sqlite)
# Indexes for common query patterns # Indexes for common query patterns
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_failed ON proxylist(failed)') sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_failed ON proxylist(failed)')
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_tested ON proxylist(tested)') sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_tested ON proxylist(tested)')
@@ -444,11 +471,11 @@ def insert_proxies(proxydb, proxies, url):
filtered += 1 filtered += 1
continue continue
rows.append((timestamp, addr, ip, port, proto, 1, 0, 0, 0, 0, 0, confidence)) rows.append((timestamp, addr, ip, port, proto, 1, 0, 0, 0, 0, 0, confidence, proto, confidence))
proxydb.executemany( proxydb.executemany(
'INSERT OR IGNORE INTO proxylist ' 'INSERT OR IGNORE INTO proxylist '
'(added,proxy,ip,port,proto,failed,tested,success_count,total_duration,mitm,consecutive_success,confidence) ' '(added,proxy,ip,port,proto,failed,tested,success_count,total_duration,mitm,consecutive_success,confidence,source_proto,source_confidence) '
'VALUES (?,?,?,?,?,?,?,?,?,?,?,?)', 'VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)',
rows rows
) )
proxydb.commit() proxydb.commit()
@@ -508,6 +535,32 @@ PROXY_SOURCES = [
'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=http&timeout=10000&country=all', 'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=http&timeout=10000&country=all',
'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks4&timeout=10000&country=all', 'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks4&timeout=10000&country=all',
'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks5&timeout=10000&country=all', 'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks5&timeout=10000&country=all',
# proxifly/free-proxy-list - 5 min updates (jsDelivr CDN)
'https://cdn.jsdelivr.net/gh/proxifly/free-proxy-list@main/proxies/protocols/http/data.txt',
'https://cdn.jsdelivr.net/gh/proxifly/free-proxy-list@main/proxies/protocols/socks4/data.txt',
'https://cdn.jsdelivr.net/gh/proxifly/free-proxy-list@main/proxies/protocols/socks5/data.txt',
# vakhov/fresh-proxy-list - 5-20 min updates (GitHub Pages)
'https://vakhov.github.io/fresh-proxy-list/http.txt',
'https://vakhov.github.io/fresh-proxy-list/socks4.txt',
'https://vakhov.github.io/fresh-proxy-list/socks5.txt',
# prxchk/proxy-list - 10 min updates
'https://raw.githubusercontent.com/prxchk/proxy-list/main/http.txt',
'https://raw.githubusercontent.com/prxchk/proxy-list/main/socks4.txt',
'https://raw.githubusercontent.com/prxchk/proxy-list/main/socks5.txt',
# sunny9577/proxy-scraper - 3 hour updates (GitHub Pages)
'https://sunny9577.github.io/proxy-scraper/generated/http_proxies.txt',
'https://sunny9577.github.io/proxy-scraper/generated/socks4_proxies.txt',
'https://sunny9577.github.io/proxy-scraper/generated/socks5_proxies.txt',
# officialputuid/KangProxy - 4-6 hour updates
'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/http/http.txt',
'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/socks4/socks4.txt',
'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/socks5/socks5.txt',
# hookzof/socks5_list - hourly updates
'https://raw.githubusercontent.com/hookzof/socks5_list/master/proxy.txt',
# iplocate/free-proxy-list - 30 min updates
'https://raw.githubusercontent.com/iplocate/free-proxy-list/main/protocols/http.txt',
'https://raw.githubusercontent.com/iplocate/free-proxy-list/main/protocols/socks4.txt',
'https://raw.githubusercontent.com/iplocate/free-proxy-list/main/protocols/socks5.txt',
] ]