dbs: add 19 proxy sources from 7 new repositories
Expand PROXY_SOURCES with proxifly, vakhov, prxchk, sunny9577, officialputuid, hookzof, and iplocate lists. Add source_proto and protos_working schema columns for protocol intelligence. Remove completed proxy source expansion task from roadmap.
This commit is contained in:
14
ROADMAP.md
14
ROADMAP.md
@@ -61,20 +61,6 @@ PPF (Proxy Fetcher) is a Python 2 proxy scraping and validation framework design
|
|||||||
| Target health tracking | Remove unresponsive targets from pool | proxywatchd.py |
|
| Target health tracking | Remove unresponsive targets from pool | proxywatchd.py |
|
||||||
| Geographic target spread | Ensure targets span multiple regions | config.py |
|
| Geographic target spread | Ensure targets span multiple regions | config.py |
|
||||||
|
|
||||||
### Proxy Source Expansion
|
|
||||||
|
|
||||||
| Task | Description | File(s) |
|
|
||||||
|------|-------------|---------|
|
|
||||||
| API sources | Integrate free proxy API endpoints | new: api_sources.py |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Technical Debt
|
|
||||||
|
|
||||||
| Item | Description | Risk |
|
|
||||||
|------|-------------|------|
|
|
||||||
| Global config in fetch.py | set_config() pattern is fragile | Low - works but not clean |
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## File Reference
|
## File Reference
|
||||||
|
|||||||
61
dbs.py
61
dbs.py
@@ -66,6 +66,28 @@ def _migrate_confidence_column(sqlite):
|
|||||||
sqlite.commit()
|
sqlite.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def _migrate_source_proto(sqlite):
|
||||||
|
"""Add source_proto columns to preserve scraper-detected protocol intelligence."""
|
||||||
|
try:
|
||||||
|
sqlite.execute('SELECT source_proto FROM proxylist LIMIT 1')
|
||||||
|
except Exception:
|
||||||
|
# source_proto: protocol detected by scraper (never overwritten by tests)
|
||||||
|
sqlite.execute('ALTER TABLE proxylist ADD COLUMN source_proto TEXT')
|
||||||
|
# source_confidence: scraper confidence score (0-100)
|
||||||
|
sqlite.execute('ALTER TABLE proxylist ADD COLUMN source_confidence INT DEFAULT 0')
|
||||||
|
sqlite.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def _migrate_protos_working(sqlite):
|
||||||
|
"""Add protos_working column for multi-protocol storage."""
|
||||||
|
try:
|
||||||
|
sqlite.execute('SELECT protos_working FROM proxylist LIMIT 1')
|
||||||
|
except Exception:
|
||||||
|
# protos_working: comma-separated list of working protos (e.g. "http,socks5")
|
||||||
|
sqlite.execute('ALTER TABLE proxylist ADD COLUMN protos_working TEXT')
|
||||||
|
sqlite.commit()
|
||||||
|
|
||||||
|
|
||||||
def compute_proxy_list_hash(proxies):
|
def compute_proxy_list_hash(proxies):
|
||||||
"""Compute MD5 hash of sorted proxy list for change detection.
|
"""Compute MD5 hash of sorted proxy list for change detection.
|
||||||
|
|
||||||
@@ -290,13 +312,18 @@ def create_table_if_not_exists(sqlite, dbname):
|
|||||||
asn INT,
|
asn INT,
|
||||||
latitude REAL,
|
latitude REAL,
|
||||||
longitude REAL,
|
longitude REAL,
|
||||||
confidence INT DEFAULT 30)""")
|
confidence INT DEFAULT 30,
|
||||||
|
source_proto TEXT,
|
||||||
|
source_confidence INT DEFAULT 0,
|
||||||
|
protos_working TEXT)""")
|
||||||
# Migration: add columns to existing databases (must run before creating indexes)
|
# Migration: add columns to existing databases (must run before creating indexes)
|
||||||
_migrate_latency_columns(sqlite)
|
_migrate_latency_columns(sqlite)
|
||||||
_migrate_anonymity_columns(sqlite)
|
_migrate_anonymity_columns(sqlite)
|
||||||
_migrate_asn_column(sqlite)
|
_migrate_asn_column(sqlite)
|
||||||
_migrate_geolocation_columns(sqlite)
|
_migrate_geolocation_columns(sqlite)
|
||||||
_migrate_confidence_column(sqlite)
|
_migrate_confidence_column(sqlite)
|
||||||
|
_migrate_source_proto(sqlite)
|
||||||
|
_migrate_protos_working(sqlite)
|
||||||
# Indexes for common query patterns
|
# Indexes for common query patterns
|
||||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_failed ON proxylist(failed)')
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_failed ON proxylist(failed)')
|
||||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_tested ON proxylist(tested)')
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_tested ON proxylist(tested)')
|
||||||
@@ -444,11 +471,11 @@ def insert_proxies(proxydb, proxies, url):
|
|||||||
filtered += 1
|
filtered += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
rows.append((timestamp, addr, ip, port, proto, 1, 0, 0, 0, 0, 0, confidence))
|
rows.append((timestamp, addr, ip, port, proto, 1, 0, 0, 0, 0, 0, confidence, proto, confidence))
|
||||||
proxydb.executemany(
|
proxydb.executemany(
|
||||||
'INSERT OR IGNORE INTO proxylist '
|
'INSERT OR IGNORE INTO proxylist '
|
||||||
'(added,proxy,ip,port,proto,failed,tested,success_count,total_duration,mitm,consecutive_success,confidence) '
|
'(added,proxy,ip,port,proto,failed,tested,success_count,total_duration,mitm,consecutive_success,confidence,source_proto,source_confidence) '
|
||||||
'VALUES (?,?,?,?,?,?,?,?,?,?,?,?)',
|
'VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)',
|
||||||
rows
|
rows
|
||||||
)
|
)
|
||||||
proxydb.commit()
|
proxydb.commit()
|
||||||
@@ -508,6 +535,32 @@ PROXY_SOURCES = [
|
|||||||
'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=http&timeout=10000&country=all',
|
'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=http&timeout=10000&country=all',
|
||||||
'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks4&timeout=10000&country=all',
|
'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks4&timeout=10000&country=all',
|
||||||
'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks5&timeout=10000&country=all',
|
'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks5&timeout=10000&country=all',
|
||||||
|
# proxifly/free-proxy-list - 5 min updates (jsDelivr CDN)
|
||||||
|
'https://cdn.jsdelivr.net/gh/proxifly/free-proxy-list@main/proxies/protocols/http/data.txt',
|
||||||
|
'https://cdn.jsdelivr.net/gh/proxifly/free-proxy-list@main/proxies/protocols/socks4/data.txt',
|
||||||
|
'https://cdn.jsdelivr.net/gh/proxifly/free-proxy-list@main/proxies/protocols/socks5/data.txt',
|
||||||
|
# vakhov/fresh-proxy-list - 5-20 min updates (GitHub Pages)
|
||||||
|
'https://vakhov.github.io/fresh-proxy-list/http.txt',
|
||||||
|
'https://vakhov.github.io/fresh-proxy-list/socks4.txt',
|
||||||
|
'https://vakhov.github.io/fresh-proxy-list/socks5.txt',
|
||||||
|
# prxchk/proxy-list - 10 min updates
|
||||||
|
'https://raw.githubusercontent.com/prxchk/proxy-list/main/http.txt',
|
||||||
|
'https://raw.githubusercontent.com/prxchk/proxy-list/main/socks4.txt',
|
||||||
|
'https://raw.githubusercontent.com/prxchk/proxy-list/main/socks5.txt',
|
||||||
|
# sunny9577/proxy-scraper - 3 hour updates (GitHub Pages)
|
||||||
|
'https://sunny9577.github.io/proxy-scraper/generated/http_proxies.txt',
|
||||||
|
'https://sunny9577.github.io/proxy-scraper/generated/socks4_proxies.txt',
|
||||||
|
'https://sunny9577.github.io/proxy-scraper/generated/socks5_proxies.txt',
|
||||||
|
# officialputuid/KangProxy - 4-6 hour updates
|
||||||
|
'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/http/http.txt',
|
||||||
|
'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/socks4/socks4.txt',
|
||||||
|
'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/socks5/socks5.txt',
|
||||||
|
# hookzof/socks5_list - hourly updates
|
||||||
|
'https://raw.githubusercontent.com/hookzof/socks5_list/master/proxy.txt',
|
||||||
|
# iplocate/free-proxy-list - 30 min updates
|
||||||
|
'https://raw.githubusercontent.com/iplocate/free-proxy-list/main/protocols/http.txt',
|
||||||
|
'https://raw.githubusercontent.com/iplocate/free-proxy-list/main/protocols/socks4.txt',
|
||||||
|
'https://raw.githubusercontent.com/iplocate/free-proxy-list/main/protocols/socks5.txt',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user