Compare commits
9 Commits
2ea7eb41b7
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
361b70ace9 | ||
|
|
9c7b7ba070 | ||
|
|
0669b38782 | ||
|
|
6130b196b1 | ||
|
|
ce2d28ab07 | ||
|
|
93eb395727 | ||
|
|
f9d237fe0d | ||
|
|
0f1fe981ef | ||
|
|
0a53e4457f |
29
Dockerfile.test
Normal file
29
Dockerfile.test
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
FROM python:2.7-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN sed -i 's/deb.debian.org/archive.debian.org/g' /etc/apt/sources.list && \
|
||||||
|
sed -i 's/security.debian.org/archive.debian.org/g' /etc/apt/sources.list && \
|
||||||
|
sed -i '/buster-updates/d' /etc/apt/sources.list && \
|
||||||
|
echo 'deb http://archive.debian.org/debian-security buster/updates main' >> /etc/apt/sources.list && \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get upgrade -y && \
|
||||||
|
apt-get install -y --no-install-recommends gcc libc-dev && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
RUN pip install --upgrade "pip<21" "setuptools<45" "wheel<0.38"
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install -r requirements.txt || true
|
||||||
|
RUN pip install pytest
|
||||||
|
|
||||||
|
RUN mkdir -p /app/data && \
|
||||||
|
python -c "import pyasn" 2>/dev/null && \
|
||||||
|
pyasn_util_download.py --latest && \
|
||||||
|
pyasn_util_convert.py --single rib.*.bz2 /app/data/ipasn.dat && \
|
||||||
|
rm -f rib.*.bz2 || \
|
||||||
|
echo "pyasn database setup skipped"
|
||||||
|
|
||||||
|
RUN apt-get purge -y gcc libc-dev && apt-get autoremove -y || true
|
||||||
|
|
||||||
|
CMD ["python", "-m", "pytest", "tests/", "-v", "--tb=short"]
|
||||||
172
ROADMAP.md
172
ROADMAP.md
@@ -1,69 +1,100 @@
|
|||||||
# PPF Project Roadmap
|
# PPF Roadmap
|
||||||
|
|
||||||
## Project Purpose
|
## Architecture
|
||||||
|
|
||||||
PPF (Proxy Fetcher) is a Python 2 proxy scraping and validation framework designed to:
|
|
||||||
|
|
||||||
1. **Discover** proxy addresses by crawling websites and search engines
|
|
||||||
2. **Validate** proxies through multi-target testing via Tor
|
|
||||||
3. **Maintain** a database of working proxies with protocol detection (SOCKS4/SOCKS5/HTTP)
|
|
||||||
|
|
||||||
## Architecture Overview
|
|
||||||
|
|
||||||
```
|
```
|
||||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
┌──────────────────────────────────────────┐
|
||||||
│ PPF Architecture │
|
│ Odin (Master) │
|
||||||
├─────────────────────────────────────────────────────────────────────────────┤
|
│ httpd.py ─ API + SSL-only verification │
|
||||||
│ │
|
│ proxywatchd.py ─ proxy recheck daemon │
|
||||||
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
│ SQLite: proxies.db, websites.db │
|
||||||
│ │ scraper.py │ │ ppf.py │ │proxywatchd │ │
|
└──────────┬───────────────────────────────┘
|
||||||
│ │ │ │ │ │ │ │
|
│ WireGuard (10.200.1.0/24)
|
||||||
│ │ Searx query │───>│ URL harvest │───>│ Proxy test │ │
|
┌────────────────┼────────────────┐
|
||||||
│ │ URL finding │ │ Proxy extract│ │ Validation │ │
|
v v v
|
||||||
│ └─────────────┘ └─────────────┘ └─────────────┘ │
|
┌───────────┐ ┌───────────┐ ┌───────────┐
|
||||||
│ │ │ │ │
|
│ cassius │ │ edge │ │ sentinel │
|
||||||
│ v v v │
|
│ Worker │ │ Worker │ │ Worker │
|
||||||
│ ┌─────────────────────────────────────────────────────────────────┐ │
|
│ ppf.py │ │ ppf.py │ │ ppf.py │
|
||||||
│ │ SQLite Databases │ │
|
└───────────┘ └───────────┘ └───────────┘
|
||||||
│ │ uris.db (URLs) proxies.db (proxy list) │ │
|
|
||||||
│ └─────────────────────────────────────────────────────────────────┘ │
|
|
||||||
│ │
|
|
||||||
│ ┌─────────────────────────────────────────────────────────────────┐ │
|
|
||||||
│ │ Network Layer │ │
|
|
||||||
│ │ rocksock.py ─── Tor SOCKS ─── Test Proxy ─── Target Server │ │
|
|
||||||
│ └─────────────────────────────────────────────────────────────────┘ │
|
|
||||||
│ │
|
|
||||||
└─────────────────────────────────────────────────────────────────────────────┘
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Workers claim URLs, extract proxies, test them, report back.
|
||||||
|
Master verifies (SSL-only), serves API, coordinates distribution.
|
||||||
|
|
||||||
## Constraints
|
## Constraints
|
||||||
|
|
||||||
- **Python 2.7** compatibility required
|
- Python 2.7 runtime (container-based)
|
||||||
- **Minimal external dependencies** (avoid adding new modules)
|
- Minimal external dependencies
|
||||||
- Current dependencies: beautifulsoup4, pyasn, IP2Location
|
- All traffic via Tor
|
||||||
- Data files: IP2LOCATION-LITE-DB1.BIN (country), ipasn.dat (ASN)
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 1: Performance and Quality (current)
|
||||||
|
|
||||||
|
Profiling-driven optimizations and source pipeline hardening.
|
||||||
|
|
||||||
|
| Item | Status | Description |
|
||||||
|
|------|--------|-------------|
|
||||||
|
| Extraction short-circuits | done | Guard clauses in fetch.py extractors |
|
||||||
|
| Skip shutdown on failed sockets | done | Track _connected flag, skip shutdown on dead sockets |
|
||||||
|
| SQLite connection reuse (odin) | done | Per-greenlet cached handles via threading.local |
|
||||||
|
| Lazy-load ASN database | done | Defer ipasn.dat parsing to first lookup |
|
||||||
|
| Add more seed sources (100+) | done | Expanded to 120+ URLs with SOCKS5-specific sources |
|
||||||
|
| Protocol-aware source weighting | done | Dynamic SOCKS boost in claim_urls scoring |
|
||||||
|
| Sharpen error penalty in URL scoring | done | Reduce erroring URL claim frequency |
|
||||||
|
|
||||||
|
## Phase 2: Proxy Diversity and Consumer API
|
||||||
|
|
||||||
|
Address customer-reported quality gaps.
|
||||||
|
|
||||||
|
| Item | Status | Description |
|
||||||
|
|------|--------|-------------|
|
||||||
|
| ASN diversity scoring | pending | Deprioritize over-represented ASNs in testing |
|
||||||
|
| Graduated recheck intervals | pending | Fresh proxies rechecked more often than stale |
|
||||||
|
| API filters (proto/country/ASN/latency) | pending | Consumer-facing query parameters on /proxies |
|
||||||
|
| Latency-based ranking | pending | Expose latency percentiles per proxy |
|
||||||
|
|
||||||
|
## Phase 3: Self-Expanding Source Pool
|
||||||
|
|
||||||
|
Worker-driven link discovery from productive pages.
|
||||||
|
|
||||||
|
| Item | Status | Description |
|
||||||
|
|------|--------|-------------|
|
||||||
|
| Link extraction from productive pages | pending | Parse HTML for links when page yields proxies |
|
||||||
|
| Report discovered URLs to master | pending | New endpoint for worker URL submissions |
|
||||||
|
| Conditional discovery | pending | Only extract links from confirmed-productive pages |
|
||||||
|
|
||||||
|
## Phase 4: Long-Term
|
||||||
|
|
||||||
|
| Item | Status | Description |
|
||||||
|
|------|--------|-------------|
|
||||||
|
| Python 3 migration | deferred | Unblocks modern deps, security patches, pyasn native |
|
||||||
|
| Worker trust scoring | pending | Activate spot-check verification framework |
|
||||||
|
| Dynamic target pool | pending | Auto-discover and rotate validation targets |
|
||||||
|
| Geographic target spread | pending | Ensure targets span multiple regions |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Completed
|
## Completed
|
||||||
|
|
||||||
### Target Management
|
| Item | Date | Description |
|
||||||
|
|------|------|-------------|
|
||||||
| Task | Description | File(s) |
|
| Sharpen URL error penalty | 2026-02-22 | error*0.5 cap 4.0 + stale*0.2 cap 1.5 |
|
||||||
|------|-------------|---------|
|
| SOCKS5 source expansion | 2026-02-22 | Added 10 new SOCKS5-specific sources |
|
||||||
| Target health tracking | Cooldown-based health tracking for all target pools (head, SSL, IRC, judges) | stats.py, proxywatchd.py |
|
| SQLite connection reuse | 2026-02-22 | Per-greenlet cached handles via threading.local |
|
||||||
| MITM field in proxy list | Expose mitm boolean in JSON proxy list endpoints | httpd.py |
|
| Lazy-load ASN database | 2026-02-22 | Deferred ipasn.dat to first lookup |
|
||||||
|
| Socket shutdown skip | 2026-02-22 | _connected flag, skip shutdown on dead sockets |
|
||||||
---
|
| Protocol-aware weighting | 2026-02-22 | Dynamic SOCKS boost in claim_urls scoring |
|
||||||
|
| Seed sources expanded | 2026-02-22 | 37 -> 120+ URLs |
|
||||||
## Open Work
|
| last_seen freshness fix | 2026-02-22 | Watchd updates last_seen on verification |
|
||||||
|
| Periodic re-seeding | 2026-02-22 | Reset errored sources every 6h |
|
||||||
### Target Management
|
| ASN enrichment | 2026-02-22 | Pure-Python ipasn.dat reader + backfill |
|
||||||
|
| URL pipeline stats | 2026-02-22 | /api/stats exposes source health metrics |
|
||||||
| Task | Description | File(s) |
|
| Extraction short-circuits | 2026-02-22 | Guard clauses + precompiled table regexes |
|
||||||
|------|-------------|---------|
|
| Target health tracking | prior | Cooldown-based health for all target pools |
|
||||||
| Dynamic target pool | Auto-discover and rotate validation targets | proxywatchd.py |
|
| MITM field in proxy list | prior | Expose mitm boolean in JSON endpoints |
|
||||||
| Geographic target spread | Ensure targets span multiple regions | config.py |
|
| V1 worker protocol removal | prior | Cleaned up legacy --worker code path |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -71,31 +102,12 @@ PPF (Proxy Fetcher) is a Python 2 proxy scraping and validation framework design
|
|||||||
|
|
||||||
| File | Purpose |
|
| File | Purpose |
|
||||||
|------|---------|
|
|------|---------|
|
||||||
| ppf.py | Main URL harvester daemon |
|
| ppf.py | URL harvester, worker main loop |
|
||||||
| proxywatchd.py | Proxy validation daemon |
|
| proxywatchd.py | Proxy validation daemon |
|
||||||
| scraper.py | Searx search integration |
|
| fetch.py | HTTP fetching, proxy extraction |
|
||||||
| fetch.py | HTTP fetching with proxy support |
|
| httpd.py | API server, worker coordination |
|
||||||
| dbs.py | Database schema and inserts |
|
| dbs.py | Database schema, seed sources |
|
||||||
| mysqlite.py | SQLite wrapper |
|
|
||||||
| rocksock.py | Socket/proxy abstraction (3rd party) |
|
|
||||||
| http2.py | HTTP client implementation |
|
|
||||||
| httpd.py | Web dashboard and REST API server |
|
|
||||||
| config.py | Configuration management |
|
| config.py | Configuration management |
|
||||||
| comboparse.py | Config/arg parser framework |
|
| rocksock.py | Socket/proxy abstraction |
|
||||||
| soup_parser.py | BeautifulSoup wrapper |
|
| http2.py | HTTP client implementation |
|
||||||
| misc.py | Utilities (timestamp, logging) |
|
| tools/ppf-deploy | Deployment wrapper |
|
||||||
| export.py | Proxy export CLI tool |
|
|
||||||
| engines.py | Search engine implementations |
|
|
||||||
| connection_pool.py | Tor connection pooling |
|
|
||||||
| network_stats.py | Network statistics tracking |
|
|
||||||
| dns.py | DNS resolution with caching |
|
|
||||||
| mitm.py | MITM certificate detection |
|
|
||||||
| job.py | Priority job queue |
|
|
||||||
| static/dashboard.js | Dashboard frontend logic |
|
|
||||||
| static/dashboard.html | Dashboard HTML template |
|
|
||||||
| tools/lib/ppf-common.sh | Shared ops library (hosts, wrappers, colors) |
|
|
||||||
| tools/ppf-deploy | Deploy wrapper (validation + playbook) |
|
|
||||||
| tools/ppf-logs | View container logs |
|
|
||||||
| tools/ppf-service | Container lifecycle management |
|
|
||||||
| tools/playbooks/deploy.yml | Ansible deploy playbook |
|
|
||||||
| tools/playbooks/inventory.ini | Host inventory (WireGuard IPs) |
|
|
||||||
|
|||||||
34
TASKLIST.md
Normal file
34
TASKLIST.md
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
# PPF Tasklist
|
||||||
|
|
||||||
|
Active execution queue. Ordered by priority.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## In Progress
|
||||||
|
|
||||||
|
| # | Task | File(s) | Notes |
|
||||||
|
|---|------|---------|-------|
|
||||||
|
|
||||||
|
## Queued
|
||||||
|
|
||||||
|
| # | Task | File(s) | Notes |
|
||||||
|
|---|------|---------|-------|
|
||||||
|
| 12 | API filters on /proxies (proto/country/ASN) | httpd.py | Consumer query params |
|
||||||
|
| 8 | Graduated recheck intervals | proxywatchd.py | Fresh proxies checked more often |
|
||||||
|
|
||||||
|
## Done
|
||||||
|
|
||||||
|
| # | Task | Date |
|
||||||
|
|---|------|------|
|
||||||
|
| - | Sharpen URL error penalty scoring | 2026-02-22 |
|
||||||
|
| - | Add SOCKS5-specific sources (10 new) | 2026-02-22 |
|
||||||
|
| 3 | Lazy-load ASN database | 2026-02-22 |
|
||||||
|
| 2 | SQLite connection reuse on odin | 2026-02-22 |
|
||||||
|
| 1 | Skip socket.shutdown on failed connections | 2026-02-22 |
|
||||||
|
| 4 | Add more seed sources (100+) | 2026-02-22 |
|
||||||
|
| 6 | Protocol-aware source weighting | 2026-02-22 |
|
||||||
|
| - | Extraction short-circuits | 2026-02-22 |
|
||||||
|
| - | last_seen freshness fix | 2026-02-22 |
|
||||||
|
| - | Periodic re-seeding | 2026-02-22 |
|
||||||
|
| - | ASN enrichment | 2026-02-22 |
|
||||||
|
| - | URL pipeline stats | 2026-02-22 |
|
||||||
82
TODO.md
82
TODO.md
@@ -1,83 +1,35 @@
|
|||||||
# PPF TODO
|
# PPF TODO
|
||||||
|
|
||||||
## Optimization
|
Intake buffer. Items refined here move to TASKLIST.md.
|
||||||
|
|
||||||
### [ ] JSON Stats Response Caching
|
|
||||||
|
|
||||||
- Cache serialized JSON response with short TTL (1-2s)
|
|
||||||
- Only regenerate when underlying stats change
|
|
||||||
- Use ETag/If-None-Match for client-side caching
|
|
||||||
- Savings: ~7-9s/hour. Low priority, only matters with frequent dashboard access.
|
|
||||||
|
|
||||||
### [ ] Object Pooling for Test States
|
|
||||||
|
|
||||||
- Pool ProxyTestState and TargetTestJob, reset and reuse
|
|
||||||
- Savings: ~11-15s/hour. **Not recommended** - high effort, medium risk, modest gain.
|
|
||||||
|
|
||||||
### [ ] SQLite Connection Reuse
|
|
||||||
|
|
||||||
- Persistent connection per thread with health checks
|
|
||||||
- Savings: ~0.3s/hour. **Not recommended** - negligible benefit.
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Dashboard
|
## Dashboard
|
||||||
|
|
||||||
### [ ] Performance
|
- [ ] Cache expensive DB queries (top countries, protocol breakdown)
|
||||||
|
- [ ] Historical graphs (24h, 7d) using stats_history table
|
||||||
- Cache expensive DB queries (top countries, protocol breakdown)
|
- [ ] Per-ASN performance analysis
|
||||||
- Lazy-load historical data (only when scrolled into view)
|
- [ ] Alert thresholds (success rate < X%, MITM detected)
|
||||||
- WebSocket option for push updates (reduce polling overhead)
|
- [ ] WebSocket push updates (reduce polling overhead)
|
||||||
- Configurable refresh interval via URL param or localStorage
|
- [ ] Mobile-responsive improvements
|
||||||
|
|
||||||
### [ ] Features
|
|
||||||
|
|
||||||
- Historical graphs (24h, 7d) using stats_history table
|
|
||||||
- Per-ASN performance analysis
|
|
||||||
- Alert thresholds (success rate < X%, MITM detected)
|
|
||||||
- Mobile-responsive improvements
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Memory
|
## Memory
|
||||||
|
|
||||||
- [ ] Lock consolidation - reduce per-proxy locks (260k LockType objects)
|
- [ ] Lock consolidation (260k LockType objects at scale)
|
||||||
- [ ] Leaner state objects - reduce dict/list count per job
|
- [ ] Leaner state objects per job
|
||||||
|
|
||||||
Memory scales linearly with queue (~4.5 KB/job). No leaks detected.
|
Memory scales ~4.5 KB/job. No leaks detected. Optimize only if constrained.
|
||||||
Optimize only if memory becomes a constraint.
|
|
||||||
|
|
||||||
---
|
## Source Pipeline
|
||||||
|
|
||||||
## Deprecation
|
- [ ] PasteBin/GitHub API scrapers for proxy lists
|
||||||
|
- [ ] Telegram channel scrapers (beyond t.me/s/ HTML)
|
||||||
### [x] Remove V1 worker protocol
|
- [ ] Source quality decay tracking (flag sources going stale)
|
||||||
|
- [ ] Deduplication of sources across different URL forms
|
||||||
Completed. Removed `--worker` flag, `worker_main()`, `claim_work()`,
|
|
||||||
`submit_results()`, `/api/work`, `/api/results`, and related config
|
|
||||||
options. `--worker` now routes to the URL-driven protocol.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Known Issues
|
## Known Issues
|
||||||
|
|
||||||
### [!] Podman Container Metadata Disappears
|
### [!] Podman Container Metadata Disappears
|
||||||
|
|
||||||
`podman ps -a` shows empty even though process is running. Service functions
|
`podman ps -a` shows empty even though process is running.
|
||||||
correctly despite missing metadata. Monitor via `ss -tlnp`, `ps aux`, or
|
Monitor via `ss -tlnp`, `ps aux`, or `curl localhost:8081/health`.
|
||||||
`curl localhost:8081/health`. Low impact.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Container Debugging Checklist
|
|
||||||
|
|
||||||
```
|
|
||||||
1. Check for orphans: ps aux | grep -E "[p]rocess_name"
|
|
||||||
2. Check port conflicts: ss -tlnp | grep PORT
|
|
||||||
3. Run foreground: podman run --rm (no -d) to see output
|
|
||||||
4. Check podman state: podman ps -a
|
|
||||||
5. Clean stale: pkill -9 -f "pattern" && podman rm -f -a
|
|
||||||
6. Verify deps: config files, data dirs, volumes exist
|
|
||||||
7. Check logs: podman logs container_name 2>&1 | tail -50
|
|
||||||
8. Health check: curl -sf http://localhost:PORT/health
|
|
||||||
```
|
|
||||||
|
|||||||
18
compose.test.yml
Normal file
18
compose.test.yml
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
# PPF test runner (Python 2.7, production deps + pytest)
|
||||||
|
#
|
||||||
|
# Mounts source and tests as volumes so no rebuild needed between runs.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# podman-compose -f compose.test.yml run --rm test
|
||||||
|
# podman-compose -f compose.test.yml run --rm test python -m pytest tests/test_fetch.py -v
|
||||||
|
|
||||||
|
services:
|
||||||
|
test:
|
||||||
|
container_name: ppf-test
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile.test
|
||||||
|
volumes:
|
||||||
|
- .:/app:ro,Z
|
||||||
|
working_dir: /app
|
||||||
|
command: python -m pytest tests/ -v --tb=short
|
||||||
176
dbs.py
176
dbs.py
@@ -582,34 +582,107 @@ def insert_urls(urls, search, sqlite):
|
|||||||
|
|
||||||
# Known proxy list sources (GitHub raw lists, APIs)
|
# Known proxy list sources (GitHub raw lists, APIs)
|
||||||
PROXY_SOURCES = [
|
PROXY_SOURCES = [
|
||||||
|
# --- GitHub raw lists (sorted by update frequency) ---
|
||||||
|
|
||||||
# TheSpeedX/PROXY-List - large, hourly updates
|
# TheSpeedX/PROXY-List - large, hourly updates
|
||||||
'https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt',
|
'https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt',
|
||||||
'https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt',
|
'https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt',
|
||||||
'https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txt',
|
'https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txt',
|
||||||
# clarketm/proxy-list - curated, daily
|
|
||||||
'https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt',
|
|
||||||
# monosans/proxy-list - hourly updates
|
# monosans/proxy-list - hourly updates
|
||||||
'https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt',
|
'https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt',
|
||||||
'https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks4.txt',
|
'https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks4.txt',
|
||||||
'https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks5.txt',
|
'https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks5.txt',
|
||||||
|
# prxchk/proxy-list - 10 min updates
|
||||||
|
'https://raw.githubusercontent.com/prxchk/proxy-list/main/http.txt',
|
||||||
|
'https://raw.githubusercontent.com/prxchk/proxy-list/main/socks4.txt',
|
||||||
|
'https://raw.githubusercontent.com/prxchk/proxy-list/main/socks5.txt',
|
||||||
# jetkai/proxy-list - 10 min updates
|
# jetkai/proxy-list - 10 min updates
|
||||||
'https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies.txt',
|
'https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies.txt',
|
||||||
# roosterkid/openproxylist
|
# hookzof/socks5_list - hourly, SOCKS5 focused
|
||||||
'https://raw.githubusercontent.com/roosterkid/openproxylist/main/HTTPS_RAW.txt',
|
'https://raw.githubusercontent.com/hookzof/socks5_list/master/proxy.txt',
|
||||||
'https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS4_RAW.txt',
|
|
||||||
'https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS5_RAW.txt',
|
|
||||||
# ShiftyTR/Proxy-List
|
|
||||||
'https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/http.txt',
|
|
||||||
'https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks4.txt',
|
|
||||||
'https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks5.txt',
|
|
||||||
# mmpx12/proxy-list
|
# mmpx12/proxy-list
|
||||||
'https://raw.githubusercontent.com/mmpx12/proxy-list/master/http.txt',
|
'https://raw.githubusercontent.com/mmpx12/proxy-list/master/http.txt',
|
||||||
'https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks4.txt',
|
'https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks4.txt',
|
||||||
'https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks5.txt',
|
'https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks5.txt',
|
||||||
# proxyscrape API
|
# ShiftyTR/Proxy-List
|
||||||
'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=http&timeout=10000&country=all',
|
'https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/http.txt',
|
||||||
'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks4&timeout=10000&country=all',
|
'https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks4.txt',
|
||||||
'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks5&timeout=10000&country=all',
|
'https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks5.txt',
|
||||||
|
# roosterkid/openproxylist
|
||||||
|
'https://raw.githubusercontent.com/roosterkid/openproxylist/main/HTTPS_RAW.txt',
|
||||||
|
'https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS4_RAW.txt',
|
||||||
|
'https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS5_RAW.txt',
|
||||||
|
# clarketm/proxy-list - curated, daily
|
||||||
|
'https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt',
|
||||||
|
# officialputuid/KangProxy - 4-6 hour updates
|
||||||
|
'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/http/http.txt',
|
||||||
|
'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/https/https.txt',
|
||||||
|
'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/socks4/socks4.txt',
|
||||||
|
'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/socks5/socks5.txt',
|
||||||
|
# iplocate/free-proxy-list - 30 min updates
|
||||||
|
'https://raw.githubusercontent.com/iplocate/free-proxy-list/main/protocols/http.txt',
|
||||||
|
'https://raw.githubusercontent.com/iplocate/free-proxy-list/main/protocols/socks4.txt',
|
||||||
|
'https://raw.githubusercontent.com/iplocate/free-proxy-list/main/protocols/socks5.txt',
|
||||||
|
# ErcinDedeworken/proxy-list - hourly
|
||||||
|
'https://raw.githubusercontent.com/ErcinDedeworken/proxy-list/main/proxy-list/data.txt',
|
||||||
|
# MuRongPIG/Proxy-Master - 10 min updates
|
||||||
|
'https://raw.githubusercontent.com/MuRongPIG/Proxy-Master/main/http.txt',
|
||||||
|
'https://raw.githubusercontent.com/MuRongPIG/Proxy-Master/main/socks4.txt',
|
||||||
|
'https://raw.githubusercontent.com/MuRongPIG/Proxy-Master/main/socks5.txt',
|
||||||
|
# zloi-user/hideip.me - hourly
|
||||||
|
'https://raw.githubusercontent.com/zloi-user/hideip.me/main/http.txt',
|
||||||
|
'https://raw.githubusercontent.com/zloi-user/hideip.me/main/socks4.txt',
|
||||||
|
'https://raw.githubusercontent.com/zloi-user/hideip.me/main/socks5.txt',
|
||||||
|
# FLAVIEN-music/proxy-list - 30 min updates
|
||||||
|
'https://raw.githubusercontent.com/FLAVIEN-music/proxy-list/main/proxies/http.txt',
|
||||||
|
'https://raw.githubusercontent.com/FLAVIEN-music/proxy-list/main/proxies/socks4.txt',
|
||||||
|
'https://raw.githubusercontent.com/FLAVIEN-music/proxy-list/main/proxies/socks5.txt',
|
||||||
|
# Zaeem20/FREE_PROXIES_LIST - 30 min updates
|
||||||
|
'https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/http.txt',
|
||||||
|
'https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/https.txt',
|
||||||
|
'https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/socks4.txt',
|
||||||
|
'https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/socks5.txt',
|
||||||
|
# r00tee/Proxy-List - hourly
|
||||||
|
'https://raw.githubusercontent.com/r00tee/Proxy-List/main/Https.txt',
|
||||||
|
'https://raw.githubusercontent.com/r00tee/Proxy-List/main/Socks4.txt',
|
||||||
|
'https://raw.githubusercontent.com/r00tee/Proxy-List/main/Socks5.txt',
|
||||||
|
# casals-ar/proxy-list
|
||||||
|
'https://raw.githubusercontent.com/casals-ar/proxy-list/main/http',
|
||||||
|
'https://raw.githubusercontent.com/casals-ar/proxy-list/main/socks4',
|
||||||
|
'https://raw.githubusercontent.com/casals-ar/proxy-list/main/socks5',
|
||||||
|
# yemixzy/proxy-list
|
||||||
|
'https://raw.githubusercontent.com/yemixzy/proxy-list/main/proxies/http.txt',
|
||||||
|
'https://raw.githubusercontent.com/yemixzy/proxy-list/main/proxies/socks4.txt',
|
||||||
|
'https://raw.githubusercontent.com/yemixzy/proxy-list/main/proxies/socks5.txt',
|
||||||
|
# opsxcq/proxy-list
|
||||||
|
'https://raw.githubusercontent.com/opsxcq/proxy-list/master/list.txt',
|
||||||
|
# im-razvan/proxy_list - 10 min updates
|
||||||
|
'https://raw.githubusercontent.com/im-razvan/proxy_list/main/http.txt',
|
||||||
|
'https://raw.githubusercontent.com/im-razvan/proxy_list/main/socks4.txt',
|
||||||
|
'https://raw.githubusercontent.com/im-razvan/proxy_list/main/socks5.txt',
|
||||||
|
# zevtyardt/proxy-list - daily SOCKS5
|
||||||
|
'https://raw.githubusercontent.com/zevtyardt/proxy-list/main/socks5.txt',
|
||||||
|
# UptimerBot/proxy-list - 15 min updates
|
||||||
|
'https://raw.githubusercontent.com/UptimerBot/proxy-list/main/proxies/socks5.txt',
|
||||||
|
# Anonym0usWork1221/Free-Proxies
|
||||||
|
'https://raw.githubusercontent.com/Anonym0usWork1221/Free-Proxies/main/proxy_files/https_proxies.txt',
|
||||||
|
'https://raw.githubusercontent.com/Anonym0usWork1221/Free-Proxies/main/proxy_files/socks4_proxies.txt',
|
||||||
|
'https://raw.githubusercontent.com/Anonym0usWork1221/Free-Proxies/main/proxy_files/socks5_proxies.txt',
|
||||||
|
# ErcinDedeoglu/proxies - hourly
|
||||||
|
'https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/http.txt',
|
||||||
|
'https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/socks4.txt',
|
||||||
|
'https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/socks5.txt',
|
||||||
|
# dinoz0rg/proxy-list - daily, all protocols
|
||||||
|
'https://raw.githubusercontent.com/dinoz0rg/proxy-list/main/all.txt',
|
||||||
|
# elliottophellia/proxylist - SOCKS5
|
||||||
|
'https://raw.githubusercontent.com/elliottophellia/proxylist/master/results/socks5/global/socks5_len.txt',
|
||||||
|
# gfpcom/free-proxy-list - SOCKS5
|
||||||
|
'https://raw.githubusercontent.com/gfpcom/free-proxy-list/main/socks5.txt',
|
||||||
|
# databay-labs/free-proxy-list - SOCKS5
|
||||||
|
'https://raw.githubusercontent.com/databay-labs/free-proxy-list/master/socks5.txt',
|
||||||
|
|
||||||
|
# --- GitHub Pages / CDN hosted ---
|
||||||
|
|
||||||
# proxifly/free-proxy-list - 5 min updates (jsDelivr CDN)
|
# proxifly/free-proxy-list - 5 min updates (jsDelivr CDN)
|
||||||
'https://cdn.jsdelivr.net/gh/proxifly/free-proxy-list@main/proxies/protocols/http/data.txt',
|
'https://cdn.jsdelivr.net/gh/proxifly/free-proxy-list@main/proxies/protocols/http/data.txt',
|
||||||
'https://cdn.jsdelivr.net/gh/proxifly/free-proxy-list@main/proxies/protocols/socks4/data.txt',
|
'https://cdn.jsdelivr.net/gh/proxifly/free-proxy-list@main/proxies/protocols/socks4/data.txt',
|
||||||
@@ -618,24 +691,71 @@ PROXY_SOURCES = [
|
|||||||
'https://vakhov.github.io/fresh-proxy-list/http.txt',
|
'https://vakhov.github.io/fresh-proxy-list/http.txt',
|
||||||
'https://vakhov.github.io/fresh-proxy-list/socks4.txt',
|
'https://vakhov.github.io/fresh-proxy-list/socks4.txt',
|
||||||
'https://vakhov.github.io/fresh-proxy-list/socks5.txt',
|
'https://vakhov.github.io/fresh-proxy-list/socks5.txt',
|
||||||
# prxchk/proxy-list - 10 min updates
|
|
||||||
'https://raw.githubusercontent.com/prxchk/proxy-list/main/http.txt',
|
|
||||||
'https://raw.githubusercontent.com/prxchk/proxy-list/main/socks4.txt',
|
|
||||||
'https://raw.githubusercontent.com/prxchk/proxy-list/main/socks5.txt',
|
|
||||||
# sunny9577/proxy-scraper - 3 hour updates (GitHub Pages)
|
# sunny9577/proxy-scraper - 3 hour updates (GitHub Pages)
|
||||||
'https://sunny9577.github.io/proxy-scraper/generated/http_proxies.txt',
|
'https://sunny9577.github.io/proxy-scraper/generated/http_proxies.txt',
|
||||||
'https://sunny9577.github.io/proxy-scraper/generated/socks4_proxies.txt',
|
'https://sunny9577.github.io/proxy-scraper/generated/socks4_proxies.txt',
|
||||||
'https://sunny9577.github.io/proxy-scraper/generated/socks5_proxies.txt',
|
'https://sunny9577.github.io/proxy-scraper/generated/socks5_proxies.txt',
|
||||||
# officialputuid/KangProxy - 4-6 hour updates
|
|
||||||
'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/http/http.txt',
|
# --- API endpoints ---
|
||||||
'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/socks4/socks4.txt',
|
|
||||||
'https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/socks5/socks5.txt',
|
# proxyscrape
|
||||||
# hookzof/socks5_list - hourly updates
|
'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=http&timeout=10000&country=all',
|
||||||
'https://raw.githubusercontent.com/hookzof/socks5_list/master/proxy.txt',
|
'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks4&timeout=10000&country=all',
|
||||||
# iplocate/free-proxy-list - 30 min updates
|
'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks5&timeout=10000&country=all',
|
||||||
'https://raw.githubusercontent.com/iplocate/free-proxy-list/main/protocols/http.txt',
|
|
||||||
'https://raw.githubusercontent.com/iplocate/free-proxy-list/main/protocols/socks4.txt',
|
# proxy-list.download - SOCKS5 API
|
||||||
'https://raw.githubusercontent.com/iplocate/free-proxy-list/main/protocols/socks5.txt',
|
'https://www.proxy-list.download/api/v1/get?type=socks5',
|
||||||
|
'https://www.proxy-list.download/api/v1/get?type=socks4',
|
||||||
|
# openproxylist.xyz - plain text
|
||||||
|
'https://api.openproxylist.xyz/http.txt',
|
||||||
|
'https://api.openproxylist.xyz/socks4.txt',
|
||||||
|
'https://api.openproxylist.xyz/socks5.txt',
|
||||||
|
# spys.me - plain text, 30 min updates
|
||||||
|
'http://spys.me/proxy.txt',
|
||||||
|
'http://spys.me/socks.txt',
|
||||||
|
|
||||||
|
# --- Web scrapers (HTML pages) ---
|
||||||
|
|
||||||
|
# spys.one - mixed protocols, requires parsing
|
||||||
|
'https://spys.one/en/free-proxy-list/',
|
||||||
|
'https://spys.one/en/socks-proxy-list/',
|
||||||
|
'https://spys.one/en/https-ssl-proxy/',
|
||||||
|
# free-proxy-list.net
|
||||||
|
'https://free-proxy-list.net/',
|
||||||
|
'https://www.sslproxies.org/',
|
||||||
|
'https://www.socks-proxy.net/',
|
||||||
|
# sockslist.us - SOCKS5 focused
|
||||||
|
'https://sockslist.us/',
|
||||||
|
# mtpro.xyz - SOCKS5, updated every 5 min
|
||||||
|
'https://mtpro.xyz/socks5',
|
||||||
|
# proxy-tools.com - SOCKS5 filtered
|
||||||
|
'https://proxy-tools.com/proxy/socks5',
|
||||||
|
# hidemy.name - all protocols, paginated
|
||||||
|
'https://hide.mn/en/proxy-list/',
|
||||||
|
# advanced.name - SOCKS5 filtered
|
||||||
|
'https://advanced.name/freeproxy?type=socks5',
|
||||||
|
# proxynova.com - by country
|
||||||
|
'https://www.proxynova.com/proxy-server-list/',
|
||||||
|
# freeproxy.world - SOCKS5 filtered
|
||||||
|
'https://www.freeproxy.world/?type=socks5',
|
||||||
|
# proxydb.net - all protocols
|
||||||
|
'http://proxydb.net/',
|
||||||
|
# geonode
|
||||||
|
'https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc&protocols=http',
|
||||||
|
'https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc&protocols=socks4',
|
||||||
|
'https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc&protocols=socks5',
|
||||||
|
# openproxy.space
|
||||||
|
'https://openproxy.space/list/http',
|
||||||
|
'https://openproxy.space/list/socks4',
|
||||||
|
'https://openproxy.space/list/socks5',
|
||||||
|
|
||||||
|
# --- Telegram channels (public HTML view) ---
|
||||||
|
|
||||||
|
'https://t.me/s/spys_one',
|
||||||
|
'https://t.me/s/proxyfree1',
|
||||||
|
'https://t.me/s/proxylist4free',
|
||||||
|
'https://t.me/s/proxy_lists',
|
||||||
|
'https://t.me/s/Proxies4ForYou',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
158
httpd.py
158
httpd.py
@@ -31,20 +31,17 @@ except (ImportError, IOError, ValueError):
|
|||||||
_geodb = None
|
_geodb = None
|
||||||
_geolite = False
|
_geolite = False
|
||||||
|
|
||||||
# ASN lookup (optional) - try pyasn first, fall back to pure-Python reader
|
# ASN lookup (optional, lazy-loaded on first use)
|
||||||
|
# Defers ~3.6s startup cost of parsing ipasn.dat until first ASN lookup.
|
||||||
_asndb = None
|
_asndb = None
|
||||||
|
_asndb_loaded = False
|
||||||
_asn_dat_path = os.path.join("data", "ipasn.dat")
|
_asn_dat_path = os.path.join("data", "ipasn.dat")
|
||||||
try:
|
|
||||||
import pyasn
|
|
||||||
_asndb = pyasn.pyasn(_asn_dat_path)
|
|
||||||
except (ImportError, IOError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
if _asndb is None and os.path.exists(_asn_dat_path):
|
|
||||||
import socket
|
import socket
|
||||||
import struct
|
import struct
|
||||||
import bisect
|
import bisect
|
||||||
|
|
||||||
|
|
||||||
class _AsnLookup(object):
|
class _AsnLookup(object):
|
||||||
"""Pure-Python ASN lookup using ipasn.dat (CIDR/ASN text format)."""
|
"""Pure-Python ASN lookup using ipasn.dat (CIDR/ASN text format)."""
|
||||||
|
|
||||||
@@ -76,10 +73,25 @@ if _asndb is None and os.path.exists(_asn_dat_path):
|
|||||||
return (asn, None)
|
return (asn, None)
|
||||||
return (None, None)
|
return (None, None)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_asndb():
|
||||||
|
"""Lazy-load ASN database on first call. Returns db instance or None."""
|
||||||
|
global _asndb, _asndb_loaded
|
||||||
|
if _asndb_loaded:
|
||||||
|
return _asndb
|
||||||
|
_asndb_loaded = True
|
||||||
|
try:
|
||||||
|
import pyasn
|
||||||
|
_asndb = pyasn.pyasn(_asn_dat_path)
|
||||||
|
return _asndb
|
||||||
|
except (ImportError, IOError):
|
||||||
|
pass
|
||||||
|
if os.path.exists(_asn_dat_path):
|
||||||
try:
|
try:
|
||||||
_asndb = _AsnLookup(_asn_dat_path)
|
_asndb = _AsnLookup(_asn_dat_path)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
_log('asn: failed to load %s: %s' % (_asn_dat_path, e), 'warn')
|
_log('asn: failed to load %s: %s' % (_asn_dat_path, e), 'warn')
|
||||||
|
return _asndb
|
||||||
|
|
||||||
# Rate limiting configuration
|
# Rate limiting configuration
|
||||||
_rate_limits = defaultdict(list)
|
_rate_limits = defaultdict(list)
|
||||||
@@ -157,6 +169,30 @@ _fail_retry_interval = 60 # retry interval for failing proxies
|
|||||||
_fail_retry_backoff = True # True=linear backoff (60,120,180...), False=fixed (60,60,60...)
|
_fail_retry_backoff = True # True=linear backoff (60,120,180...), False=fixed (60,60,60...)
|
||||||
_max_fail = 5 # failures before proxy considered dead
|
_max_fail = 5 # failures before proxy considered dead
|
||||||
|
|
||||||
|
# Per-greenlet (or per-thread) SQLite connection cache
|
||||||
|
# Under gevent, threading.local() is monkey-patched to greenlet-local storage.
|
||||||
|
# Connections are reused across requests handled by the same greenlet, eliminating
|
||||||
|
# redundant sqlite3.connect() + PRAGMA calls (~0.5ms each, ~2.7k/session on odin).
|
||||||
|
_local = threading.local()
|
||||||
|
|
||||||
|
|
||||||
|
def _get_db(path):
|
||||||
|
"""Get a cached SQLite connection for the proxy database."""
|
||||||
|
db = getattr(_local, 'proxy_db', None)
|
||||||
|
if db is None:
|
||||||
|
db = mysqlite.mysqlite(path, str)
|
||||||
|
_local.proxy_db = db
|
||||||
|
return db
|
||||||
|
|
||||||
|
|
||||||
|
def _get_url_db(path):
|
||||||
|
"""Get a cached SQLite connection for the URL database."""
|
||||||
|
db = getattr(_local, 'url_db', None)
|
||||||
|
if db is None:
|
||||||
|
db = mysqlite.mysqlite(path, str)
|
||||||
|
_local.url_db = db
|
||||||
|
return db
|
||||||
|
|
||||||
|
|
||||||
def configure_schedule(working_checktime, fail_retry_interval, fail_retry_backoff, max_fail):
|
def configure_schedule(working_checktime, fail_retry_interval, fail_retry_backoff, max_fail):
|
||||||
"""Set testing schedule parameters from config."""
|
"""Set testing schedule parameters from config."""
|
||||||
@@ -350,6 +386,42 @@ def get_worker_test_rate(worker_id):
|
|||||||
return 0.0
|
return 0.0
|
||||||
return total_tests / elapsed
|
return total_tests / elapsed
|
||||||
|
|
||||||
|
def _get_proto_boost():
|
||||||
|
"""Calculate protocol scarcity boost for URL scoring.
|
||||||
|
|
||||||
|
Returns a value 0.0-1.0 to boost SOCKS sources when SOCKS proxies
|
||||||
|
are underrepresented relative to HTTP. Returns 0.0 when balanced.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if not _proxy_database:
|
||||||
|
return 0.0
|
||||||
|
db = _get_db(_proxy_database)
|
||||||
|
if not db:
|
||||||
|
return 0.0
|
||||||
|
row = db.execute(
|
||||||
|
"SELECT "
|
||||||
|
" SUM(CASE WHEN proto='http' THEN 1 ELSE 0 END),"
|
||||||
|
" SUM(CASE WHEN proto IN ('socks4','socks5') THEN 1 ELSE 0 END)"
|
||||||
|
" FROM proxylist WHERE failed=0"
|
||||||
|
).fetchone()
|
||||||
|
if not row or not row[0]:
|
||||||
|
return 0.5 # no data, default mild boost
|
||||||
|
http_count, socks_count = row[0] or 0, row[1] or 0
|
||||||
|
total = http_count + socks_count
|
||||||
|
if total == 0:
|
||||||
|
return 0.5
|
||||||
|
socks_ratio = float(socks_count) / total
|
||||||
|
# Boost SOCKS sources when socks_ratio < 40%
|
||||||
|
if socks_ratio >= 0.4:
|
||||||
|
return 0.0
|
||||||
|
return min((0.4 - socks_ratio) * 2.5, 1.0) # 0.0-1.0 scale
|
||||||
|
except Exception:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
# Global reference to proxy database path (set by ProxyAPIServer.__init__)
|
||||||
|
_proxy_database = None
|
||||||
|
|
||||||
|
|
||||||
def claim_urls(url_db, worker_id, count=5):
|
def claim_urls(url_db, worker_id, count=5):
|
||||||
"""Claim a batch of URLs for worker-driven fetching. Returns list of URL dicts.
|
"""Claim a batch of URLs for worker-driven fetching. Returns list of URL dicts.
|
||||||
|
|
||||||
@@ -360,6 +432,7 @@ def claim_urls(url_db, worker_id, count=5):
|
|||||||
- quality_bonus: 0-0.5 based on working_ratio
|
- quality_bonus: 0-0.5 based on working_ratio
|
||||||
- error_penalty: 0-2.0 based on consecutive errors
|
- error_penalty: 0-2.0 based on consecutive errors
|
||||||
- stale_penalty: 0-1.0 based on unchanged fetches
|
- stale_penalty: 0-1.0 based on unchanged fetches
|
||||||
|
- proto_boost: 0-1.0 for SOCKS sources when SOCKS underrepresented
|
||||||
"""
|
"""
|
||||||
now = time.time()
|
now = time.time()
|
||||||
now_int = int(now)
|
now_int = int(now)
|
||||||
@@ -385,14 +458,19 @@ def claim_urls(url_db, worker_id, count=5):
|
|||||||
list_max_age_seconds = _url_list_max_age_days * 86400
|
list_max_age_seconds = _url_list_max_age_days * 86400
|
||||||
min_added = now_int - list_max_age_seconds
|
min_added = now_int - list_max_age_seconds
|
||||||
|
|
||||||
|
# Boost SOCKS sources when protocol pool is imbalanced
|
||||||
|
proto_boost = _get_proto_boost()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
rows = url_db.execute(
|
rows = url_db.execute(
|
||||||
'''SELECT url, content_hash,
|
'''SELECT url, content_hash,
|
||||||
(? - check_time) * 1.0 / MAX(COALESCE(check_interval, 3600), 1)
|
(? - check_time) * 1.0 / MAX(COALESCE(check_interval, 3600), 1)
|
||||||
+ MIN(COALESCE(yield_rate, 0) / 100.0, 1.0)
|
+ MIN(COALESCE(yield_rate, 0) / 100.0, 1.0)
|
||||||
+ COALESCE(working_ratio, 0) * 0.5
|
+ COALESCE(working_ratio, 0) * 0.5
|
||||||
- MIN(error * 0.3, 2.0)
|
- MIN(error * 0.5, 4.0)
|
||||||
- MIN(stale_count * 0.1, 1.0)
|
- MIN(stale_count * 0.2, 1.5)
|
||||||
|
+ CASE WHEN LOWER(url) LIKE '%socks5%' OR LOWER(url) LIKE '%socks4%'
|
||||||
|
THEN ? ELSE 0 END
|
||||||
AS score
|
AS score
|
||||||
FROM uris
|
FROM uris
|
||||||
WHERE error < ?
|
WHERE error < ?
|
||||||
@@ -400,7 +478,7 @@ def claim_urls(url_db, worker_id, count=5):
|
|||||||
AND (added > ? OR proxies_added > 0)
|
AND (added > ? OR proxies_added > 0)
|
||||||
ORDER BY score DESC
|
ORDER BY score DESC
|
||||||
LIMIT ?''',
|
LIMIT ?''',
|
||||||
(now_int, _url_max_fail, now_int, min_added, count * 3)
|
(now_int, proto_boost, _url_max_fail, now_int, min_added, count * 3)
|
||||||
).fetchall()
|
).fetchall()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
_log('claim_urls query error: %s' % e, 'error')
|
_log('claim_urls query error: %s' % e, 'error')
|
||||||
@@ -576,7 +654,7 @@ def _update_url_working_ratios(url_working_counts):
|
|||||||
pending_snapshot = dict(_url_pending_counts)
|
pending_snapshot = dict(_url_pending_counts)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
url_db = mysqlite.mysqlite(_url_database_path, str)
|
url_db = _get_url_db(_url_database_path)
|
||||||
for url, working_count in url_working_counts.items():
|
for url, working_count in url_working_counts.items():
|
||||||
pending = pending_snapshot.get(url)
|
pending = pending_snapshot.get(url)
|
||||||
if not pending or pending['total'] <= 0:
|
if not pending or pending['total'] <= 0:
|
||||||
@@ -597,7 +675,6 @@ def _update_url_working_ratios(url_working_counts):
|
|||||||
settled.append(url)
|
settled.append(url)
|
||||||
|
|
||||||
url_db.commit()
|
url_db.commit()
|
||||||
url_db.close()
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
_log('_update_url_working_ratios error: %s' % e, 'error')
|
_log('_update_url_working_ratios error: %s' % e, 'error')
|
||||||
|
|
||||||
@@ -664,9 +741,10 @@ def submit_proxy_reports(db, worker_id, proxies):
|
|||||||
(rec.country_short, proxy_key))
|
(rec.country_short, proxy_key))
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
if _asndb:
|
asndb = _get_asndb()
|
||||||
|
if asndb:
|
||||||
try:
|
try:
|
||||||
asn_result = _asndb.lookup(ip)
|
asn_result = asndb.lookup(ip)
|
||||||
if asn_result and asn_result[0]:
|
if asn_result and asn_result[0]:
|
||||||
db.execute(
|
db.execute(
|
||||||
'UPDATE proxylist SET asn=? WHERE proxy=?',
|
'UPDATE proxylist SET asn=? WHERE proxy=?',
|
||||||
@@ -1109,7 +1187,7 @@ class ProxyAPIHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
|||||||
def handle_countries(self):
|
def handle_countries(self):
|
||||||
"""Return all countries with proxy counts."""
|
"""Return all countries with proxy counts."""
|
||||||
try:
|
try:
|
||||||
db = mysqlite.mysqlite(self.database, str)
|
db = _get_db(self.database)
|
||||||
rows = db.execute(
|
rows = db.execute(
|
||||||
'SELECT country, COUNT(*) as c FROM proxylist WHERE failed=0 AND country IS NOT NULL '
|
'SELECT country, COUNT(*) as c FROM proxylist WHERE failed=0 AND country IS NOT NULL '
|
||||||
'GROUP BY country ORDER BY c DESC'
|
'GROUP BY country ORDER BY c DESC'
|
||||||
@@ -1128,7 +1206,7 @@ class ProxyAPIHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
|||||||
def get_db_stats(self):
|
def get_db_stats(self):
|
||||||
"""Get statistics from database."""
|
"""Get statistics from database."""
|
||||||
try:
|
try:
|
||||||
db = mysqlite.mysqlite(self.database, str)
|
db = _get_db(self.database)
|
||||||
stats = {}
|
stats = {}
|
||||||
|
|
||||||
# Total counts
|
# Total counts
|
||||||
@@ -1176,7 +1254,7 @@ class ProxyAPIHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
|||||||
|
|
||||||
# Add database stats
|
# Add database stats
|
||||||
try:
|
try:
|
||||||
db = mysqlite.mysqlite(self.database, str)
|
db = _get_db(self.database)
|
||||||
stats['db'] = self.get_db_stats()
|
stats['db'] = self.get_db_stats()
|
||||||
stats['db_health'] = get_db_health(db)
|
stats['db_health'] = get_db_health(db)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -1263,7 +1341,7 @@ class ProxyAPIHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
|||||||
args.append(limit)
|
args.append(limit)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
db = mysqlite.mysqlite(self.database, str)
|
db = _get_db(self.database)
|
||||||
rows = db.execute(sql, args).fetchall()
|
rows = db.execute(sql, args).fetchall()
|
||||||
|
|
||||||
if fmt == 'plain':
|
if fmt == 'plain':
|
||||||
@@ -1312,7 +1390,7 @@ class ProxyAPIHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
|||||||
sql += ' ORDER BY avg_latency ASC, tested DESC'
|
sql += ' ORDER BY avg_latency ASC, tested DESC'
|
||||||
|
|
||||||
try:
|
try:
|
||||||
db = mysqlite.mysqlite(self.database, str)
|
db = _get_db(self.database)
|
||||||
rows = db.execute(sql, args).fetchall()
|
rows = db.execute(sql, args).fetchall()
|
||||||
|
|
||||||
if fmt == 'plain':
|
if fmt == 'plain':
|
||||||
@@ -1329,7 +1407,7 @@ class ProxyAPIHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
|||||||
|
|
||||||
def handle_count(self):
|
def handle_count(self):
|
||||||
try:
|
try:
|
||||||
db = mysqlite.mysqlite(self.database, str)
|
db = _get_db(self.database)
|
||||||
row = db.execute('SELECT COUNT(*) FROM proxylist WHERE failed=0 AND proto IS NOT NULL').fetchone()
|
row = db.execute('SELECT COUNT(*) FROM proxylist WHERE failed=0 AND proto IS NOT NULL').fetchone()
|
||||||
self.send_json({'count': row[0] if row else 0})
|
self.send_json({'count': row[0] if row else 0})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -1355,8 +1433,9 @@ class ProxyAPIServer(threading.Thread):
|
|||||||
self.stats_provider = stats_provider
|
self.stats_provider = stats_provider
|
||||||
self.profiling = profiling
|
self.profiling = profiling
|
||||||
self.daemon = True
|
self.daemon = True
|
||||||
global _url_database_path
|
global _url_database_path, _proxy_database
|
||||||
_url_database_path = url_database
|
_url_database_path = url_database
|
||||||
|
_proxy_database = database
|
||||||
self.server = None
|
self.server = None
|
||||||
self._stop_event = threading.Event() if not GEVENT_PATCHED else None
|
self._stop_event = threading.Event() if not GEVENT_PATCHED else None
|
||||||
# Load static library files into cache
|
# Load static library files into cache
|
||||||
@@ -1365,12 +1444,12 @@ class ProxyAPIServer(threading.Thread):
|
|||||||
load_static_files(THEME)
|
load_static_files(THEME)
|
||||||
# Load worker registry from disk
|
# Load worker registry from disk
|
||||||
load_workers()
|
load_workers()
|
||||||
# Backfill ASN for existing proxies missing it
|
# Backfill ASN for existing proxies missing it (triggers lazy-load)
|
||||||
if _asndb:
|
if _get_asndb():
|
||||||
self._backfill_asn()
|
self._backfill_asn()
|
||||||
# Create verification tables if they don't exist
|
# Create verification tables if they don't exist
|
||||||
try:
|
try:
|
||||||
db = mysqlite.mysqlite(self.database, str)
|
db = _get_db(self.database)
|
||||||
create_verification_tables(db)
|
create_verification_tables(db)
|
||||||
_log('verification tables initialized', 'debug')
|
_log('verification tables initialized', 'debug')
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -1379,7 +1458,7 @@ class ProxyAPIServer(threading.Thread):
|
|||||||
def _backfill_asn(self):
|
def _backfill_asn(self):
|
||||||
"""One-time backfill of ASN for proxies that have ip but no ASN."""
|
"""One-time backfill of ASN for proxies that have ip but no ASN."""
|
||||||
try:
|
try:
|
||||||
db = mysqlite.mysqlite(self.database, str)
|
db = _get_db(self.database)
|
||||||
rows = db.execute(
|
rows = db.execute(
|
||||||
'SELECT proxy, ip FROM proxylist WHERE asn IS NULL AND ip IS NOT NULL'
|
'SELECT proxy, ip FROM proxylist WHERE asn IS NULL AND ip IS NOT NULL'
|
||||||
).fetchall()
|
).fetchall()
|
||||||
@@ -1388,7 +1467,7 @@ class ProxyAPIServer(threading.Thread):
|
|||||||
updated = 0
|
updated = 0
|
||||||
for proxy_key, ip in rows:
|
for proxy_key, ip in rows:
|
||||||
try:
|
try:
|
||||||
result = _asndb.lookup(ip)
|
result = _get_asndb().lookup(ip)
|
||||||
if result and result[0]:
|
if result and result[0]:
|
||||||
db.execute('UPDATE proxylist SET asn=? WHERE proxy=?',
|
db.execute('UPDATE proxylist SET asn=? WHERE proxy=?',
|
||||||
(result[0], proxy_key))
|
(result[0], proxy_key))
|
||||||
@@ -1559,7 +1638,7 @@ class ProxyAPIServer(threading.Thread):
|
|||||||
stats['system'] = get_system_stats()
|
stats['system'] = get_system_stats()
|
||||||
# Add database stats
|
# Add database stats
|
||||||
try:
|
try:
|
||||||
db = mysqlite.mysqlite(self.database, str)
|
db = _get_db(self.database)
|
||||||
stats['db'] = self._get_db_stats(db)
|
stats['db'] = self._get_db_stats(db)
|
||||||
stats['db_health'] = get_db_health(db)
|
stats['db_health'] = get_db_health(db)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -1592,7 +1671,7 @@ class ProxyAPIServer(threading.Thread):
|
|||||||
|
|
||||||
# 2. Database stats and health
|
# 2. Database stats and health
|
||||||
try:
|
try:
|
||||||
db = mysqlite.mysqlite(self.database, str)
|
db = _get_db(self.database)
|
||||||
result['stats']['db'] = self._get_db_stats(db)
|
result['stats']['db'] = self._get_db_stats(db)
|
||||||
result['stats']['db_health'] = get_db_health(db)
|
result['stats']['db_health'] = get_db_health(db)
|
||||||
|
|
||||||
@@ -1605,8 +1684,6 @@ class ProxyAPIServer(threading.Thread):
|
|||||||
|
|
||||||
# 4. Workers (same as /api/workers)
|
# 4. Workers (same as /api/workers)
|
||||||
result['workers'] = self._get_workers_data(db)
|
result['workers'] = self._get_workers_data(db)
|
||||||
|
|
||||||
db.close()
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
_log('api/dashboard db error: %s' % e, 'warn')
|
_log('api/dashboard db error: %s' % e, 'warn')
|
||||||
result['countries'] = {}
|
result['countries'] = {}
|
||||||
@@ -1625,7 +1702,7 @@ class ProxyAPIServer(threading.Thread):
|
|||||||
return json.dumps({'error': 'stats not available'}), 'application/json', 500
|
return json.dumps({'error': 'stats not available'}), 'application/json', 500
|
||||||
elif path == '/api/countries':
|
elif path == '/api/countries':
|
||||||
try:
|
try:
|
||||||
db = mysqlite.mysqlite(self.database, str)
|
db = _get_db(self.database)
|
||||||
rows = db.execute(
|
rows = db.execute(
|
||||||
'SELECT country, COUNT(*) as c FROM proxylist WHERE failed=0 AND country IS NOT NULL '
|
'SELECT country, COUNT(*) as c FROM proxylist WHERE failed=0 AND country IS NOT NULL '
|
||||||
'GROUP BY country ORDER BY c DESC'
|
'GROUP BY country ORDER BY c DESC'
|
||||||
@@ -1637,7 +1714,7 @@ class ProxyAPIServer(threading.Thread):
|
|||||||
elif path == '/api/locations':
|
elif path == '/api/locations':
|
||||||
# Return proxy locations aggregated by lat/lon grid (0.5 degree cells)
|
# Return proxy locations aggregated by lat/lon grid (0.5 degree cells)
|
||||||
try:
|
try:
|
||||||
db = mysqlite.mysqlite(self.database, str)
|
db = _get_db(self.database)
|
||||||
rows = db.execute(
|
rows = db.execute(
|
||||||
'SELECT ROUND(latitude, 1) as lat, ROUND(longitude, 1) as lon, '
|
'SELECT ROUND(latitude, 1) as lat, ROUND(longitude, 1) as lon, '
|
||||||
'country, anonymity, COUNT(*) as c FROM proxylist '
|
'country, anonymity, COUNT(*) as c FROM proxylist '
|
||||||
@@ -1675,7 +1752,7 @@ class ProxyAPIServer(threading.Thread):
|
|||||||
sql += ' ORDER BY avg_latency ASC, tested DESC LIMIT ?'
|
sql += ' ORDER BY avg_latency ASC, tested DESC LIMIT ?'
|
||||||
args.append(limit)
|
args.append(limit)
|
||||||
|
|
||||||
db = mysqlite.mysqlite(self.database, str)
|
db = _get_db(self.database)
|
||||||
rows = db.execute(sql, args).fetchall()
|
rows = db.execute(sql, args).fetchall()
|
||||||
|
|
||||||
if fmt == 'plain':
|
if fmt == 'plain':
|
||||||
@@ -1714,7 +1791,7 @@ class ProxyAPIServer(threading.Thread):
|
|||||||
sql += ' AND mitm=1'
|
sql += ' AND mitm=1'
|
||||||
sql += ' ORDER BY avg_latency ASC, tested DESC'
|
sql += ' ORDER BY avg_latency ASC, tested DESC'
|
||||||
|
|
||||||
db = mysqlite.mysqlite(self.database, str)
|
db = _get_db(self.database)
|
||||||
rows = db.execute(sql, args).fetchall()
|
rows = db.execute(sql, args).fetchall()
|
||||||
|
|
||||||
if fmt == 'plain':
|
if fmt == 'plain':
|
||||||
@@ -1736,7 +1813,7 @@ class ProxyAPIServer(threading.Thread):
|
|||||||
sql += ' AND mitm=0'
|
sql += ' AND mitm=0'
|
||||||
elif mitm_filter == '1':
|
elif mitm_filter == '1':
|
||||||
sql += ' AND mitm=1'
|
sql += ' AND mitm=1'
|
||||||
db = mysqlite.mysqlite(self.database, str)
|
db = _get_db(self.database)
|
||||||
row = db.execute(sql).fetchone()
|
row = db.execute(sql).fetchone()
|
||||||
return json.dumps({'count': row[0] if row else 0}), 'application/json', 200
|
return json.dumps({'count': row[0] if row else 0}), 'application/json', 200
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -1792,9 +1869,8 @@ class ProxyAPIServer(threading.Thread):
|
|||||||
elif path == '/api/workers':
|
elif path == '/api/workers':
|
||||||
# List connected workers
|
# List connected workers
|
||||||
try:
|
try:
|
||||||
db = mysqlite.mysqlite(self.database, str)
|
db = _get_db(self.database)
|
||||||
workers_data = self._get_workers_data(db)
|
workers_data = self._get_workers_data(db)
|
||||||
db.close()
|
|
||||||
return json.dumps(workers_data, indent=2), 'application/json', 200
|
return json.dumps(workers_data, indent=2), 'application/json', 200
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
_log('api/workers error: %s' % e, 'warn')
|
_log('api/workers error: %s' % e, 'warn')
|
||||||
@@ -1812,7 +1888,7 @@ class ProxyAPIServer(threading.Thread):
|
|||||||
return json.dumps({'error': 'url database not configured'}), 'application/json', 500
|
return json.dumps({'error': 'url database not configured'}), 'application/json', 500
|
||||||
count = min(int(query_params.get('count', 5)), 20)
|
count = min(int(query_params.get('count', 5)), 20)
|
||||||
try:
|
try:
|
||||||
url_db = mysqlite.mysqlite(self.url_database, str)
|
url_db = _get_url_db(self.url_database)
|
||||||
urls = claim_urls(url_db, worker_id, count)
|
urls = claim_urls(url_db, worker_id, count)
|
||||||
update_worker_heartbeat(worker_id)
|
update_worker_heartbeat(worker_id)
|
||||||
return json.dumps({
|
return json.dumps({
|
||||||
@@ -1839,7 +1915,7 @@ class ProxyAPIServer(threading.Thread):
|
|||||||
if not reports:
|
if not reports:
|
||||||
return json.dumps({'error': 'no reports provided'}), 'application/json', 400
|
return json.dumps({'error': 'no reports provided'}), 'application/json', 400
|
||||||
try:
|
try:
|
||||||
url_db = mysqlite.mysqlite(self.url_database, str)
|
url_db = _get_url_db(self.url_database)
|
||||||
processed = submit_url_reports(url_db, worker_id, reports)
|
processed = submit_url_reports(url_db, worker_id, reports)
|
||||||
update_worker_heartbeat(worker_id)
|
update_worker_heartbeat(worker_id)
|
||||||
return json.dumps({
|
return json.dumps({
|
||||||
@@ -1863,7 +1939,7 @@ class ProxyAPIServer(threading.Thread):
|
|||||||
if not proxies:
|
if not proxies:
|
||||||
return json.dumps({'error': 'no proxies provided'}), 'application/json', 400
|
return json.dumps({'error': 'no proxies provided'}), 'application/json', 400
|
||||||
try:
|
try:
|
||||||
db = mysqlite.mysqlite(self.database, str)
|
db = _get_db(self.database)
|
||||||
processed = submit_proxy_reports(db, worker_id, proxies)
|
processed = submit_proxy_reports(db, worker_id, proxies)
|
||||||
update_worker_heartbeat(worker_id)
|
update_worker_heartbeat(worker_id)
|
||||||
return json.dumps({
|
return json.dumps({
|
||||||
@@ -1917,7 +1993,7 @@ class ProxyAPIServer(threading.Thread):
|
|||||||
if not self.url_database:
|
if not self.url_database:
|
||||||
return None
|
return None
|
||||||
try:
|
try:
|
||||||
db = mysqlite.mysqlite(self.url_database, str)
|
db = _get_url_db(self.url_database)
|
||||||
stats = {}
|
stats = {}
|
||||||
now = int(time.time())
|
now = int(time.time())
|
||||||
|
|
||||||
|
|||||||
@@ -242,6 +242,7 @@ class Rocksock():
|
|||||||
target = RocksockProxy(host, port, RS_PT_NONE)
|
target = RocksockProxy(host, port, RS_PT_NONE)
|
||||||
self.proxychain.append(target)
|
self.proxychain.append(target)
|
||||||
self.sock = None
|
self.sock = None
|
||||||
|
self._connected = False
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
|
|
||||||
def _translate_socket_error(self, e, pnum):
|
def _translate_socket_error(self, e, pnum):
|
||||||
@@ -302,15 +303,18 @@ class Rocksock():
|
|||||||
select.select([], [self.sock], [])
|
select.select([], [self.sock], [])
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
self._connected = True
|
||||||
|
|
||||||
def disconnect(self):
|
def disconnect(self):
|
||||||
if self.sock is None: return
|
if self.sock is None: return
|
||||||
|
if self._connected:
|
||||||
try:
|
try:
|
||||||
self.sock.shutdown(socket.SHUT_RDWR)
|
self.sock.shutdown(socket.SHUT_RDWR)
|
||||||
except socket.error:
|
except socket.error:
|
||||||
pass
|
pass
|
||||||
self.sock.close()
|
self.sock.close()
|
||||||
self.sock = None
|
self.sock = None
|
||||||
|
self._connected = False
|
||||||
|
|
||||||
def canread(self):
|
def canread(self):
|
||||||
return select.select([self.sock], [], [], 0)[0]
|
return select.select([self.sock], [], [], 0)[0]
|
||||||
|
|||||||
Reference in New Issue
Block a user