fetch: add IPv6, auth proxy, and confidence scoring support
This commit is contained in:
50
dbs.py
50
dbs.py
@@ -56,6 +56,16 @@ def _migrate_geolocation_columns(sqlite):
|
|||||||
sqlite.commit()
|
sqlite.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def _migrate_confidence_column(sqlite):
|
||||||
|
"""Add confidence column for extraction quality scoring."""
|
||||||
|
try:
|
||||||
|
sqlite.execute('SELECT confidence FROM proxylist LIMIT 1')
|
||||||
|
except Exception:
|
||||||
|
# confidence: 0-100 score indicating extraction reliability
|
||||||
|
sqlite.execute('ALTER TABLE proxylist ADD COLUMN confidence INT DEFAULT 30')
|
||||||
|
sqlite.commit()
|
||||||
|
|
||||||
|
|
||||||
def compute_proxy_list_hash(proxies):
|
def compute_proxy_list_hash(proxies):
|
||||||
"""Compute MD5 hash of sorted proxy list for change detection.
|
"""Compute MD5 hash of sorted proxy list for change detection.
|
||||||
|
|
||||||
@@ -279,12 +289,14 @@ def create_table_if_not_exists(sqlite, dbname):
|
|||||||
exit_ip TEXT,
|
exit_ip TEXT,
|
||||||
asn INT,
|
asn INT,
|
||||||
latitude REAL,
|
latitude REAL,
|
||||||
longitude REAL)""")
|
longitude REAL,
|
||||||
|
confidence INT DEFAULT 30)""")
|
||||||
# Migration: add columns to existing databases (must run before creating indexes)
|
# Migration: add columns to existing databases (must run before creating indexes)
|
||||||
_migrate_latency_columns(sqlite)
|
_migrate_latency_columns(sqlite)
|
||||||
_migrate_anonymity_columns(sqlite)
|
_migrate_anonymity_columns(sqlite)
|
||||||
_migrate_asn_column(sqlite)
|
_migrate_asn_column(sqlite)
|
||||||
_migrate_geolocation_columns(sqlite)
|
_migrate_geolocation_columns(sqlite)
|
||||||
|
_migrate_confidence_column(sqlite)
|
||||||
# Indexes for common query patterns
|
# Indexes for common query patterns
|
||||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_failed ON proxylist(failed)')
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_failed ON proxylist(failed)')
|
||||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_tested ON proxylist(tested)')
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_tested ON proxylist(tested)')
|
||||||
@@ -359,7 +371,10 @@ def insert_proxies(proxydb, proxies, url):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
proxydb: Database connection
|
proxydb: Database connection
|
||||||
proxies: List of (address, proto) tuples or plain address strings
|
proxies: List of tuples or plain address strings
|
||||||
|
- (address, proto) - 2-tuple, default confidence
|
||||||
|
- (address, proto, confidence) - 3-tuple with score
|
||||||
|
- address string - default proto and confidence
|
||||||
url: Source URL for logging
|
url: Source URL for logging
|
||||||
"""
|
"""
|
||||||
if not proxies:
|
if not proxies:
|
||||||
@@ -367,17 +382,36 @@ def insert_proxies(proxydb, proxies, url):
|
|||||||
timestamp = int(time.time())
|
timestamp = int(time.time())
|
||||||
rows = []
|
rows = []
|
||||||
for p in proxies:
|
for p in proxies:
|
||||||
# Handle both tuple (address, proto) and plain string formats
|
# Handle tuple (address, proto[, confidence]) and plain string formats
|
||||||
|
confidence = 30 # Default confidence (CONFIDENCE_REGEX)
|
||||||
if isinstance(p, tuple):
|
if isinstance(p, tuple):
|
||||||
addr, proto = p
|
if len(p) >= 3:
|
||||||
|
addr, proto, confidence = p[0], p[1], p[2]
|
||||||
|
else:
|
||||||
|
addr, proto = p[0], p[1]
|
||||||
else:
|
else:
|
||||||
addr, proto = p, None
|
addr, proto = p, None
|
||||||
ip, port = addr.split(':')
|
|
||||||
rows.append((timestamp, addr, ip, port, proto, 1, 0, 0, 0, 0, 0))
|
# Parse address into ip and port
|
||||||
|
# Formats: ip:port, [ipv6]:port, user:pass@ip:port, user:pass@[ipv6]:port
|
||||||
|
addr_part = addr.split('@')[-1] # Strip auth if present
|
||||||
|
|
||||||
|
if addr_part.startswith('['):
|
||||||
|
# IPv6: [ipv6]:port
|
||||||
|
bracket_end = addr_part.find(']')
|
||||||
|
if bracket_end < 0:
|
||||||
|
continue
|
||||||
|
ip = addr_part[:bracket_end + 1] # Include brackets
|
||||||
|
port = addr_part[bracket_end + 2:] # Skip ]:
|
||||||
|
else:
|
||||||
|
# IPv4: ip:port
|
||||||
|
ip, port = addr_part.rsplit(':', 1)
|
||||||
|
|
||||||
|
rows.append((timestamp, addr, ip, port, proto, 1, 0, 0, 0, 0, 0, confidence))
|
||||||
proxydb.executemany(
|
proxydb.executemany(
|
||||||
'INSERT OR IGNORE INTO proxylist '
|
'INSERT OR IGNORE INTO proxylist '
|
||||||
'(added,proxy,ip,port,proto,failed,tested,success_count,total_duration,mitm,consecutive_success) '
|
'(added,proxy,ip,port,proto,failed,tested,success_count,total_duration,mitm,consecutive_success,confidence) '
|
||||||
'VALUES (?,?,?,?,?,?,?,?,?,?,?)',
|
'VALUES (?,?,?,?,?,?,?,?,?,?,?,?)',
|
||||||
rows
|
rows
|
||||||
)
|
)
|
||||||
proxydb.commit()
|
proxydb.commit()
|
||||||
|
|||||||
564
fetch.py
564
fetch.py
@@ -1,4 +1,5 @@
|
|||||||
import re, random, time, string
|
import re, random, time, string
|
||||||
|
import json
|
||||||
import threading
|
import threading
|
||||||
import rocksock
|
import rocksock
|
||||||
import network_stats
|
import network_stats
|
||||||
@@ -134,6 +135,339 @@ cleanhtml_re = [
|
|||||||
# Proxy extraction pattern: IP:PORT followed by non-digit or end
|
# Proxy extraction pattern: IP:PORT followed by non-digit or end
|
||||||
# Pattern: 1-3 digits, dot, repeated 3 times, colon, 2-5 digit port
|
# Pattern: 1-3 digits, dot, repeated 3 times, colon, 2-5 digit port
|
||||||
PROXY_PATTERN = re.compile(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]')
|
PROXY_PATTERN = re.compile(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]')
|
||||||
|
|
||||||
|
# IPv6 proxy pattern: [ipv6]:port
|
||||||
|
# IPv6 can contain hex digits and colons, enclosed in brackets for URL format
|
||||||
|
IPV6_PROXY_PATTERN = re.compile(
|
||||||
|
r'\[([0-9a-fA-F:]+)\]:([0-9]{2,5})'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Auth proxy pattern: user:pass@IP:PORT or proto://user:pass@IP:PORT
|
||||||
|
# Captures: (proto, user, pass, ip, port)
|
||||||
|
AUTH_PROXY_PATTERN = re.compile(
|
||||||
|
r'(?:(socks5|socks4a?|https?|http|ssl|tor)://)?' # optional protocol
|
||||||
|
r'([a-zA-Z0-9._-]+):([a-zA-Z0-9._-]+)@' # user:pass@
|
||||||
|
r'([0-9]+(?:\.[0-9]+){3}):([0-9]{2,5})', # ip:port
|
||||||
|
re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
# IPv6 auth pattern: user:pass@[ipv6]:port
|
||||||
|
AUTH_IPV6_PATTERN = re.compile(
|
||||||
|
r'(?:(socks5|socks4a?|https?|http|ssl|tor)://)?' # optional protocol
|
||||||
|
r'([a-zA-Z0-9._-]+):([a-zA-Z0-9._-]+)@' # user:pass@
|
||||||
|
r'\[([0-9a-fA-F:]+)\]:([0-9]{2,5})', # [ipv6]:port
|
||||||
|
re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
# Protocol hint patterns - look for protocol keywords near IP:PORT
|
||||||
|
PROTO_HINT_PATTERN = re.compile(
|
||||||
|
r'(socks5|socks4a?|https?|connect|ssl|tor)\s*[:\-_\s]*'
|
||||||
|
r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})',
|
||||||
|
re.IGNORECASE
|
||||||
|
)
|
||||||
|
PROTO_HINT_REVERSE = re.compile(
|
||||||
|
r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})\s*[:\-_\|,\s]*'
|
||||||
|
r'(socks5|socks4a?|https?|http|connect|ssl|tor)',
|
||||||
|
re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
# JSON field names commonly used for proxy data
|
||||||
|
JSON_IP_FIELDS = ('ip', 'host', 'address', 'addr', 'server', 'proxy_address')
|
||||||
|
JSON_PORT_FIELDS = ('port', 'proxy_port')
|
||||||
|
JSON_PROTO_FIELDS = ('type', 'protocol', 'proto', 'scheme', 'proxy_type')
|
||||||
|
JSON_USER_FIELDS = ('user', 'username', 'login', 'usr')
|
||||||
|
JSON_PASS_FIELDS = ('pass', 'password', 'pwd', 'passwd')
|
||||||
|
|
||||||
|
# Confidence scoring for extraction methods
|
||||||
|
# Higher scores indicate more reliable extraction
|
||||||
|
CONFIDENCE_AUTH = 90 # Authenticated proxy (usually paid sources)
|
||||||
|
CONFIDENCE_JSON = 80 # JSON API with structured fields
|
||||||
|
CONFIDENCE_TABLE = 70 # HTML table with columns
|
||||||
|
CONFIDENCE_HINT = 60 # Protocol hint in surrounding text
|
||||||
|
CONFIDENCE_URL_PROTO = 50 # Protocol inferred from URL path
|
||||||
|
CONFIDENCE_REGEX = 30 # Raw regex extraction
|
||||||
|
|
||||||
|
# Bonus for protocol detection
|
||||||
|
CONFIDENCE_PROTO_EXPLICIT = 15 # Protocol explicitly stated
|
||||||
|
CONFIDENCE_PROTO_INFERRED = 5 # Protocol from URL path
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_proto(proto_str):
|
||||||
|
"""Normalize protocol string to standard form."""
|
||||||
|
if not proto_str:
|
||||||
|
return None
|
||||||
|
p = proto_str.lower().strip()
|
||||||
|
if p in ('socks5', 's5', 'tor'):
|
||||||
|
return 'socks5'
|
||||||
|
if p in ('socks4', 'socks4a', 's4'):
|
||||||
|
return 'socks4'
|
||||||
|
if p in ('http', 'https', 'connect', 'ssl'):
|
||||||
|
return 'http'
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_auth_proxies(content):
|
||||||
|
"""Extract authenticated proxies from content.
|
||||||
|
|
||||||
|
Matches patterns like:
|
||||||
|
- user:pass@1.2.3.4:8080
|
||||||
|
- socks5://user:pass@1.2.3.4:8080
|
||||||
|
- http://user:pass@1.2.3.4:8080
|
||||||
|
- user:pass@[2001:db8::1]:8080 (IPv6)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of (address, proto) tuples where address is user:pass@ip:port
|
||||||
|
"""
|
||||||
|
proxies = []
|
||||||
|
|
||||||
|
# IPv4 auth proxies
|
||||||
|
for match in AUTH_PROXY_PATTERN.finditer(content):
|
||||||
|
proto_str, user, passwd, ip, port = match.groups()
|
||||||
|
proto = _normalize_proto(proto_str)
|
||||||
|
|
||||||
|
# Normalize IP (remove leading zeros)
|
||||||
|
ip = '.'.join(str(int(o)) for o in ip.split('.'))
|
||||||
|
port = int(port)
|
||||||
|
|
||||||
|
# Build address with auth
|
||||||
|
addr = '%s:%s@%s:%d' % (user, passwd, ip, port)
|
||||||
|
proxies.append((addr, proto))
|
||||||
|
|
||||||
|
# IPv6 auth proxies
|
||||||
|
for match in AUTH_IPV6_PATTERN.finditer(content):
|
||||||
|
proto_str, user, passwd, ipv6, port = match.groups()
|
||||||
|
proto = _normalize_proto(proto_str)
|
||||||
|
port = int(port)
|
||||||
|
|
||||||
|
if not is_valid_ipv6(ipv6):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Build address with auth and bracketed IPv6
|
||||||
|
addr = '%s:%s@[%s]:%d' % (user, passwd, ipv6, port)
|
||||||
|
proxies.append((addr, proto))
|
||||||
|
|
||||||
|
return proxies
|
||||||
|
|
||||||
|
|
||||||
|
# Table column header patterns for identifying proxy data columns
|
||||||
|
TABLE_IP_HEADERS = ('ip', 'address', 'host', 'server', 'proxy')
|
||||||
|
TABLE_PORT_HEADERS = ('port',)
|
||||||
|
TABLE_PROTO_HEADERS = ('type', 'protocol', 'proto', 'scheme')
|
||||||
|
|
||||||
|
|
||||||
|
def extract_proxies_from_table(content):
|
||||||
|
"""Extract proxies from HTML tables with IP/Port/Protocol columns.
|
||||||
|
|
||||||
|
Handles tables like:
|
||||||
|
| IP Address | Port | Type |
|
||||||
|
|------------|------|---------|
|
||||||
|
| 1.2.3.4 | 8080 | SOCKS5 |
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of (address, proto) tuples
|
||||||
|
"""
|
||||||
|
proxies = []
|
||||||
|
|
||||||
|
# Simple regex-based table parsing (works without BeautifulSoup)
|
||||||
|
# Find all tables
|
||||||
|
table_pattern = re.compile(r'<table[^>]*>(.*?)</table>', re.IGNORECASE | re.DOTALL)
|
||||||
|
row_pattern = re.compile(r'<tr[^>]*>(.*?)</tr>', re.IGNORECASE | re.DOTALL)
|
||||||
|
cell_pattern = re.compile(r'<t[hd][^>]*>(.*?)</t[hd]>', re.IGNORECASE | re.DOTALL)
|
||||||
|
tag_strip = re.compile(r'<[^>]+>')
|
||||||
|
|
||||||
|
for table_match in table_pattern.finditer(content):
|
||||||
|
table_html = table_match.group(1)
|
||||||
|
rows = row_pattern.findall(table_html)
|
||||||
|
if not rows:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Parse header row to find column indices
|
||||||
|
ip_col = port_col = proto_col = -1
|
||||||
|
header_row = rows[0]
|
||||||
|
headers = cell_pattern.findall(header_row)
|
||||||
|
|
||||||
|
for i, cell in enumerate(headers):
|
||||||
|
cell_text = tag_strip.sub('', cell).strip().lower()
|
||||||
|
if ip_col < 0 and any(h in cell_text for h in TABLE_IP_HEADERS):
|
||||||
|
ip_col = i
|
||||||
|
elif port_col < 0 and any(h in cell_text for h in TABLE_PORT_HEADERS):
|
||||||
|
port_col = i
|
||||||
|
elif proto_col < 0 and any(h in cell_text for h in TABLE_PROTO_HEADERS):
|
||||||
|
proto_col = i
|
||||||
|
|
||||||
|
# Need at least IP column (port might be in same cell)
|
||||||
|
if ip_col < 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Parse data rows
|
||||||
|
for row in rows[1:]:
|
||||||
|
cells = cell_pattern.findall(row)
|
||||||
|
if len(cells) <= ip_col:
|
||||||
|
continue
|
||||||
|
|
||||||
|
ip_cell = tag_strip.sub('', cells[ip_col]).strip()
|
||||||
|
|
||||||
|
# Check if IP cell contains port (ip:port format)
|
||||||
|
if ':' in ip_cell and port_col < 0:
|
||||||
|
match = re.match(r'([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+):([0-9]+)', ip_cell)
|
||||||
|
if match:
|
||||||
|
ip, port = match.groups()
|
||||||
|
proto = None
|
||||||
|
if proto_col >= 0 and len(cells) > proto_col:
|
||||||
|
proto = _normalize_proto(tag_strip.sub('', cells[proto_col]).strip())
|
||||||
|
addr = '%s:%s' % (ip, port)
|
||||||
|
if is_usable_proxy(addr):
|
||||||
|
proxies.append((addr, proto))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Separate IP and Port columns
|
||||||
|
if port_col >= 0 and len(cells) > port_col:
|
||||||
|
port_cell = tag_strip.sub('', cells[port_col]).strip()
|
||||||
|
try:
|
||||||
|
port = int(port_cell)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Validate IP format
|
||||||
|
if not re.match(r'^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$', ip_cell):
|
||||||
|
continue
|
||||||
|
|
||||||
|
proto = None
|
||||||
|
if proto_col >= 0 and len(cells) > proto_col:
|
||||||
|
proto = _normalize_proto(tag_strip.sub('', cells[proto_col]).strip())
|
||||||
|
|
||||||
|
addr = '%s:%d' % (ip_cell, port)
|
||||||
|
if is_usable_proxy(addr):
|
||||||
|
proxies.append((addr, proto))
|
||||||
|
|
||||||
|
return proxies
|
||||||
|
|
||||||
|
|
||||||
|
def extract_proxies_from_json(content):
|
||||||
|
"""Extract proxies from JSON content.
|
||||||
|
|
||||||
|
Handles common JSON formats:
|
||||||
|
- Array of objects: [{"ip": "1.2.3.4", "port": 8080}, ...]
|
||||||
|
- Array of strings: ["1.2.3.4:8080", ...]
|
||||||
|
- Object with data array: {"data": [...], "proxies": [...]}
|
||||||
|
- Nested structures with ip/host/port/protocol fields
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of (address, proto) tuples
|
||||||
|
"""
|
||||||
|
proxies = []
|
||||||
|
|
||||||
|
# Try to find JSON in content (may be embedded in HTML)
|
||||||
|
json_matches = []
|
||||||
|
|
||||||
|
# Look for JSON arrays
|
||||||
|
for match in re.finditer(r'\[[\s\S]*?\]', content):
|
||||||
|
json_matches.append(match.group())
|
||||||
|
|
||||||
|
# Look for JSON objects
|
||||||
|
for match in re.finditer(r'\{[\s\S]*?\}', content):
|
||||||
|
json_matches.append(match.group())
|
||||||
|
|
||||||
|
for json_str in json_matches:
|
||||||
|
try:
|
||||||
|
data = json.loads(json_str)
|
||||||
|
proxies.extend(_extract_from_json_data(data))
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
return proxies
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_from_json_data(data, parent_proto=None):
|
||||||
|
"""Recursively extract proxies from parsed JSON data.
|
||||||
|
|
||||||
|
Returns list of (address, proto) tuples where address may include
|
||||||
|
auth credentials as user:pass@ip:port.
|
||||||
|
"""
|
||||||
|
proxies = []
|
||||||
|
|
||||||
|
if isinstance(data, list):
|
||||||
|
for item in data:
|
||||||
|
if isinstance(item, dict):
|
||||||
|
proxies.extend(_extract_from_json_data(item, parent_proto))
|
||||||
|
elif isinstance(item, basestring):
|
||||||
|
# Try to parse as IP:PORT or user:pass@IP:PORT string
|
||||||
|
item = item.strip()
|
||||||
|
if re.match(r'^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+:[0-9]+$', item):
|
||||||
|
proxies.append((item, parent_proto))
|
||||||
|
elif re.match(r'^[^:]+:[^@]+@[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+:[0-9]+$', item):
|
||||||
|
proxies.append((item, parent_proto))
|
||||||
|
|
||||||
|
elif isinstance(data, dict):
|
||||||
|
# Look for ip/port/user/pass fields
|
||||||
|
ip = None
|
||||||
|
port = None
|
||||||
|
proto = parent_proto
|
||||||
|
user = None
|
||||||
|
passwd = None
|
||||||
|
|
||||||
|
for key, value in data.items():
|
||||||
|
key_lower = key.lower()
|
||||||
|
if key_lower in JSON_IP_FIELDS and isinstance(value, basestring):
|
||||||
|
ip = value.strip()
|
||||||
|
elif key_lower in JSON_PORT_FIELDS:
|
||||||
|
try:
|
||||||
|
port = int(value)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
elif key_lower in JSON_PROTO_FIELDS and isinstance(value, basestring):
|
||||||
|
proto = _normalize_proto(value)
|
||||||
|
elif key_lower in JSON_USER_FIELDS and isinstance(value, basestring):
|
||||||
|
user = value.strip()
|
||||||
|
elif key_lower in JSON_PASS_FIELDS and isinstance(value, basestring):
|
||||||
|
passwd = value.strip()
|
||||||
|
|
||||||
|
if ip and port:
|
||||||
|
if user and passwd:
|
||||||
|
addr = '%s:%s@%s:%d' % (user, passwd, ip, port)
|
||||||
|
else:
|
||||||
|
addr = '%s:%d' % (ip, port)
|
||||||
|
proxies.append((addr, proto))
|
||||||
|
|
||||||
|
# Check for nested arrays (data, proxies, list, items, etc.)
|
||||||
|
for key, value in data.items():
|
||||||
|
if isinstance(value, (list, dict)):
|
||||||
|
proxies.extend(_extract_from_json_data(value, proto))
|
||||||
|
|
||||||
|
return proxies
|
||||||
|
|
||||||
|
|
||||||
|
def extract_proxies_with_hints(content):
|
||||||
|
"""Extract proxies with protocol hints from surrounding context.
|
||||||
|
|
||||||
|
Looks for patterns like:
|
||||||
|
- "socks5 1.2.3.4:8080"
|
||||||
|
- "1.2.3.4:8080 (http)"
|
||||||
|
- "SOCKS5: 1.2.3.4:8080"
|
||||||
|
- Table rows with protocol in adjacent column
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping address -> proto (or None if no hint)
|
||||||
|
"""
|
||||||
|
hints = {}
|
||||||
|
|
||||||
|
# Pattern: protocol before IP:PORT
|
||||||
|
for match in PROTO_HINT_PATTERN.finditer(content):
|
||||||
|
proto = _normalize_proto(match.group(1))
|
||||||
|
addr = match.group(2)
|
||||||
|
if proto:
|
||||||
|
hints[addr] = proto
|
||||||
|
|
||||||
|
# Pattern: IP:PORT before protocol
|
||||||
|
for match in PROTO_HINT_REVERSE.finditer(content):
|
||||||
|
addr = match.group(1)
|
||||||
|
proto = _normalize_proto(match.group(2))
|
||||||
|
if proto and addr not in hints:
|
||||||
|
hints[addr] = proto
|
||||||
|
|
||||||
|
return hints
|
||||||
|
|
||||||
|
|
||||||
def cleanhtml(raw_html):
|
def cleanhtml(raw_html):
|
||||||
html = raw_html.replace(' ', ' ')
|
html = raw_html.replace(' ', ' ')
|
||||||
html = re.sub(cleanhtml_re[0], ':', html)
|
html = re.sub(cleanhtml_re[0], ':', html)
|
||||||
@@ -219,25 +553,90 @@ def valid_port(port):
|
|||||||
return port >= 1 and port <= 65535
|
return port >= 1 and port <= 65535
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_ipv6(addr):
|
||||||
|
"""Validate IPv6 address format.
|
||||||
|
|
||||||
|
Rejects:
|
||||||
|
- Malformed addresses
|
||||||
|
- Loopback (::1)
|
||||||
|
- Link-local (fe80::/10)
|
||||||
|
- Unique local (fc00::/7)
|
||||||
|
- Multicast (ff00::/8)
|
||||||
|
- Unspecified (::)
|
||||||
|
"""
|
||||||
|
# Basic format check - must contain colons, only hex digits and colons
|
||||||
|
if not re.match(r'^[0-9a-fA-F:]+$', addr):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check for valid segment count (2-8 segments, :: expands to fill)
|
||||||
|
if '::' in addr:
|
||||||
|
if addr.count('::') > 1:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
if addr.count(':') != 7:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Reject special addresses
|
||||||
|
addr_lower = addr.lower()
|
||||||
|
|
||||||
|
# Loopback ::1
|
||||||
|
if addr_lower in ('::1', '0:0:0:0:0:0:0:1'):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Unspecified ::
|
||||||
|
if addr_lower in ('::', '0:0:0:0:0:0:0:0'):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Link-local fe80::/10
|
||||||
|
if addr_lower.startswith('fe8') or addr_lower.startswith('fe9') or \
|
||||||
|
addr_lower.startswith('fea') or addr_lower.startswith('feb'):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Unique local fc00::/7 (fc00:: - fdff::)
|
||||||
|
if addr_lower.startswith('fc') or addr_lower.startswith('fd'):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Multicast ff00::/8
|
||||||
|
if addr_lower.startswith('ff'):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def is_usable_proxy(proxy):
|
def is_usable_proxy(proxy):
|
||||||
"""Validate proxy string format and reject unusable addresses.
|
"""Validate proxy string format and reject unusable addresses.
|
||||||
|
|
||||||
|
Accepts formats:
|
||||||
|
- ip:port (IPv4)
|
||||||
|
- [ipv6]:port (IPv6)
|
||||||
|
- user:pass@ip:port
|
||||||
|
- user:pass@[ipv6]:port
|
||||||
|
|
||||||
Rejects:
|
Rejects:
|
||||||
- Malformed strings (not ip:port format)
|
- Malformed strings
|
||||||
- Invalid port (0, >65535)
|
- Invalid port (0, >65535)
|
||||||
- Invalid IP octets (>255)
|
- Private/reserved ranges
|
||||||
- Private ranges: 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
|
|
||||||
- Loopback: 127.0.0.0/8
|
|
||||||
- Link-local: 169.254.0.0/16
|
|
||||||
- CGNAT: 100.64.0.0/10
|
|
||||||
- Multicast: 224.0.0.0/4
|
|
||||||
- Reserved: 240.0.0.0/4
|
|
||||||
- Unspecified: 0.0.0.0
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
if ':' not in proxy:
|
if ':' not in proxy:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# Strip auth credentials if present (user:pass@ip:port -> ip:port)
|
||||||
|
if '@' in proxy:
|
||||||
|
proxy = proxy.split('@', 1)[1]
|
||||||
|
|
||||||
|
# Check for IPv6 format: [ipv6]:port
|
||||||
|
if proxy.startswith('['):
|
||||||
|
match = re.match(r'^\[([^\]]+)\]:(\d+)$', proxy)
|
||||||
|
if not match:
|
||||||
|
return False
|
||||||
|
ipv6_addr, port_str = match.groups()
|
||||||
|
port = int(port_str)
|
||||||
|
if not valid_port(port):
|
||||||
|
return False
|
||||||
|
return is_valid_ipv6(ipv6_addr)
|
||||||
|
|
||||||
|
# IPv4 format: ip:port
|
||||||
ip, port_str = proxy.rsplit(':', 1)
|
ip, port_str = proxy.rsplit(':', 1)
|
||||||
port = int(port_str)
|
port = int(port_str)
|
||||||
|
|
||||||
@@ -348,33 +747,160 @@ def detect_proto_from_path(url):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_proxy_addr(addr):
|
||||||
|
"""Normalize proxy address, handling auth and IPv6 formats.
|
||||||
|
|
||||||
|
Formats:
|
||||||
|
- ip:port
|
||||||
|
- user:pass@ip:port
|
||||||
|
- [ipv6]:port
|
||||||
|
- user:pass@[ipv6]:port
|
||||||
|
|
||||||
|
Returns normalized address or None if invalid.
|
||||||
|
"""
|
||||||
|
auth_prefix = ''
|
||||||
|
if '@' in addr:
|
||||||
|
auth_prefix, addr = addr.rsplit('@', 1)
|
||||||
|
auth_prefix += '@'
|
||||||
|
|
||||||
|
if ':' not in addr:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# IPv6 format: [ipv6]:port
|
||||||
|
if addr.startswith('['):
|
||||||
|
match = re.match(r'^\[([^\]]+)\]:(\d+)$', addr)
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
ipv6, port = match.groups()
|
||||||
|
try:
|
||||||
|
port = int(port.lstrip('0') or '0')
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
return '%s[%s]:%d' % (auth_prefix, ipv6, port)
|
||||||
|
|
||||||
|
# IPv4 format: ip:port
|
||||||
|
ip, port = addr.rsplit(':', 1)
|
||||||
|
try:
|
||||||
|
ip = '.'.join(str(int(o)) for o in ip.split('.'))
|
||||||
|
port = int(port.lstrip('0') or '0')
|
||||||
|
except (ValueError, AttributeError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
return '%s%s:%d' % (auth_prefix, ip, port)
|
||||||
|
|
||||||
|
|
||||||
def extract_proxies(content, proxydb=None, filter_known=True, proto=None):
|
def extract_proxies(content, proxydb=None, filter_known=True, proto=None):
|
||||||
"""Extract and normalize proxy addresses from content.
|
"""Extract and normalize proxy addresses from content.
|
||||||
|
|
||||||
|
Uses multiple extraction methods (in priority order):
|
||||||
|
1. Authenticated proxy patterns (user:pass@ip:port)
|
||||||
|
2. JSON parsing for API responses
|
||||||
|
3. HTML table parsing with IP/Port/Protocol columns
|
||||||
|
4. Protocol hints from surrounding text
|
||||||
|
5. Regex extraction for raw IP:PORT patterns
|
||||||
|
6. IPv6 regex extraction
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
content: HTML/text content to parse
|
content: HTML/text content to parse
|
||||||
proxydb: Database connection for known proxy lookup (optional)
|
proxydb: Database connection for known proxy lookup (optional)
|
||||||
filter_known: If True, filter out known proxies and return new only
|
filter_known: If True, filter out known proxies and return new only
|
||||||
proto: Protocol to assign to all extracted proxies (from source URL)
|
proto: Protocol from source URL (fallback if not detected)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
If filter_known: (unique_count, new_proxies) tuple
|
If filter_known: (unique_count, new_proxies) tuple
|
||||||
new_proxies is list of (address, proto) tuples
|
new_proxies is list of (address, proto, confidence) tuples
|
||||||
If not filter_known: list of (address, proto) tuples
|
If not filter_known: list of (address, proto, confidence) tuples
|
||||||
"""
|
"""
|
||||||
matches = PROXY_PATTERN.findall(cleanhtml(content))
|
# Dict: address -> (protocol, confidence)
|
||||||
|
# Higher confidence wins; explicit proto upgrades confidence
|
||||||
|
found = {}
|
||||||
|
|
||||||
uniques_dict = {}
|
# 1. Extract authenticated proxies first (highest confidence)
|
||||||
|
auth_proxies = extract_auth_proxies(content)
|
||||||
|
for addr, detected_proto in auth_proxies:
|
||||||
|
if is_usable_proxy(addr):
|
||||||
|
addr = _normalize_proxy_addr(addr)
|
||||||
|
if addr:
|
||||||
|
conf = CONFIDENCE_AUTH
|
||||||
|
if detected_proto:
|
||||||
|
conf += CONFIDENCE_PROTO_EXPLICIT
|
||||||
|
if addr not in found or conf > found[addr][1]:
|
||||||
|
found[addr] = (detected_proto, conf)
|
||||||
|
|
||||||
|
# 2. Try JSON extraction (reliable for protocol info)
|
||||||
|
json_proxies = extract_proxies_from_json(content)
|
||||||
|
for addr, detected_proto in json_proxies:
|
||||||
|
if is_usable_proxy(addr):
|
||||||
|
addr = _normalize_proxy_addr(addr)
|
||||||
|
if addr:
|
||||||
|
conf = CONFIDENCE_JSON
|
||||||
|
if detected_proto:
|
||||||
|
conf += CONFIDENCE_PROTO_EXPLICIT
|
||||||
|
if addr not in found or conf > found[addr][1]:
|
||||||
|
found[addr] = (detected_proto, conf)
|
||||||
|
|
||||||
|
# 3. Try HTML table extraction (structured data with protocol columns)
|
||||||
|
table_proxies = extract_proxies_from_table(content)
|
||||||
|
for addr, detected_proto in table_proxies:
|
||||||
|
if is_usable_proxy(addr):
|
||||||
|
addr = _normalize_proxy_addr(addr)
|
||||||
|
if addr:
|
||||||
|
conf = CONFIDENCE_TABLE
|
||||||
|
if detected_proto:
|
||||||
|
conf += CONFIDENCE_PROTO_EXPLICIT
|
||||||
|
if addr not in found or conf > found[addr][1]:
|
||||||
|
found[addr] = (detected_proto, conf)
|
||||||
|
|
||||||
|
# 4. Get protocol hints from content
|
||||||
|
hints = extract_proxies_with_hints(content)
|
||||||
|
|
||||||
|
# 5. Regex extraction for remaining IPv4 proxies (no auth)
|
||||||
|
matches = PROXY_PATTERN.findall(cleanhtml(content))
|
||||||
for p in matches:
|
for p in matches:
|
||||||
ip, port = p.split(':')
|
ip, port = p.split(':')
|
||||||
# Normalize IP (remove leading zeros from octets)
|
# Normalize IP (remove leading zeros from octets)
|
||||||
ip = '.'.join(str(int(octet)) for octet in ip.split('.'))
|
ip = '.'.join(str(int(octet)) for octet in ip.split('.'))
|
||||||
# Normalize port (remove leading zeros, handle empty case)
|
# Normalize port (remove leading zeros, handle empty case)
|
||||||
port = int(port.lstrip('0') or '0')
|
port = int(port.lstrip('0') or '0')
|
||||||
p = '%s:%s' % (ip, port)
|
addr = '%s:%d' % (ip, port)
|
||||||
uniques_dict[p] = True
|
|
||||||
|
|
||||||
uniques = [(p, proto) for p in uniques_dict.keys() if is_usable_proxy(p)]
|
if not is_usable_proxy(addr):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if addr not in found:
|
||||||
|
# Check for protocol hint
|
||||||
|
detected_proto = hints.get(p) or hints.get(addr)
|
||||||
|
if detected_proto:
|
||||||
|
conf = CONFIDENCE_HINT + CONFIDENCE_PROTO_EXPLICIT
|
||||||
|
else:
|
||||||
|
conf = CONFIDENCE_REGEX
|
||||||
|
found[addr] = (detected_proto, conf)
|
||||||
|
|
||||||
|
# 6. Regex extraction for IPv6 proxies [ipv6]:port
|
||||||
|
for match in IPV6_PROXY_PATTERN.finditer(content):
|
||||||
|
ipv6, port = match.groups()
|
||||||
|
port = int(port)
|
||||||
|
|
||||||
|
if not is_valid_ipv6(ipv6):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not valid_port(port):
|
||||||
|
continue
|
||||||
|
|
||||||
|
addr = '[%s]:%d' % (ipv6, port)
|
||||||
|
if addr not in found:
|
||||||
|
found[addr] = (None, CONFIDENCE_REGEX)
|
||||||
|
|
||||||
|
# Build result list with protocol and confidence
|
||||||
|
# Protocol priority: detected > URL-based > None
|
||||||
|
uniques = []
|
||||||
|
for addr in found:
|
||||||
|
detected_proto, conf = found[addr]
|
||||||
|
final_proto = detected_proto if detected_proto else proto
|
||||||
|
# Add URL proto bonus if proto was inferred from path
|
||||||
|
if not detected_proto and proto:
|
||||||
|
conf += CONFIDENCE_PROTO_INFERRED
|
||||||
|
uniques.append((addr, final_proto, conf))
|
||||||
|
|
||||||
if not filter_known:
|
if not filter_known:
|
||||||
return uniques
|
return uniques
|
||||||
@@ -384,9 +910,9 @@ def extract_proxies(content, proxydb=None, filter_known=True, proto=None):
|
|||||||
init_known_proxies(proxydb)
|
init_known_proxies(proxydb)
|
||||||
|
|
||||||
new = []
|
new = []
|
||||||
for p, pr in uniques:
|
for p, pr, conf in uniques:
|
||||||
if not is_known_proxy(p):
|
if not is_known_proxy(p):
|
||||||
new.append((p, pr))
|
new.append((p, pr, conf))
|
||||||
add_known_proxies([p])
|
add_known_proxies([p])
|
||||||
|
|
||||||
return len(uniques), new
|
return len(uniques), new
|
||||||
|
|||||||
Reference in New Issue
Block a user