fetch: detect proxy protocol from source URL path
- detect_proto_from_path() infers socks4/socks5/http from URL - extract_proxies() now returns (address, proto) tuples - ppf.py updated to handle protocol-tagged proxies - profiler signal handler for SIGTERM stats dump
This commit is contained in:
40
fetch.py
40
fetch.py
@@ -191,17 +191,47 @@ def is_known_proxy(proxy):
|
||||
"""Check if proxy is in known cache."""
|
||||
return proxy in _known_proxies
|
||||
|
||||
def extract_proxies(content, proxydb=None, filter_known=True):
|
||||
def detect_proto_from_path(url):
|
||||
"""Detect proxy protocol from URL path.
|
||||
|
||||
Many proxy lists indicate protocol in their path:
|
||||
- /socks5/, /socks5.txt, socks5-proxies.txt -> socks5
|
||||
- /socks4/, /socks4a/, /socks4.txt -> socks4
|
||||
- /http/, /http.txt, http-proxies.txt -> http
|
||||
- /https/, /ssl/ -> http (HTTPS proxies use HTTP CONNECT)
|
||||
|
||||
Args:
|
||||
url: Source URL path or full URL
|
||||
|
||||
Returns:
|
||||
Protocol string ('http', 'socks4', 'socks5') or None if not detected
|
||||
"""
|
||||
url_lower = url.lower()
|
||||
# Check for socks5 indicators
|
||||
if 'socks5' in url_lower:
|
||||
return 'socks5'
|
||||
# Check for socks4/socks4a indicators
|
||||
if 'socks4' in url_lower:
|
||||
return 'socks4'
|
||||
# Check for http/https/ssl/connect indicators
|
||||
if any(x in url_lower for x in ('/http', 'http-', 'http_', 'http.', '/https', '/ssl', '/connect')):
|
||||
return 'http'
|
||||
return None
|
||||
|
||||
|
||||
def extract_proxies(content, proxydb=None, filter_known=True, proto=None):
|
||||
"""Extract and normalize proxy addresses from content.
|
||||
|
||||
Args:
|
||||
content: HTML/text content to parse
|
||||
proxydb: Database connection for known proxy lookup (optional)
|
||||
filter_known: If True, filter out known proxies and return new only
|
||||
proto: Protocol to assign to all extracted proxies (from source URL)
|
||||
|
||||
Returns:
|
||||
If filter_known: (unique_count, new_proxies) tuple
|
||||
If not filter_known: list of all unique valid proxies
|
||||
new_proxies is list of (address, proto) tuples
|
||||
If not filter_known: list of (address, proto) tuples
|
||||
"""
|
||||
matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content))
|
||||
|
||||
@@ -215,7 +245,7 @@ def extract_proxies(content, proxydb=None, filter_known=True):
|
||||
p = '%s:%s' % (ip, port)
|
||||
uniques_dict[p] = True
|
||||
|
||||
uniques = [p for p in uniques_dict.keys() if is_usable_proxy(p)]
|
||||
uniques = [(p, proto) for p in uniques_dict.keys() if is_usable_proxy(p)]
|
||||
|
||||
if not filter_known:
|
||||
return uniques
|
||||
@@ -225,9 +255,9 @@ def extract_proxies(content, proxydb=None, filter_known=True):
|
||||
init_known_proxies(proxydb)
|
||||
|
||||
new = []
|
||||
for p in uniques:
|
||||
for p, pr in uniques:
|
||||
if not is_known_proxy(p):
|
||||
new.append(p)
|
||||
new.append((p, pr))
|
||||
add_known_proxies([p])
|
||||
|
||||
return len(uniques), new
|
||||
|
||||
Reference in New Issue
Block a user