import re, random, time, string import json import threading from collections import OrderedDict import rocksock import network_stats from http2 import RsHttp, _parse_url from soup_parser import soupify from misc import _log, tor_proxy_url config = None # LRU cache for is_usable_proxy() - avoids repeated validation of same proxy strings # Uses OrderedDict to maintain insertion order; oldest entries evicted when full _proxy_valid_cache = OrderedDict() _proxy_valid_cache_max = 10000 _proxy_valid_cache_lock = threading.Lock() class FetchSession(object): """Reusable fetch session with persistent Tor circuit. Maintains HTTP connection and Tor credentials across multiple requests. Call cycle() to get a new Tor circuit when blocked. """ def __init__(self): self.http = None self.current_host = None self.current_port = None self.current_ssl = None self.tor_url = None self._new_circuit() def _new_circuit(self): """Generate new Tor credentials for a fresh circuit.""" if config and config.torhosts: torhost = random.choice(config.torhosts) self.tor_url = tor_proxy_url(torhost) def cycle(self): """Cycle to a new Tor circuit (call when blocked).""" self.close() self._new_circuit() def close(self): """Close current connection.""" if self.http: try: self.http.disconnect() except Exception: pass self.http = None self.current_host = None def fetch(self, url, head=False): """Fetch URL, reusing connection if possible.""" network_stats.set_category('scraper') host, port, ssl, uri = _parse_url(url) # Check if we can reuse existing connection if (self.http and self.current_host == host and self.current_port == port and self.current_ssl == ssl): # Reuse existing connection try: if head: return self.http.head(uri, [ 'Accept-Language: en-US,en;q=0.8', 'Cache-Control: max-age=0', ]) hdr, res = self.http.get(uri, [ 'Accept-Language: en-US,en;q=0.8', 'Cache-Control: max-age=0', ]) res = res.encode('utf-8') if isinstance(res, unicode) else res return res except Exception: # Connection died, close and reconnect self.close() # Need new connection self.close() if not self.tor_url: self._new_circuit() proxies = [rocksock.RocksockProxyFromURL(self.tor_url)] self.http = RsHttp( host, ssl=ssl, port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0', log_errors=False ) if not self.http.connect(): self.close() return None self.current_host = host self.current_port = port self.current_ssl = ssl try: if head: return self.http.head(uri, [ 'Accept-Language: en-US,en;q=0.8', 'Cache-Control: max-age=0', ]) hdr, res = self.http.get(uri, [ 'Accept-Language: en-US,en;q=0.8', 'Cache-Control: max-age=0', ]) res = res.encode('utf-8') if isinstance(res, unicode) else res return res except Exception: self.close() return None _last_fail_log = 0 _fail_log_interval = 60 def set_config(cfg): global config config = cfg # Pre-compiled regex patterns (compiled once at module load) cleanhtml_re = [ re.compile(r'<.*?>'), re.compile(r'\s+'), re.compile(r'::+'), ] # Proxy extraction pattern: IP:PORT followed by non-digit or end # Pattern: 1-3 digits, dot, repeated 3 times, colon, 2-5 digit port PROXY_PATTERN = re.compile(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]') # IPv6 proxy pattern: [ipv6]:port # IPv6 can contain hex digits and colons, enclosed in brackets for URL format IPV6_PROXY_PATTERN = re.compile( r'\[([0-9a-fA-F:]+)\]:([0-9]{2,5})' ) # Auth proxy pattern: user:pass@IP:PORT or proto://user:pass@IP:PORT # Captures: (proto, user, pass, ip, port) AUTH_PROXY_PATTERN = re.compile( r'(?:(socks5|socks4a?|https?|http|ssl|tor)://)?' # optional protocol r'([a-zA-Z0-9._-]+):([a-zA-Z0-9._-]+)@' # user:pass@ r'([0-9]+(?:\.[0-9]+){3}):([0-9]{2,5})', # ip:port re.IGNORECASE ) # IPv6 auth pattern: user:pass@[ipv6]:port AUTH_IPV6_PATTERN = re.compile( r'(?:(socks5|socks4a?|https?|http|ssl|tor)://)?' # optional protocol r'([a-zA-Z0-9._-]+):([a-zA-Z0-9._-]+)@' # user:pass@ r'\[([0-9a-fA-F:]+)\]:([0-9]{2,5})', # [ipv6]:port re.IGNORECASE ) # Protocol hint patterns - look for protocol keywords near IP:PORT PROTO_HINT_PATTERN = re.compile( r'(socks5|socks4a?|https?|connect|ssl|tor)\s*[:\-_\s]*' r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})', re.IGNORECASE ) PROTO_HINT_REVERSE = re.compile( r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})\s*[:\-_\|,\s]*' r'(socks5|socks4a?|https?|http|connect|ssl|tor)', re.IGNORECASE ) # JSON field names commonly used for proxy data JSON_IP_FIELDS = ('ip', 'host', 'address', 'addr', 'server', 'proxy_address') JSON_PORT_FIELDS = ('port', 'proxy_port') JSON_PROTO_FIELDS = ('type', 'protocol', 'proto', 'scheme', 'proxy_type') JSON_USER_FIELDS = ('user', 'username', 'login', 'usr') JSON_PASS_FIELDS = ('pass', 'password', 'pwd', 'passwd') # Confidence scoring for extraction methods # Higher scores indicate more reliable extraction CONFIDENCE_AUTH = 90 # Authenticated proxy (usually paid sources) CONFIDENCE_JSON = 80 # JSON API with structured fields CONFIDENCE_TABLE = 70 # HTML table with columns CONFIDENCE_HINT = 60 # Protocol hint in surrounding text CONFIDENCE_URL_PROTO = 50 # Protocol inferred from URL path CONFIDENCE_REGEX = 30 # Raw regex extraction # Bonus for protocol detection CONFIDENCE_PROTO_EXPLICIT = 15 # Protocol explicitly stated CONFIDENCE_PROTO_INFERRED = 5 # Protocol from URL path def _normalize_proto(proto_str): """Normalize protocol string to standard form.""" if not proto_str: return None p = proto_str.lower().strip() if p in ('socks5', 's5', 'tor'): return 'socks5' if p in ('socks4', 'socks4a', 's4'): return 'socks4' if p in ('http', 'https', 'connect', 'ssl'): return 'http' return None def extract_auth_proxies(content): """Extract authenticated proxies from content. Matches patterns like: - user:pass@1.2.3.4:8080 - socks5://user:pass@1.2.3.4:8080 - http://user:pass@1.2.3.4:8080 - user:pass@[2001:db8::1]:8080 (IPv6) Returns: List of (address, proto) tuples where address is user:pass@ip:port """ proxies = [] # IPv4 auth proxies for match in AUTH_PROXY_PATTERN.finditer(content): proto_str, user, passwd, ip, port = match.groups() proto = _normalize_proto(proto_str) # Normalize IP (remove leading zeros) ip = '.'.join(str(int(o)) for o in ip.split('.')) port = int(port) # Build address with auth addr = '%s:%s@%s:%d' % (user, passwd, ip, port) proxies.append((addr, proto)) # IPv6 auth proxies for match in AUTH_IPV6_PATTERN.finditer(content): proto_str, user, passwd, ipv6, port = match.groups() proto = _normalize_proto(proto_str) port = int(port) if not is_valid_ipv6(ipv6): continue # Build address with auth and bracketed IPv6 addr = '%s:%s@[%s]:%d' % (user, passwd, ipv6, port) proxies.append((addr, proto)) return proxies # Table column header patterns for identifying proxy data columns TABLE_IP_HEADERS = ('ip', 'address', 'host', 'server', 'proxy') TABLE_PORT_HEADERS = ('port',) TABLE_PROTO_HEADERS = ('type', 'protocol', 'proto', 'scheme') def extract_proxies_from_table(content): """Extract proxies from HTML tables with IP/Port/Protocol columns. Handles tables like: | IP Address | Port | Type | |------------|------|---------| | 1.2.3.4 | 8080 | SOCKS5 | Returns: List of (address, proto) tuples """ proxies = [] # Simple regex-based table parsing (works without BeautifulSoup) # Find all tables table_pattern = re.compile(r']*>(.*?)', re.IGNORECASE | re.DOTALL) row_pattern = re.compile(r']*>(.*?)', re.IGNORECASE | re.DOTALL) cell_pattern = re.compile(r']*>(.*?)', re.IGNORECASE | re.DOTALL) tag_strip = re.compile(r'<[^>]+>') for table_match in table_pattern.finditer(content): table_html = table_match.group(1) rows = row_pattern.findall(table_html) if not rows: continue # Parse header row to find column indices ip_col = port_col = proto_col = -1 header_row = rows[0] headers = cell_pattern.findall(header_row) for i, cell in enumerate(headers): cell_text = tag_strip.sub('', cell).strip().lower() if ip_col < 0 and any(h in cell_text for h in TABLE_IP_HEADERS): ip_col = i elif port_col < 0 and any(h in cell_text for h in TABLE_PORT_HEADERS): port_col = i elif proto_col < 0 and any(h in cell_text for h in TABLE_PROTO_HEADERS): proto_col = i # Need at least IP column (port might be in same cell) if ip_col < 0: continue # Parse data rows for row in rows[1:]: cells = cell_pattern.findall(row) if len(cells) <= ip_col: continue ip_cell = tag_strip.sub('', cells[ip_col]).strip() # Check if IP cell contains port (ip:port format) if ':' in ip_cell and port_col < 0: match = re.match(r'([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+):([0-9]+)', ip_cell) if match: ip, port = match.groups() proto = None if proto_col >= 0 and len(cells) > proto_col: proto = _normalize_proto(tag_strip.sub('', cells[proto_col]).strip()) addr = '%s:%s' % (ip, port) if is_usable_proxy(addr): proxies.append((addr, proto)) continue # Separate IP and Port columns if port_col >= 0 and len(cells) > port_col: port_cell = tag_strip.sub('', cells[port_col]).strip() try: port = int(port_cell) except ValueError: continue # Validate IP format if not re.match(r'^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$', ip_cell): continue proto = None if proto_col >= 0 and len(cells) > proto_col: proto = _normalize_proto(tag_strip.sub('', cells[proto_col]).strip()) addr = '%s:%d' % (ip_cell, port) if is_usable_proxy(addr): proxies.append((addr, proto)) return proxies def extract_proxies_from_json(content): """Extract proxies from JSON content. Handles common JSON formats: - Array of objects: [{"ip": "1.2.3.4", "port": 8080}, ...] - Array of strings: ["1.2.3.4:8080", ...] - Object with data array: {"data": [...], "proxies": [...]} - Nested structures with ip/host/port/protocol fields Returns: List of (address, proto) tuples """ proxies = [] # Try to find JSON in content (may be embedded in HTML) json_matches = [] # Look for JSON arrays for match in re.finditer(r'\[[\s\S]*?\]', content): json_matches.append(match.group()) # Look for JSON objects for match in re.finditer(r'\{[\s\S]*?\}', content): json_matches.append(match.group()) for json_str in json_matches: try: data = json.loads(json_str) proxies.extend(_extract_from_json_data(data)) except (ValueError, TypeError): continue return proxies def _extract_from_json_data(data, parent_proto=None): """Recursively extract proxies from parsed JSON data. Returns list of (address, proto) tuples where address may include auth credentials as user:pass@ip:port. """ proxies = [] if isinstance(data, list): for item in data: if isinstance(item, dict): proxies.extend(_extract_from_json_data(item, parent_proto)) elif isinstance(item, basestring): # Try to parse as IP:PORT or user:pass@IP:PORT string item = item.strip() if re.match(r'^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+:[0-9]+$', item): proxies.append((item, parent_proto)) elif re.match(r'^[^:]+:[^@]+@[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+:[0-9]+$', item): proxies.append((item, parent_proto)) elif isinstance(data, dict): # Look for ip/port/user/pass fields ip = None port = None proto = parent_proto user = None passwd = None for key, value in data.items(): key_lower = key.lower() if key_lower in JSON_IP_FIELDS and isinstance(value, basestring): ip = value.strip() elif key_lower in JSON_PORT_FIELDS: try: port = int(value) except (ValueError, TypeError): pass elif key_lower in JSON_PROTO_FIELDS and isinstance(value, basestring): proto = _normalize_proto(value) elif key_lower in JSON_USER_FIELDS and isinstance(value, basestring): user = value.strip() elif key_lower in JSON_PASS_FIELDS and isinstance(value, basestring): passwd = value.strip() if ip and port: if user and passwd: addr = '%s:%s@%s:%d' % (user, passwd, ip, port) else: addr = '%s:%d' % (ip, port) proxies.append((addr, proto)) # Check for nested arrays (data, proxies, list, items, etc.) for key, value in data.items(): if isinstance(value, (list, dict)): proxies.extend(_extract_from_json_data(value, proto)) return proxies def extract_proxies_with_hints(content): """Extract proxies with protocol hints from surrounding context. Looks for patterns like: - "socks5 1.2.3.4:8080" - "1.2.3.4:8080 (http)" - "SOCKS5: 1.2.3.4:8080" - Table rows with protocol in adjacent column Returns: Dict mapping address -> proto (or None if no hint) """ hints = {} # Pattern: protocol before IP:PORT for match in PROTO_HINT_PATTERN.finditer(content): proto = _normalize_proto(match.group(1)) addr = match.group(2) if proto: hints[addr] = proto # Pattern: IP:PORT before protocol for match in PROTO_HINT_REVERSE.finditer(content): addr = match.group(1) proto = _normalize_proto(match.group(2)) if proto and addr not in hints: hints[addr] = proto return hints def cleanhtml(raw_html): html = raw_html.replace(' ', ' ') html = re.sub(cleanhtml_re[0], ':', html) html = re.sub(cleanhtml_re[1], ':', html) html = re.sub(cleanhtml_re[2], ':', html) return html def fetch_contents(url, head=False, proxy=None): content = None if proxy is not None and len(proxy): for p in proxy: content = _fetch_contents(url, head=head, proxy=p) if content is not None: break else: content = _fetch_contents(url, head=head) return content if content is not None else '' retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded') def _fetch_contents(url, head = False, proxy=None): network_stats.set_category('scraper') host, port, ssl, uri = _parse_url(url) headers=[ 'Accept-Language: en-US,en;q=0.8', 'Cache-Control: max-age=0', ] if config.ppf.debug: _log("connecting to %s... (header: %s)" % (url, str(head)), "debug") tor_retries = 0 max_tor_retries = 1 http = None try: while True: proxies = [rocksock.RocksockProxyFromURL(tor_proxy_url(random.choice(config.torhosts)))] if proxy: proxies.append( rocksock.RocksockProxyFromURL(proxy)) http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0', log_errors=False) if not http.connect(): http.disconnect() http = None tor_retries += 1 if tor_retries <= max_tor_retries: # Retry once with different circuit time.sleep(1) continue # Log failure after retries exhausted global _last_fail_log now = time.time() if (now - _last_fail_log) >= _fail_log_interval: _log("failed to connect to %s"%url, "ppf") _last_fail_log = now return None break ## only request header if head: hdr = http.head(uri, headers) return hdr hdr, res = http.get(uri, headers) res = res.encode('utf-8') if isinstance(res, unicode) else res for retry_message in retry_messages: if retry_message in res: return None return res finally: if http: http.disconnect() def valid_port(port): """Check if port number is valid (1-65535).""" return port >= 1 and port <= 65535 def is_valid_ipv6(addr): """Validate IPv6 address format. Rejects: - Malformed addresses - Loopback (::1) - Link-local (fe80::/10) - Unique local (fc00::/7) - Multicast (ff00::/8) - Unspecified (::) """ # Basic format check - must contain colons, only hex digits and colons if not re.match(r'^[0-9a-fA-F:]+$', addr): return False # Check for valid segment count (2-8 segments, :: expands to fill) if '::' in addr: if addr.count('::') > 1: return False else: if addr.count(':') != 7: return False # Reject special addresses addr_lower = addr.lower() # Loopback ::1 if addr_lower in ('::1', '0:0:0:0:0:0:0:1'): return False # Unspecified :: if addr_lower in ('::', '0:0:0:0:0:0:0:0'): return False # Link-local fe80::/10 if addr_lower.startswith('fe8') or addr_lower.startswith('fe9') or \ addr_lower.startswith('fea') or addr_lower.startswith('feb'): return False # Unique local fc00::/7 (fc00:: - fdff::) if addr_lower.startswith('fc') or addr_lower.startswith('fd'): return False # Multicast ff00::/8 if addr_lower.startswith('ff'): return False return True def is_usable_proxy(proxy): """Validate proxy string format and reject unusable addresses. Accepts formats: - ip:port (IPv4) - [ipv6]:port (IPv6) - user:pass@ip:port - user:pass@[ipv6]:port Rejects: - Malformed strings - Invalid port (0, >65535) - Private/reserved ranges Results are cached using LRU eviction to avoid repeated validation. """ with _proxy_valid_cache_lock: if proxy in _proxy_valid_cache: # Move to end (most recently used) - Python 2 compatible value = _proxy_valid_cache.pop(proxy) _proxy_valid_cache[proxy] = value return value result = _validate_proxy(proxy) with _proxy_valid_cache_lock: # Evict oldest entries if at capacity while len(_proxy_valid_cache) >= _proxy_valid_cache_max: _proxy_valid_cache.popitem(last=False) _proxy_valid_cache[proxy] = result return result def _validate_proxy(proxy): """Internal validation logic for is_usable_proxy.""" try: if ':' not in proxy: return False # Strip auth credentials if present (user:pass@ip:port -> ip:port) if '@' in proxy: proxy = proxy.split('@', 1)[1] # Check for IPv6 format: [ipv6]:port if proxy.startswith('['): match = re.match(r'^\[([^\]]+)\]:(\d+)$', proxy) if not match: return False ipv6_addr, port_str = match.groups() port = int(port_str) if not valid_port(port): return False return is_valid_ipv6(ipv6_addr) # IPv4 format: ip:port ip, port_str = proxy.rsplit(':', 1) port = int(port_str) if not valid_port(port): return False octets = ip.split('.') if len(octets) != 4: return False A, B, C, D = [int(o) for o in octets] # Validate octet ranges if any(o < 0 or o > 255 for o in (A, B, C, D)): return False # Reject first octet 0 (0.0.0.0/8 - unspecified/invalid) if A == 0: return False # Reject loopback (127.0.0.0/8) if A == 127: return False # Reject private 10.0.0.0/8 if A == 10: return False # Reject private 172.16.0.0/12 if A == 172 and 16 <= B <= 31: return False # Reject private 192.168.0.0/16 if A == 192 and B == 168: return False # Reject link-local 169.254.0.0/16 if A == 169 and B == 254: return False # Reject CGNAT 100.64.0.0/10 (100.64.0.0 - 100.127.255.255) if A == 100 and 64 <= B <= 127: return False # Reject multicast 224.0.0.0/4 (224-239.x.x.x) if 224 <= A <= 239: return False # Reject reserved/future 240.0.0.0/4 (240-255.x.x.x) if A >= 240: return False return True except (ValueError, AttributeError, IndexError): return False _known_proxies = {} _known_proxies_lock = threading.Lock() def init_known_proxies(proxydb): """Initialize known proxies cache from database.""" global _known_proxies with _known_proxies_lock: if _known_proxies: return known = proxydb.execute('SELECT proxy FROM proxylist').fetchall() for k in known: _known_proxies[k[0]] = True def add_known_proxies(proxies): """Add proxies to known cache.""" global _known_proxies with _known_proxies_lock: for p in proxies: _known_proxies[p] = True def is_known_proxy(proxy): """Check if proxy is in known cache.""" with _known_proxies_lock: return proxy in _known_proxies def detect_proto_from_path(url): """Detect proxy protocol from URL path. Many proxy lists indicate protocol in their path: - /socks5/, /socks5.txt, socks5-proxies.txt -> socks5 - /socks4/, /socks4a/, /socks4.txt -> socks4 - /http/, /http.txt, http-proxies.txt -> http - /https/, /ssl/ -> http (HTTPS proxies use HTTP CONNECT) Args: url: Source URL path or full URL Returns: Protocol string ('http', 'socks4', 'socks5') or None if not detected """ url_lower = url.lower() # Check for socks5 indicators if 'socks5' in url_lower: return 'socks5' # Check for socks4/socks4a indicators if 'socks4' in url_lower: return 'socks4' # Check for http/https/ssl/connect indicators if any(x in url_lower for x in ('/http', 'http-', 'http_', 'http.', '/https', '/ssl', '/connect')): return 'http' return None def _normalize_proxy_addr(addr): """Normalize proxy address, handling auth and IPv6 formats. Formats: - ip:port - user:pass@ip:port - [ipv6]:port - user:pass@[ipv6]:port Returns normalized address or None if invalid. """ auth_prefix = '' if '@' in addr: auth_prefix, addr = addr.rsplit('@', 1) auth_prefix += '@' if ':' not in addr: return None # IPv6 format: [ipv6]:port if addr.startswith('['): match = re.match(r'^\[([^\]]+)\]:(\d+)$', addr) if not match: return None ipv6, port = match.groups() try: port = int(port.lstrip('0') or '0') except ValueError: return None return '%s[%s]:%d' % (auth_prefix, ipv6, port) # IPv4 format: ip:port ip, port = addr.rsplit(':', 1) try: ip = '.'.join(str(int(o)) for o in ip.split('.')) port = int(port.lstrip('0') or '0') except (ValueError, AttributeError): return None return '%s%s:%d' % (auth_prefix, ip, port) def extract_proxies(content, proxydb=None, filter_known=True, proto=None): """Extract and normalize proxy addresses from content. Uses multiple extraction methods (in priority order): 1. Authenticated proxy patterns (user:pass@ip:port) 2. JSON parsing for API responses 3. HTML table parsing with IP/Port/Protocol columns 4. Protocol hints from surrounding text 5. Regex extraction for raw IP:PORT patterns 6. IPv6 regex extraction Args: content: HTML/text content to parse proxydb: Database connection for known proxy lookup (optional) filter_known: If True, filter out known proxies and return new only proto: Protocol from source URL (fallback if not detected) Returns: If filter_known: (unique_count, new_proxies) tuple new_proxies is list of (address, proto, confidence) tuples If not filter_known: list of (address, proto, confidence) tuples """ # Dict: address -> (protocol, confidence) # Higher confidence wins; explicit proto upgrades confidence found = {} # 1. Extract authenticated proxies first (highest confidence) auth_proxies = extract_auth_proxies(content) for addr, detected_proto in auth_proxies: if is_usable_proxy(addr): addr = _normalize_proxy_addr(addr) if addr: conf = CONFIDENCE_AUTH if detected_proto: conf += CONFIDENCE_PROTO_EXPLICIT if addr not in found or conf > found[addr][1]: found[addr] = (detected_proto, conf) # 2. Try JSON extraction (reliable for protocol info) json_proxies = extract_proxies_from_json(content) for addr, detected_proto in json_proxies: if is_usable_proxy(addr): addr = _normalize_proxy_addr(addr) if addr: conf = CONFIDENCE_JSON if detected_proto: conf += CONFIDENCE_PROTO_EXPLICIT if addr not in found or conf > found[addr][1]: found[addr] = (detected_proto, conf) # 3. Try HTML table extraction (structured data with protocol columns) table_proxies = extract_proxies_from_table(content) for addr, detected_proto in table_proxies: if is_usable_proxy(addr): addr = _normalize_proxy_addr(addr) if addr: conf = CONFIDENCE_TABLE if detected_proto: conf += CONFIDENCE_PROTO_EXPLICIT if addr not in found or conf > found[addr][1]: found[addr] = (detected_proto, conf) # 4. Get protocol hints from content hints = extract_proxies_with_hints(content) # 5. Regex extraction for remaining IPv4 proxies (no auth) matches = PROXY_PATTERN.findall(cleanhtml(content)) for p in matches: ip, port = p.split(':') # Normalize IP (remove leading zeros from octets) ip = '.'.join(str(int(octet)) for octet in ip.split('.')) # Normalize port (remove leading zeros, handle empty case) port = int(port.lstrip('0') or '0') addr = '%s:%d' % (ip, port) if not is_usable_proxy(addr): continue if addr not in found: # Check for protocol hint detected_proto = hints.get(p) or hints.get(addr) if detected_proto: conf = CONFIDENCE_HINT + CONFIDENCE_PROTO_EXPLICIT else: conf = CONFIDENCE_REGEX found[addr] = (detected_proto, conf) # 6. Regex extraction for IPv6 proxies [ipv6]:port for match in IPV6_PROXY_PATTERN.finditer(content): ipv6, port = match.groups() port = int(port) if not is_valid_ipv6(ipv6): continue if not valid_port(port): continue addr = '[%s]:%d' % (ipv6, port) if addr not in found: found[addr] = (None, CONFIDENCE_REGEX) # Build result list with protocol and confidence # Protocol priority: detected > URL-based > None uniques = [] for addr in found: detected_proto, conf = found[addr] final_proto = detected_proto if detected_proto else proto # Add URL proto bonus if proto was inferred from path if not detected_proto and proto: conf += CONFIDENCE_PROTO_INFERRED uniques.append((addr, final_proto, conf)) if not filter_known: return uniques # Initialize known proxies from DB if needed if proxydb is not None: init_known_proxies(proxydb) new = [] for p, pr, conf in uniques: if not is_known_proxy(p): new.append((p, pr, conf)) add_known_proxies([p]) return len(uniques), new