import re, random, time import threading import rocksock import network_stats from http2 import RsHttp, _parse_url from soup_parser import soupify from misc import _log config = None _last_fail_log = 0 _fail_log_interval = 60 def set_config(cfg): global config config = cfg # Pre-compiled regex patterns (compiled once at module load) cleanhtml_re = [ re.compile(r'<.*?>'), re.compile(r'\s+'), re.compile(r'::+'), ] # Proxy extraction pattern: IP:PORT followed by non-digit or end # Pattern: 1-3 digits, dot, repeated 3 times, colon, 2-5 digit port PROXY_PATTERN = re.compile(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]') def cleanhtml(raw_html): html = raw_html.replace(' ', ' ') html = re.sub(cleanhtml_re[0], ':', html) html = re.sub(cleanhtml_re[1], ':', html) html = re.sub(cleanhtml_re[2], ':', html) return html def fetch_contents(url, head=False, proxy=None): content = None if proxy is not None and len(proxy): for p in proxy: content = _fetch_contents(url, head=head, proxy=p) if content is not None: break else: content = _fetch_contents(url, head=head) return content if content is not None else '' retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded') def _fetch_contents(url, head = False, proxy=None): network_stats.set_category('scraper') host, port, ssl, uri = _parse_url(url) headers=[ 'Accept-Language: en-US,en;q=0.8', 'Cache-Control: max-age=0', ] if config.ppf.debug: _log("connecting to %s... (header: %s)" % (url, str(head)), "debug") tor_retries = 0 max_tor_retries = 1 http = None try: while True: proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))] if proxy: proxies.append( rocksock.RocksockProxyFromURL(proxy)) http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0', log_errors=False) if not http.connect(): global _last_fail_log now = time.time() if (now - _last_fail_log) >= _fail_log_interval: _log("failed to connect to %s"%url, "ppf") _last_fail_log = now e = http.get_last_rocksock_exception() if not e: return None et = e.get_errortype() ee = e.get_error() ef = e.get_failedproxy() if et == rocksock.RS_ET_OWN and \ ee == rocksock.RS_E_TARGET_CONN_REFUSED \ and ef == 0: http.disconnect() http = None tor_retries += 1 if tor_retries >= max_tor_retries: _log("tor proxy failed after %d retries" % tor_retries, "error") return None _log("tor proxy retry %d/%d" % (tor_retries, max_tor_retries), "warn") time.sleep(5) continue return None break ## only request header if head: hdr = http.head(uri, headers) return hdr hdr, res = http.get(uri, headers) res = res.encode('utf-8') if isinstance(res, unicode) else res for retry_message in retry_messages: if retry_message in res: return None return res finally: if http: http.disconnect() def valid_port(port): """Check if port number is valid (1-65535).""" return port >= 1 and port <= 65535 def is_usable_proxy(proxy): """Validate proxy string format and reject unusable addresses. Rejects: - Malformed strings (not ip:port format) - Invalid port (0, >65535) - Invalid IP octets (>255) - Private ranges: 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 - Loopback: 127.0.0.0/8 - Link-local: 169.254.0.0/16 - CGNAT: 100.64.0.0/10 - Multicast: 224.0.0.0/4 - Reserved: 240.0.0.0/4 - Unspecified: 0.0.0.0 """ try: if ':' not in proxy: return False ip, port_str = proxy.rsplit(':', 1) port = int(port_str) if not valid_port(port): return False octets = ip.split('.') if len(octets) != 4: return False A, B, C, D = [int(o) for o in octets] # Validate octet ranges if any(o < 0 or o > 255 for o in (A, B, C, D)): return False # Reject first octet 0 (0.0.0.0/8 - unspecified/invalid) if A == 0: return False # Reject loopback (127.0.0.0/8) if A == 127: return False # Reject private 10.0.0.0/8 if A == 10: return False # Reject private 172.16.0.0/12 if A == 172 and 16 <= B <= 31: return False # Reject private 192.168.0.0/16 if A == 192 and B == 168: return False # Reject link-local 169.254.0.0/16 if A == 169 and B == 254: return False # Reject CGNAT 100.64.0.0/10 (100.64.0.0 - 100.127.255.255) if A == 100 and 64 <= B <= 127: return False # Reject multicast 224.0.0.0/4 (224-239.x.x.x) if 224 <= A <= 239: return False # Reject reserved/future 240.0.0.0/4 (240-255.x.x.x) if A >= 240: return False return True except (ValueError, AttributeError, IndexError): return False _known_proxies = {} _known_proxies_lock = threading.Lock() def init_known_proxies(proxydb): """Initialize known proxies cache from database.""" global _known_proxies with _known_proxies_lock: if _known_proxies: return known = proxydb.execute('SELECT proxy FROM proxylist').fetchall() for k in known: _known_proxies[k[0]] = True def add_known_proxies(proxies): """Add proxies to known cache.""" global _known_proxies with _known_proxies_lock: for p in proxies: _known_proxies[p] = True def is_known_proxy(proxy): """Check if proxy is in known cache.""" with _known_proxies_lock: return proxy in _known_proxies def detect_proto_from_path(url): """Detect proxy protocol from URL path. Many proxy lists indicate protocol in their path: - /socks5/, /socks5.txt, socks5-proxies.txt -> socks5 - /socks4/, /socks4a/, /socks4.txt -> socks4 - /http/, /http.txt, http-proxies.txt -> http - /https/, /ssl/ -> http (HTTPS proxies use HTTP CONNECT) Args: url: Source URL path or full URL Returns: Protocol string ('http', 'socks4', 'socks5') or None if not detected """ url_lower = url.lower() # Check for socks5 indicators if 'socks5' in url_lower: return 'socks5' # Check for socks4/socks4a indicators if 'socks4' in url_lower: return 'socks4' # Check for http/https/ssl/connect indicators if any(x in url_lower for x in ('/http', 'http-', 'http_', 'http.', '/https', '/ssl', '/connect')): return 'http' return None def extract_proxies(content, proxydb=None, filter_known=True, proto=None): """Extract and normalize proxy addresses from content. Args: content: HTML/text content to parse proxydb: Database connection for known proxy lookup (optional) filter_known: If True, filter out known proxies and return new only proto: Protocol to assign to all extracted proxies (from source URL) Returns: If filter_known: (unique_count, new_proxies) tuple new_proxies is list of (address, proto) tuples If not filter_known: list of (address, proto) tuples """ matches = PROXY_PATTERN.findall(cleanhtml(content)) uniques_dict = {} for p in matches: ip, port = p.split(':') # Normalize IP (remove leading zeros from octets) ip = '.'.join(str(int(octet)) for octet in ip.split('.')) # Normalize port (remove leading zeros, handle empty case) port = int(port.lstrip('0') or '0') p = '%s:%s' % (ip, port) uniques_dict[p] = True uniques = [(p, proto) for p in uniques_dict.keys() if is_usable_proxy(p)] if not filter_known: return uniques # Initialize known proxies from DB if needed if proxydb is not None: init_known_proxies(proxydb) new = [] for p, pr in uniques: if not is_known_proxy(p): new.append((p, pr)) add_known_proxies([p]) return len(uniques), new def extract_urls(content, urls = None, urignore=None): urls = [] if not urls else urls soup = soupify(content) for a in soup.body.find_all('a'): if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue bad = False href = a.attrs['href'] for i in urignore: if re.findall(i, href): bad = True break if not bad: urls.append(href) return urls