From 98b232f3d3355df19728559500f8e320a8d2deff Mon Sep 17 00:00:00 2001 From: Username Date: Sun, 22 Feb 2026 13:50:29 +0100 Subject: [PATCH] fetch: add short-circuit guards to extraction functions Skip expensive regex scans when content lacks required markers: - extract_auth_proxies: skip if no '@' in content - extract_proxies_from_table: skip if no ']*>(.*?)', re.IGNORECASE | re.DOTALL) +_ROW_PATTERN = re.compile(r']*>(.*?)', re.IGNORECASE | re.DOTALL) +_CELL_PATTERN = re.compile(r']*>(.*?)', re.IGNORECASE | re.DOTALL) +_TAG_STRIP = re.compile(r'<[^>]+>') + + def extract_proxies_from_table(content): """Extract proxies from HTML tables with IP/Port/Protocol columns. @@ -269,26 +279,23 @@ def extract_proxies_from_table(content): """ proxies = [] - # Simple regex-based table parsing (works without BeautifulSoup) - # Find all tables - table_pattern = re.compile(r']*>(.*?)', re.IGNORECASE | re.DOTALL) - row_pattern = re.compile(r']*>(.*?)', re.IGNORECASE | re.DOTALL) - cell_pattern = re.compile(r']*>(.*?)', re.IGNORECASE | re.DOTALL) - tag_strip = re.compile(r'<[^>]+>') + # Short-circuit: no HTML tables in plain text content + if '= 0 and len(cells) > proto_col: - proto = _normalize_proto(tag_strip.sub('', cells[proto_col]).strip()) + proto = _normalize_proto(_TAG_STRIP.sub('', cells[proto_col]).strip()) addr = '%s:%s' % (ip, port) if is_usable_proxy(addr): proxies.append((addr, proto)) @@ -323,7 +330,7 @@ def extract_proxies_from_table(content): # Separate IP and Port columns if port_col >= 0 and len(cells) > port_col: - port_cell = tag_strip.sub('', cells[port_col]).strip() + port_cell = _TAG_STRIP.sub('', cells[port_col]).strip() try: port = int(port_cell) except ValueError: @@ -335,7 +342,7 @@ def extract_proxies_from_table(content): proto = None if proto_col >= 0 and len(cells) > proto_col: - proto = _normalize_proto(tag_strip.sub('', cells[proto_col]).strip()) + proto = _normalize_proto(_TAG_STRIP.sub('', cells[proto_col]).strip()) addr = '%s:%d' % (ip_cell, port) if is_usable_proxy(addr): @@ -358,6 +365,10 @@ def extract_proxies_from_json(content): """ proxies = [] + # Short-circuit: content must contain JSON delimiters + if '{' not in content and '[' not in content: + return proxies + # Try to find JSON in content (may be embedded in HTML) json_matches = []