diff --git a/fetch.py b/fetch.py index d57da6d..ab20f91 100644 --- a/fetch.py +++ b/fetch.py @@ -221,6 +221,10 @@ def extract_auth_proxies(content): """ proxies = [] + # Short-circuit: auth proxies always contain @ + if '@' not in content: + return proxies + # IPv4 auth proxies for match in AUTH_PROXY_PATTERN.finditer(content): proto_str, user, passwd, ip, port = match.groups() @@ -256,6 +260,12 @@ TABLE_PORT_HEADERS = ('port',) TABLE_PROTO_HEADERS = ('type', 'protocol', 'proto', 'scheme') +_TABLE_PATTERN = re.compile(r']*>(.*?)', re.IGNORECASE | re.DOTALL) +_ROW_PATTERN = re.compile(r']*>(.*?)', re.IGNORECASE | re.DOTALL) +_CELL_PATTERN = re.compile(r']*>(.*?)', re.IGNORECASE | re.DOTALL) +_TAG_STRIP = re.compile(r'<[^>]+>') + + def extract_proxies_from_table(content): """Extract proxies from HTML tables with IP/Port/Protocol columns. @@ -269,26 +279,23 @@ def extract_proxies_from_table(content): """ proxies = [] - # Simple regex-based table parsing (works without BeautifulSoup) - # Find all tables - table_pattern = re.compile(r']*>(.*?)', re.IGNORECASE | re.DOTALL) - row_pattern = re.compile(r']*>(.*?)', re.IGNORECASE | re.DOTALL) - cell_pattern = re.compile(r']*>(.*?)', re.IGNORECASE | re.DOTALL) - tag_strip = re.compile(r'<[^>]+>') + # Short-circuit: no HTML tables in plain text content + if '= 0 and len(cells) > proto_col: - proto = _normalize_proto(tag_strip.sub('', cells[proto_col]).strip()) + proto = _normalize_proto(_TAG_STRIP.sub('', cells[proto_col]).strip()) addr = '%s:%s' % (ip, port) if is_usable_proxy(addr): proxies.append((addr, proto)) @@ -323,7 +330,7 @@ def extract_proxies_from_table(content): # Separate IP and Port columns if port_col >= 0 and len(cells) > port_col: - port_cell = tag_strip.sub('', cells[port_col]).strip() + port_cell = _TAG_STRIP.sub('', cells[port_col]).strip() try: port = int(port_cell) except ValueError: @@ -335,7 +342,7 @@ def extract_proxies_from_table(content): proto = None if proto_col >= 0 and len(cells) > proto_col: - proto = _normalize_proto(tag_strip.sub('', cells[proto_col]).strip()) + proto = _normalize_proto(_TAG_STRIP.sub('', cells[proto_col]).strip()) addr = '%s:%d' % (ip_cell, port) if is_usable_proxy(addr): @@ -358,6 +365,10 @@ def extract_proxies_from_json(content): """ proxies = [] + # Short-circuit: content must contain JSON delimiters + if '{' not in content and '[' not in content: + return proxies + # Try to find JSON in content (may be embedded in HTML) json_matches = []