fetch: add short-circuit guards to extraction functions

Skip expensive regex scans when content lacks required markers: - extract_auth_proxies: skip if no '@' in content - extract_proxies_from_table: skip if no '<table' tag - extract_proxies_from_json: skip if no '{' or '[' - Hoist table regexes to module-level precompiled constants
2026-02-22 13:50:29 +01:00
parent b300afed6c
commit 98b232f3d3
1 changed files with 26 additions and 15 deletions
--- a/fetch.py
+++ b/fetch.py
@@ -221,6 +221,10 @@ def extract_auth_proxies(content):
    """
    proxies = []

+    # Short-circuit: auth proxies always contain @
+    if '@' not in content:
+        return proxies
+
    # IPv4 auth proxies
    for match in AUTH_PROXY_PATTERN.finditer(content):
        proto_str, user, passwd, ip, port = match.groups()
@@ -256,6 +260,12 @@ TABLE_PORT_HEADERS = ('port',)
 TABLE_PROTO_HEADERS = ('type', 'protocol', 'proto', 'scheme')


+_TABLE_PATTERN = re.compile(r'<table[^>]*>(.*?)</table>', re.IGNORECASE | re.DOTALL)
+_ROW_PATTERN = re.compile(r'<tr[^>]*>(.*?)</tr>', re.IGNORECASE | re.DOTALL)
+_CELL_PATTERN = re.compile(r'<t[hd][^>]*>(.*?)</t[hd]>', re.IGNORECASE | re.DOTALL)
+_TAG_STRIP = re.compile(r'<[^>]+>')
+
+
 def extract_proxies_from_table(content):
    """Extract proxies from HTML tables with IP/Port/Protocol columns.

@@ -269,26 +279,23 @@ def extract_proxies_from_table(content):
    """
    proxies = []

-    # Simple regex-based table parsing (works without BeautifulSoup)
-    # Find all tables
-    table_pattern = re.compile(r'<table[^>]*>(.*?)</table>', re.IGNORECASE | re.DOTALL)
-    row_pattern = re.compile(r'<tr[^>]*>(.*?)</tr>', re.IGNORECASE | re.DOTALL)
-    cell_pattern = re.compile(r'<t[hd][^>]*>(.*?)</t[hd]>', re.IGNORECASE | re.DOTALL)
-    tag_strip = re.compile(r'<[^>]+>')
+    # Short-circuit: no HTML tables in plain text content
+    if '<table' not in content and '<TABLE' not in content:
+        return proxies

-    for table_match in table_pattern.finditer(content):
+    for table_match in _TABLE_PATTERN.finditer(content):
        table_html = table_match.group(1)
-        rows = row_pattern.findall(table_html)
+        rows = _ROW_PATTERN.findall(table_html)
        if not rows:
            continue

        # Parse header row to find column indices
        ip_col = port_col = proto_col = -1
        header_row = rows[0]
-        headers = cell_pattern.findall(header_row)
+        headers = _CELL_PATTERN.findall(header_row)

        for i, cell in enumerate(headers):
-            cell_text = tag_strip.sub('', cell).strip().lower()
+            cell_text = _TAG_STRIP.sub('', cell).strip().lower()
            if ip_col < 0 and any(h in cell_text for h in TABLE_IP_HEADERS):
                ip_col = i
            elif port_col < 0 and any(h in cell_text for h in TABLE_PORT_HEADERS):
@@ -302,11 +309,11 @@ def extract_proxies_from_table(content):

        # Parse data rows
        for row in rows[1:]:
-            cells = cell_pattern.findall(row)
+            cells = _CELL_PATTERN.findall(row)
            if len(cells) <= ip_col:
                continue

-            ip_cell = tag_strip.sub('', cells[ip_col]).strip()
+            ip_cell = _TAG_STRIP.sub('', cells[ip_col]).strip()

            # Check if IP cell contains port (ip:port format)
            if ':' in ip_cell and port_col < 0:
@@ -315,7 +322,7 @@ def extract_proxies_from_table(content):
                    ip, port = match.groups()
                    proto = None
                    if proto_col >= 0 and len(cells) > proto_col:
-                        proto = _normalize_proto(tag_strip.sub('', cells[proto_col]).strip())
+                        proto = _normalize_proto(_TAG_STRIP.sub('', cells[proto_col]).strip())
                    addr = '%s:%s' % (ip, port)
                    if is_usable_proxy(addr):
                        proxies.append((addr, proto))
@@ -323,7 +330,7 @@ def extract_proxies_from_table(content):

            # Separate IP and Port columns
            if port_col >= 0 and len(cells) > port_col:
-                port_cell = tag_strip.sub('', cells[port_col]).strip()
+                port_cell = _TAG_STRIP.sub('', cells[port_col]).strip()
                try:
                    port = int(port_cell)
                except ValueError:
@@ -335,7 +342,7 @@ def extract_proxies_from_table(content):

                proto = None
                if proto_col >= 0 and len(cells) > proto_col:
-                    proto = _normalize_proto(tag_strip.sub('', cells[proto_col]).strip())
+                    proto = _normalize_proto(_TAG_STRIP.sub('', cells[proto_col]).strip())

                addr = '%s:%d' % (ip_cell, port)
                if is_usable_proxy(addr):
@@ -358,6 +365,10 @@ def extract_proxies_from_json(content):
    """
    proxies = []

+    # Short-circuit: content must contain JSON delimiters
+    if '{' not in content and '[' not in content:
+        return proxies
+
    # Try to find JSON in content (may be embedded in HTML)
    json_matches = []