From 98b232f3d3355df19728559500f8e320a8d2deff Mon Sep 17 00:00:00 2001
From: Username <user@mymx.me>
Date: Sun, 22 Feb 2026 13:50:29 +0100
Subject: [PATCH] fetch: add short-circuit guards to extraction functions

Skip expensive regex scans when content lacks required markers:
- extract_auth_proxies: skip if no '@' in content
- extract_proxies_from_table: skip if no '<table' tag
- extract_proxies_from_json: skip if no '{' or '['
- Hoist table regexes to module-level precompiled constants
---
 fetch.py | 41 ++++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 15 deletions(-)
diff --git a/fetch.py b/fetch.py
index d57da6d..ab20f91 100644
--- a/fetch.py
+++ b/fetch.py
@@ -221,6 +221,10 @@ def extract_auth_proxies(content):
     """
     proxies = []
 
+    # Short-circuit: auth proxies always contain @
+    if '@' not in content:
+        return proxies
+
     # IPv4 auth proxies
     for match in AUTH_PROXY_PATTERN.finditer(content):
         proto_str, user, passwd, ip, port = match.groups()
@@ -256,6 +260,12 @@ TABLE_PORT_HEADERS = ('port',)
 TABLE_PROTO_HEADERS = ('type', 'protocol', 'proto', 'scheme')
 
 
+_TABLE_PATTERN = re.compile(r'<table[^>]*>(.*?)</table>', re.IGNORECASE | re.DOTALL)
+_ROW_PATTERN = re.compile(r'<tr[^>]*>(.*?)</tr>', re.IGNORECASE | re.DOTALL)
+_CELL_PATTERN = re.compile(r'<t[hd][^>]*>(.*?)</t[hd]>', re.IGNORECASE | re.DOTALL)
+_TAG_STRIP = re.compile(r'<[^>]+>')
+
+
 def extract_proxies_from_table(content):
     """Extract proxies from HTML tables with IP/Port/Protocol columns.
 
@@ -269,26 +279,23 @@ def extract_proxies_from_table(content):
     """
     proxies = []
 
-    # Simple regex-based table parsing (works without BeautifulSoup)
-    # Find all tables
-    table_pattern = re.compile(r'<table[^>]*>(.*?)</table>', re.IGNORECASE | re.DOTALL)
-    row_pattern = re.compile(r'<tr[^>]*>(.*?)</tr>', re.IGNORECASE | re.DOTALL)
-    cell_pattern = re.compile(r'<t[hd][^>]*>(.*?)</t[hd]>', re.IGNORECASE | re.DOTALL)
-    tag_strip = re.compile(r'<[^>]+>')
+    # Short-circuit: no HTML tables in plain text content
+    if '<table' not in content and '<TABLE' not in content:
+        return proxies
 
-    for table_match in table_pattern.finditer(content):
+    for table_match in _TABLE_PATTERN.finditer(content):
         table_html = table_match.group(1)
-        rows = row_pattern.findall(table_html)
+        rows = _ROW_PATTERN.findall(table_html)
         if not rows:
             continue
 
         # Parse header row to find column indices
         ip_col = port_col = proto_col = -1
         header_row = rows[0]
-        headers = cell_pattern.findall(header_row)
+        headers = _CELL_PATTERN.findall(header_row)
 
         for i, cell in enumerate(headers):
-            cell_text = tag_strip.sub('', cell).strip().lower()
+            cell_text = _TAG_STRIP.sub('', cell).strip().lower()
             if ip_col < 0 and any(h in cell_text for h in TABLE_IP_HEADERS):
                 ip_col = i
             elif port_col < 0 and any(h in cell_text for h in TABLE_PORT_HEADERS):
@@ -302,11 +309,11 @@ def extract_proxies_from_table(content):
 
         # Parse data rows
         for row in rows[1:]:
-            cells = cell_pattern.findall(row)
+            cells = _CELL_PATTERN.findall(row)
             if len(cells) <= ip_col:
                 continue
 
-            ip_cell = tag_strip.sub('', cells[ip_col]).strip()
+            ip_cell = _TAG_STRIP.sub('', cells[ip_col]).strip()
 
             # Check if IP cell contains port (ip:port format)
             if ':' in ip_cell and port_col < 0:
@@ -315,7 +322,7 @@ def extract_proxies_from_table(content):
                     ip, port = match.groups()
                     proto = None
                     if proto_col >= 0 and len(cells) > proto_col:
-                        proto = _normalize_proto(tag_strip.sub('', cells[proto_col]).strip())
+                        proto = _normalize_proto(_TAG_STRIP.sub('', cells[proto_col]).strip())
                     addr = '%s:%s' % (ip, port)
                     if is_usable_proxy(addr):
                         proxies.append((addr, proto))
@@ -323,7 +330,7 @@ def extract_proxies_from_table(content):
 
             # Separate IP and Port columns
             if port_col >= 0 and len(cells) > port_col:
-                port_cell = tag_strip.sub('', cells[port_col]).strip()
+                port_cell = _TAG_STRIP.sub('', cells[port_col]).strip()
                 try:
                     port = int(port_cell)
                 except ValueError:
@@ -335,7 +342,7 @@ def extract_proxies_from_table(content):
 
                 proto = None
                 if proto_col >= 0 and len(cells) > proto_col:
-                    proto = _normalize_proto(tag_strip.sub('', cells[proto_col]).strip())
+                    proto = _normalize_proto(_TAG_STRIP.sub('', cells[proto_col]).strip())
 
                 addr = '%s:%d' % (ip_cell, port)
                 if is_usable_proxy(addr):
@@ -358,6 +365,10 @@ def extract_proxies_from_json(content):
     """
     proxies = []
 
+    # Short-circuit: content must contain JSON delimiters
+    if '{' not in content and '[' not in content:
+        return proxies
+
     # Try to find JSON in content (may be embedded in HTML)
     json_matches = []