diff --git a/fetch.py b/fetch.py
index d57da6d..ab20f91 100644
--- a/fetch.py
+++ b/fetch.py
@@ -221,6 +221,10 @@ def extract_auth_proxies(content):
"""
proxies = []
+ # Short-circuit: auth proxies always contain @
+ if '@' not in content:
+ return proxies
+
# IPv4 auth proxies
for match in AUTH_PROXY_PATTERN.finditer(content):
proto_str, user, passwd, ip, port = match.groups()
@@ -256,6 +260,12 @@ TABLE_PORT_HEADERS = ('port',)
TABLE_PROTO_HEADERS = ('type', 'protocol', 'proto', 'scheme')
+_TABLE_PATTERN = re.compile(r'
', re.IGNORECASE | re.DOTALL)
+_ROW_PATTERN = re.compile(r']*>(.*?)
', re.IGNORECASE | re.DOTALL)
+_CELL_PATTERN = re.compile(r']*>(.*?)', re.IGNORECASE | re.DOTALL)
+_TAG_STRIP = re.compile(r'<[^>]+>')
+
+
def extract_proxies_from_table(content):
"""Extract proxies from HTML tables with IP/Port/Protocol columns.
@@ -269,26 +279,23 @@ def extract_proxies_from_table(content):
"""
proxies = []
- # Simple regex-based table parsing (works without BeautifulSoup)
- # Find all tables
- table_pattern = re.compile(r'', re.IGNORECASE | re.DOTALL)
- row_pattern = re.compile(r']*>(.*?)
', re.IGNORECASE | re.DOTALL)
- cell_pattern = re.compile(r']*>(.*?)', re.IGNORECASE | re.DOTALL)
- tag_strip = re.compile(r'<[^>]+>')
+ # Short-circuit: no HTML tables in plain text content
+ if '= 0 and len(cells) > proto_col:
- proto = _normalize_proto(tag_strip.sub('', cells[proto_col]).strip())
+ proto = _normalize_proto(_TAG_STRIP.sub('', cells[proto_col]).strip())
addr = '%s:%s' % (ip, port)
if is_usable_proxy(addr):
proxies.append((addr, proto))
@@ -323,7 +330,7 @@ def extract_proxies_from_table(content):
# Separate IP and Port columns
if port_col >= 0 and len(cells) > port_col:
- port_cell = tag_strip.sub('', cells[port_col]).strip()
+ port_cell = _TAG_STRIP.sub('', cells[port_col]).strip()
try:
port = int(port_cell)
except ValueError:
@@ -335,7 +342,7 @@ def extract_proxies_from_table(content):
proto = None
if proto_col >= 0 and len(cells) > proto_col:
- proto = _normalize_proto(tag_strip.sub('', cells[proto_col]).strip())
+ proto = _normalize_proto(_TAG_STRIP.sub('', cells[proto_col]).strip())
addr = '%s:%d' % (ip_cell, port)
if is_usable_proxy(addr):
@@ -358,6 +365,10 @@ def extract_proxies_from_json(content):
"""
proxies = []
+ # Short-circuit: content must contain JSON delimiters
+ if '{' not in content and '[' not in content:
+ return proxies
+
# Try to find JSON in content (may be embedded in HTML)
json_matches = []