fetch: add short-circuit guards to extraction functions
Skip expensive regex scans when content lacks required markers:
- extract_auth_proxies: skip if no '@' in content
- extract_proxies_from_table: skip if no '<table' tag
- extract_proxies_from_json: skip if no '{' or '['
- Hoist table regexes to module-level precompiled constants
This commit is contained in:
41
fetch.py
41
fetch.py
@@ -221,6 +221,10 @@ def extract_auth_proxies(content):
|
||||
"""
|
||||
proxies = []
|
||||
|
||||
# Short-circuit: auth proxies always contain @
|
||||
if '@' not in content:
|
||||
return proxies
|
||||
|
||||
# IPv4 auth proxies
|
||||
for match in AUTH_PROXY_PATTERN.finditer(content):
|
||||
proto_str, user, passwd, ip, port = match.groups()
|
||||
@@ -256,6 +260,12 @@ TABLE_PORT_HEADERS = ('port',)
|
||||
TABLE_PROTO_HEADERS = ('type', 'protocol', 'proto', 'scheme')
|
||||
|
||||
|
||||
_TABLE_PATTERN = re.compile(r'<table[^>]*>(.*?)</table>', re.IGNORECASE | re.DOTALL)
|
||||
_ROW_PATTERN = re.compile(r'<tr[^>]*>(.*?)</tr>', re.IGNORECASE | re.DOTALL)
|
||||
_CELL_PATTERN = re.compile(r'<t[hd][^>]*>(.*?)</t[hd]>', re.IGNORECASE | re.DOTALL)
|
||||
_TAG_STRIP = re.compile(r'<[^>]+>')
|
||||
|
||||
|
||||
def extract_proxies_from_table(content):
|
||||
"""Extract proxies from HTML tables with IP/Port/Protocol columns.
|
||||
|
||||
@@ -269,26 +279,23 @@ def extract_proxies_from_table(content):
|
||||
"""
|
||||
proxies = []
|
||||
|
||||
# Simple regex-based table parsing (works without BeautifulSoup)
|
||||
# Find all tables
|
||||
table_pattern = re.compile(r'<table[^>]*>(.*?)</table>', re.IGNORECASE | re.DOTALL)
|
||||
row_pattern = re.compile(r'<tr[^>]*>(.*?)</tr>', re.IGNORECASE | re.DOTALL)
|
||||
cell_pattern = re.compile(r'<t[hd][^>]*>(.*?)</t[hd]>', re.IGNORECASE | re.DOTALL)
|
||||
tag_strip = re.compile(r'<[^>]+>')
|
||||
# Short-circuit: no HTML tables in plain text content
|
||||
if '<table' not in content and '<TABLE' not in content:
|
||||
return proxies
|
||||
|
||||
for table_match in table_pattern.finditer(content):
|
||||
for table_match in _TABLE_PATTERN.finditer(content):
|
||||
table_html = table_match.group(1)
|
||||
rows = row_pattern.findall(table_html)
|
||||
rows = _ROW_PATTERN.findall(table_html)
|
||||
if not rows:
|
||||
continue
|
||||
|
||||
# Parse header row to find column indices
|
||||
ip_col = port_col = proto_col = -1
|
||||
header_row = rows[0]
|
||||
headers = cell_pattern.findall(header_row)
|
||||
headers = _CELL_PATTERN.findall(header_row)
|
||||
|
||||
for i, cell in enumerate(headers):
|
||||
cell_text = tag_strip.sub('', cell).strip().lower()
|
||||
cell_text = _TAG_STRIP.sub('', cell).strip().lower()
|
||||
if ip_col < 0 and any(h in cell_text for h in TABLE_IP_HEADERS):
|
||||
ip_col = i
|
||||
elif port_col < 0 and any(h in cell_text for h in TABLE_PORT_HEADERS):
|
||||
@@ -302,11 +309,11 @@ def extract_proxies_from_table(content):
|
||||
|
||||
# Parse data rows
|
||||
for row in rows[1:]:
|
||||
cells = cell_pattern.findall(row)
|
||||
cells = _CELL_PATTERN.findall(row)
|
||||
if len(cells) <= ip_col:
|
||||
continue
|
||||
|
||||
ip_cell = tag_strip.sub('', cells[ip_col]).strip()
|
||||
ip_cell = _TAG_STRIP.sub('', cells[ip_col]).strip()
|
||||
|
||||
# Check if IP cell contains port (ip:port format)
|
||||
if ':' in ip_cell and port_col < 0:
|
||||
@@ -315,7 +322,7 @@ def extract_proxies_from_table(content):
|
||||
ip, port = match.groups()
|
||||
proto = None
|
||||
if proto_col >= 0 and len(cells) > proto_col:
|
||||
proto = _normalize_proto(tag_strip.sub('', cells[proto_col]).strip())
|
||||
proto = _normalize_proto(_TAG_STRIP.sub('', cells[proto_col]).strip())
|
||||
addr = '%s:%s' % (ip, port)
|
||||
if is_usable_proxy(addr):
|
||||
proxies.append((addr, proto))
|
||||
@@ -323,7 +330,7 @@ def extract_proxies_from_table(content):
|
||||
|
||||
# Separate IP and Port columns
|
||||
if port_col >= 0 and len(cells) > port_col:
|
||||
port_cell = tag_strip.sub('', cells[port_col]).strip()
|
||||
port_cell = _TAG_STRIP.sub('', cells[port_col]).strip()
|
||||
try:
|
||||
port = int(port_cell)
|
||||
except ValueError:
|
||||
@@ -335,7 +342,7 @@ def extract_proxies_from_table(content):
|
||||
|
||||
proto = None
|
||||
if proto_col >= 0 and len(cells) > proto_col:
|
||||
proto = _normalize_proto(tag_strip.sub('', cells[proto_col]).strip())
|
||||
proto = _normalize_proto(_TAG_STRIP.sub('', cells[proto_col]).strip())
|
||||
|
||||
addr = '%s:%d' % (ip_cell, port)
|
||||
if is_usable_proxy(addr):
|
||||
@@ -358,6 +365,10 @@ def extract_proxies_from_json(content):
|
||||
"""
|
||||
proxies = []
|
||||
|
||||
# Short-circuit: content must contain JSON delimiters
|
||||
if '{' not in content and '[' not in content:
|
||||
return proxies
|
||||
|
||||
# Try to find JSON in content (may be embedded in HTML)
|
||||
json_matches = []
|
||||
|
||||
|
||||
Reference in New Issue
Block a user