fetch: add short-circuit guards to extraction functions
Skip expensive regex scans when content lacks required markers:
- extract_auth_proxies: skip if no '@' in content
- extract_proxies_from_table: skip if no '<table' tag
- extract_proxies_from_json: skip if no '{' or '['
- Hoist table regexes to module-level precompiled constants
This commit is contained in:
41
fetch.py
41
fetch.py
@@ -221,6 +221,10 @@ def extract_auth_proxies(content):
|
|||||||
"""
|
"""
|
||||||
proxies = []
|
proxies = []
|
||||||
|
|
||||||
|
# Short-circuit: auth proxies always contain @
|
||||||
|
if '@' not in content:
|
||||||
|
return proxies
|
||||||
|
|
||||||
# IPv4 auth proxies
|
# IPv4 auth proxies
|
||||||
for match in AUTH_PROXY_PATTERN.finditer(content):
|
for match in AUTH_PROXY_PATTERN.finditer(content):
|
||||||
proto_str, user, passwd, ip, port = match.groups()
|
proto_str, user, passwd, ip, port = match.groups()
|
||||||
@@ -256,6 +260,12 @@ TABLE_PORT_HEADERS = ('port',)
|
|||||||
TABLE_PROTO_HEADERS = ('type', 'protocol', 'proto', 'scheme')
|
TABLE_PROTO_HEADERS = ('type', 'protocol', 'proto', 'scheme')
|
||||||
|
|
||||||
|
|
||||||
|
_TABLE_PATTERN = re.compile(r'<table[^>]*>(.*?)</table>', re.IGNORECASE | re.DOTALL)
|
||||||
|
_ROW_PATTERN = re.compile(r'<tr[^>]*>(.*?)</tr>', re.IGNORECASE | re.DOTALL)
|
||||||
|
_CELL_PATTERN = re.compile(r'<t[hd][^>]*>(.*?)</t[hd]>', re.IGNORECASE | re.DOTALL)
|
||||||
|
_TAG_STRIP = re.compile(r'<[^>]+>')
|
||||||
|
|
||||||
|
|
||||||
def extract_proxies_from_table(content):
|
def extract_proxies_from_table(content):
|
||||||
"""Extract proxies from HTML tables with IP/Port/Protocol columns.
|
"""Extract proxies from HTML tables with IP/Port/Protocol columns.
|
||||||
|
|
||||||
@@ -269,26 +279,23 @@ def extract_proxies_from_table(content):
|
|||||||
"""
|
"""
|
||||||
proxies = []
|
proxies = []
|
||||||
|
|
||||||
# Simple regex-based table parsing (works without BeautifulSoup)
|
# Short-circuit: no HTML tables in plain text content
|
||||||
# Find all tables
|
if '<table' not in content and '<TABLE' not in content:
|
||||||
table_pattern = re.compile(r'<table[^>]*>(.*?)</table>', re.IGNORECASE | re.DOTALL)
|
return proxies
|
||||||
row_pattern = re.compile(r'<tr[^>]*>(.*?)</tr>', re.IGNORECASE | re.DOTALL)
|
|
||||||
cell_pattern = re.compile(r'<t[hd][^>]*>(.*?)</t[hd]>', re.IGNORECASE | re.DOTALL)
|
|
||||||
tag_strip = re.compile(r'<[^>]+>')
|
|
||||||
|
|
||||||
for table_match in table_pattern.finditer(content):
|
for table_match in _TABLE_PATTERN.finditer(content):
|
||||||
table_html = table_match.group(1)
|
table_html = table_match.group(1)
|
||||||
rows = row_pattern.findall(table_html)
|
rows = _ROW_PATTERN.findall(table_html)
|
||||||
if not rows:
|
if not rows:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Parse header row to find column indices
|
# Parse header row to find column indices
|
||||||
ip_col = port_col = proto_col = -1
|
ip_col = port_col = proto_col = -1
|
||||||
header_row = rows[0]
|
header_row = rows[0]
|
||||||
headers = cell_pattern.findall(header_row)
|
headers = _CELL_PATTERN.findall(header_row)
|
||||||
|
|
||||||
for i, cell in enumerate(headers):
|
for i, cell in enumerate(headers):
|
||||||
cell_text = tag_strip.sub('', cell).strip().lower()
|
cell_text = _TAG_STRIP.sub('', cell).strip().lower()
|
||||||
if ip_col < 0 and any(h in cell_text for h in TABLE_IP_HEADERS):
|
if ip_col < 0 and any(h in cell_text for h in TABLE_IP_HEADERS):
|
||||||
ip_col = i
|
ip_col = i
|
||||||
elif port_col < 0 and any(h in cell_text for h in TABLE_PORT_HEADERS):
|
elif port_col < 0 and any(h in cell_text for h in TABLE_PORT_HEADERS):
|
||||||
@@ -302,11 +309,11 @@ def extract_proxies_from_table(content):
|
|||||||
|
|
||||||
# Parse data rows
|
# Parse data rows
|
||||||
for row in rows[1:]:
|
for row in rows[1:]:
|
||||||
cells = cell_pattern.findall(row)
|
cells = _CELL_PATTERN.findall(row)
|
||||||
if len(cells) <= ip_col:
|
if len(cells) <= ip_col:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
ip_cell = tag_strip.sub('', cells[ip_col]).strip()
|
ip_cell = _TAG_STRIP.sub('', cells[ip_col]).strip()
|
||||||
|
|
||||||
# Check if IP cell contains port (ip:port format)
|
# Check if IP cell contains port (ip:port format)
|
||||||
if ':' in ip_cell and port_col < 0:
|
if ':' in ip_cell and port_col < 0:
|
||||||
@@ -315,7 +322,7 @@ def extract_proxies_from_table(content):
|
|||||||
ip, port = match.groups()
|
ip, port = match.groups()
|
||||||
proto = None
|
proto = None
|
||||||
if proto_col >= 0 and len(cells) > proto_col:
|
if proto_col >= 0 and len(cells) > proto_col:
|
||||||
proto = _normalize_proto(tag_strip.sub('', cells[proto_col]).strip())
|
proto = _normalize_proto(_TAG_STRIP.sub('', cells[proto_col]).strip())
|
||||||
addr = '%s:%s' % (ip, port)
|
addr = '%s:%s' % (ip, port)
|
||||||
if is_usable_proxy(addr):
|
if is_usable_proxy(addr):
|
||||||
proxies.append((addr, proto))
|
proxies.append((addr, proto))
|
||||||
@@ -323,7 +330,7 @@ def extract_proxies_from_table(content):
|
|||||||
|
|
||||||
# Separate IP and Port columns
|
# Separate IP and Port columns
|
||||||
if port_col >= 0 and len(cells) > port_col:
|
if port_col >= 0 and len(cells) > port_col:
|
||||||
port_cell = tag_strip.sub('', cells[port_col]).strip()
|
port_cell = _TAG_STRIP.sub('', cells[port_col]).strip()
|
||||||
try:
|
try:
|
||||||
port = int(port_cell)
|
port = int(port_cell)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
@@ -335,7 +342,7 @@ def extract_proxies_from_table(content):
|
|||||||
|
|
||||||
proto = None
|
proto = None
|
||||||
if proto_col >= 0 and len(cells) > proto_col:
|
if proto_col >= 0 and len(cells) > proto_col:
|
||||||
proto = _normalize_proto(tag_strip.sub('', cells[proto_col]).strip())
|
proto = _normalize_proto(_TAG_STRIP.sub('', cells[proto_col]).strip())
|
||||||
|
|
||||||
addr = '%s:%d' % (ip_cell, port)
|
addr = '%s:%d' % (ip_cell, port)
|
||||||
if is_usable_proxy(addr):
|
if is_usable_proxy(addr):
|
||||||
@@ -358,6 +365,10 @@ def extract_proxies_from_json(content):
|
|||||||
"""
|
"""
|
||||||
proxies = []
|
proxies = []
|
||||||
|
|
||||||
|
# Short-circuit: content must contain JSON delimiters
|
||||||
|
if '{' not in content and '[' not in content:
|
||||||
|
return proxies
|
||||||
|
|
||||||
# Try to find JSON in content (may be embedded in HTML)
|
# Try to find JSON in content (may be embedded in HTML)
|
||||||
json_matches = []
|
json_matches = []
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user