fetch: precompile proxy extraction regex
Move regex pattern compilation to module load time for better performance in repeated calls.
This commit is contained in:
7
fetch.py
7
fetch.py
@@ -12,11 +12,16 @@ def set_config(cfg):
|
|||||||
global config
|
global config
|
||||||
config = cfg
|
config = cfg
|
||||||
|
|
||||||
|
# Pre-compiled regex patterns (compiled once at module load)
|
||||||
cleanhtml_re = [
|
cleanhtml_re = [
|
||||||
re.compile('<.*?>'),
|
re.compile('<.*?>'),
|
||||||
re.compile('\s+'),
|
re.compile('\s+'),
|
||||||
re.compile('::+'),
|
re.compile('::+'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Proxy extraction pattern: IP:PORT followed by non-digit or end
|
||||||
|
# Pattern: 1-3 digits, dot, repeated 3 times, colon, 2-5 digit port
|
||||||
|
PROXY_PATTERN = re.compile(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]')
|
||||||
def cleanhtml(raw_html):
|
def cleanhtml(raw_html):
|
||||||
html = raw_html.replace(' ', ' ')
|
html = raw_html.replace(' ', ' ')
|
||||||
html = re.sub(cleanhtml_re[0], ':', html)
|
html = re.sub(cleanhtml_re[0], ':', html)
|
||||||
@@ -233,7 +238,7 @@ def extract_proxies(content, proxydb=None, filter_known=True, proto=None):
|
|||||||
new_proxies is list of (address, proto) tuples
|
new_proxies is list of (address, proto) tuples
|
||||||
If not filter_known: list of (address, proto) tuples
|
If not filter_known: list of (address, proto) tuples
|
||||||
"""
|
"""
|
||||||
matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content))
|
matches = PROXY_PATTERN.findall(cleanhtml(content))
|
||||||
|
|
||||||
uniques_dict = {}
|
uniques_dict = {}
|
||||||
for p in matches:
|
for p in matches:
|
||||||
|
|||||||
Reference in New Issue
Block a user