fetch: precompile proxy extraction regex

Move regex pattern compilation to module load time
for better performance in repeated calls.
This commit is contained in:
Username
2025-12-24 00:20:06 +01:00
parent 33f9a211ce
commit 5e788c06d1

View File

@@ -12,11 +12,16 @@ def set_config(cfg):
global config global config
config = cfg config = cfg
# Pre-compiled regex patterns (compiled once at module load)
cleanhtml_re = [ cleanhtml_re = [
re.compile('<.*?>'), re.compile('<.*?>'),
re.compile('\s+'), re.compile('\s+'),
re.compile('::+'), re.compile('::+'),
] ]
# Proxy extraction pattern: IP:PORT followed by non-digit or end
# Pattern: 1-3 digits, dot, repeated 3 times, colon, 2-5 digit port
PROXY_PATTERN = re.compile(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]')
def cleanhtml(raw_html): def cleanhtml(raw_html):
html = raw_html.replace('&nbsp;', ' ') html = raw_html.replace('&nbsp;', ' ')
html = re.sub(cleanhtml_re[0], ':', html) html = re.sub(cleanhtml_re[0], ':', html)
@@ -233,7 +238,7 @@ def extract_proxies(content, proxydb=None, filter_known=True, proto=None):
new_proxies is list of (address, proto) tuples new_proxies is list of (address, proto) tuples
If not filter_known: list of (address, proto) tuples If not filter_known: list of (address, proto) tuples
""" """
matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content)) matches = PROXY_PATTERN.findall(cleanhtml(content))
uniques_dict = {} uniques_dict = {}
for p in matches: for p in matches: