diff --git a/fetch.py b/fetch.py index 5dbbe7f..749ce3b 100644 --- a/fetch.py +++ b/fetch.py @@ -12,11 +12,16 @@ def set_config(cfg): global config config = cfg +# Pre-compiled regex patterns (compiled once at module load) cleanhtml_re = [ re.compile('<.*?>'), re.compile('\s+'), re.compile('::+'), ] + +# Proxy extraction pattern: IP:PORT followed by non-digit or end +# Pattern: 1-3 digits, dot, repeated 3 times, colon, 2-5 digit port +PROXY_PATTERN = re.compile(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]') def cleanhtml(raw_html): html = raw_html.replace(' ', ' ') html = re.sub(cleanhtml_re[0], ':', html) @@ -233,7 +238,7 @@ def extract_proxies(content, proxydb=None, filter_known=True, proto=None): new_proxies is list of (address, proto) tuples If not filter_known: list of (address, proto) tuples """ - matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content)) + matches = PROXY_PATTERN.findall(cleanhtml(content)) uniques_dict = {} for p in matches: