From 4780b6f09584c4cf8380ef10b41e1e520123575d Mon Sep 17 00:00:00 2001 From: Username Date: Sat, 20 Dec 2025 22:50:39 +0100 Subject: [PATCH] fetch: consolidate extract_proxies into single implementation --- fetch.py | 38 +++++++++++++++++++++++++------------- ppf.py | 21 +-------------------- 2 files changed, 26 insertions(+), 33 deletions(-) diff --git a/fetch.py b/fetch.py index d85b75f..fdb0558 100644 --- a/fetch.py +++ b/fetch.py @@ -117,32 +117,44 @@ def is_known_proxy(proxy): """Check if proxy is in known cache.""" return proxy in _known_proxies -def extract_proxies(content, proxydb): +def extract_proxies(content, proxydb=None, filter_known=True): + """Extract and normalize proxy addresses from content. + + Args: + content: HTML/text content to parse + proxydb: Database connection for known proxy lookup (optional) + filter_known: If True, filter out known proxies and return new only + + Returns: + If filter_known: (unique_count, new_proxies) tuple + If not filter_known: list of all unique valid proxies + """ matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content)) uniques_dict = {} for p in matches: ip, port = p.split(':') - ip = '.'.join( [ str(int(str(i))) for i in ip.split('.') ] ) - port = int( port.lstrip('0') ) + # Normalize IP (remove leading zeros from octets) + ip = '.'.join(str(int(octet)) for octet in ip.split('.')) + # Normalize port (remove leading zeros, handle empty case) + port = int(port.lstrip('0') or '0') p = '%s:%s' % (ip, port) uniques_dict[p] = True - uniques = [] - for p in uniques_dict.keys(): - if is_usable_proxy(p): uniques.append(p) + uniques = [p for p in uniques_dict.keys() if is_usable_proxy(p)] - global _known_proxies - if len(_known_proxies) == 0: - known = proxydb.execute('SELECT proxy FROM proxylist').fetchall() - for k in known: - _known_proxies[k[0]] = True + if not filter_known: + return uniques + + # Initialize known proxies from DB if needed + if proxydb is not None: + init_known_proxies(proxydb) new = [] for p in uniques: - if not p in _known_proxies: + if not is_known_proxy(p): new.append(p) - _known_proxies[p] = True + add_known_proxies([p]) return len(uniques), new diff --git a/ppf.py b/ppf.py index e8e210d..7f114fb 100755 --- a/ppf.py +++ b/ppf.py @@ -80,25 +80,6 @@ def import_proxies_from_file(proxydb, fn): return 0 return 1 -def extract_proxies(content): - """Extract and normalize proxy addresses from content.""" - matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', fetch.cleanhtml(content)) - uniques_dict = {} - for p in matches: - # Cleanse IP (remove leading zeros) and port - ip, port = p.split(':') - ip = '.'.join(str(int(octet)) for octet in ip.split('.')) - port = int(port.lstrip('0') or '0') - p = '%s:%s' % (ip, port) - uniques_dict[p] = True - - uniques = [] - for p in uniques_dict.keys(): - if fetch.is_usable_proxy(p): uniques.append(p) - - return uniques - - class Leechered(threading.Thread): def __init__(self, url, stale_count, error, retrievals, proxies_added, content_type, proxy): self.status = 'nok' @@ -135,7 +116,7 @@ class Leechered(threading.Thread): else: content = '' - unique = extract_proxies(content) + unique = fetch.extract_proxies(content, filter_known=False) self.proxylist = [ proxy for proxy in unique if not fetch.is_known_proxy(proxy) ] proxy_count = len(self.proxylist)