fetch: consolidate extract_proxies into single implementation

This commit is contained in:
Username
2025-12-20 22:50:39 +01:00
parent 9588da92e7
commit 4780b6f095
2 changed files with 26 additions and 33 deletions

View File

@@ -117,32 +117,44 @@ def is_known_proxy(proxy):
"""Check if proxy is in known cache."""
return proxy in _known_proxies
def extract_proxies(content, proxydb):
def extract_proxies(content, proxydb=None, filter_known=True):
"""Extract and normalize proxy addresses from content.
Args:
content: HTML/text content to parse
proxydb: Database connection for known proxy lookup (optional)
filter_known: If True, filter out known proxies and return new only
Returns:
If filter_known: (unique_count, new_proxies) tuple
If not filter_known: list of all unique valid proxies
"""
matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content))
uniques_dict = {}
for p in matches:
ip, port = p.split(':')
ip = '.'.join( [ str(int(str(i))) for i in ip.split('.') ] )
port = int( port.lstrip('0') )
# Normalize IP (remove leading zeros from octets)
ip = '.'.join(str(int(octet)) for octet in ip.split('.'))
# Normalize port (remove leading zeros, handle empty case)
port = int(port.lstrip('0') or '0')
p = '%s:%s' % (ip, port)
uniques_dict[p] = True
uniques = []
for p in uniques_dict.keys():
if is_usable_proxy(p): uniques.append(p)
uniques = [p for p in uniques_dict.keys() if is_usable_proxy(p)]
global _known_proxies
if len(_known_proxies) == 0:
known = proxydb.execute('SELECT proxy FROM proxylist').fetchall()
for k in known:
_known_proxies[k[0]] = True
if not filter_known:
return uniques
# Initialize known proxies from DB if needed
if proxydb is not None:
init_known_proxies(proxydb)
new = []
for p in uniques:
if not p in _known_proxies:
if not is_known_proxy(p):
new.append(p)
_known_proxies[p] = True
add_known_proxies([p])
return len(uniques), new