fetch: consolidate extract_proxies into single implementation
This commit is contained in:
38
fetch.py
38
fetch.py
@@ -117,32 +117,44 @@ def is_known_proxy(proxy):
|
|||||||
"""Check if proxy is in known cache."""
|
"""Check if proxy is in known cache."""
|
||||||
return proxy in _known_proxies
|
return proxy in _known_proxies
|
||||||
|
|
||||||
def extract_proxies(content, proxydb):
|
def extract_proxies(content, proxydb=None, filter_known=True):
|
||||||
|
"""Extract and normalize proxy addresses from content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: HTML/text content to parse
|
||||||
|
proxydb: Database connection for known proxy lookup (optional)
|
||||||
|
filter_known: If True, filter out known proxies and return new only
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
If filter_known: (unique_count, new_proxies) tuple
|
||||||
|
If not filter_known: list of all unique valid proxies
|
||||||
|
"""
|
||||||
matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content))
|
matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content))
|
||||||
|
|
||||||
uniques_dict = {}
|
uniques_dict = {}
|
||||||
for p in matches:
|
for p in matches:
|
||||||
ip, port = p.split(':')
|
ip, port = p.split(':')
|
||||||
ip = '.'.join( [ str(int(str(i))) for i in ip.split('.') ] )
|
# Normalize IP (remove leading zeros from octets)
|
||||||
port = int( port.lstrip('0') )
|
ip = '.'.join(str(int(octet)) for octet in ip.split('.'))
|
||||||
|
# Normalize port (remove leading zeros, handle empty case)
|
||||||
|
port = int(port.lstrip('0') or '0')
|
||||||
p = '%s:%s' % (ip, port)
|
p = '%s:%s' % (ip, port)
|
||||||
uniques_dict[p] = True
|
uniques_dict[p] = True
|
||||||
|
|
||||||
uniques = []
|
uniques = [p for p in uniques_dict.keys() if is_usable_proxy(p)]
|
||||||
for p in uniques_dict.keys():
|
|
||||||
if is_usable_proxy(p): uniques.append(p)
|
|
||||||
|
|
||||||
global _known_proxies
|
if not filter_known:
|
||||||
if len(_known_proxies) == 0:
|
return uniques
|
||||||
known = proxydb.execute('SELECT proxy FROM proxylist').fetchall()
|
|
||||||
for k in known:
|
# Initialize known proxies from DB if needed
|
||||||
_known_proxies[k[0]] = True
|
if proxydb is not None:
|
||||||
|
init_known_proxies(proxydb)
|
||||||
|
|
||||||
new = []
|
new = []
|
||||||
for p in uniques:
|
for p in uniques:
|
||||||
if not p in _known_proxies:
|
if not is_known_proxy(p):
|
||||||
new.append(p)
|
new.append(p)
|
||||||
_known_proxies[p] = True
|
add_known_proxies([p])
|
||||||
|
|
||||||
return len(uniques), new
|
return len(uniques), new
|
||||||
|
|
||||||
|
|||||||
21
ppf.py
21
ppf.py
@@ -80,25 +80,6 @@ def import_proxies_from_file(proxydb, fn):
|
|||||||
return 0
|
return 0
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def extract_proxies(content):
|
|
||||||
"""Extract and normalize proxy addresses from content."""
|
|
||||||
matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', fetch.cleanhtml(content))
|
|
||||||
uniques_dict = {}
|
|
||||||
for p in matches:
|
|
||||||
# Cleanse IP (remove leading zeros) and port
|
|
||||||
ip, port = p.split(':')
|
|
||||||
ip = '.'.join(str(int(octet)) for octet in ip.split('.'))
|
|
||||||
port = int(port.lstrip('0') or '0')
|
|
||||||
p = '%s:%s' % (ip, port)
|
|
||||||
uniques_dict[p] = True
|
|
||||||
|
|
||||||
uniques = []
|
|
||||||
for p in uniques_dict.keys():
|
|
||||||
if fetch.is_usable_proxy(p): uniques.append(p)
|
|
||||||
|
|
||||||
return uniques
|
|
||||||
|
|
||||||
|
|
||||||
class Leechered(threading.Thread):
|
class Leechered(threading.Thread):
|
||||||
def __init__(self, url, stale_count, error, retrievals, proxies_added, content_type, proxy):
|
def __init__(self, url, stale_count, error, retrievals, proxies_added, content_type, proxy):
|
||||||
self.status = 'nok'
|
self.status = 'nok'
|
||||||
@@ -135,7 +116,7 @@ class Leechered(threading.Thread):
|
|||||||
else:
|
else:
|
||||||
content = ''
|
content = ''
|
||||||
|
|
||||||
unique = extract_proxies(content)
|
unique = fetch.extract_proxies(content, filter_known=False)
|
||||||
self.proxylist = [ proxy for proxy in unique if not fetch.is_known_proxy(proxy) ]
|
self.proxylist = [ proxy for proxy in unique if not fetch.is_known_proxy(proxy) ]
|
||||||
proxy_count = len(self.proxylist)
|
proxy_count = len(self.proxylist)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user