fetch: consolidate extract_proxies into single implementation
This commit is contained in:
21
ppf.py
21
ppf.py
@@ -80,25 +80,6 @@ def import_proxies_from_file(proxydb, fn):
|
||||
return 0
|
||||
return 1
|
||||
|
||||
def extract_proxies(content):
|
||||
"""Extract and normalize proxy addresses from content."""
|
||||
matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', fetch.cleanhtml(content))
|
||||
uniques_dict = {}
|
||||
for p in matches:
|
||||
# Cleanse IP (remove leading zeros) and port
|
||||
ip, port = p.split(':')
|
||||
ip = '.'.join(str(int(octet)) for octet in ip.split('.'))
|
||||
port = int(port.lstrip('0') or '0')
|
||||
p = '%s:%s' % (ip, port)
|
||||
uniques_dict[p] = True
|
||||
|
||||
uniques = []
|
||||
for p in uniques_dict.keys():
|
||||
if fetch.is_usable_proxy(p): uniques.append(p)
|
||||
|
||||
return uniques
|
||||
|
||||
|
||||
class Leechered(threading.Thread):
|
||||
def __init__(self, url, stale_count, error, retrievals, proxies_added, content_type, proxy):
|
||||
self.status = 'nok'
|
||||
@@ -135,7 +116,7 @@ class Leechered(threading.Thread):
|
||||
else:
|
||||
content = ''
|
||||
|
||||
unique = extract_proxies(content)
|
||||
unique = fetch.extract_proxies(content, filter_known=False)
|
||||
self.proxylist = [ proxy for proxy in unique if not fetch.is_known_proxy(proxy) ]
|
||||
proxy_count = len(self.proxylist)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user