fetch: detect proxy protocol from source URL path
- detect_proto_from_path() infers socks4/socks5/http from URL - extract_proxies() now returns (address, proto) tuples - ppf.py updated to handle protocol-tagged proxies - profiler signal handler for SIGTERM stats dump
This commit is contained in:
40
fetch.py
40
fetch.py
@@ -191,17 +191,47 @@ def is_known_proxy(proxy):
|
|||||||
"""Check if proxy is in known cache."""
|
"""Check if proxy is in known cache."""
|
||||||
return proxy in _known_proxies
|
return proxy in _known_proxies
|
||||||
|
|
||||||
def extract_proxies(content, proxydb=None, filter_known=True):
|
def detect_proto_from_path(url):
|
||||||
|
"""Detect proxy protocol from URL path.
|
||||||
|
|
||||||
|
Many proxy lists indicate protocol in their path:
|
||||||
|
- /socks5/, /socks5.txt, socks5-proxies.txt -> socks5
|
||||||
|
- /socks4/, /socks4a/, /socks4.txt -> socks4
|
||||||
|
- /http/, /http.txt, http-proxies.txt -> http
|
||||||
|
- /https/, /ssl/ -> http (HTTPS proxies use HTTP CONNECT)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: Source URL path or full URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Protocol string ('http', 'socks4', 'socks5') or None if not detected
|
||||||
|
"""
|
||||||
|
url_lower = url.lower()
|
||||||
|
# Check for socks5 indicators
|
||||||
|
if 'socks5' in url_lower:
|
||||||
|
return 'socks5'
|
||||||
|
# Check for socks4/socks4a indicators
|
||||||
|
if 'socks4' in url_lower:
|
||||||
|
return 'socks4'
|
||||||
|
# Check for http/https/ssl/connect indicators
|
||||||
|
if any(x in url_lower for x in ('/http', 'http-', 'http_', 'http.', '/https', '/ssl', '/connect')):
|
||||||
|
return 'http'
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_proxies(content, proxydb=None, filter_known=True, proto=None):
|
||||||
"""Extract and normalize proxy addresses from content.
|
"""Extract and normalize proxy addresses from content.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
content: HTML/text content to parse
|
content: HTML/text content to parse
|
||||||
proxydb: Database connection for known proxy lookup (optional)
|
proxydb: Database connection for known proxy lookup (optional)
|
||||||
filter_known: If True, filter out known proxies and return new only
|
filter_known: If True, filter out known proxies and return new only
|
||||||
|
proto: Protocol to assign to all extracted proxies (from source URL)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
If filter_known: (unique_count, new_proxies) tuple
|
If filter_known: (unique_count, new_proxies) tuple
|
||||||
If not filter_known: list of all unique valid proxies
|
new_proxies is list of (address, proto) tuples
|
||||||
|
If not filter_known: list of (address, proto) tuples
|
||||||
"""
|
"""
|
||||||
matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content))
|
matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content))
|
||||||
|
|
||||||
@@ -215,7 +245,7 @@ def extract_proxies(content, proxydb=None, filter_known=True):
|
|||||||
p = '%s:%s' % (ip, port)
|
p = '%s:%s' % (ip, port)
|
||||||
uniques_dict[p] = True
|
uniques_dict[p] = True
|
||||||
|
|
||||||
uniques = [p for p in uniques_dict.keys() if is_usable_proxy(p)]
|
uniques = [(p, proto) for p in uniques_dict.keys() if is_usable_proxy(p)]
|
||||||
|
|
||||||
if not filter_known:
|
if not filter_known:
|
||||||
return uniques
|
return uniques
|
||||||
@@ -225,9 +255,9 @@ def extract_proxies(content, proxydb=None, filter_known=True):
|
|||||||
init_known_proxies(proxydb)
|
init_known_proxies(proxydb)
|
||||||
|
|
||||||
new = []
|
new = []
|
||||||
for p in uniques:
|
for p, pr in uniques:
|
||||||
if not is_known_proxy(p):
|
if not is_known_proxy(p):
|
||||||
new.append(p)
|
new.append((p, pr))
|
||||||
add_known_proxies([p])
|
add_known_proxies([p])
|
||||||
|
|
||||||
return len(uniques), new
|
return len(uniques), new
|
||||||
|
|||||||
41
ppf.py
41
ppf.py
@@ -16,8 +16,16 @@ import re
|
|||||||
import threading
|
import threading
|
||||||
import random
|
import random
|
||||||
|
|
||||||
|
# Global profiler for signal handler access
|
||||||
|
_profiler = None
|
||||||
|
|
||||||
# Handle SIGTERM gracefully (for container stop)
|
# Handle SIGTERM gracefully (for container stop)
|
||||||
def sigterm_handler(signum, frame):
|
def sigterm_handler(signum, frame):
|
||||||
|
global _profiler
|
||||||
|
if _profiler:
|
||||||
|
_profiler.disable()
|
||||||
|
_profiler.dump_stats('data/profile.stats')
|
||||||
|
_log('profile stats written to data/profile.stats (SIGTERM)', 'info')
|
||||||
raise KeyboardInterrupt
|
raise KeyboardInterrupt
|
||||||
|
|
||||||
signal.signal(signal.SIGTERM, sigterm_handler)
|
signal.signal(signal.SIGTERM, sigterm_handler)
|
||||||
@@ -83,7 +91,9 @@ def extract_urls(html, url):
|
|||||||
|
|
||||||
def import_proxies_from_file(proxydb, fn):
|
def import_proxies_from_file(proxydb, fn):
|
||||||
content = open(fn, 'r').read()
|
content = open(fn, 'r').read()
|
||||||
unique_count, new = fetch.extract_proxies(content, proxydb)
|
# Detect protocol from filename (e.g., socks5.txt, http-proxies.txt)
|
||||||
|
proto = fetch.detect_proto_from_path(fn)
|
||||||
|
unique_count, new = fetch.extract_proxies(content, proxydb, proto=proto)
|
||||||
if new:
|
if new:
|
||||||
dbs.insert_proxies(proxydb, new, fn)
|
dbs.insert_proxies(proxydb, new, fn)
|
||||||
return 0
|
return 0
|
||||||
@@ -134,7 +144,9 @@ class Leechered(threading.Thread):
|
|||||||
else:
|
else:
|
||||||
content = ''
|
content = ''
|
||||||
|
|
||||||
unique = fetch.extract_proxies(content, filter_known=False)
|
# Detect protocol from source URL (e.g., .../socks5/list.txt)
|
||||||
|
proto = fetch.detect_proto_from_path(self.url)
|
||||||
|
unique = fetch.extract_proxies(content, filter_known=False, proto=proto)
|
||||||
|
|
||||||
# Compute hash of all extracted proxies for change detection
|
# Compute hash of all extracted proxies for change detection
|
||||||
self.new_hash = dbs.compute_proxy_list_hash(unique)
|
self.new_hash = dbs.compute_proxy_list_hash(unique)
|
||||||
@@ -152,7 +164,8 @@ class Leechered(threading.Thread):
|
|||||||
|
|
||||||
# Content changed or first fetch - reset stale_count, proceed with normal processing
|
# Content changed or first fetch - reset stale_count, proceed with normal processing
|
||||||
self.stale_count = 0
|
self.stale_count = 0
|
||||||
self.proxylist = [ proxy for proxy in unique if not fetch.is_known_proxy(proxy) ]
|
# unique is list of (address, proto) tuples; filter by address, keep tuple
|
||||||
|
self.proxylist = [(addr, pr) for addr, pr in unique if not fetch.is_known_proxy(addr)]
|
||||||
proxy_count = len(self.proxylist)
|
proxy_count = len(self.proxylist)
|
||||||
|
|
||||||
if self.retrievals == 0: # new site
|
if self.retrievals == 0: # new site
|
||||||
@@ -250,9 +263,10 @@ def main():
|
|||||||
for thread in threads:
|
for thread in threads:
|
||||||
if thread.status == 'ok':
|
if thread.status == 'ok':
|
||||||
url, proxylist, stale_count, error, retrievals, content_type, proxies_added, execute = thread.retrieve()
|
url, proxylist, stale_count, error, retrievals, content_type, proxies_added, execute = thread.retrieve()
|
||||||
new = [ p for p in proxylist if not fetch.is_known_proxy(p) ]
|
# proxylist is list of (address, proto) tuples
|
||||||
|
new = [(addr, pr) for addr, pr in proxylist if not fetch.is_known_proxy(addr)]
|
||||||
if new:
|
if new:
|
||||||
fetch.add_known_proxies(new)
|
fetch.add_known_proxies([addr for addr, pr in new])
|
||||||
# Update content_hash if we have a new one
|
# Update content_hash if we have a new one
|
||||||
new_hash = thread.new_hash
|
new_hash = thread.new_hash
|
||||||
execute = (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, new_hash, url)
|
execute = (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, new_hash, url)
|
||||||
@@ -262,7 +276,7 @@ def main():
|
|||||||
|
|
||||||
threads = [ thread for thread in threads if thread.is_alive() ]
|
threads = [ thread for thread in threads if thread.is_alive() ]
|
||||||
if len(threads) < config.ppf.threads and rows:
|
if len(threads) < config.ppf.threads and rows:
|
||||||
p = random.sample(_proxylist, 5) if _proxylist is not None else None
|
p = random.sample(_proxylist, min(5, len(_proxylist))) if _proxylist else None
|
||||||
row = random.choice(rows)
|
row = random.choice(rows)
|
||||||
urldb.execute('UPDATE uris SET check_time=? where url=?', (time.time(), row[0]))
|
urldb.execute('UPDATE uris SET check_time=? where url=?', (time.time(), row[0]))
|
||||||
urldb.commit()
|
urldb.commit()
|
||||||
@@ -297,17 +311,16 @@ if __name__ == '__main__':
|
|||||||
set_nobs(True)
|
set_nobs(True)
|
||||||
|
|
||||||
if config.args.profile:
|
if config.args.profile:
|
||||||
_log('profiling enabled, output to profile.stats', 'info')
|
_log('profiling enabled, output to data/profile.stats', 'info')
|
||||||
profiler = cProfile.Profile()
|
_profiler = cProfile.Profile()
|
||||||
try:
|
try:
|
||||||
profiler.enable()
|
_profiler.enable()
|
||||||
main()
|
main()
|
||||||
finally:
|
finally:
|
||||||
profiler.disable()
|
_profiler.disable()
|
||||||
profiler.dump_stats('profile.stats')
|
_profiler.dump_stats('data/profile.stats')
|
||||||
_log('profile stats written to profile.stats', 'info')
|
_log('profile stats written to data/profile.stats', 'info')
|
||||||
# print top 20 by cumulative time
|
stats = pstats.Stats('data/profile.stats')
|
||||||
stats = pstats.Stats('profile.stats')
|
|
||||||
stats.strip_dirs().sort_stats('cumulative').print_stats(20)
|
stats.strip_dirs().sort_stats('cumulative').print_stats(20)
|
||||||
else:
|
else:
|
||||||
main()
|
main()
|
||||||
|
|||||||
Reference in New Issue
Block a user