fetch: detect proxy protocol from source URL path

- detect_proto_from_path() infers socks4/socks5/http from URL
- extract_proxies() now returns (address, proto) tuples
- ppf.py updated to handle protocol-tagged proxies
- profiler signal handler for SIGTERM stats dump
This commit is contained in:
Username
2025-12-23 17:23:17 +01:00
parent e0e330301a
commit 68a34f2638
2 changed files with 62 additions and 19 deletions

41
ppf.py
View File

@@ -16,8 +16,16 @@ import re
import threading
import random
# Global profiler for signal handler access
_profiler = None
# Handle SIGTERM gracefully (for container stop)
def sigterm_handler(signum, frame):
global _profiler
if _profiler:
_profiler.disable()
_profiler.dump_stats('data/profile.stats')
_log('profile stats written to data/profile.stats (SIGTERM)', 'info')
raise KeyboardInterrupt
signal.signal(signal.SIGTERM, sigterm_handler)
@@ -83,7 +91,9 @@ def extract_urls(html, url):
def import_proxies_from_file(proxydb, fn):
content = open(fn, 'r').read()
unique_count, new = fetch.extract_proxies(content, proxydb)
# Detect protocol from filename (e.g., socks5.txt, http-proxies.txt)
proto = fetch.detect_proto_from_path(fn)
unique_count, new = fetch.extract_proxies(content, proxydb, proto=proto)
if new:
dbs.insert_proxies(proxydb, new, fn)
return 0
@@ -134,7 +144,9 @@ class Leechered(threading.Thread):
else:
content = ''
unique = fetch.extract_proxies(content, filter_known=False)
# Detect protocol from source URL (e.g., .../socks5/list.txt)
proto = fetch.detect_proto_from_path(self.url)
unique = fetch.extract_proxies(content, filter_known=False, proto=proto)
# Compute hash of all extracted proxies for change detection
self.new_hash = dbs.compute_proxy_list_hash(unique)
@@ -152,7 +164,8 @@ class Leechered(threading.Thread):
# Content changed or first fetch - reset stale_count, proceed with normal processing
self.stale_count = 0
self.proxylist = [ proxy for proxy in unique if not fetch.is_known_proxy(proxy) ]
# unique is list of (address, proto) tuples; filter by address, keep tuple
self.proxylist = [(addr, pr) for addr, pr in unique if not fetch.is_known_proxy(addr)]
proxy_count = len(self.proxylist)
if self.retrievals == 0: # new site
@@ -250,9 +263,10 @@ def main():
for thread in threads:
if thread.status == 'ok':
url, proxylist, stale_count, error, retrievals, content_type, proxies_added, execute = thread.retrieve()
new = [ p for p in proxylist if not fetch.is_known_proxy(p) ]
# proxylist is list of (address, proto) tuples
new = [(addr, pr) for addr, pr in proxylist if not fetch.is_known_proxy(addr)]
if new:
fetch.add_known_proxies(new)
fetch.add_known_proxies([addr for addr, pr in new])
# Update content_hash if we have a new one
new_hash = thread.new_hash
execute = (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, new_hash, url)
@@ -262,7 +276,7 @@ def main():
threads = [ thread for thread in threads if thread.is_alive() ]
if len(threads) < config.ppf.threads and rows:
p = random.sample(_proxylist, 5) if _proxylist is not None else None
p = random.sample(_proxylist, min(5, len(_proxylist))) if _proxylist else None
row = random.choice(rows)
urldb.execute('UPDATE uris SET check_time=? where url=?', (time.time(), row[0]))
urldb.commit()
@@ -297,17 +311,16 @@ if __name__ == '__main__':
set_nobs(True)
if config.args.profile:
_log('profiling enabled, output to profile.stats', 'info')
profiler = cProfile.Profile()
_log('profiling enabled, output to data/profile.stats', 'info')
_profiler = cProfile.Profile()
try:
profiler.enable()
_profiler.enable()
main()
finally:
profiler.disable()
profiler.dump_stats('profile.stats')
_log('profile stats written to profile.stats', 'info')
# print top 20 by cumulative time
stats = pstats.Stats('profile.stats')
_profiler.disable()
_profiler.dump_stats('data/profile.stats')
_log('profile stats written to data/profile.stats', 'info')
stats = pstats.Stats('data/profile.stats')
stats.strip_dirs().sort_stats('cumulative').print_stats(20)
else:
main()