diff --git a/fetch.py b/fetch.py index 6e59b3b..5dbbe7f 100644 --- a/fetch.py +++ b/fetch.py @@ -191,17 +191,47 @@ def is_known_proxy(proxy): """Check if proxy is in known cache.""" return proxy in _known_proxies -def extract_proxies(content, proxydb=None, filter_known=True): +def detect_proto_from_path(url): + """Detect proxy protocol from URL path. + + Many proxy lists indicate protocol in their path: + - /socks5/, /socks5.txt, socks5-proxies.txt -> socks5 + - /socks4/, /socks4a/, /socks4.txt -> socks4 + - /http/, /http.txt, http-proxies.txt -> http + - /https/, /ssl/ -> http (HTTPS proxies use HTTP CONNECT) + + Args: + url: Source URL path or full URL + + Returns: + Protocol string ('http', 'socks4', 'socks5') or None if not detected + """ + url_lower = url.lower() + # Check for socks5 indicators + if 'socks5' in url_lower: + return 'socks5' + # Check for socks4/socks4a indicators + if 'socks4' in url_lower: + return 'socks4' + # Check for http/https/ssl/connect indicators + if any(x in url_lower for x in ('/http', 'http-', 'http_', 'http.', '/https', '/ssl', '/connect')): + return 'http' + return None + + +def extract_proxies(content, proxydb=None, filter_known=True, proto=None): """Extract and normalize proxy addresses from content. Args: content: HTML/text content to parse proxydb: Database connection for known proxy lookup (optional) filter_known: If True, filter out known proxies and return new only + proto: Protocol to assign to all extracted proxies (from source URL) Returns: If filter_known: (unique_count, new_proxies) tuple - If not filter_known: list of all unique valid proxies + new_proxies is list of (address, proto) tuples + If not filter_known: list of (address, proto) tuples """ matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content)) @@ -215,7 +245,7 @@ def extract_proxies(content, proxydb=None, filter_known=True): p = '%s:%s' % (ip, port) uniques_dict[p] = True - uniques = [p for p in uniques_dict.keys() if is_usable_proxy(p)] + uniques = [(p, proto) for p in uniques_dict.keys() if is_usable_proxy(p)] if not filter_known: return uniques @@ -225,9 +255,9 @@ def extract_proxies(content, proxydb=None, filter_known=True): init_known_proxies(proxydb) new = [] - for p in uniques: + for p, pr in uniques: if not is_known_proxy(p): - new.append(p) + new.append((p, pr)) add_known_proxies([p]) return len(uniques), new diff --git a/ppf.py b/ppf.py index b12756a..31adbcd 100644 --- a/ppf.py +++ b/ppf.py @@ -16,8 +16,16 @@ import re import threading import random +# Global profiler for signal handler access +_profiler = None + # Handle SIGTERM gracefully (for container stop) def sigterm_handler(signum, frame): + global _profiler + if _profiler: + _profiler.disable() + _profiler.dump_stats('data/profile.stats') + _log('profile stats written to data/profile.stats (SIGTERM)', 'info') raise KeyboardInterrupt signal.signal(signal.SIGTERM, sigterm_handler) @@ -83,7 +91,9 @@ def extract_urls(html, url): def import_proxies_from_file(proxydb, fn): content = open(fn, 'r').read() - unique_count, new = fetch.extract_proxies(content, proxydb) + # Detect protocol from filename (e.g., socks5.txt, http-proxies.txt) + proto = fetch.detect_proto_from_path(fn) + unique_count, new = fetch.extract_proxies(content, proxydb, proto=proto) if new: dbs.insert_proxies(proxydb, new, fn) return 0 @@ -134,7 +144,9 @@ class Leechered(threading.Thread): else: content = '' - unique = fetch.extract_proxies(content, filter_known=False) + # Detect protocol from source URL (e.g., .../socks5/list.txt) + proto = fetch.detect_proto_from_path(self.url) + unique = fetch.extract_proxies(content, filter_known=False, proto=proto) # Compute hash of all extracted proxies for change detection self.new_hash = dbs.compute_proxy_list_hash(unique) @@ -152,7 +164,8 @@ class Leechered(threading.Thread): # Content changed or first fetch - reset stale_count, proceed with normal processing self.stale_count = 0 - self.proxylist = [ proxy for proxy in unique if not fetch.is_known_proxy(proxy) ] + # unique is list of (address, proto) tuples; filter by address, keep tuple + self.proxylist = [(addr, pr) for addr, pr in unique if not fetch.is_known_proxy(addr)] proxy_count = len(self.proxylist) if self.retrievals == 0: # new site @@ -250,9 +263,10 @@ def main(): for thread in threads: if thread.status == 'ok': url, proxylist, stale_count, error, retrievals, content_type, proxies_added, execute = thread.retrieve() - new = [ p for p in proxylist if not fetch.is_known_proxy(p) ] + # proxylist is list of (address, proto) tuples + new = [(addr, pr) for addr, pr in proxylist if not fetch.is_known_proxy(addr)] if new: - fetch.add_known_proxies(new) + fetch.add_known_proxies([addr for addr, pr in new]) # Update content_hash if we have a new one new_hash = thread.new_hash execute = (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, new_hash, url) @@ -262,7 +276,7 @@ def main(): threads = [ thread for thread in threads if thread.is_alive() ] if len(threads) < config.ppf.threads and rows: - p = random.sample(_proxylist, 5) if _proxylist is not None else None + p = random.sample(_proxylist, min(5, len(_proxylist))) if _proxylist else None row = random.choice(rows) urldb.execute('UPDATE uris SET check_time=? where url=?', (time.time(), row[0])) urldb.commit() @@ -297,17 +311,16 @@ if __name__ == '__main__': set_nobs(True) if config.args.profile: - _log('profiling enabled, output to profile.stats', 'info') - profiler = cProfile.Profile() + _log('profiling enabled, output to data/profile.stats', 'info') + _profiler = cProfile.Profile() try: - profiler.enable() + _profiler.enable() main() finally: - profiler.disable() - profiler.dump_stats('profile.stats') - _log('profile stats written to profile.stats', 'info') - # print top 20 by cumulative time - stats = pstats.Stats('profile.stats') + _profiler.disable() + _profiler.dump_stats('data/profile.stats') + _log('profile stats written to data/profile.stats', 'info') + stats = pstats.Stats('data/profile.stats') stats.strip_dirs().sort_stats('cumulative').print_stats(20) else: main()