#!/usr/bin/env python2 # -*- coding: utf-8 -*- """Multi-engine proxy list scraper.""" import dbs import random import time import urllib import mysqlite import proxywatchd from misc import _log from config import Config import fetch import engines import translations import os config = Config() # Load Searx instances if file exists searx_instances = [] if os.path.exists('searx.instances'): with open('searx.instances') as h: searx_instances = [line.strip() for line in h.readlines() if line.lower().startswith('http')] class EngineTracker(object): """Track multiple search engine instances with rate limiting.""" def __init__(self, engine_names, searx_urls, base_delay=30, max_delay=3600): self.base_delay = base_delay self.max_delay = max_delay self.failures = {} self.backoff_until = {} self.success_count = {} # Build list of (engine_instance, identifier) self.engines = [] for name in engine_names: name = name.strip().lower() if name == 'searx': for url in searx_urls: eng = engines.Searx(url) self.engines.append((eng, url)) elif name in engines.ENGINES: eng = engines.get_engine(name) self.engines.append((eng, name)) else: _log('unknown engine: %s' % name, 'warn') def get_available(self): """Return engines not currently in backoff.""" now = time.time() available = [] for eng, ident in self.engines: if ident not in self.backoff_until or now >= self.backoff_until[ident]: available.append((eng, ident)) return available def mark_success(self, ident): """Reset failure count on success.""" self.failures[ident] = 0 self.success_count[ident] = self.success_count.get(ident, 0) + 1 if ident in self.backoff_until: del self.backoff_until[ident] def mark_failure(self, ident): """Increment failure count and set exponential backoff.""" count = self.failures.get(ident, 0) + 1 self.failures[ident] = count delay = min(self.base_delay * (2 ** (count - 1)), self.max_delay) self.backoff_until[ident] = time.time() + delay name = ident.split('/')[2] if '/' in ident else ident _log('%s: backoff %ds (failures: %d)' % (name, delay, count), 'rate') return delay def get_status(self): """Return status summary.""" available = len(self.get_available()) in_backoff = len(self.engines) - available return available, in_backoff, len(self.engines) engine_tracker = None def build_search_query(sqlite=None): """Build a search query using configured sources.""" search = '' # Search by working proxy if 'p' in config.scraper.query: proxydb = mysqlite.mysqlite(config.watchd.database, str) proxies = [i[0] for i in proxydb.execute( 'SELECT proxy FROM proxylist WHERE failed=0 ORDER BY RANDOM() LIMIT 10' ).fetchall()] if proxies and random.random() < 0.5: search = ' '.join(random.sample(proxies, random.randint(1, 2))) # Search by known website if ('w' in config.scraper.query and not search) or random.random() < 0.5: if sqlite is None: sqlite = mysqlite.mysqlite(config.ppf.database, str) uris = [i[0] for i in sqlite.execute( 'SELECT url FROM uris WHERE error=0 AND url NOT LIKE "%github%" ORDER BY RANDOM() LIMIT 10' ).fetchall()] if uris and random.random() < 0.5: if search: search = '%s OR ' % search search = search + 'site:%s' % random.choice(uris).split('/')[2] # Search by term (multi-lingual) if ('s' in config.scraper.query and not search) or random.random() < 0.5: if search: search = '%s OR ' % search # 70% chance of non-English term if random.random() < 0.7: term = translations.get_random_search_term() else: term = random.choice(search_terms) search = search + term return search def scrape_engine(engine, ident, query, urignore, sqlite): """Scrape a single engine for proxy list URLs.""" max_pages = config.scraper.max_pages consecutive_empty = 0 total_urls = 0 for page in range(max_pages): try: url = engine.build_url(query, page) if config.scraper.debug: _log('%s page %d: %s' % (engine.name, page, url), 'debug') content = fetch.fetch_contents(url) # Check for rate limiting if engine.is_rate_limited(content): _log('%s: rate limited' % engine.name, 'rate') engine_tracker.mark_failure(ident) return total_urls if not content: consecutive_empty += 1 if consecutive_empty >= config.scraper.fail_threshold: engine_tracker.mark_failure(ident) return total_urls continue # Extract URLs urls = engine.extract_urls(content, urignore) if not urls: # Empty results on first page likely means rate limited if page == 0: engine_tracker.mark_failure(ident) return total_urls # Success engine_tracker.mark_success(ident) consecutive_empty = 0 # Deduplicate and insert urls = list(set(urls)) source = '%s (page %d, query: %s)' % (engine.name, page, query[:50]) dbs.insert_urls(urls, source, sqlite) total_urls += len(urls) # Small delay between pages time.sleep(random.uniform(1.0, 3.0)) except Exception as e: name = ident.split('/')[2] if '/' in ident else ident _log('%s: error: %s' % (name, str(e)), 'error') engine_tracker.mark_failure(ident) return total_urls return total_urls def proxyfind(sqlite=None, urignore=None): """Find proxy list URLs using available search engines.""" global engine_tracker # Get available engines available = engine_tracker.get_available() if not available: avail, backoff, total = engine_tracker.get_status() _log('all %d engines in backoff, sleeping 60s' % total, 'rate') time.sleep(60) return # Build search query query = build_search_query(sqlite) if not query: return if config.scraper.debug: _log('query: %s' % query, 'debug') # Shuffle and pick engines random.shuffle(available) # Use 1-3 engines per round num_engines = min(len(available), random.randint(1, 3)) for engine, ident in available[:num_engines]: total = scrape_engine(engine, ident, query, urignore, sqlite) if total > 0: name = ident.split('/')[2] if '/' in ident else ident _log('%s: found %d URLs' % (name, total), 'scraper') # Delay between engines time.sleep(random.uniform(2.0, 5.0)) def load_urignore(): """Load URL ignore patterns.""" urignore = [] # Load from file if os.path.exists('urignore.txt'): with open('urignore.txt', 'r') as f: urignore = [i.strip() for i in f.read().split('\n') if i.strip()] # Add Searx instances to ignore (avoid loops) for i in searx_instances: urignore.append(i.split('/')[2]) # Add search engine domains to ignore ignore_domains = [ 'duckduckgo.com', 'startpage.com', 'mojeek.com', 'qwant.com', 'yandex.com', 'yandex.ru', 'ecosia.org', 'brave.com', 'google.com', 'bing.com', 'yahoo.com', ] for domain in ignore_domains: urignore.append(domain) return urignore if __name__ == '__main__': config.load() errors = config.validate() if errors: for e in errors: _log(e, 'error') import sys sys.exit(1) fetch.set_config(config) translations.set_config(config) proxydb = mysqlite.mysqlite(config.watchd.database, str) dbs.create_table_if_not_exists(proxydb, 'proxylist') urldb = mysqlite.mysqlite(config.ppf.database, str) dbs.create_table_if_not_exists(urldb, 'uris') # Load search terms search_terms = ['free proxy list', 'socks5 proxy', 'http proxy'] if os.path.exists('search_terms.txt'): with open('search_terms.txt', 'r') as f: search_terms = [i.strip() for i in f.read().split('\n') if i.strip()] urignore = load_urignore() # Parse enabled engines from config enabled_engines = [e.strip() for e in config.scraper.engines.split(',')] # Initialize engine tracker engine_tracker = EngineTracker( enabled_engines, searx_instances, base_delay=config.scraper.backoff_base, max_delay=config.scraper.backoff_max ) avail, backoff, total = engine_tracker.get_status() _log('loaded %d engine instances (%s)' % (total, ', '.join(enabled_engines)), 'info') try: while True: proxyfind(urldb, urignore) # Small delay between rounds time.sleep(random.uniform(5.0, 15.0)) except KeyboardInterrupt: avail, backoff, total = engine_tracker.get_status() _log('scraper stopped (engines: %d/%d available)' % (avail, total), 'info')