diff --git a/scraper.py b/scraper.py index f50ecd7..b9e6eaf 100755 --- a/scraper.py +++ b/scraper.py @@ -1,94 +1,337 @@ #!/usr/bin/env python2 +# -*- coding: utf-8 -*- +"""Multi-engine proxy list scraper.""" import dbs -import random, time +import random +import time import urllib import mysqlite import proxywatchd from misc import _log from config import Config import fetch -import sys +import engines +import translations +import os config = Config() -with open('searx.instances') as h: - searx_instances = [ line.strip() for line in h.readlines() if line.lower().startswith('http') ] +# Load Searx instances if file exists +searx_instances = [] +if os.path.exists('searx.instances'): + with open('searx.instances') as h: + searx_instances = [line.strip() for line in h.readlines() + if line.lower().startswith('http')] -def proxyfind(sqlite = None, urignore=None): - search = '' - random.shuffle(searx_instances) - ## search by working proxy - if 'p' in config.scraper.query: - proxydb = mysqlite.mysqlite(config.watchd.database,str) - proxies = [ i[0] for i in proxydb.execute('SELECT proxy FROM proxylist WHERE failed=0 ORDER BY RANDOM() LIMIT 10').fetchall() ] - if len(proxies) and random.random() < random.random(): - search = ' '.join( random.sample(proxies, random.randint(1,2))) +class InstanceTracker(object): + """Track instance health with exponential backoff.""" - ## search by relative url - if 'w' in config.scraper.query and not len(search) or random.random() < random.random(): - if not sqlite: sqlite = mysqlite.mysqlite(config.ppf.database,str) - uris = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE error=0 and url not like "%github%" ORDER BY RANDOM() LIMIT 10').fetchall() ] - if len(uris) > 0 and random.random() < random.random(): - if len(search): search = '%s OR ' % search - search = search + 'site:%s' % random.choice(uris).split('/')[2] + def __init__(self, instances, base_delay=30, max_delay=3600): + self.instances = list(instances) + self.base_delay = base_delay + self.max_delay = max_delay + self.failures = {} + self.backoff_until = {} + self.success_count = {} - ## build string - if 's' in config.scraper.query and not len(search) or random.random() < random.random(): - if len(search): search = '%s OR ' % search - search = search + random.choice(search_terms) + def get_available(self): + """Return instances not currently in backoff.""" + now = time.time() + available = [] + for inst in self.instances: + if inst not in self.backoff_until or now >= self.backoff_until[inst]: + available.append(inst) + return available - if not len(search): return - #search_args = [ 'category=general', 'time_range=%s' % random.choice(['day','week','month','year']), 'q=%s' % urllib.quote_plus(search) ] - search_args = [ 'category=general', 'time_range=%s' % random.choice(['day','week']), 'q=%s' % urllib.quote_plus(search) ] - random.shuffle(search_args) - search_arg = '&'.join(search_args) + def mark_success(self, instance): + """Reset failure count on success.""" + self.failures[instance] = 0 + self.success_count[instance] = self.success_count.get(instance, 0) + 1 + if instance in self.backoff_until: + del self.backoff_until[instance] - if config.scraper.debug: - print('search_arg: %s' % search_arg) + def mark_failure(self, instance): + """Increment failure count and set exponential backoff.""" + count = self.failures.get(instance, 0) + 1 + self.failures[instance] = count + delay = min(self.base_delay * (2 ** (count - 1)), self.max_delay) + self.backoff_until[instance] = time.time() + delay + name = instance.split('/')[2] if '/' in instance else instance + _log('%s: backoff %ds (failures: %d)' % (name, delay, count), 'rate') + return delay - for srx in searx_instances: - x = 0 - while 1: - urls = [] - if x > 0: content = fetch.fetch_contents('%s/?%s&pageno=%d' % (srx,search_arg,x)) - else: content = fetch.fetch_contents('%s/?%s' % (srx,search_arg)) - if content: urls = fetch.extract_urls(content, urls, urignore) + def get_status(self): + """Return status summary.""" + available = len(self.get_available()) + in_backoff = len(self.instances) - available + return available, in_backoff, len(self.instances) - if not len(urls): break - dbs.insert_urls(urls, '%s/?%s (pageno: %d)' % (srx.split('/')[2],search_arg,x) , sqlite) - x = x + 1 + +class EngineTracker(object): + """Track multiple search engine instances with rate limiting.""" + + def __init__(self, engine_names, searx_urls, base_delay=30, max_delay=3600): + self.base_delay = base_delay + self.max_delay = max_delay + self.failures = {} + self.backoff_until = {} + self.success_count = {} + + # Build list of (engine_instance, identifier) + self.engines = [] + for name in engine_names: + name = name.strip().lower() + if name == 'searx': + for url in searx_urls: + eng = engines.Searx(url) + self.engines.append((eng, url)) + elif name in engines.ENGINES: + eng = engines.get_engine(name) + self.engines.append((eng, name)) + else: + _log('unknown engine: %s' % name, 'warn') + + def get_available(self): + """Return engines not currently in backoff.""" + now = time.time() + available = [] + for eng, ident in self.engines: + if ident not in self.backoff_until or now >= self.backoff_until[ident]: + available.append((eng, ident)) + return available + + def mark_success(self, ident): + """Reset failure count on success.""" + self.failures[ident] = 0 + self.success_count[ident] = self.success_count.get(ident, 0) + 1 + if ident in self.backoff_until: + del self.backoff_until[ident] + + def mark_failure(self, ident): + """Increment failure count and set exponential backoff.""" + count = self.failures.get(ident, 0) + 1 + self.failures[ident] = count + delay = min(self.base_delay * (2 ** (count - 1)), self.max_delay) + self.backoff_until[ident] = time.time() + delay + name = ident.split('/')[2] if '/' in ident else ident + _log('%s: backoff %ds (failures: %d)' % (name, delay, count), 'rate') + return delay + + def get_status(self): + """Return status summary.""" + available = len(self.get_available()) + in_backoff = len(self.engines) - available + return available, in_backoff, len(self.engines) + + +engine_tracker = None + + +def build_search_query(sqlite=None): + """Build a search query using configured sources.""" + search = '' + + # Search by working proxy + if 'p' in config.scraper.query: + proxydb = mysqlite.mysqlite(config.watchd.database, str) + proxies = [i[0] for i in proxydb.execute( + 'SELECT proxy FROM proxylist WHERE failed=0 ORDER BY RANDOM() LIMIT 10' + ).fetchall()] + if proxies and random.random() < 0.5: + search = ' '.join(random.sample(proxies, random.randint(1, 2))) + + # Search by known website + if ('w' in config.scraper.query and not search) or random.random() < 0.5: + if sqlite is None: + sqlite = mysqlite.mysqlite(config.ppf.database, str) + uris = [i[0] for i in sqlite.execute( + 'SELECT url FROM uris WHERE error=0 AND url NOT LIKE "%github%" ORDER BY RANDOM() LIMIT 10' + ).fetchall()] + if uris and random.random() < 0.5: + if search: + search = '%s OR ' % search + search = search + 'site:%s' % random.choice(uris).split('/')[2] + + # Search by term (multi-lingual) + if ('s' in config.scraper.query and not search) or random.random() < 0.5: + if search: + search = '%s OR ' % search + # 70% chance of non-English term + if random.random() < 0.7: + term = translations.get_random_search_term() + else: + term = random.choice(search_terms) + search = search + term + + return search + + +def scrape_engine(engine, ident, query, urignore, sqlite): + """Scrape a single engine for proxy list URLs.""" + max_pages = config.scraper.max_pages + consecutive_empty = 0 + total_urls = 0 + + for page in range(max_pages): + try: + url = engine.build_url(query, page) + + if config.scraper.debug: + _log('%s page %d: %s' % (engine.name, page, url), 'debug') + + content = fetch.fetch_contents(url) + + # Check for rate limiting + if engine.is_rate_limited(content): + _log('%s: rate limited' % engine.name, 'rate') + engine_tracker.mark_failure(ident) + return total_urls + + if not content: + consecutive_empty += 1 + if consecutive_empty >= config.scraper.fail_threshold: + engine_tracker.mark_failure(ident) + return total_urls + continue + + # Extract URLs + urls = engine.extract_urls(content, urignore) + + if not urls: + # Empty results on first page likely means rate limited + if page == 0: + engine_tracker.mark_failure(ident) + return total_urls + + # Success + engine_tracker.mark_success(ident) + consecutive_empty = 0 + + # Deduplicate and insert + urls = list(set(urls)) + source = '%s (page %d, query: %s)' % (engine.name, page, query[:50]) + dbs.insert_urls(urls, source, sqlite) + total_urls += len(urls) + + # Small delay between pages + time.sleep(random.uniform(1.0, 3.0)) + + except Exception as e: + name = ident.split('/')[2] if '/' in ident else ident + _log('%s: error: %s' % (name, str(e)), 'error') + engine_tracker.mark_failure(ident) + return total_urls + + return total_urls + + +def proxyfind(sqlite=None, urignore=None): + """Find proxy list URLs using available search engines.""" + global engine_tracker + + # Get available engines + available = engine_tracker.get_available() + if not available: + avail, backoff, total = engine_tracker.get_status() + _log('all %d engines in backoff, sleeping 60s' % total, 'rate') + time.sleep(60) + return + + # Build search query + query = build_search_query(sqlite) + if not query: + return + + if config.scraper.debug: + _log('query: %s' % query, 'debug') + + # Shuffle and pick engines + random.shuffle(available) + + # Use 1-3 engines per round + num_engines = min(len(available), random.randint(1, 3)) + + for engine, ident in available[:num_engines]: + total = scrape_engine(engine, ident, query, urignore, sqlite) + if total > 0: + name = ident.split('/')[2] if '/' in ident else ident + _log('%s: found %d URLs' % (name, total), 'scraper') + + # Delay between engines + time.sleep(random.uniform(2.0, 5.0)) def load_urignore(): - ## load bad terms - with open('urignore.txt', 'r') as f: - urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] - ## add searx instances as bad terms (avoid loops) - for i in searx_instances: - urignore.append(i.split('/')[2]) - return urignore + """Load URL ignore patterns.""" + urignore = [] + + # Load from file + if os.path.exists('urignore.txt'): + with open('urignore.txt', 'r') as f: + urignore = [i.strip() for i in f.read().split('\n') if i.strip()] + + # Add Searx instances to ignore (avoid loops) + for i in searx_instances: + urignore.append(i.split('/')[2]) + + # Add search engine domains to ignore + ignore_domains = [ + 'duckduckgo.com', 'startpage.com', 'mojeek.com', 'qwant.com', + 'yandex.com', 'yandex.ru', 'ecosia.org', 'brave.com', + 'google.com', 'bing.com', 'yahoo.com', + ] + for domain in ignore_domains: + urignore.append(domain) + + return urignore if __name__ == '__main__': - config.load() - fetch.set_config(config) + config.load() + errors = config.validate() + if errors: + for e in errors: + _log(e, 'error') + import sys + sys.exit(1) + fetch.set_config(config) + translations.set_config(config) - proxydb = mysqlite.mysqlite(config.watchd.database, str) - dbs.create_table_if_not_exists(proxydb, 'proxylist') + proxydb = mysqlite.mysqlite(config.watchd.database, str) + dbs.create_table_if_not_exists(proxydb, 'proxylist') - urldb = mysqlite.mysqlite(config.ppf.database, str) - dbs.create_table_if_not_exists(urldb, 'uris') + urldb = mysqlite.mysqlite(config.ppf.database, str) + dbs.create_table_if_not_exists(urldb, 'uris') - ## load search terms - with open('search_terms.txt', 'r') as f: - search_terms = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] + # Load search terms + search_terms = ['free proxy list', 'socks5 proxy', 'http proxy'] + if os.path.exists('search_terms.txt'): + with open('search_terms.txt', 'r') as f: + search_terms = [i.strip() for i in f.read().split('\n') if i.strip()] - urignore = load_urignore() + urignore = load_urignore() - while True: - try: proxyfind(urldb, urignore) - except KeyboardInterrupt: break + # Parse enabled engines from config + enabled_engines = [e.strip() for e in config.scraper.engines.split(',')] - print '\r', + # Initialize engine tracker + engine_tracker = EngineTracker( + enabled_engines, + searx_instances, + base_delay=config.scraper.backoff_base, + max_delay=config.scraper.backoff_max + ) + + avail, backoff, total = engine_tracker.get_status() + _log('loaded %d engine instances (%s)' % (total, ', '.join(enabled_engines)), 'info') + + try: + while True: + proxyfind(urldb, urignore) + # Small delay between rounds + time.sleep(random.uniform(5.0, 15.0)) + except KeyboardInterrupt: + avail, backoff, total = engine_tracker.get_status() + _log('scraper stopped (engines: %d/%d available)' % (avail, total), 'info')