scraper: integrate multi-lingual search terms

- Use translations module for 70% non-English search terms - Initialize translations config on startup - Add engines module for multi-engine support
2025-12-20 22:27:51 +01:00
parent eeb71a1d55
commit 8ce6900244
1 changed files with 305 additions and 62 deletions
--- a/scraper.py
+++ b/scraper.py
@@ -1,94 +1,337 @@
 #!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+"""Multi-engine proxy list scraper."""

 import dbs
-import random, time
+import random
+import time
 import urllib
 import mysqlite
 import proxywatchd
 from misc import _log
 from config import Config
 import fetch
-import sys
+import engines
+import translations
+import os

 config = Config()

-with open('searx.instances') as h:
-	searx_instances = [ line.strip() for line in h.readlines() if line.lower().startswith('http') ]
+# Load Searx instances if file exists
+searx_instances = []
+if os.path.exists('searx.instances'):
+    with open('searx.instances') as h:
+        searx_instances = [line.strip() for line in h.readlines()
+                          if line.lower().startswith('http')]

-def proxyfind(sqlite = None, urignore=None):
-	search = ''
-	random.shuffle(searx_instances)

-	## search by working proxy
-	if 'p' in config.scraper.query:
-		proxydb = mysqlite.mysqlite(config.watchd.database,str)
-		proxies = [ i[0] for i in proxydb.execute('SELECT proxy FROM proxylist WHERE failed=0 ORDER BY RANDOM() LIMIT 10').fetchall() ]
-		if len(proxies) and random.random() < random.random():
-			search = ' '.join( random.sample(proxies, random.randint(1,2)))
+class InstanceTracker(object):
+    """Track instance health with exponential backoff."""

-	## search by relative url
-	if 'w' in config.scraper.query and not len(search) or random.random() < random.random():
-		if not sqlite: sqlite = mysqlite.mysqlite(config.ppf.database,str)
-		uris = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE error=0 and url not like "%github%" ORDER BY RANDOM() LIMIT 10').fetchall() ]
-		if len(uris) > 0 and random.random() < random.random():
-			if len(search): search = '%s OR ' % search
-			search = search + 'site:%s' % random.choice(uris).split('/')[2]
+    def __init__(self, instances, base_delay=30, max_delay=3600):
+        self.instances = list(instances)
+        self.base_delay = base_delay
+        self.max_delay = max_delay
+        self.failures = {}
+        self.backoff_until = {}
+        self.success_count = {}

-	## build string
-	if 's' in config.scraper.query and not len(search) or random.random() < random.random():
-		if len(search): search = '%s OR ' % search
-		search = search + random.choice(search_terms)
+    def get_available(self):
+        """Return instances not currently in backoff."""
+        now = time.time()
+        available = []
+        for inst in self.instances:
+            if inst not in self.backoff_until or now >= self.backoff_until[inst]:
+                available.append(inst)
+        return available

-	if not len(search): return
-	#search_args = [ 'category=general', 'time_range=%s' % random.choice(['day','week','month','year']), 'q=%s' % urllib.quote_plus(search) ]
-	search_args = [ 'category=general', 'time_range=%s' % random.choice(['day','week']), 'q=%s' % urllib.quote_plus(search) ]
-	random.shuffle(search_args)
-	search_arg = '&'.join(search_args)
+    def mark_success(self, instance):
+        """Reset failure count on success."""
+        self.failures[instance] = 0
+        self.success_count[instance] = self.success_count.get(instance, 0) + 1
+        if instance in self.backoff_until:
+            del self.backoff_until[instance]

-	if config.scraper.debug:
-		print('search_arg: %s' % search_arg)
+    def mark_failure(self, instance):
+        """Increment failure count and set exponential backoff."""
+        count = self.failures.get(instance, 0) + 1
+        self.failures[instance] = count
+        delay = min(self.base_delay * (2 ** (count - 1)), self.max_delay)
+        self.backoff_until[instance] = time.time() + delay
+        name = instance.split('/')[2] if '/' in instance else instance
+        _log('%s: backoff %ds (failures: %d)' % (name, delay, count), 'rate')
+        return delay

-	for srx in searx_instances:
-		x = 0
-		while 1:
-			urls = []
-			if x > 0: content = fetch.fetch_contents('%s/?%s&pageno=%d' % (srx,search_arg,x))
-			else: content = fetch.fetch_contents('%s/?%s' % (srx,search_arg))
-			if content: urls = fetch.extract_urls(content, urls, urignore)
+    def get_status(self):
+        """Return status summary."""
+        available = len(self.get_available())
+        in_backoff = len(self.instances) - available
+        return available, in_backoff, len(self.instances)

-			if not len(urls): break
-			dbs.insert_urls(urls, '%s/?%s (pageno: %d)' % (srx.split('/')[2],search_arg,x) , sqlite)
-			x = x + 1
+
+class EngineTracker(object):
+    """Track multiple search engine instances with rate limiting."""
+
+    def __init__(self, engine_names, searx_urls, base_delay=30, max_delay=3600):
+        self.base_delay = base_delay
+        self.max_delay = max_delay
+        self.failures = {}
+        self.backoff_until = {}
+        self.success_count = {}
+
+        # Build list of (engine_instance, identifier)
+        self.engines = []
+        for name in engine_names:
+            name = name.strip().lower()
+            if name == 'searx':
+                for url in searx_urls:
+                    eng = engines.Searx(url)
+                    self.engines.append((eng, url))
+            elif name in engines.ENGINES:
+                eng = engines.get_engine(name)
+                self.engines.append((eng, name))
+            else:
+                _log('unknown engine: %s' % name, 'warn')
+
+    def get_available(self):
+        """Return engines not currently in backoff."""
+        now = time.time()
+        available = []
+        for eng, ident in self.engines:
+            if ident not in self.backoff_until or now >= self.backoff_until[ident]:
+                available.append((eng, ident))
+        return available
+
+    def mark_success(self, ident):
+        """Reset failure count on success."""
+        self.failures[ident] = 0
+        self.success_count[ident] = self.success_count.get(ident, 0) + 1
+        if ident in self.backoff_until:
+            del self.backoff_until[ident]
+
+    def mark_failure(self, ident):
+        """Increment failure count and set exponential backoff."""
+        count = self.failures.get(ident, 0) + 1
+        self.failures[ident] = count
+        delay = min(self.base_delay * (2 ** (count - 1)), self.max_delay)
+        self.backoff_until[ident] = time.time() + delay
+        name = ident.split('/')[2] if '/' in ident else ident
+        _log('%s: backoff %ds (failures: %d)' % (name, delay, count), 'rate')
+        return delay
+
+    def get_status(self):
+        """Return status summary."""
+        available = len(self.get_available())
+        in_backoff = len(self.engines) - available
+        return available, in_backoff, len(self.engines)
+
+
+engine_tracker = None
+
+
+def build_search_query(sqlite=None):
+    """Build a search query using configured sources."""
+    search = ''
+
+    # Search by working proxy
+    if 'p' in config.scraper.query:
+        proxydb = mysqlite.mysqlite(config.watchd.database, str)
+        proxies = [i[0] for i in proxydb.execute(
+            'SELECT proxy FROM proxylist WHERE failed=0 ORDER BY RANDOM() LIMIT 10'
+        ).fetchall()]
+        if proxies and random.random() < 0.5:
+            search = ' '.join(random.sample(proxies, random.randint(1, 2)))
+
+    # Search by known website
+    if ('w' in config.scraper.query and not search) or random.random() < 0.5:
+        if sqlite is None:
+            sqlite = mysqlite.mysqlite(config.ppf.database, str)
+        uris = [i[0] for i in sqlite.execute(
+            'SELECT url FROM uris WHERE error=0 AND url NOT LIKE "%github%" ORDER BY RANDOM() LIMIT 10'
+        ).fetchall()]
+        if uris and random.random() < 0.5:
+            if search:
+                search = '%s OR ' % search
+            search = search + 'site:%s' % random.choice(uris).split('/')[2]
+
+    # Search by term (multi-lingual)
+    if ('s' in config.scraper.query and not search) or random.random() < 0.5:
+        if search:
+            search = '%s OR ' % search
+        # 70% chance of non-English term
+        if random.random() < 0.7:
+            term = translations.get_random_search_term()
+        else:
+            term = random.choice(search_terms)
+        search = search + term
+
+    return search
+
+
+def scrape_engine(engine, ident, query, urignore, sqlite):
+    """Scrape a single engine for proxy list URLs."""
+    max_pages = config.scraper.max_pages
+    consecutive_empty = 0
+    total_urls = 0
+
+    for page in range(max_pages):
+        try:
+            url = engine.build_url(query, page)
+
+            if config.scraper.debug:
+                _log('%s page %d: %s' % (engine.name, page, url), 'debug')
+
+            content = fetch.fetch_contents(url)
+
+            # Check for rate limiting
+            if engine.is_rate_limited(content):
+                _log('%s: rate limited' % engine.name, 'rate')
+                engine_tracker.mark_failure(ident)
+                return total_urls
+
+            if not content:
+                consecutive_empty += 1
+                if consecutive_empty >= config.scraper.fail_threshold:
+                    engine_tracker.mark_failure(ident)
+                    return total_urls
+                continue
+
+            # Extract URLs
+            urls = engine.extract_urls(content, urignore)
+
+            if not urls:
+                # Empty results on first page likely means rate limited
+                if page == 0:
+                    engine_tracker.mark_failure(ident)
+                return total_urls
+
+            # Success
+            engine_tracker.mark_success(ident)
+            consecutive_empty = 0
+
+            # Deduplicate and insert
+            urls = list(set(urls))
+            source = '%s (page %d, query: %s)' % (engine.name, page, query[:50])
+            dbs.insert_urls(urls, source, sqlite)
+            total_urls += len(urls)
+
+            # Small delay between pages
+            time.sleep(random.uniform(1.0, 3.0))
+
+        except Exception as e:
+            name = ident.split('/')[2] if '/' in ident else ident
+            _log('%s: error: %s' % (name, str(e)), 'error')
+            engine_tracker.mark_failure(ident)
+            return total_urls
+
+    return total_urls
+
+
+def proxyfind(sqlite=None, urignore=None):
+    """Find proxy list URLs using available search engines."""
+    global engine_tracker
+
+    # Get available engines
+    available = engine_tracker.get_available()
+    if not available:
+        avail, backoff, total = engine_tracker.get_status()
+        _log('all %d engines in backoff, sleeping 60s' % total, 'rate')
+        time.sleep(60)
+        return
+
+    # Build search query
+    query = build_search_query(sqlite)
+    if not query:
+        return
+
+    if config.scraper.debug:
+        _log('query: %s' % query, 'debug')
+
+    # Shuffle and pick engines
+    random.shuffle(available)
+
+    # Use 1-3 engines per round
+    num_engines = min(len(available), random.randint(1, 3))
+
+    for engine, ident in available[:num_engines]:
+        total = scrape_engine(engine, ident, query, urignore, sqlite)
+        if total > 0:
+            name = ident.split('/')[2] if '/' in ident else ident
+            _log('%s: found %d URLs' % (name, total), 'scraper')
+
+        # Delay between engines
+        time.sleep(random.uniform(2.0, 5.0))


 def load_urignore():
-	## load bad terms
-	with open('urignore.txt', 'r') as f:
-		urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ]
-	## add searx instances as bad terms (avoid loops)
-	for i in searx_instances:
-		urignore.append(i.split('/')[2])
-	return urignore
+    """Load URL ignore patterns."""
+    urignore = []
+
+    # Load from file
+    if os.path.exists('urignore.txt'):
+        with open('urignore.txt', 'r') as f:
+            urignore = [i.strip() for i in f.read().split('\n') if i.strip()]
+
+    # Add Searx instances to ignore (avoid loops)
+    for i in searx_instances:
+        urignore.append(i.split('/')[2])
+
+    # Add search engine domains to ignore
+    ignore_domains = [
+        'duckduckgo.com', 'startpage.com', 'mojeek.com', 'qwant.com',
+        'yandex.com', 'yandex.ru', 'ecosia.org', 'brave.com',
+        'google.com', 'bing.com', 'yahoo.com',
+    ]
+    for domain in ignore_domains:
+        urignore.append(domain)
+
+    return urignore


 if __name__ == '__main__':
-	config.load()
-	fetch.set_config(config)
+    config.load()
+    errors = config.validate()
+    if errors:
+        for e in errors:
+            _log(e, 'error')
+        import sys
+        sys.exit(1)
+    fetch.set_config(config)
+    translations.set_config(config)

-	proxydb = mysqlite.mysqlite(config.watchd.database, str)
-	dbs.create_table_if_not_exists(proxydb, 'proxylist')
+    proxydb = mysqlite.mysqlite(config.watchd.database, str)
+    dbs.create_table_if_not_exists(proxydb, 'proxylist')

-	urldb = mysqlite.mysqlite(config.ppf.database, str)
-	dbs.create_table_if_not_exists(urldb, 'uris')
+    urldb = mysqlite.mysqlite(config.ppf.database, str)
+    dbs.create_table_if_not_exists(urldb, 'uris')

-	## load search terms
-	with open('search_terms.txt', 'r') as f:
-		search_terms = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ]
+    # Load search terms
+    search_terms = ['free proxy list', 'socks5 proxy', 'http proxy']
+    if os.path.exists('search_terms.txt'):
+        with open('search_terms.txt', 'r') as f:
+            search_terms = [i.strip() for i in f.read().split('\n') if i.strip()]

-	urignore = load_urignore()
+    urignore = load_urignore()

-	while True:
-		try: proxyfind(urldb, urignore)
-		except KeyboardInterrupt: break
+    # Parse enabled engines from config
+    enabled_engines = [e.strip() for e in config.scraper.engines.split(',')]

-	print '\r',
+    # Initialize engine tracker
+    engine_tracker = EngineTracker(
+        enabled_engines,
+        searx_instances,
+        base_delay=config.scraper.backoff_base,
+        max_delay=config.scraper.backoff_max
+    )
+
+    avail, backoff, total = engine_tracker.get_status()
+    _log('loaded %d engine instances (%s)' % (total, ', '.join(enabled_engines)), 'info')
+
+    try:
+        while True:
+            proxyfind(urldb, urignore)
+            # Small delay between rounds
+            time.sleep(random.uniform(5.0, 15.0))
+    except KeyboardInterrupt:
+        avail, backoff, total = engine_tracker.get_status()
+        _log('scraper stopped (engines: %d/%d available)' % (avail, total), 'info')