#!/usr/bin/env python2 # -*- coding: utf-8 -*- """Search engine implementations for proxy list discovery.""" import re import urllib import random import time from soup_parser import soupify from misc import _log def _date_weeks_ago(weeks): """Return date string (YYYY-MM-DD) for N weeks ago.""" secs = time.time() - (weeks * 7 * 24 * 3600) return time.strftime('%Y-%m-%d', time.gmtime(secs)) def _urlencode(params): """URL-encode params dict, handling Unicode strings. Python 2's urllib.urlencode() expects byte strings. This helper encodes any Unicode values to UTF-8 before URL encoding. Args: params: Dictionary of query parameters Returns: URL-encoded query string """ encoded = {} for k, v in params.items(): if isinstance(v, unicode): v = v.encode('utf-8') encoded[k] = v return urllib.urlencode(encoded) def _get_body(soup): """Get body element from soup, handling None case. Args: soup: BeautifulSoup or SoupResult object Returns: Body element or empty list wrapper if None """ if soup is None or soup.body is None: # Return object with empty find_all to avoid AttributeError class EmptyBody: def find_all(self, *args, **kwargs): return [] return EmptyBody() return soup.body class SearchEngine(object): """Base class for search engines.""" name = 'base' base_url = '' # Rate limiting: requests per minute (0 = no limit) rate_limit = 0 # Domains to skip (engine's own domain) skip_domains = [] # Path patterns to skip (internal pages) skip_patterns = [] # Base URL for relative link conversion (None = skip relative links) relative_base = None def __init__(self): self.last_request = 0 def build_url(self, query, page=0): """Build search URL for query and page number.""" raise NotImplementedError def extract_urls(self, content, urignore=None): """Extract result URLs from response content. Base implementation handles common patterns: - Skips empty content - Parses HTML with soupify - Converts relative URLs if relative_base is set - Skips domains in skip_domains - Skips paths matching skip_patterns - Applies urignore regex patterns - Deduplicates results """ urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href: continue # Handle relative URLs if not href.startswith('http'): if self.relative_base and href.startswith('/'): href = self.relative_base + href else: continue # Skip engine's own domain(s) skip = False for domain in self.skip_domains: if domain in href: skip = True break if skip: continue # Skip internal paths for pattern in self.skip_patterns: if pattern in href: skip = True break if skip: continue # Check urignore patterns for pattern in urignore: if re.search(pattern, href): skip = True break if skip: continue if href not in urls: urls.append(href) return urls def is_rate_limited(self, content): """Check if response indicates rate limiting.""" if not content: return True rate_signals = ( 'rate limit', 'too many requests', 'blocked', 'captcha', 'please verify', 'unusual traffic', 'access denied', '403', '429', ) content_lower = content.lower() for signal in rate_signals: if signal in content_lower: return True return False class DuckDuckGo(SearchEngine): """DuckDuckGo HTML search (no JavaScript required).""" name = 'duckduckgo' base_url = 'https://html.duckduckgo.com/html/' rate_limit = 10 skip_domains = ['duckduckgo.com'] def build_url(self, query, page=0): params = {'q': query} if page > 0: # DuckDuckGo uses 's' param for offset (30 results per page) params['s'] = str(page * 30) params['dc'] = str(page * 30 + 1) return '%s?%s' % (self.base_url, _urlencode(params)) def _unwrap_url(self, href): """Extract actual URL from DuckDuckGo redirect wrapper.""" if '/l/?uddg=' in href or 'uddg=' in href: match = re.search(r'uddg=([^&]+)', href) if match: try: return urllib.unquote(match.group(1)) except Exception: return None return href def extract_urls(self, content, urignore=None): urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href or not href.startswith('http'): continue # Unwrap redirect URLs href = self._unwrap_url(href) if not href: continue # Skip engine's domain if 'duckduckgo.com' in href: continue # Check urignore patterns skip = False for pattern in urignore: if re.search(pattern, href): skip = True break if not skip and href not in urls: urls.append(href) return urls class Startpage(SearchEngine): """Startpage search (privacy-focused, uses Google results).""" name = 'startpage' base_url = 'https://www.startpage.com/do/search' rate_limit = 5 skip_domains = ['startpage.com'] def build_url(self, query, page=0): params = { 'query': query, 'cat': 'web', 'language': 'english', } if page > 0: params['page'] = str(page + 1) return '%s?%s' % (self.base_url, _urlencode(params)) class Mojeek(SearchEngine): """Mojeek search (UK-based, independent index).""" name = 'mojeek' base_url = 'https://www.mojeek.com/search' rate_limit = 10 skip_domains = ['mojeek.com', 'mojeek.co.uk'] def build_url(self, query, page=0): params = {'q': query} if page > 0: # Mojeek uses 's' for start position (10 results per page) params['s'] = str(page * 10 + 1) return '%s?%s' % (self.base_url, _urlencode(params)) class Qwant(SearchEngine): """Qwant Lite search (French, EU-based, privacy-focused).""" name = 'qwant' base_url = 'https://lite.qwant.com/' rate_limit = 10 skip_domains = ['qwant.com'] def build_url(self, query, page=0): params = { 'q': query, 't': 'web', } if page > 0: params['p'] = str(page + 1) return '%s?%s' % (self.base_url, _urlencode(params)) class Yandex(SearchEngine): """Yandex search (Russian, large independent index).""" name = 'yandex' base_url = 'https://yandex.com/search/' rate_limit = 5 skip_domains = ['yandex.com', 'yandex.ru'] def build_url(self, query, page=0): params = { 'text': query, 'lr': '84', # Worldwide } if page > 0: params['p'] = str(page) return '%s?%s' % (self.base_url, _urlencode(params)) def _unwrap_url(self, href): """Extract actual URL from Yandex redirect wrapper.""" if '//yandex.' in href: match = re.search(r'url=([^&]+)', href) if match: try: return urllib.unquote(match.group(1)) except Exception: return None return None return href def extract_urls(self, content, urignore=None): urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href: continue # Unwrap redirect URLs href = self._unwrap_url(href) if not href or not href.startswith('http'): continue # Check urignore patterns skip = False for pattern in urignore: if re.search(pattern, href): skip = True break if not skip and href not in urls: urls.append(href) return urls class Ecosia(SearchEngine): """Ecosia search (German, eco-friendly, uses Bing results).""" name = 'ecosia' base_url = 'https://www.ecosia.org/search' rate_limit = 10 skip_domains = ['ecosia.org'] def build_url(self, query, page=0): params = {'q': query} if page > 0: params['p'] = str(page) return '%s?%s' % (self.base_url, _urlencode(params)) class Bing(SearchEngine): """Bing search (Microsoft).""" name = 'bing' base_url = 'https://www.bing.com/search' rate_limit = 10 skip_domains = ['bing.com', 'microsoft.com', 'msn.com'] def build_url(self, query, page=0): params = {'q': query} if page > 0: params['first'] = str(page * 10 + 1) return '%s?%s' % (self.base_url, _urlencode(params)) class Yahoo(SearchEngine): """Yahoo search.""" name = 'yahoo' base_url = 'https://search.yahoo.com/search' rate_limit = 10 skip_domains = ['yahoo.com', 'yahooapis.com'] def build_url(self, query, page=0): params = {'p': query} if page > 0: params['b'] = str(page * 10 + 1) return '%s?%s' % (self.base_url, _urlencode(params)) def _unwrap_url(self, href): """Extract actual URL from Yahoo redirect wrapper.""" if 'yahoo.com' in href and '/RU=' in href: match = re.search(r'/RU=([^/]+)/', href) if match: try: return urllib.unquote(match.group(1)) except Exception: return None return href def extract_urls(self, content, urignore=None): urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href or not href.startswith('http'): continue href = self._unwrap_url(href) if not href: continue skip = False for domain in self.skip_domains: if domain in href: skip = True break if skip: continue for pattern in urignore: if re.search(pattern, href): skip = True break if not skip and href not in urls: urls.append(href) return urls class Gigablast(SearchEngine): """Gigablast search (independent US index).""" name = 'gigablast' base_url = 'https://www.gigablast.com/search' rate_limit = 15 skip_domains = ['gigablast.com'] def build_url(self, query, page=0): params = {'q': query} if page > 0: params['s'] = str(page * 10) return '%s?%s' % (self.base_url, _urlencode(params)) class Metager(SearchEngine): """MetaGer search (German, privacy-focused meta search).""" name = 'metager' base_url = 'https://metager.org/meta/meta.ger3' rate_limit = 10 skip_domains = ['metager.org', 'metager.de'] def build_url(self, query, page=0): params = { 'eingabe': query, 'lang': 'en', } if page > 0: params['page'] = str(page + 1) return '%s?%s' % (self.base_url, _urlencode(params)) class Swisscows(SearchEngine): """Swisscows search (Swiss, privacy-focused).""" name = 'swisscows' base_url = 'https://swisscows.com/web' rate_limit = 10 skip_domains = ['swisscows.com'] def build_url(self, query, page=0): params = { 'query': query, 'region': 'en-US', } if page > 0: params['offset'] = str(page * 10) return '%s?%s' % (self.base_url, _urlencode(params)) class Alexandria(SearchEngine): """Alexandria search (independent, non-commercial focus).""" name = 'alexandria' base_url = 'https://www.alexandria.org/search' rate_limit = 15 skip_domains = ['alexandria.org'] def build_url(self, query, page=0): params = {'q': query} if page > 0: params['p'] = str(page + 1) return '%s?%s' % (self.base_url, _urlencode(params)) class Brave(SearchEngine): """Brave Search (privacy-focused, independent index).""" name = 'brave' base_url = 'https://search.brave.com/search' rate_limit = 10 skip_domains = ['brave.com'] def build_url(self, query, page=0): params = {'q': query} if page > 0: params['offset'] = str(page) return '%s?%s' % (self.base_url, _urlencode(params)) class GitHub(SearchEngine): """GitHub code/repository search for proxy lists.""" name = 'github' base_url = 'https://github.com/search' rate_limit = 5 relative_base = 'https://github.com' skip_patterns = [ '/login', '/signup', '/join', '/settings', '/notifications', '/marketplace', '/explore', '/sponsors', '/pricing', '/features', ] # Search terms specific to proxy lists on GitHub github_queries = [ 'proxy list', 'socks5 proxy list', 'free proxy', 'proxy scraper', 'proxy checker', 'proxies txt', 'socks4 list', 'http proxy list', ] def build_url(self, query, page=0): # GitHub search for repositories and code (pushed in last 2 weeks) base_query = query if query else random.choice(self.github_queries) date_filter = _date_weeks_ago(2) search_query = '%s pushed:>%s' % (base_query, date_filter) params = { 'q': search_query, 'type': random.choice(['repositories', 'code']), 's': 'updated', # Sort by recently updated 'o': 'desc', } if page > 0: params['p'] = str(page + 1) return '%s?%s' % (self.base_url, _urlencode(params)) def extract_urls(self, content, urignore=None): """Extract URLs with blob-to-raw conversion for direct file access.""" urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href: continue # Convert relative to absolute if href.startswith('/'): href = self.relative_base + href if not href.startswith('http'): continue # Only keep GitHub repo/file links if 'github.com' not in href: continue # Skip internal pages skip = False for pattern in self.skip_patterns: if pattern in href: skip = True break if skip: continue # Check urignore patterns for pattern in urignore: if re.search(pattern, href): skip = True break if skip: continue # Keep raw file links and repo links if '/raw/' in href or '/blob/' in href or re.match(r'https://github\.com/[^/]+/[^/]+$', href): # Convert blob to raw for direct access if '/blob/' in href: raw_href = href.replace('/blob/', '/raw/') if raw_href not in urls: urls.append(raw_href) if href not in urls: urls.append(href) return urls class GitLab(SearchEngine): """GitLab search for proxy lists.""" name = 'gitlab' base_url = 'https://gitlab.com/search' rate_limit = 5 relative_base = 'https://gitlab.com' skip_patterns = [ '/users/', '/-/', '/explore', '/help', '/admin', '/dashboard', '/profile', ] def build_url(self, query, page=0): search_query = query if query else 'proxy list' params = { 'search': search_query, 'scope': 'projects', 'sort': 'updated_desc', # Most recently updated first } if page > 0: params['page'] = str(page + 1) return '%s?%s' % (self.base_url, _urlencode(params)) def extract_urls(self, content, urignore=None): """Extract project URLs only (whitelist pattern).""" urls = [] if not content: return urls soup = soupify(content, nohtml=True) for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href: continue # Convert relative to absolute if href.startswith('/'): href = self.relative_base + href if not href.startswith('http') or 'gitlab.com' not in href: continue # Skip internal pages skip = False for pattern in self.skip_patterns: if pattern in href: skip = True break if skip: continue # Keep only project links if re.match(r'https://gitlab\.com/[^/]+/[^/]+', href): if href not in urls: urls.append(href) return urls class Codeberg(SearchEngine): """Codeberg (Forgejo) search for proxy lists.""" name = 'codeberg' base_url = 'https://codeberg.org/explore/repos' rate_limit = 10 relative_base = 'https://codeberg.org' def build_url(self, query, page=0): search_query = query if query else 'proxy' params = { 'q': search_query, 'sort': 'updated', } if page > 0: params['page'] = str(page + 1) return '%s?%s' % (self.base_url, _urlencode(params)) def extract_urls(self, content, urignore=None): """Extract repo URLs only (whitelist pattern).""" urls = [] if not content: return urls soup = soupify(content, nohtml=True) for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href: continue # Convert relative to absolute if href.startswith('/'): href = self.relative_base + href if not href.startswith('http') or 'codeberg.org' not in href: continue # Keep repo links (format: /user/repo) if re.match(r'https://codeberg\.org/[^/]+/[^/]+$', href): if href not in urls: urls.append(href) return urls class Bitbucket(SearchEngine): """Bitbucket repository search.""" name = 'bitbucket' base_url = 'https://bitbucket.org/repo/all' rate_limit = 10 relative_base = 'https://bitbucket.org' skip_patterns = ['/account/', '/dashboard/', '/support/', '/-/'] def build_url(self, query, page=0): params = {'name': query if query else 'proxy'} if page > 0: params['page'] = str(page + 1) return '%s?%s' % (self.base_url, _urlencode(params)) def extract_urls(self, content, urignore=None): urls = [] if not content: return urls soup = soupify(content, nohtml=True) for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href: continue if href.startswith('/'): href = self.relative_base + href if not href.startswith('http') or 'bitbucket.org' not in href: continue skip = False for pattern in self.skip_patterns: if pattern in href: skip = True break if skip: continue # Keep repo links (format: /workspace/repo) if re.match(r'https://bitbucket\.org/[^/]+/[^/]+$', href): if href not in urls: urls.append(href) return urls class Sourcehut(SearchEngine): """Sourcehut (sr.ht) repository search.""" name = 'sourcehut' base_url = 'https://sr.ht/projects' rate_limit = 10 relative_base = 'https://sr.ht' def build_url(self, query, page=0): params = {'search': query if query else 'proxy'} if page > 0: params['page'] = str(page + 1) return '%s?%s' % (self.base_url, _urlencode(params)) def extract_urls(self, content, urignore=None): urls = [] if not content: return urls soup = soupify(content, nohtml=True) for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href: continue if href.startswith('/'): href = self.relative_base + href if not href.startswith('http'): continue # Keep git.sr.ht repo links if 'git.sr.ht/~' in href: if href not in urls: urls.append(href) return urls class Pastebin(SearchEngine): """Pastebin search via DuckDuckGo site: query (recent only).""" name = 'pastebin' base_url = 'https://html.duckduckgo.com/html/' rate_limit = 10 # Pastebin sites to search paste_sites = [ 'pastebin.com', 'paste.ee', 'dpaste.org', 'hastebin.com', 'ghostbin.com', 'paste.ubuntu.com', 'bpa.st', ] def build_url(self, query, page=0): # Search for proxy lists on paste sites (last 2 weeks only) site = random.choice(self.paste_sites) search_query = 'site:%s %s' % (site, query if query else 'proxy list') params = { 'q': search_query, 'df': 'w', # Past week (DuckDuckGo date filter) } if page > 0: params['s'] = str(page * 30) return '%s?%s' % (self.base_url, _urlencode(params)) def extract_urls(self, content, urignore=None): urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href: continue # Unwrap DDG redirect if 'uddg=' in href: match = re.search(r'uddg=([^&]+)', href) if match: try: href = urllib.unquote(match.group(1)) except Exception: continue if not href.startswith('http'): continue # Only keep paste site links is_paste = False for site in self.paste_sites: if site in href: is_paste = True break if not is_paste: continue skip = False for pattern in urignore: if re.search(pattern, href): skip = True break if not skip and href not in urls: urls.append(href) return urls class Rentry(SearchEngine): """Rentry.co/org search (markdown pastebin often used for proxy lists).""" name = 'rentry' base_url = 'https://html.duckduckgo.com/html/' rate_limit = 10 def build_url(self, query, page=0): # Search rentry for proxy lists (last 2 weeks only) search_query = 'site:rentry.co OR site:rentry.org %s' % (query if query else 'proxy socks') params = { 'q': search_query, 'df': 'w', # Past week (DuckDuckGo date filter) } if page > 0: params['s'] = str(page * 30) return '%s?%s' % (self.base_url, _urlencode(params)) def extract_urls(self, content, urignore=None): urls = [] if not content: return urls soup = soupify(content, nohtml=True) for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href: continue # Unwrap DDG redirect if 'uddg=' in href: match = re.search(r'uddg=([^&]+)', href) if match: try: href = urllib.unquote(match.group(1)) except Exception: continue if not href.startswith('http'): continue # Only keep rentry links if 'rentry.co' in href or 'rentry.org' in href: if href not in urls: urls.append(href) return urls class Gitea(SearchEngine): """Generic Gitea instance search (configurable).""" name = 'gitea' rate_limit = 10 # Public Gitea instances with proxy-related content instances = [ 'https://git.disroot.org', 'https://git.envs.net', 'https://gitea.com', 'https://try.gitea.io', ] def __init__(self): super(Gitea, self).__init__() self.current_instance = random.choice(self.instances) def build_url(self, query, page=0): search_query = query if query else 'proxy' params = { 'q': search_query, 'sort': 'updated', } if page > 0: params['page'] = str(page + 1) return '%s/explore/repos?%s' % (self.current_instance, _urlencode(params)) def extract_urls(self, content, urignore=None): """Extract repo URLs for current dynamic instance.""" urls = [] if not content: return urls soup = soupify(content, nohtml=True) instance_domain = self.current_instance.split('//')[1] for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href: continue # Convert relative to absolute if href.startswith('/'): href = self.current_instance + href if not href.startswith('http'): continue # Keep repo links for this instance if instance_domain in href: if re.match(r'https?://[^/]+/[^/]+/[^/]+$', href): if href not in urls: urls.append(href) return urls class Searx(SearchEngine): """Searx meta-search engine (uses instances from file).""" name = 'searx' rate_limit = 0 # Handled by instance tracker def __init__(self, instance_url): super(Searx, self).__init__() self.base_url = instance_url def build_url(self, query, page=0): params = { 'q': query, 'category': 'general', 'time_range': random.choice(['day', 'week']), } if page > 0: params['pageno'] = str(page + 1) return '%s/?%s' % (self.base_url, _urlencode(params)) def extract_urls(self, content, urignore=None): """Extract URLs from Searx results (noreferrer links only).""" urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) for a in _get_body(soup).find_all('a'): # Searx uses rel="noreferrer" for result links rel = a.get('rel', '') if not rel or 'noreferrer' not in str(rel): continue href = a.get('href', '') if not href or not href.startswith('http'): continue # Check urignore patterns skip = False for pattern in urignore: if re.search(pattern, href): skip = True break if not skip and href not in urls: urls.append(href) return urls class DuckDuckGoOnion(DuckDuckGo): """DuckDuckGo via Tor hidden service.""" name = 'duckduckgo_onion' base_url = 'http://duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion/html/' skip_domains = ['duckduckgo.com', '.onion'] class StartpageOnion(Startpage): """Startpage via Tor hidden service.""" name = 'startpage_onion' base_url = 'http://startpagel6srwcjlue4zgq3zevrujfaow726kjytqbbjyrswwmjzcqd.onion/do/search' skip_domains = ['startpage.com', '.onion'] class BraveOnion(Brave): """Brave Search via Tor hidden service.""" name = 'brave_onion' base_url = 'https://search.brave4u7jddbv7cyviptqjc7jusxh72uik7zt6adtckl5f4nwy2v72qd.onion/search' skip_domains = ['brave.com', '.onion'] class Ahmia(SearchEngine): """Ahmia dark web search engine (indexes .onion sites).""" name = 'ahmia' base_url = 'https://ahmia.fi/search/' rate_limit = 10 skip_domains = ['ahmia.fi'] def build_url(self, query, page=0): params = {'q': query} if page > 0: params['p'] = str(page + 1) return '%s?%s' % (self.base_url, _urlencode(params)) class AhmiaOnion(Ahmia): """Ahmia via Tor hidden service.""" name = 'ahmia_onion' base_url = 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion/search/' skip_domains = ['ahmia.fi', '.onion'] # Registry of available engines ENGINES = { # Major search engines 'bing': Bing, 'yahoo': Yahoo, # Privacy-focused search engines 'duckduckgo': DuckDuckGo, 'duckduckgo_onion': DuckDuckGoOnion, 'startpage': Startpage, 'startpage_onion': StartpageOnion, 'brave': Brave, 'brave_onion': BraveOnion, 'ahmia': Ahmia, 'ahmia_onion': AhmiaOnion, 'ecosia': Ecosia, 'metager': Metager, 'swisscows': Swisscows, # Independent/regional search engines 'mojeek': Mojeek, # UK 'qwant': Qwant, # France 'yandex': Yandex, # Russia 'gigablast': Gigablast, 'alexandria': Alexandria, # Git hosting platforms 'github': GitHub, 'gitlab': GitLab, 'codeberg': Codeberg, 'gitea': Gitea, 'bitbucket': Bitbucket, 'sourcehut': Sourcehut, # Paste sites 'pastebin': Pastebin, 'rentry': Rentry, } def get_engine(name): """Get engine instance by name.""" if name not in ENGINES: return None return ENGINES[name]() def get_all_engines(): """Get instances of all available engines.""" return [cls() for cls in ENGINES.values()] def list_engines(): """List available engine names.""" return list(ENGINES.keys())