#!/usr/bin/env python2 # -*- coding: utf-8 -*- """Search engine implementations for proxy list discovery.""" import re import urllib import random from soup_parser import soupify from misc import _log class SearchEngine(object): """Base class for search engines.""" name = 'base' base_url = '' # Rate limiting: requests per minute (0 = no limit) rate_limit = 0 def __init__(self): self.last_request = 0 def build_url(self, query, page=0): """Build search URL for query and page number.""" raise NotImplementedError def extract_urls(self, content, urignore=None): """Extract result URLs from response content.""" raise NotImplementedError def is_rate_limited(self, content): """Check if response indicates rate limiting.""" if not content: return True rate_signals = ( 'rate limit', 'too many requests', 'blocked', 'captcha', 'please verify', 'unusual traffic', 'access denied', '403', '429', ) content_lower = content.lower() for signal in rate_signals: if signal in content_lower: return True return False class DuckDuckGo(SearchEngine): """DuckDuckGo HTML search (no JavaScript required).""" name = 'duckduckgo' base_url = 'https://html.duckduckgo.com/html/' rate_limit = 10 def build_url(self, query, page=0): params = {'q': query} if page > 0: # DuckDuckGo uses 's' param for offset (30 results per page) params['s'] = str(page * 30) params['dc'] = str(page * 30 + 1) return '%s?%s' % (self.base_url, urllib.urlencode(params)) def extract_urls(self, content, urignore=None): urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) # DuckDuckGo HTML results are in or for a in soup.body.find_all('a'): href = a.get('href', '') if not href or not href.startswith('http'): continue # Skip DuckDuckGo internal links if 'duckduckgo.com' in href: continue # DuckDuckGo wraps URLs - extract actual URL from redirect if '/l/?uddg=' in href or 'uddg=' in href: match = re.search(r'uddg=([^&]+)', href) if match: try: href = urllib.unquote(match.group(1)) except Exception: continue # Check urignore patterns bad = False for pattern in urignore: if re.search(pattern, href): bad = True break if not bad and href not in urls: urls.append(href) return urls class Startpage(SearchEngine): """Startpage search (privacy-focused, uses Google results).""" name = 'startpage' base_url = 'https://www.startpage.com/do/search' rate_limit = 5 def build_url(self, query, page=0): params = { 'query': query, 'cat': 'web', 'language': 'english', } if page > 0: params['page'] = str(page + 1) return '%s?%s' % (self.base_url, urllib.urlencode(params)) def extract_urls(self, content, urignore=None): urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) for a in soup.body.find_all('a'): href = a.get('href', '') if not href or not href.startswith('http'): continue # Skip Startpage internal links if 'startpage.com' in href: continue # Check urignore patterns bad = False for pattern in urignore: if re.search(pattern, href): bad = True break if not bad and href not in urls: urls.append(href) return urls class Mojeek(SearchEngine): """Mojeek search (UK-based, independent index).""" name = 'mojeek' base_url = 'https://www.mojeek.com/search' rate_limit = 10 def build_url(self, query, page=0): params = {'q': query} if page > 0: # Mojeek uses 's' for start position (10 results per page) params['s'] = str(page * 10 + 1) return '%s?%s' % (self.base_url, urllib.urlencode(params)) def extract_urls(self, content, urignore=None): urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) for a in soup.body.find_all('a'): href = a.get('href', '') if not href or not href.startswith('http'): continue # Skip Mojeek internal links if 'mojeek.com' in href or 'mojeek.co.uk' in href: continue # Check urignore patterns bad = False for pattern in urignore: if re.search(pattern, href): bad = True break if not bad and href not in urls: urls.append(href) return urls class Qwant(SearchEngine): """Qwant Lite search (French, EU-based, privacy-focused).""" name = 'qwant' base_url = 'https://lite.qwant.com/' rate_limit = 10 def build_url(self, query, page=0): params = { 'q': query, 't': 'web', } if page > 0: params['p'] = str(page + 1) return '%s?%s' % (self.base_url, urllib.urlencode(params)) def extract_urls(self, content, urignore=None): urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) for a in soup.body.find_all('a'): href = a.get('href', '') if not href or not href.startswith('http'): continue # Skip Qwant internal links if 'qwant.com' in href: continue # Check urignore patterns bad = False for pattern in urignore: if re.search(pattern, href): bad = True break if not bad and href not in urls: urls.append(href) return urls class Yandex(SearchEngine): """Yandex search (Russian, large independent index).""" name = 'yandex' base_url = 'https://yandex.com/search/' rate_limit = 5 def build_url(self, query, page=0): params = { 'text': query, 'lr': '84', # Worldwide } if page > 0: params['p'] = str(page) return '%s?%s' % (self.base_url, urllib.urlencode(params)) def extract_urls(self, content, urignore=None): urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) for a in soup.body.find_all('a'): href = a.get('href', '') if not href: continue # Yandex uses redirect URLs, try to extract actual URL if '//yandex.' in href: # Try to find embedded URL match = re.search(r'url=([^&]+)', href) if match: try: href = urllib.unquote(match.group(1)) except Exception: continue else: continue if not href.startswith('http'): continue # Check urignore patterns bad = False for pattern in urignore: if re.search(pattern, href): bad = True break if not bad and href not in urls: urls.append(href) return urls class Ecosia(SearchEngine): """Ecosia search (German, eco-friendly, uses Bing results).""" name = 'ecosia' base_url = 'https://www.ecosia.org/search' rate_limit = 10 def build_url(self, query, page=0): params = {'q': query} if page > 0: params['p'] = str(page) return '%s?%s' % (self.base_url, urllib.urlencode(params)) def extract_urls(self, content, urignore=None): urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) for a in soup.body.find_all('a'): href = a.get('href', '') if not href or not href.startswith('http'): continue # Skip Ecosia internal links if 'ecosia.org' in href: continue # Check urignore patterns bad = False for pattern in urignore: if re.search(pattern, href): bad = True break if not bad and href not in urls: urls.append(href) return urls class Brave(SearchEngine): """Brave Search (privacy-focused, independent index).""" name = 'brave' base_url = 'https://search.brave.com/search' rate_limit = 10 def build_url(self, query, page=0): params = {'q': query} if page > 0: params['offset'] = str(page) return '%s?%s' % (self.base_url, urllib.urlencode(params)) def extract_urls(self, content, urignore=None): urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) for a in soup.body.find_all('a'): href = a.get('href', '') if not href or not href.startswith('http'): continue # Skip Brave internal links if 'brave.com' in href: continue # Check urignore patterns bad = False for pattern in urignore: if re.search(pattern, href): bad = True break if not bad and href not in urls: urls.append(href) return urls class GitHub(SearchEngine): """GitHub code/repository search for proxy lists.""" name = 'github' base_url = 'https://github.com/search' rate_limit = 5 # Search terms specific to proxy lists on GitHub github_queries = [ 'proxy list', 'socks5 proxy list', 'free proxy', 'proxy scraper', 'proxy checker', 'proxies txt', 'socks4 list', 'http proxy list', ] def build_url(self, query, page=0): # GitHub search for repositories and code search_query = query if query else random.choice(self.github_queries) params = { 'q': search_query, 'type': random.choice(['repositories', 'code']), } if page > 0: params['p'] = str(page + 1) return '%s?%s' % (self.base_url, urllib.urlencode(params)) def extract_urls(self, content, urignore=None): urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) for a in soup.body.find_all('a'): href = a.get('href', '') if not href: continue # Convert relative to absolute if href.startswith('/'): href = 'https://github.com' + href if not href.startswith('http'): continue # Only keep GitHub repo/file links if 'github.com' not in href: continue # Skip non-content links skip_patterns = [ '/login', '/signup', '/join', '/settings', '/notifications', '/marketplace', '/explore', '/sponsors', '/pricing', '/features', ] skip = False for pattern in skip_patterns: if pattern in href: skip = True break if skip: continue # Keep raw file links and repo links if '/raw/' in href or '/blob/' in href or re.match(r'https://github\.com/[^/]+/[^/]+$', href): # Convert blob to raw for direct access if '/blob/' in href: raw_href = href.replace('/blob/', '/raw/') if raw_href not in urls: urls.append(raw_href) if href not in urls: urls.append(href) return urls class GitLab(SearchEngine): """GitLab search for proxy lists.""" name = 'gitlab' base_url = 'https://gitlab.com/search' rate_limit = 5 def build_url(self, query, page=0): search_query = query if query else 'proxy list' params = { 'search': search_query, 'scope': 'projects', } if page > 0: params['page'] = str(page + 1) return '%s?%s' % (self.base_url, urllib.urlencode(params)) def extract_urls(self, content, urignore=None): urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) for a in soup.body.find_all('a'): href = a.get('href', '') if not href: continue # Convert relative to absolute if href.startswith('/'): href = 'https://gitlab.com' + href if not href.startswith('http'): continue # Only keep GitLab project links if 'gitlab.com' not in href: continue # Skip non-project links skip_patterns = [ '/users/', '/-/', '/explore', '/help', '/admin', '/dashboard', '/profile', ] skip = False for pattern in skip_patterns: if pattern in href: skip = True break if skip: continue # Keep project and file links if re.match(r'https://gitlab\.com/[^/]+/[^/]+', href): if href not in urls: urls.append(href) return urls class Codeberg(SearchEngine): """Codeberg (Forgejo) search for proxy lists.""" name = 'codeberg' base_url = 'https://codeberg.org/explore/repos' rate_limit = 10 def build_url(self, query, page=0): search_query = query if query else 'proxy' params = { 'q': search_query, 'sort': 'updated', } if page > 0: params['page'] = str(page + 1) return '%s?%s' % (self.base_url, urllib.urlencode(params)) def extract_urls(self, content, urignore=None): urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) for a in soup.body.find_all('a'): href = a.get('href', '') if not href: continue # Convert relative to absolute if href.startswith('/'): href = 'https://codeberg.org' + href if not href.startswith('http'): continue # Only keep Codeberg repo links if 'codeberg.org' not in href: continue # Keep repo links (format: /user/repo) if re.match(r'https://codeberg\.org/[^/]+/[^/]+$', href): if href not in urls: urls.append(href) return urls class Gitea(SearchEngine): """Generic Gitea instance search (configurable).""" name = 'gitea' rate_limit = 10 # Public Gitea instances with proxy-related content instances = [ 'https://git.disroot.org', 'https://git.envs.net', 'https://git.sr.ht', ] def __init__(self): super(Gitea, self).__init__() self.current_instance = random.choice(self.instances) def build_url(self, query, page=0): search_query = query if query else 'proxy' params = { 'q': search_query, 'sort': 'updated', } if page > 0: params['page'] = str(page + 1) return '%s/explore/repos?%s' % (self.current_instance, urllib.urlencode(params)) def extract_urls(self, content, urignore=None): urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) for a in soup.body.find_all('a'): href = a.get('href', '') if not href: continue # Convert relative to absolute if href.startswith('/'): href = self.current_instance + href if not href.startswith('http'): continue # Keep repo links if self.current_instance.split('//')[1] in href: if re.match(r'https?://[^/]+/[^/]+/[^/]+$', href): if href not in urls: urls.append(href) return urls class Searx(SearchEngine): """Searx meta-search engine (uses instances from file).""" name = 'searx' rate_limit = 0 # Handled by instance tracker def __init__(self, instance_url): super(Searx, self).__init__() self.base_url = instance_url def build_url(self, query, page=0): params = { 'q': query, 'category': 'general', 'time_range': random.choice(['day', 'week']), } if page > 0: params['pageno'] = str(page + 1) return '%s/?%s' % (self.base_url, urllib.urlencode(params)) def extract_urls(self, content, urignore=None): """Extract URLs from Searx results (noreferrer links).""" urls = [] if not content: return urls urignore = urignore or [] soup = soupify(content, nohtml=True) for a in soup.body.find_all('a'): # Searx uses rel="noreferrer" for result links rel = a.get('rel', '') if not rel or 'noreferrer' not in str(rel): continue href = a.get('href', '') if not href or not href.startswith('http'): continue # Check urignore patterns bad = False for pattern in urignore: if re.search(pattern, href): bad = True break if not bad and href not in urls: urls.append(href) return urls # Registry of available engines ENGINES = { # Privacy-focused search engines 'duckduckgo': DuckDuckGo, 'startpage': Startpage, 'brave': Brave, 'ecosia': Ecosia, # Regional/non-US search engines 'mojeek': Mojeek, # UK 'qwant': Qwant, # France 'yandex': Yandex, # Russia # Git hosting platforms 'github': GitHub, 'gitlab': GitLab, 'codeberg': Codeberg, 'gitea': Gitea, } def get_engine(name): """Get engine instance by name.""" if name not in ENGINES: return None return ENGINES[name]() def get_all_engines(): """Get instances of all available engines.""" return [cls() for cls in ENGINES.values()] def list_engines(): """List available engine names.""" return list(ENGINES.keys())