diff --git a/engines.py b/engines.py new file mode 100644 index 0000000..47c1ba5 --- /dev/null +++ b/engines.py @@ -0,0 +1,716 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +"""Search engine implementations for proxy list discovery.""" + +import re +import urllib +import random +from soup_parser import soupify +from misc import _log + + +class SearchEngine(object): + """Base class for search engines.""" + + name = 'base' + base_url = '' + # Rate limiting: requests per minute (0 = no limit) + rate_limit = 0 + + def __init__(self): + self.last_request = 0 + + def build_url(self, query, page=0): + """Build search URL for query and page number.""" + raise NotImplementedError + + def extract_urls(self, content, urignore=None): + """Extract result URLs from response content.""" + raise NotImplementedError + + def is_rate_limited(self, content): + """Check if response indicates rate limiting.""" + if not content: + return True + rate_signals = ( + 'rate limit', 'too many requests', 'blocked', + 'captcha', 'please verify', 'unusual traffic', + 'access denied', '403', '429', + ) + content_lower = content.lower() + for signal in rate_signals: + if signal in content_lower: + return True + return False + + +class DuckDuckGo(SearchEngine): + """DuckDuckGo HTML search (no JavaScript required).""" + + name = 'duckduckgo' + base_url = 'https://html.duckduckgo.com/html/' + rate_limit = 10 + + def build_url(self, query, page=0): + params = {'q': query} + if page > 0: + # DuckDuckGo uses 's' param for offset (30 results per page) + params['s'] = str(page * 30) + params['dc'] = str(page * 30 + 1) + return '%s?%s' % (self.base_url, urllib.urlencode(params)) + + def extract_urls(self, content, urignore=None): + urls = [] + if not content: + return urls + urignore = urignore or [] + + soup = soupify(content, nohtml=True) + + # DuckDuckGo HTML results are in or + for a in soup.body.find_all('a'): + href = a.get('href', '') + if not href or not href.startswith('http'): + continue + + # Skip DuckDuckGo internal links + if 'duckduckgo.com' in href: + continue + + # DuckDuckGo wraps URLs - extract actual URL from redirect + if '/l/?uddg=' in href or 'uddg=' in href: + match = re.search(r'uddg=([^&]+)', href) + if match: + try: + href = urllib.unquote(match.group(1)) + except Exception: + continue + + # Check urignore patterns + bad = False + for pattern in urignore: + if re.search(pattern, href): + bad = True + break + + if not bad and href not in urls: + urls.append(href) + + return urls + + +class Startpage(SearchEngine): + """Startpage search (privacy-focused, uses Google results).""" + + name = 'startpage' + base_url = 'https://www.startpage.com/do/search' + rate_limit = 5 + + def build_url(self, query, page=0): + params = { + 'query': query, + 'cat': 'web', + 'language': 'english', + } + if page > 0: + params['page'] = str(page + 1) + return '%s?%s' % (self.base_url, urllib.urlencode(params)) + + def extract_urls(self, content, urignore=None): + urls = [] + if not content: + return urls + urignore = urignore or [] + + soup = soupify(content, nohtml=True) + + for a in soup.body.find_all('a'): + href = a.get('href', '') + if not href or not href.startswith('http'): + continue + + # Skip Startpage internal links + if 'startpage.com' in href: + continue + + # Check urignore patterns + bad = False + for pattern in urignore: + if re.search(pattern, href): + bad = True + break + + if not bad and href not in urls: + urls.append(href) + + return urls + + +class Mojeek(SearchEngine): + """Mojeek search (UK-based, independent index).""" + + name = 'mojeek' + base_url = 'https://www.mojeek.com/search' + rate_limit = 10 + + def build_url(self, query, page=0): + params = {'q': query} + if page > 0: + # Mojeek uses 's' for start position (10 results per page) + params['s'] = str(page * 10 + 1) + return '%s?%s' % (self.base_url, urllib.urlencode(params)) + + def extract_urls(self, content, urignore=None): + urls = [] + if not content: + return urls + urignore = urignore or [] + + soup = soupify(content, nohtml=True) + + for a in soup.body.find_all('a'): + href = a.get('href', '') + if not href or not href.startswith('http'): + continue + + # Skip Mojeek internal links + if 'mojeek.com' in href or 'mojeek.co.uk' in href: + continue + + # Check urignore patterns + bad = False + for pattern in urignore: + if re.search(pattern, href): + bad = True + break + + if not bad and href not in urls: + urls.append(href) + + return urls + + +class Qwant(SearchEngine): + """Qwant Lite search (French, EU-based, privacy-focused).""" + + name = 'qwant' + base_url = 'https://lite.qwant.com/' + rate_limit = 10 + + def build_url(self, query, page=0): + params = { + 'q': query, + 't': 'web', + } + if page > 0: + params['p'] = str(page + 1) + return '%s?%s' % (self.base_url, urllib.urlencode(params)) + + def extract_urls(self, content, urignore=None): + urls = [] + if not content: + return urls + urignore = urignore or [] + + soup = soupify(content, nohtml=True) + + for a in soup.body.find_all('a'): + href = a.get('href', '') + if not href or not href.startswith('http'): + continue + + # Skip Qwant internal links + if 'qwant.com' in href: + continue + + # Check urignore patterns + bad = False + for pattern in urignore: + if re.search(pattern, href): + bad = True + break + + if not bad and href not in urls: + urls.append(href) + + return urls + + +class Yandex(SearchEngine): + """Yandex search (Russian, large independent index).""" + + name = 'yandex' + base_url = 'https://yandex.com/search/' + rate_limit = 5 + + def build_url(self, query, page=0): + params = { + 'text': query, + 'lr': '84', # Worldwide + } + if page > 0: + params['p'] = str(page) + return '%s?%s' % (self.base_url, urllib.urlencode(params)) + + def extract_urls(self, content, urignore=None): + urls = [] + if not content: + return urls + urignore = urignore or [] + + soup = soupify(content, nohtml=True) + + for a in soup.body.find_all('a'): + href = a.get('href', '') + if not href: + continue + + # Yandex uses redirect URLs, try to extract actual URL + if '//yandex.' in href: + # Try to find embedded URL + match = re.search(r'url=([^&]+)', href) + if match: + try: + href = urllib.unquote(match.group(1)) + except Exception: + continue + else: + continue + + if not href.startswith('http'): + continue + + # Check urignore patterns + bad = False + for pattern in urignore: + if re.search(pattern, href): + bad = True + break + + if not bad and href not in urls: + urls.append(href) + + return urls + + +class Ecosia(SearchEngine): + """Ecosia search (German, eco-friendly, uses Bing results).""" + + name = 'ecosia' + base_url = 'https://www.ecosia.org/search' + rate_limit = 10 + + def build_url(self, query, page=0): + params = {'q': query} + if page > 0: + params['p'] = str(page) + return '%s?%s' % (self.base_url, urllib.urlencode(params)) + + def extract_urls(self, content, urignore=None): + urls = [] + if not content: + return urls + urignore = urignore or [] + + soup = soupify(content, nohtml=True) + + for a in soup.body.find_all('a'): + href = a.get('href', '') + if not href or not href.startswith('http'): + continue + + # Skip Ecosia internal links + if 'ecosia.org' in href: + continue + + # Check urignore patterns + bad = False + for pattern in urignore: + if re.search(pattern, href): + bad = True + break + + if not bad and href not in urls: + urls.append(href) + + return urls + + +class Brave(SearchEngine): + """Brave Search (privacy-focused, independent index).""" + + name = 'brave' + base_url = 'https://search.brave.com/search' + rate_limit = 10 + + def build_url(self, query, page=0): + params = {'q': query} + if page > 0: + params['offset'] = str(page) + return '%s?%s' % (self.base_url, urllib.urlencode(params)) + + def extract_urls(self, content, urignore=None): + urls = [] + if not content: + return urls + urignore = urignore or [] + + soup = soupify(content, nohtml=True) + + for a in soup.body.find_all('a'): + href = a.get('href', '') + if not href or not href.startswith('http'): + continue + + # Skip Brave internal links + if 'brave.com' in href: + continue + + # Check urignore patterns + bad = False + for pattern in urignore: + if re.search(pattern, href): + bad = True + break + + if not bad and href not in urls: + urls.append(href) + + return urls + + +class GitHub(SearchEngine): + """GitHub code/repository search for proxy lists.""" + + name = 'github' + base_url = 'https://github.com/search' + rate_limit = 5 + + # Search terms specific to proxy lists on GitHub + github_queries = [ + 'proxy list', + 'socks5 proxy list', + 'free proxy', + 'proxy scraper', + 'proxy checker', + 'proxies txt', + 'socks4 list', + 'http proxy list', + ] + + def build_url(self, query, page=0): + # GitHub search for repositories and code + search_query = query if query else random.choice(self.github_queries) + params = { + 'q': search_query, + 'type': random.choice(['repositories', 'code']), + } + if page > 0: + params['p'] = str(page + 1) + return '%s?%s' % (self.base_url, urllib.urlencode(params)) + + def extract_urls(self, content, urignore=None): + urls = [] + if not content: + return urls + urignore = urignore or [] + + soup = soupify(content, nohtml=True) + + for a in soup.body.find_all('a'): + href = a.get('href', '') + if not href: + continue + + # Convert relative to absolute + if href.startswith('/'): + href = 'https://github.com' + href + + if not href.startswith('http'): + continue + + # Only keep GitHub repo/file links + if 'github.com' not in href: + continue + + # Skip non-content links + skip_patterns = [ + '/login', '/signup', '/join', '/settings', + '/notifications', '/marketplace', '/explore', + '/sponsors', '/pricing', '/features', + ] + skip = False + for pattern in skip_patterns: + if pattern in href: + skip = True + break + if skip: + continue + + # Keep raw file links and repo links + if '/raw/' in href or '/blob/' in href or re.match(r'https://github\.com/[^/]+/[^/]+$', href): + # Convert blob to raw for direct access + if '/blob/' in href: + raw_href = href.replace('/blob/', '/raw/') + if raw_href not in urls: + urls.append(raw_href) + + if href not in urls: + urls.append(href) + + return urls + + +class GitLab(SearchEngine): + """GitLab search for proxy lists.""" + + name = 'gitlab' + base_url = 'https://gitlab.com/search' + rate_limit = 5 + + def build_url(self, query, page=0): + search_query = query if query else 'proxy list' + params = { + 'search': search_query, + 'scope': 'projects', + } + if page > 0: + params['page'] = str(page + 1) + return '%s?%s' % (self.base_url, urllib.urlencode(params)) + + def extract_urls(self, content, urignore=None): + urls = [] + if not content: + return urls + urignore = urignore or [] + + soup = soupify(content, nohtml=True) + + for a in soup.body.find_all('a'): + href = a.get('href', '') + if not href: + continue + + # Convert relative to absolute + if href.startswith('/'): + href = 'https://gitlab.com' + href + + if not href.startswith('http'): + continue + + # Only keep GitLab project links + if 'gitlab.com' not in href: + continue + + # Skip non-project links + skip_patterns = [ + '/users/', '/-/', '/explore', '/help', + '/admin', '/dashboard', '/profile', + ] + skip = False + for pattern in skip_patterns: + if pattern in href: + skip = True + break + if skip: + continue + + # Keep project and file links + if re.match(r'https://gitlab\.com/[^/]+/[^/]+', href): + if href not in urls: + urls.append(href) + + return urls + + +class Codeberg(SearchEngine): + """Codeberg (Forgejo) search for proxy lists.""" + + name = 'codeberg' + base_url = 'https://codeberg.org/explore/repos' + rate_limit = 10 + + def build_url(self, query, page=0): + search_query = query if query else 'proxy' + params = { + 'q': search_query, + 'sort': 'updated', + } + if page > 0: + params['page'] = str(page + 1) + return '%s?%s' % (self.base_url, urllib.urlencode(params)) + + def extract_urls(self, content, urignore=None): + urls = [] + if not content: + return urls + urignore = urignore or [] + + soup = soupify(content, nohtml=True) + + for a in soup.body.find_all('a'): + href = a.get('href', '') + if not href: + continue + + # Convert relative to absolute + if href.startswith('/'): + href = 'https://codeberg.org' + href + + if not href.startswith('http'): + continue + + # Only keep Codeberg repo links + if 'codeberg.org' not in href: + continue + + # Keep repo links (format: /user/repo) + if re.match(r'https://codeberg\.org/[^/]+/[^/]+$', href): + if href not in urls: + urls.append(href) + + return urls + + +class Gitea(SearchEngine): + """Generic Gitea instance search (configurable).""" + + name = 'gitea' + rate_limit = 10 + + # Public Gitea instances with proxy-related content + instances = [ + 'https://git.disroot.org', + 'https://git.envs.net', + 'https://git.sr.ht', + ] + + def __init__(self): + super(Gitea, self).__init__() + self.current_instance = random.choice(self.instances) + + def build_url(self, query, page=0): + search_query = query if query else 'proxy' + params = { + 'q': search_query, + 'sort': 'updated', + } + if page > 0: + params['page'] = str(page + 1) + return '%s/explore/repos?%s' % (self.current_instance, urllib.urlencode(params)) + + def extract_urls(self, content, urignore=None): + urls = [] + if not content: + return urls + urignore = urignore or [] + + soup = soupify(content, nohtml=True) + + for a in soup.body.find_all('a'): + href = a.get('href', '') + if not href: + continue + + # Convert relative to absolute + if href.startswith('/'): + href = self.current_instance + href + + if not href.startswith('http'): + continue + + # Keep repo links + if self.current_instance.split('//')[1] in href: + if re.match(r'https?://[^/]+/[^/]+/[^/]+$', href): + if href not in urls: + urls.append(href) + + return urls + + +class Searx(SearchEngine): + """Searx meta-search engine (uses instances from file).""" + + name = 'searx' + rate_limit = 0 # Handled by instance tracker + + def __init__(self, instance_url): + super(Searx, self).__init__() + self.base_url = instance_url + + def build_url(self, query, page=0): + params = { + 'q': query, + 'category': 'general', + 'time_range': random.choice(['day', 'week']), + } + if page > 0: + params['pageno'] = str(page + 1) + return '%s/?%s' % (self.base_url, urllib.urlencode(params)) + + def extract_urls(self, content, urignore=None): + """Extract URLs from Searx results (noreferrer links).""" + urls = [] + if not content: + return urls + urignore = urignore or [] + + soup = soupify(content, nohtml=True) + + for a in soup.body.find_all('a'): + # Searx uses rel="noreferrer" for result links + rel = a.get('rel', '') + if not rel or 'noreferrer' not in str(rel): + continue + + href = a.get('href', '') + if not href or not href.startswith('http'): + continue + + # Check urignore patterns + bad = False + for pattern in urignore: + if re.search(pattern, href): + bad = True + break + + if not bad and href not in urls: + urls.append(href) + + return urls + + +# Registry of available engines +ENGINES = { + # Privacy-focused search engines + 'duckduckgo': DuckDuckGo, + 'startpage': Startpage, + 'brave': Brave, + 'ecosia': Ecosia, + # Regional/non-US search engines + 'mojeek': Mojeek, # UK + 'qwant': Qwant, # France + 'yandex': Yandex, # Russia + # Git hosting platforms + 'github': GitHub, + 'gitlab': GitLab, + 'codeberg': Codeberg, + 'gitea': Gitea, +} + + +def get_engine(name): + """Get engine instance by name.""" + if name not in ENGINES: + return None + return ENGINES[name]() + + +def get_all_engines(): + """Get instances of all available engines.""" + return [cls() for cls in ENGINES.values()] + + +def list_engines(): + """List available engine names.""" + return list(ENGINES.keys())