diff --git a/engines.py b/engines.py
new file mode 100644
index 0000000..47c1ba5
--- /dev/null
+++ b/engines.py
@@ -0,0 +1,716 @@
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+"""Search engine implementations for proxy list discovery."""
+
+import re
+import urllib
+import random
+from soup_parser import soupify
+from misc import _log
+
+
+class SearchEngine(object):
+ """Base class for search engines."""
+
+ name = 'base'
+ base_url = ''
+ # Rate limiting: requests per minute (0 = no limit)
+ rate_limit = 0
+
+ def __init__(self):
+ self.last_request = 0
+
+ def build_url(self, query, page=0):
+ """Build search URL for query and page number."""
+ raise NotImplementedError
+
+ def extract_urls(self, content, urignore=None):
+ """Extract result URLs from response content."""
+ raise NotImplementedError
+
+ def is_rate_limited(self, content):
+ """Check if response indicates rate limiting."""
+ if not content:
+ return True
+ rate_signals = (
+ 'rate limit', 'too many requests', 'blocked',
+ 'captcha', 'please verify', 'unusual traffic',
+ 'access denied', '403', '429',
+ )
+ content_lower = content.lower()
+ for signal in rate_signals:
+ if signal in content_lower:
+ return True
+ return False
+
+
+class DuckDuckGo(SearchEngine):
+ """DuckDuckGo HTML search (no JavaScript required)."""
+
+ name = 'duckduckgo'
+ base_url = 'https://html.duckduckgo.com/html/'
+ rate_limit = 10
+
+ def build_url(self, query, page=0):
+ params = {'q': query}
+ if page > 0:
+ # DuckDuckGo uses 's' param for offset (30 results per page)
+ params['s'] = str(page * 30)
+ params['dc'] = str(page * 30 + 1)
+ return '%s?%s' % (self.base_url, urllib.urlencode(params))
+
+ def extract_urls(self, content, urignore=None):
+ urls = []
+ if not content:
+ return urls
+ urignore = urignore or []
+
+ soup = soupify(content, nohtml=True)
+
+ # DuckDuckGo HTML results are in or
+ for a in soup.body.find_all('a'):
+ href = a.get('href', '')
+ if not href or not href.startswith('http'):
+ continue
+
+ # Skip DuckDuckGo internal links
+ if 'duckduckgo.com' in href:
+ continue
+
+ # DuckDuckGo wraps URLs - extract actual URL from redirect
+ if '/l/?uddg=' in href or 'uddg=' in href:
+ match = re.search(r'uddg=([^&]+)', href)
+ if match:
+ try:
+ href = urllib.unquote(match.group(1))
+ except Exception:
+ continue
+
+ # Check urignore patterns
+ bad = False
+ for pattern in urignore:
+ if re.search(pattern, href):
+ bad = True
+ break
+
+ if not bad and href not in urls:
+ urls.append(href)
+
+ return urls
+
+
+class Startpage(SearchEngine):
+ """Startpage search (privacy-focused, uses Google results)."""
+
+ name = 'startpage'
+ base_url = 'https://www.startpage.com/do/search'
+ rate_limit = 5
+
+ def build_url(self, query, page=0):
+ params = {
+ 'query': query,
+ 'cat': 'web',
+ 'language': 'english',
+ }
+ if page > 0:
+ params['page'] = str(page + 1)
+ return '%s?%s' % (self.base_url, urllib.urlencode(params))
+
+ def extract_urls(self, content, urignore=None):
+ urls = []
+ if not content:
+ return urls
+ urignore = urignore or []
+
+ soup = soupify(content, nohtml=True)
+
+ for a in soup.body.find_all('a'):
+ href = a.get('href', '')
+ if not href or not href.startswith('http'):
+ continue
+
+ # Skip Startpage internal links
+ if 'startpage.com' in href:
+ continue
+
+ # Check urignore patterns
+ bad = False
+ for pattern in urignore:
+ if re.search(pattern, href):
+ bad = True
+ break
+
+ if not bad and href not in urls:
+ urls.append(href)
+
+ return urls
+
+
+class Mojeek(SearchEngine):
+ """Mojeek search (UK-based, independent index)."""
+
+ name = 'mojeek'
+ base_url = 'https://www.mojeek.com/search'
+ rate_limit = 10
+
+ def build_url(self, query, page=0):
+ params = {'q': query}
+ if page > 0:
+ # Mojeek uses 's' for start position (10 results per page)
+ params['s'] = str(page * 10 + 1)
+ return '%s?%s' % (self.base_url, urllib.urlencode(params))
+
+ def extract_urls(self, content, urignore=None):
+ urls = []
+ if not content:
+ return urls
+ urignore = urignore or []
+
+ soup = soupify(content, nohtml=True)
+
+ for a in soup.body.find_all('a'):
+ href = a.get('href', '')
+ if not href or not href.startswith('http'):
+ continue
+
+ # Skip Mojeek internal links
+ if 'mojeek.com' in href or 'mojeek.co.uk' in href:
+ continue
+
+ # Check urignore patterns
+ bad = False
+ for pattern in urignore:
+ if re.search(pattern, href):
+ bad = True
+ break
+
+ if not bad and href not in urls:
+ urls.append(href)
+
+ return urls
+
+
+class Qwant(SearchEngine):
+ """Qwant Lite search (French, EU-based, privacy-focused)."""
+
+ name = 'qwant'
+ base_url = 'https://lite.qwant.com/'
+ rate_limit = 10
+
+ def build_url(self, query, page=0):
+ params = {
+ 'q': query,
+ 't': 'web',
+ }
+ if page > 0:
+ params['p'] = str(page + 1)
+ return '%s?%s' % (self.base_url, urllib.urlencode(params))
+
+ def extract_urls(self, content, urignore=None):
+ urls = []
+ if not content:
+ return urls
+ urignore = urignore or []
+
+ soup = soupify(content, nohtml=True)
+
+ for a in soup.body.find_all('a'):
+ href = a.get('href', '')
+ if not href or not href.startswith('http'):
+ continue
+
+ # Skip Qwant internal links
+ if 'qwant.com' in href:
+ continue
+
+ # Check urignore patterns
+ bad = False
+ for pattern in urignore:
+ if re.search(pattern, href):
+ bad = True
+ break
+
+ if not bad and href not in urls:
+ urls.append(href)
+
+ return urls
+
+
+class Yandex(SearchEngine):
+ """Yandex search (Russian, large independent index)."""
+
+ name = 'yandex'
+ base_url = 'https://yandex.com/search/'
+ rate_limit = 5
+
+ def build_url(self, query, page=0):
+ params = {
+ 'text': query,
+ 'lr': '84', # Worldwide
+ }
+ if page > 0:
+ params['p'] = str(page)
+ return '%s?%s' % (self.base_url, urllib.urlencode(params))
+
+ def extract_urls(self, content, urignore=None):
+ urls = []
+ if not content:
+ return urls
+ urignore = urignore or []
+
+ soup = soupify(content, nohtml=True)
+
+ for a in soup.body.find_all('a'):
+ href = a.get('href', '')
+ if not href:
+ continue
+
+ # Yandex uses redirect URLs, try to extract actual URL
+ if '//yandex.' in href:
+ # Try to find embedded URL
+ match = re.search(r'url=([^&]+)', href)
+ if match:
+ try:
+ href = urllib.unquote(match.group(1))
+ except Exception:
+ continue
+ else:
+ continue
+
+ if not href.startswith('http'):
+ continue
+
+ # Check urignore patterns
+ bad = False
+ for pattern in urignore:
+ if re.search(pattern, href):
+ bad = True
+ break
+
+ if not bad and href not in urls:
+ urls.append(href)
+
+ return urls
+
+
+class Ecosia(SearchEngine):
+ """Ecosia search (German, eco-friendly, uses Bing results)."""
+
+ name = 'ecosia'
+ base_url = 'https://www.ecosia.org/search'
+ rate_limit = 10
+
+ def build_url(self, query, page=0):
+ params = {'q': query}
+ if page > 0:
+ params['p'] = str(page)
+ return '%s?%s' % (self.base_url, urllib.urlencode(params))
+
+ def extract_urls(self, content, urignore=None):
+ urls = []
+ if not content:
+ return urls
+ urignore = urignore or []
+
+ soup = soupify(content, nohtml=True)
+
+ for a in soup.body.find_all('a'):
+ href = a.get('href', '')
+ if not href or not href.startswith('http'):
+ continue
+
+ # Skip Ecosia internal links
+ if 'ecosia.org' in href:
+ continue
+
+ # Check urignore patterns
+ bad = False
+ for pattern in urignore:
+ if re.search(pattern, href):
+ bad = True
+ break
+
+ if not bad and href not in urls:
+ urls.append(href)
+
+ return urls
+
+
+class Brave(SearchEngine):
+ """Brave Search (privacy-focused, independent index)."""
+
+ name = 'brave'
+ base_url = 'https://search.brave.com/search'
+ rate_limit = 10
+
+ def build_url(self, query, page=0):
+ params = {'q': query}
+ if page > 0:
+ params['offset'] = str(page)
+ return '%s?%s' % (self.base_url, urllib.urlencode(params))
+
+ def extract_urls(self, content, urignore=None):
+ urls = []
+ if not content:
+ return urls
+ urignore = urignore or []
+
+ soup = soupify(content, nohtml=True)
+
+ for a in soup.body.find_all('a'):
+ href = a.get('href', '')
+ if not href or not href.startswith('http'):
+ continue
+
+ # Skip Brave internal links
+ if 'brave.com' in href:
+ continue
+
+ # Check urignore patterns
+ bad = False
+ for pattern in urignore:
+ if re.search(pattern, href):
+ bad = True
+ break
+
+ if not bad and href not in urls:
+ urls.append(href)
+
+ return urls
+
+
+class GitHub(SearchEngine):
+ """GitHub code/repository search for proxy lists."""
+
+ name = 'github'
+ base_url = 'https://github.com/search'
+ rate_limit = 5
+
+ # Search terms specific to proxy lists on GitHub
+ github_queries = [
+ 'proxy list',
+ 'socks5 proxy list',
+ 'free proxy',
+ 'proxy scraper',
+ 'proxy checker',
+ 'proxies txt',
+ 'socks4 list',
+ 'http proxy list',
+ ]
+
+ def build_url(self, query, page=0):
+ # GitHub search for repositories and code
+ search_query = query if query else random.choice(self.github_queries)
+ params = {
+ 'q': search_query,
+ 'type': random.choice(['repositories', 'code']),
+ }
+ if page > 0:
+ params['p'] = str(page + 1)
+ return '%s?%s' % (self.base_url, urllib.urlencode(params))
+
+ def extract_urls(self, content, urignore=None):
+ urls = []
+ if not content:
+ return urls
+ urignore = urignore or []
+
+ soup = soupify(content, nohtml=True)
+
+ for a in soup.body.find_all('a'):
+ href = a.get('href', '')
+ if not href:
+ continue
+
+ # Convert relative to absolute
+ if href.startswith('/'):
+ href = 'https://github.com' + href
+
+ if not href.startswith('http'):
+ continue
+
+ # Only keep GitHub repo/file links
+ if 'github.com' not in href:
+ continue
+
+ # Skip non-content links
+ skip_patterns = [
+ '/login', '/signup', '/join', '/settings',
+ '/notifications', '/marketplace', '/explore',
+ '/sponsors', '/pricing', '/features',
+ ]
+ skip = False
+ for pattern in skip_patterns:
+ if pattern in href:
+ skip = True
+ break
+ if skip:
+ continue
+
+ # Keep raw file links and repo links
+ if '/raw/' in href or '/blob/' in href or re.match(r'https://github\.com/[^/]+/[^/]+$', href):
+ # Convert blob to raw for direct access
+ if '/blob/' in href:
+ raw_href = href.replace('/blob/', '/raw/')
+ if raw_href not in urls:
+ urls.append(raw_href)
+
+ if href not in urls:
+ urls.append(href)
+
+ return urls
+
+
+class GitLab(SearchEngine):
+ """GitLab search for proxy lists."""
+
+ name = 'gitlab'
+ base_url = 'https://gitlab.com/search'
+ rate_limit = 5
+
+ def build_url(self, query, page=0):
+ search_query = query if query else 'proxy list'
+ params = {
+ 'search': search_query,
+ 'scope': 'projects',
+ }
+ if page > 0:
+ params['page'] = str(page + 1)
+ return '%s?%s' % (self.base_url, urllib.urlencode(params))
+
+ def extract_urls(self, content, urignore=None):
+ urls = []
+ if not content:
+ return urls
+ urignore = urignore or []
+
+ soup = soupify(content, nohtml=True)
+
+ for a in soup.body.find_all('a'):
+ href = a.get('href', '')
+ if not href:
+ continue
+
+ # Convert relative to absolute
+ if href.startswith('/'):
+ href = 'https://gitlab.com' + href
+
+ if not href.startswith('http'):
+ continue
+
+ # Only keep GitLab project links
+ if 'gitlab.com' not in href:
+ continue
+
+ # Skip non-project links
+ skip_patterns = [
+ '/users/', '/-/', '/explore', '/help',
+ '/admin', '/dashboard', '/profile',
+ ]
+ skip = False
+ for pattern in skip_patterns:
+ if pattern in href:
+ skip = True
+ break
+ if skip:
+ continue
+
+ # Keep project and file links
+ if re.match(r'https://gitlab\.com/[^/]+/[^/]+', href):
+ if href not in urls:
+ urls.append(href)
+
+ return urls
+
+
+class Codeberg(SearchEngine):
+ """Codeberg (Forgejo) search for proxy lists."""
+
+ name = 'codeberg'
+ base_url = 'https://codeberg.org/explore/repos'
+ rate_limit = 10
+
+ def build_url(self, query, page=0):
+ search_query = query if query else 'proxy'
+ params = {
+ 'q': search_query,
+ 'sort': 'updated',
+ }
+ if page > 0:
+ params['page'] = str(page + 1)
+ return '%s?%s' % (self.base_url, urllib.urlencode(params))
+
+ def extract_urls(self, content, urignore=None):
+ urls = []
+ if not content:
+ return urls
+ urignore = urignore or []
+
+ soup = soupify(content, nohtml=True)
+
+ for a in soup.body.find_all('a'):
+ href = a.get('href', '')
+ if not href:
+ continue
+
+ # Convert relative to absolute
+ if href.startswith('/'):
+ href = 'https://codeberg.org' + href
+
+ if not href.startswith('http'):
+ continue
+
+ # Only keep Codeberg repo links
+ if 'codeberg.org' not in href:
+ continue
+
+ # Keep repo links (format: /user/repo)
+ if re.match(r'https://codeberg\.org/[^/]+/[^/]+$', href):
+ if href not in urls:
+ urls.append(href)
+
+ return urls
+
+
+class Gitea(SearchEngine):
+ """Generic Gitea instance search (configurable)."""
+
+ name = 'gitea'
+ rate_limit = 10
+
+ # Public Gitea instances with proxy-related content
+ instances = [
+ 'https://git.disroot.org',
+ 'https://git.envs.net',
+ 'https://git.sr.ht',
+ ]
+
+ def __init__(self):
+ super(Gitea, self).__init__()
+ self.current_instance = random.choice(self.instances)
+
+ def build_url(self, query, page=0):
+ search_query = query if query else 'proxy'
+ params = {
+ 'q': search_query,
+ 'sort': 'updated',
+ }
+ if page > 0:
+ params['page'] = str(page + 1)
+ return '%s/explore/repos?%s' % (self.current_instance, urllib.urlencode(params))
+
+ def extract_urls(self, content, urignore=None):
+ urls = []
+ if not content:
+ return urls
+ urignore = urignore or []
+
+ soup = soupify(content, nohtml=True)
+
+ for a in soup.body.find_all('a'):
+ href = a.get('href', '')
+ if not href:
+ continue
+
+ # Convert relative to absolute
+ if href.startswith('/'):
+ href = self.current_instance + href
+
+ if not href.startswith('http'):
+ continue
+
+ # Keep repo links
+ if self.current_instance.split('//')[1] in href:
+ if re.match(r'https?://[^/]+/[^/]+/[^/]+$', href):
+ if href not in urls:
+ urls.append(href)
+
+ return urls
+
+
+class Searx(SearchEngine):
+ """Searx meta-search engine (uses instances from file)."""
+
+ name = 'searx'
+ rate_limit = 0 # Handled by instance tracker
+
+ def __init__(self, instance_url):
+ super(Searx, self).__init__()
+ self.base_url = instance_url
+
+ def build_url(self, query, page=0):
+ params = {
+ 'q': query,
+ 'category': 'general',
+ 'time_range': random.choice(['day', 'week']),
+ }
+ if page > 0:
+ params['pageno'] = str(page + 1)
+ return '%s/?%s' % (self.base_url, urllib.urlencode(params))
+
+ def extract_urls(self, content, urignore=None):
+ """Extract URLs from Searx results (noreferrer links)."""
+ urls = []
+ if not content:
+ return urls
+ urignore = urignore or []
+
+ soup = soupify(content, nohtml=True)
+
+ for a in soup.body.find_all('a'):
+ # Searx uses rel="noreferrer" for result links
+ rel = a.get('rel', '')
+ if not rel or 'noreferrer' not in str(rel):
+ continue
+
+ href = a.get('href', '')
+ if not href or not href.startswith('http'):
+ continue
+
+ # Check urignore patterns
+ bad = False
+ for pattern in urignore:
+ if re.search(pattern, href):
+ bad = True
+ break
+
+ if not bad and href not in urls:
+ urls.append(href)
+
+ return urls
+
+
+# Registry of available engines
+ENGINES = {
+ # Privacy-focused search engines
+ 'duckduckgo': DuckDuckGo,
+ 'startpage': Startpage,
+ 'brave': Brave,
+ 'ecosia': Ecosia,
+ # Regional/non-US search engines
+ 'mojeek': Mojeek, # UK
+ 'qwant': Qwant, # France
+ 'yandex': Yandex, # Russia
+ # Git hosting platforms
+ 'github': GitHub,
+ 'gitlab': GitLab,
+ 'codeberg': Codeberg,
+ 'gitea': Gitea,
+}
+
+
+def get_engine(name):
+ """Get engine instance by name."""
+ if name not in ENGINES:
+ return None
+ return ENGINES[name]()
+
+
+def get_all_engines():
+ """Get instances of all available engines."""
+ return [cls() for cls in ENGINES.values()]
+
+
+def list_engines():
+ """List available engine names."""
+ return list(ENGINES.keys())