ppf/engines.py

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""Search engine implementations for proxy list discovery."""

import re
import urllib
import random
import time
from soup_parser import soupify
from misc import _log


def _date_weeks_ago(weeks):
    """Return date string (YYYY-MM-DD) for N weeks ago."""
    secs = time.time() - (weeks * 7 * 24 * 3600)
    return time.strftime('%Y-%m-%d', time.gmtime(secs))


def _urlencode(params):
    """URL-encode params dict, handling Unicode strings.

    Python 2's urllib.urlencode() expects byte strings. This helper
    encodes any Unicode values to UTF-8 before URL encoding.

    Args:
        params: Dictionary of query parameters

    Returns:
        URL-encoded query string
    """
    encoded = {}
    for k, v in params.items():
        if isinstance(v, unicode):
            v = v.encode('utf-8')
        encoded[k] = v
    return urllib.urlencode(encoded)


def _get_body(soup):
    """Get body element from soup, handling None case.

    Args:
        soup: BeautifulSoup or SoupResult object

    Returns:
        Body element or empty list wrapper if None
    """
    if soup is None or soup.body is None:
        # Return object with empty find_all to avoid AttributeError
        class EmptyBody:
            def find_all(self, *args, **kwargs):
                return []
        return EmptyBody()
    return soup.body


class SearchEngine(object):
    """Base class for search engines."""

    name = 'base'
    base_url = ''
    # Rate limiting: requests per minute (0 = no limit)
    rate_limit = 0
    # Domains to skip (engine's own domain)
    skip_domains = []
    # Path patterns to skip (internal pages)
    skip_patterns = []
    # Base URL for relative link conversion (None = skip relative links)
    relative_base = None

    def __init__(self):
        self.last_request = 0

    def build_url(self, query, page=0):
        """Build search URL for query and page number."""
        raise NotImplementedError

    def extract_urls(self, content, urignore=None):
        """Extract result URLs from response content.

        Base implementation handles common patterns:
        - Skips empty content
        - Parses HTML with soupify
        - Converts relative URLs if relative_base is set
        - Skips domains in skip_domains
        - Skips paths matching skip_patterns
        - Applies urignore regex patterns
        - Deduplicates results
        """
        urls = []
        if not content:
            return urls
        urignore = urignore or []

        soup = soupify(content, nohtml=True)

        for a in _get_body(soup).find_all('a'):
            href = a.get('href', '')
            if not href:
                continue

            # Handle relative URLs
            if not href.startswith('http'):
                if self.relative_base and href.startswith('/'):
                    href = self.relative_base + href
                else:
                    continue

            # Skip engine's own domain(s)
            skip = False
            for domain in self.skip_domains:
                if domain in href:
                    skip = True
                    break
            if skip:
                continue

            # Skip internal paths
            for pattern in self.skip_patterns:
                if pattern in href:
                    skip = True
                    break
            if skip:
                continue

            # Check urignore patterns
            for pattern in urignore:
                if re.search(pattern, href):
                    skip = True
                    break
            if skip:
                continue

            if href not in urls:
                urls.append(href)

        return urls

    def is_rate_limited(self, content):
        """Check if response indicates rate limiting."""
        if not content:
            return True
        rate_signals = (
            'rate limit', 'too many requests', 'blocked',
            'captcha', 'please verify', 'unusual traffic',
            'access denied', '403', '429',
        )
        content_lower = content.lower()
        for signal in rate_signals:
            if signal in content_lower:
                return True
        return False


class DuckDuckGo(SearchEngine):
    """DuckDuckGo HTML search (no JavaScript required)."""

    name = 'duckduckgo'
    base_url = 'https://html.duckduckgo.com/html/'
    rate_limit = 10
    skip_domains = ['duckduckgo.com']

    def build_url(self, query, page=0):
        params = {'q': query}
        if page > 0:
            # DuckDuckGo uses 's' param for offset (30 results per page)
            params['s'] = str(page * 30)
            params['dc'] = str(page * 30 + 1)
        return '%s?%s' % (self.base_url, _urlencode(params))

    def _unwrap_url(self, href):
        """Extract actual URL from DuckDuckGo redirect wrapper."""
        if '/l/?uddg=' in href or 'uddg=' in href:
            match = re.search(r'uddg=([^&]+)', href)
            if match:
                try:
                    return urllib.unquote(match.group(1))
                except Exception:
                    return None
        return href

    def extract_urls(self, content, urignore=None):
        urls = []
        if not content:
            return urls
        urignore = urignore or []

        soup = soupify(content, nohtml=True)

        for a in _get_body(soup).find_all('a'):
            href = a.get('href', '')
            if not href or not href.startswith('http'):
                continue

            # Unwrap redirect URLs
            href = self._unwrap_url(href)
            if not href:
                continue

            # Skip engine's domain
            if 'duckduckgo.com' in href:
                continue

            # Check urignore patterns
            skip = False
            for pattern in urignore:
                if re.search(pattern, href):
                    skip = True
                    break

            if not skip and href not in urls:
                urls.append(href)

        return urls


class Startpage(SearchEngine):
    """Startpage search (privacy-focused, uses Google results)."""

    name = 'startpage'
    base_url = 'https://www.startpage.com/do/search'
    rate_limit = 5
    skip_domains = ['startpage.com']

    def build_url(self, query, page=0):
        params = {
            'query': query,
            'cat': 'web',
            'language': 'english',
        }
        if page > 0:
            params['page'] = str(page + 1)
        return '%s?%s' % (self.base_url, _urlencode(params))


class Mojeek(SearchEngine):
    """Mojeek search (UK-based, independent index)."""

    name = 'mojeek'
    base_url = 'https://www.mojeek.com/search'
    rate_limit = 10
    skip_domains = ['mojeek.com', 'mojeek.co.uk']

    def build_url(self, query, page=0):
        params = {'q': query}
        if page > 0:
            # Mojeek uses 's' for start position (10 results per page)
            params['s'] = str(page * 10 + 1)
        return '%s?%s' % (self.base_url, _urlencode(params))


class Qwant(SearchEngine):
    """Qwant Lite search (French, EU-based, privacy-focused)."""

    name = 'qwant'
    base_url = 'https://lite.qwant.com/'
    rate_limit = 10
    skip_domains = ['qwant.com']

    def build_url(self, query, page=0):
        params = {
            'q': query,
            't': 'web',
        }
        if page > 0:
            params['p'] = str(page + 1)
        return '%s?%s' % (self.base_url, _urlencode(params))


class Yandex(SearchEngine):
    """Yandex search (Russian, large independent index)."""

    name = 'yandex'
    base_url = 'https://yandex.com/search/'
    rate_limit = 5
    skip_domains = ['yandex.com', 'yandex.ru']

    def build_url(self, query, page=0):
        params = {
            'text': query,
            'lr': '84',  # Worldwide
        }
        if page > 0:
            params['p'] = str(page)
        return '%s?%s' % (self.base_url, _urlencode(params))

    def _unwrap_url(self, href):
        """Extract actual URL from Yandex redirect wrapper."""
        if '//yandex.' in href:
            match = re.search(r'url=([^&]+)', href)
            if match:
                try:
                    return urllib.unquote(match.group(1))
                except Exception:
                    return None
            return None
        return href

    def extract_urls(self, content, urignore=None):
        urls = []
        if not content:
            return urls
        urignore = urignore or []

        soup = soupify(content, nohtml=True)

        for a in _get_body(soup).find_all('a'):
            href = a.get('href', '')
            if not href:
                continue

            # Unwrap redirect URLs
            href = self._unwrap_url(href)
            if not href or not href.startswith('http'):
                continue

            # Check urignore patterns
            skip = False
            for pattern in urignore:
                if re.search(pattern, href):
                    skip = True
                    break

            if not skip and href not in urls:
                urls.append(href)

        return urls


class Ecosia(SearchEngine):
    """Ecosia search (German, eco-friendly, uses Bing results)."""

    name = 'ecosia'
    base_url = 'https://www.ecosia.org/search'
    rate_limit = 10
    skip_domains = ['ecosia.org']

    def build_url(self, query, page=0):
        params = {'q': query}
        if page > 0:
            params['p'] = str(page)
        return '%s?%s' % (self.base_url, _urlencode(params))


class Bing(SearchEngine):
    """Bing search (Microsoft)."""

    name = 'bing'
    base_url = 'https://www.bing.com/search'
    rate_limit = 10
    skip_domains = ['bing.com', 'microsoft.com', 'msn.com']

    def build_url(self, query, page=0):
        params = {'q': query}
        if page > 0:
            params['first'] = str(page * 10 + 1)
        return '%s?%s' % (self.base_url, _urlencode(params))


class Yahoo(SearchEngine):
    """Yahoo search."""

    name = 'yahoo'
    base_url = 'https://search.yahoo.com/search'
    rate_limit = 10
    skip_domains = ['yahoo.com', 'yahooapis.com']

    def build_url(self, query, page=0):
        params = {'p': query}
        if page > 0:
            params['b'] = str(page * 10 + 1)
        return '%s?%s' % (self.base_url, _urlencode(params))

    def _unwrap_url(self, href):
        """Extract actual URL from Yahoo redirect wrapper."""
        if 'yahoo.com' in href and '/RU=' in href:
            match = re.search(r'/RU=([^/]+)/', href)
            if match:
                try:
                    return urllib.unquote(match.group(1))
                except Exception:
                    return None
        return href

    def extract_urls(self, content, urignore=None):
        urls = []
        if not content:
            return urls
        urignore = urignore or []

        soup = soupify(content, nohtml=True)

        for a in _get_body(soup).find_all('a'):
            href = a.get('href', '')
            if not href or not href.startswith('http'):
                continue

            href = self._unwrap_url(href)
            if not href:
                continue

            skip = False
            for domain in self.skip_domains:
                if domain in href:
                    skip = True
                    break
            if skip:
                continue

            for pattern in urignore:
                if re.search(pattern, href):
                    skip = True
                    break

            if not skip and href not in urls:
                urls.append(href)

        return urls


class Gigablast(SearchEngine):
    """Gigablast search (independent US index)."""

    name = 'gigablast'
    base_url = 'https://www.gigablast.com/search'
    rate_limit = 15
    skip_domains = ['gigablast.com']

    def build_url(self, query, page=0):
        params = {'q': query}
        if page > 0:
            params['s'] = str(page * 10)
        return '%s?%s' % (self.base_url, _urlencode(params))


class Metager(SearchEngine):
    """MetaGer search (German, privacy-focused meta search)."""

    name = 'metager'
    base_url = 'https://metager.org/meta/meta.ger3'
    rate_limit = 10
    skip_domains = ['metager.org', 'metager.de']

    def build_url(self, query, page=0):
        params = {
            'eingabe': query,
            'lang': 'en',
        }
        if page > 0:
            params['page'] = str(page + 1)
        return '%s?%s' % (self.base_url, _urlencode(params))


class Swisscows(SearchEngine):
    """Swisscows search (Swiss, privacy-focused)."""

    name = 'swisscows'
    base_url = 'https://swisscows.com/web'
    rate_limit = 10
    skip_domains = ['swisscows.com']

    def build_url(self, query, page=0):
        params = {
            'query': query,
            'region': 'en-US',
        }
        if page > 0:
            params['offset'] = str(page * 10)
        return '%s?%s' % (self.base_url, _urlencode(params))


class Alexandria(SearchEngine):
    """Alexandria search (independent, non-commercial focus)."""

    name = 'alexandria'
    base_url = 'https://www.alexandria.org/search'
    rate_limit = 15
    skip_domains = ['alexandria.org']

    def build_url(self, query, page=0):
        params = {'q': query}
        if page > 0:
            params['p'] = str(page + 1)
        return '%s?%s' % (self.base_url, _urlencode(params))


class Brave(SearchEngine):
    """Brave Search (privacy-focused, independent index)."""

    name = 'brave'
    base_url = 'https://search.brave.com/search'
    rate_limit = 10
    skip_domains = ['brave.com']

    def build_url(self, query, page=0):
        params = {'q': query}
        if page > 0:
            params['offset'] = str(page)
        return '%s?%s' % (self.base_url, _urlencode(params))


class GitHub(SearchEngine):
    """GitHub code/repository search for proxy lists."""

    name = 'github'
    base_url = 'https://github.com/search'
    rate_limit = 5
    relative_base = 'https://github.com'
    skip_patterns = [
        '/login', '/signup', '/join', '/settings',
        '/notifications', '/marketplace', '/explore',
        '/sponsors', '/pricing', '/features',
    ]

    # Search terms specific to proxy lists on GitHub
    github_queries = [
        'proxy list',
        'socks5 proxy list',
        'free proxy',
        'proxy scraper',
        'proxy checker',
        'proxies txt',
        'socks4 list',
        'http proxy list',
    ]

    def build_url(self, query, page=0):
        # GitHub search for repositories and code (pushed in last 2 weeks)
        base_query = query if query else random.choice(self.github_queries)
        date_filter = _date_weeks_ago(2)
        search_query = '%s pushed:>%s' % (base_query, date_filter)
        params = {
            'q': search_query,
            'type': random.choice(['repositories', 'code']),
            's': 'updated',  # Sort by recently updated
            'o': 'desc',
        }
        if page > 0:
            params['p'] = str(page + 1)
        return '%s?%s' % (self.base_url, _urlencode(params))

    def extract_urls(self, content, urignore=None):
        """Extract URLs with blob-to-raw conversion for direct file access."""
        urls = []
        if not content:
            return urls
        urignore = urignore or []

        soup = soupify(content, nohtml=True)

        for a in _get_body(soup).find_all('a'):
            href = a.get('href', '')
            if not href:
                continue

            # Convert relative to absolute
            if href.startswith('/'):
                href = self.relative_base + href

            if not href.startswith('http'):
                continue

            # Only keep GitHub repo/file links
            if 'github.com' not in href:
                continue

            # Skip internal pages
            skip = False
            for pattern in self.skip_patterns:
                if pattern in href:
                    skip = True
                    break
            if skip:
                continue

            # Check urignore patterns
            for pattern in urignore:
                if re.search(pattern, href):
                    skip = True
                    break
            if skip:
                continue

            # Keep raw file links and repo links
            if '/raw/' in href or '/blob/' in href or re.match(r'https://github\.com/[^/]+/[^/]+$', href):
                # Convert blob to raw for direct access
                if '/blob/' in href:
                    raw_href = href.replace('/blob/', '/raw/')
                    if raw_href not in urls:
                        urls.append(raw_href)

                if href not in urls:
                    urls.append(href)

        return urls


class GitLab(SearchEngine):
    """GitLab search for proxy lists."""

    name = 'gitlab'
    base_url = 'https://gitlab.com/search'
    rate_limit = 5
    relative_base = 'https://gitlab.com'
    skip_patterns = [
        '/users/', '/-/', '/explore', '/help',
        '/admin', '/dashboard', '/profile',
    ]

    def build_url(self, query, page=0):
        search_query = query if query else 'proxy list'
        params = {
            'search': search_query,
            'scope': 'projects',
            'sort': 'updated_desc',  # Most recently updated first
        }
        if page > 0:
            params['page'] = str(page + 1)
        return '%s?%s' % (self.base_url, _urlencode(params))

    def extract_urls(self, content, urignore=None):
        """Extract project URLs only (whitelist pattern)."""
        urls = []
        if not content:
            return urls

        soup = soupify(content, nohtml=True)

        for a in _get_body(soup).find_all('a'):
            href = a.get('href', '')
            if not href:
                continue

            # Convert relative to absolute
            if href.startswith('/'):
                href = self.relative_base + href

            if not href.startswith('http') or 'gitlab.com' not in href:
                continue

            # Skip internal pages
            skip = False
            for pattern in self.skip_patterns:
                if pattern in href:
                    skip = True
                    break
            if skip:
                continue

            # Keep only project links
            if re.match(r'https://gitlab\.com/[^/]+/[^/]+', href):
                if href not in urls:
                    urls.append(href)

        return urls


class Codeberg(SearchEngine):
    """Codeberg (Forgejo) search for proxy lists."""

    name = 'codeberg'
    base_url = 'https://codeberg.org/explore/repos'
    rate_limit = 10
    relative_base = 'https://codeberg.org'

    def build_url(self, query, page=0):
        search_query = query if query else 'proxy'
        params = {
            'q': search_query,
            'sort': 'updated',
        }
        if page > 0:
            params['page'] = str(page + 1)
        return '%s?%s' % (self.base_url, _urlencode(params))

    def extract_urls(self, content, urignore=None):
        """Extract repo URLs only (whitelist pattern)."""
        urls = []
        if not content:
            return urls

        soup = soupify(content, nohtml=True)

        for a in _get_body(soup).find_all('a'):
            href = a.get('href', '')
            if not href:
                continue

            # Convert relative to absolute
            if href.startswith('/'):
                href = self.relative_base + href

            if not href.startswith('http') or 'codeberg.org' not in href:
                continue

            # Keep repo links (format: /user/repo)
            if re.match(r'https://codeberg\.org/[^/]+/[^/]+$', href):
                if href not in urls:
                    urls.append(href)

        return urls


class Bitbucket(SearchEngine):
    """Bitbucket repository search."""

    name = 'bitbucket'
    base_url = 'https://bitbucket.org/repo/all'
    rate_limit = 10
    relative_base = 'https://bitbucket.org'
    skip_patterns = ['/account/', '/dashboard/', '/support/', '/-/']

    def build_url(self, query, page=0):
        params = {'name': query if query else 'proxy'}
        if page > 0:
            params['page'] = str(page + 1)
        return '%s?%s' % (self.base_url, _urlencode(params))

    def extract_urls(self, content, urignore=None):
        urls = []
        if not content:
            return urls

        soup = soupify(content, nohtml=True)

        for a in _get_body(soup).find_all('a'):
            href = a.get('href', '')
            if not href:
                continue

            if href.startswith('/'):
                href = self.relative_base + href

            if not href.startswith('http') or 'bitbucket.org' not in href:
                continue

            skip = False
            for pattern in self.skip_patterns:
                if pattern in href:
                    skip = True
                    break
            if skip:
                continue

            # Keep repo links (format: /workspace/repo)
            if re.match(r'https://bitbucket\.org/[^/]+/[^/]+$', href):
                if href not in urls:
                    urls.append(href)

        return urls


class Sourcehut(SearchEngine):
    """Sourcehut (sr.ht) repository search."""

    name = 'sourcehut'
    base_url = 'https://sr.ht/projects'
    rate_limit = 10
    relative_base = 'https://sr.ht'

    def build_url(self, query, page=0):
        params = {'search': query if query else 'proxy'}
        if page > 0:
            params['page'] = str(page + 1)
        return '%s?%s' % (self.base_url, _urlencode(params))

    def extract_urls(self, content, urignore=None):
        urls = []
        if not content:
            return urls

        soup = soupify(content, nohtml=True)

        for a in _get_body(soup).find_all('a'):
            href = a.get('href', '')
            if not href:
                continue

            if href.startswith('/'):
                href = self.relative_base + href

            if not href.startswith('http'):
                continue

            # Keep git.sr.ht repo links
            if 'git.sr.ht/~' in href:
                if href not in urls:
                    urls.append(href)

        return urls


class Pastebin(SearchEngine):
    """Pastebin search via DuckDuckGo site: query (recent only)."""

    name = 'pastebin'
    base_url = 'https://html.duckduckgo.com/html/'
    rate_limit = 10

    # Pastebin sites to search
    paste_sites = [
        'pastebin.com',
        'paste.ee',
        'dpaste.org',
        'hastebin.com',
        'ghostbin.com',
        'paste.ubuntu.com',
        'bpa.st',
    ]

    def build_url(self, query, page=0):
        # Search for proxy lists on paste sites (last 2 weeks only)
        site = random.choice(self.paste_sites)
        search_query = 'site:%s %s' % (site, query if query else 'proxy list')
        params = {
            'q': search_query,
            'df': 'w',  # Past week (DuckDuckGo date filter)
        }
        if page > 0:
            params['s'] = str(page * 30)
        return '%s?%s' % (self.base_url, _urlencode(params))

    def extract_urls(self, content, urignore=None):
        urls = []
        if not content:
            return urls
        urignore = urignore or []

        soup = soupify(content, nohtml=True)

        for a in _get_body(soup).find_all('a'):
            href = a.get('href', '')
            if not href:
                continue

            # Unwrap DDG redirect
            if 'uddg=' in href:
                match = re.search(r'uddg=([^&]+)', href)
                if match:
                    try:
                        href = urllib.unquote(match.group(1))
                    except Exception:
                        continue

            if not href.startswith('http'):
                continue

            # Only keep paste site links
            is_paste = False
            for site in self.paste_sites:
                if site in href:
                    is_paste = True
                    break
            if not is_paste:
                continue

            skip = False
            for pattern in urignore:
                if re.search(pattern, href):
                    skip = True
                    break

            if not skip and href not in urls:
                urls.append(href)

        return urls


class Rentry(SearchEngine):
    """Rentry.co/org search (markdown pastebin often used for proxy lists)."""

    name = 'rentry'
    base_url = 'https://html.duckduckgo.com/html/'
    rate_limit = 10

    def build_url(self, query, page=0):
        # Search rentry for proxy lists (last 2 weeks only)
        search_query = 'site:rentry.co OR site:rentry.org %s' % (query if query else 'proxy socks')
        params = {
            'q': search_query,
            'df': 'w',  # Past week (DuckDuckGo date filter)
        }
        if page > 0:
            params['s'] = str(page * 30)
        return '%s?%s' % (self.base_url, _urlencode(params))

    def extract_urls(self, content, urignore=None):
        urls = []
        if not content:
            return urls

        soup = soupify(content, nohtml=True)

        for a in _get_body(soup).find_all('a'):
            href = a.get('href', '')
            if not href:
                continue

            # Unwrap DDG redirect
            if 'uddg=' in href:
                match = re.search(r'uddg=([^&]+)', href)
                if match:
                    try:
                        href = urllib.unquote(match.group(1))
                    except Exception:
                        continue

            if not href.startswith('http'):
                continue

            # Only keep rentry links
            if 'rentry.co' in href or 'rentry.org' in href:
                if href not in urls:
                    urls.append(href)

        return urls


class Gitea(SearchEngine):
    """Generic Gitea instance search (configurable)."""

    name = 'gitea'
    rate_limit = 10

    # Public Gitea instances with proxy-related content
    instances = [
        'https://git.disroot.org',
        'https://git.envs.net',
        'https://gitea.com',
        'https://try.gitea.io',
    ]

    def __init__(self):
        super(Gitea, self).__init__()
        self.current_instance = random.choice(self.instances)

    def build_url(self, query, page=0):
        search_query = query if query else 'proxy'
        params = {
            'q': search_query,
            'sort': 'updated',
        }
        if page > 0:
            params['page'] = str(page + 1)
        return '%s/explore/repos?%s' % (self.current_instance, _urlencode(params))

    def extract_urls(self, content, urignore=None):
        """Extract repo URLs for current dynamic instance."""
        urls = []
        if not content:
            return urls

        soup = soupify(content, nohtml=True)
        instance_domain = self.current_instance.split('//')[1]

        for a in _get_body(soup).find_all('a'):
            href = a.get('href', '')
            if not href:
                continue

            # Convert relative to absolute
            if href.startswith('/'):
                href = self.current_instance + href

            if not href.startswith('http'):
                continue

            # Keep repo links for this instance
            if instance_domain in href:
                if re.match(r'https?://[^/]+/[^/]+/[^/]+$', href):
                    if href not in urls:
                        urls.append(href)

        return urls


class Searx(SearchEngine):
    """Searx meta-search engine (uses instances from file)."""

    name = 'searx'
    rate_limit = 0  # Handled by instance tracker

    def __init__(self, instance_url):
        super(Searx, self).__init__()
        self.base_url = instance_url

    def build_url(self, query, page=0):
        params = {
            'q': query,
            'category': 'general',
            'time_range': random.choice(['day', 'week']),
        }
        if page > 0:
            params['pageno'] = str(page + 1)
        return '%s/?%s' % (self.base_url, _urlencode(params))

    def extract_urls(self, content, urignore=None):
        """Extract URLs from Searx results (noreferrer links only)."""
        urls = []
        if not content:
            return urls
        urignore = urignore or []

        soup = soupify(content, nohtml=True)

        for a in _get_body(soup).find_all('a'):
            # Searx uses rel="noreferrer" for result links
            rel = a.get('rel', '')
            if not rel or 'noreferrer' not in str(rel):
                continue

            href = a.get('href', '')
            if not href or not href.startswith('http'):
                continue

            # Check urignore patterns
            skip = False
            for pattern in urignore:
                if re.search(pattern, href):
                    skip = True
                    break

            if not skip and href not in urls:
                urls.append(href)

        return urls


class DuckDuckGoOnion(DuckDuckGo):
    """DuckDuckGo via Tor hidden service."""

    name = 'duckduckgo_onion'
    base_url = 'http://duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion/html/'
    skip_domains = ['duckduckgo.com', '.onion']


class StartpageOnion(Startpage):
    """Startpage via Tor hidden service."""

    name = 'startpage_onion'
    base_url = 'http://startpagel6srwcjlue4zgq3zevrujfaow726kjytqbbjyrswwmjzcqd.onion/do/search'
    skip_domains = ['startpage.com', '.onion']


class BraveOnion(Brave):
    """Brave Search via Tor hidden service."""

    name = 'brave_onion'
    base_url = 'https://search.brave4u7jddbv7cyviptqjc7jusxh72uik7zt6adtckl5f4nwy2v72qd.onion/search'
    skip_domains = ['brave.com', '.onion']


class Ahmia(SearchEngine):
    """Ahmia dark web search engine (indexes .onion sites)."""

    name = 'ahmia'
    base_url = 'https://ahmia.fi/search/'
    rate_limit = 10
    skip_domains = ['ahmia.fi']

    def build_url(self, query, page=0):
        params = {'q': query}
        if page > 0:
            params['p'] = str(page + 1)
        return '%s?%s' % (self.base_url, _urlencode(params))


class AhmiaOnion(Ahmia):
    """Ahmia via Tor hidden service."""

    name = 'ahmia_onion'
    base_url = 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion/search/'
    skip_domains = ['ahmia.fi', '.onion']


# Registry of available engines
ENGINES = {
    # Major search engines
    'bing': Bing,
    'yahoo': Yahoo,
    # Privacy-focused search engines
    'duckduckgo': DuckDuckGo,
    'duckduckgo_onion': DuckDuckGoOnion,
    'startpage': Startpage,
    'startpage_onion': StartpageOnion,
    'brave': Brave,
    'brave_onion': BraveOnion,
    'ahmia': Ahmia,
    'ahmia_onion': AhmiaOnion,
    'ecosia': Ecosia,
    'metager': Metager,
    'swisscows': Swisscows,
    # Independent/regional search engines
    'mojeek': Mojeek,      # UK
    'qwant': Qwant,        # France
    'yandex': Yandex,      # Russia
    'gigablast': Gigablast,
    'alexandria': Alexandria,
    # Git hosting platforms
    'github': GitHub,
    'gitlab': GitLab,
    'codeberg': Codeberg,
    'gitea': Gitea,
    'bitbucket': Bitbucket,
    'sourcehut': Sourcehut,
    # Paste sites
    'pastebin': Pastebin,
    'rentry': Rentry,
}


def get_engine(name):
    """Get engine instance by name."""
    if name not in ENGINES:
        return None
    return ENGINES[name]()


def get_all_engines():
    """Get instances of all available engines."""
    return [cls() for cls in ENGINES.values()]


def list_engines():
    """List available engine names."""
    return list(ENGINES.keys())