From 630ed96aa2a391fb5720fbd4598429f225bfd93d Mon Sep 17 00:00:00 2001
From: Username <user@mymx.me>
Date: Thu, 25 Dec 2025 02:51:11 +0100
Subject: [PATCH] engines: add Bing and Yahoo search engines

---
 engines.py | 442 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 438 insertions(+), 4 deletions(-)

diff --git a/engines.py b/engines.py
index a743656..e016d46 100644
--- a/engines.py
+++ b/engines.py
@@ -5,10 +5,17 @@
 import re
 import urllib
 import random
+import time
 from soup_parser import soupify
 from misc import _log
 
 
+def _date_weeks_ago(weeks):
+    """Return date string (YYYY-MM-DD) for N weeks ago."""
+    secs = time.time() - (weeks * 7 * 24 * 3600)
+    return time.strftime('%Y-%m-%d', time.gmtime(secs))
+
+
 def _urlencode(params):
     """URL-encode params dict, handling Unicode strings.
 
@@ -335,6 +342,148 @@ class Ecosia(SearchEngine):
         return '%s?%s' % (self.base_url, _urlencode(params))
 
 
+class Bing(SearchEngine):
+    """Bing search (Microsoft)."""
+
+    name = 'bing'
+    base_url = 'https://www.bing.com/search'
+    rate_limit = 10
+    skip_domains = ['bing.com', 'microsoft.com', 'msn.com']
+
+    def build_url(self, query, page=0):
+        params = {'q': query}
+        if page > 0:
+            params['first'] = str(page * 10 + 1)
+        return '%s?%s' % (self.base_url, _urlencode(params))
+
+
+class Yahoo(SearchEngine):
+    """Yahoo search."""
+
+    name = 'yahoo'
+    base_url = 'https://search.yahoo.com/search'
+    rate_limit = 10
+    skip_domains = ['yahoo.com', 'yahooapis.com']
+
+    def build_url(self, query, page=0):
+        params = {'p': query}
+        if page > 0:
+            params['b'] = str(page * 10 + 1)
+        return '%s?%s' % (self.base_url, _urlencode(params))
+
+    def _unwrap_url(self, href):
+        """Extract actual URL from Yahoo redirect wrapper."""
+        if 'yahoo.com' in href and '/RU=' in href:
+            match = re.search(r'/RU=([^/]+)/', href)
+            if match:
+                try:
+                    return urllib.unquote(match.group(1))
+                except Exception:
+                    return None
+        return href
+
+    def extract_urls(self, content, urignore=None):
+        urls = []
+        if not content:
+            return urls
+        urignore = urignore or []
+
+        soup = soupify(content, nohtml=True)
+
+        for a in _get_body(soup).find_all('a'):
+            href = a.get('href', '')
+            if not href or not href.startswith('http'):
+                continue
+
+            href = self._unwrap_url(href)
+            if not href:
+                continue
+
+            skip = False
+            for domain in self.skip_domains:
+                if domain in href:
+                    skip = True
+                    break
+            if skip:
+                continue
+
+            for pattern in urignore:
+                if re.search(pattern, href):
+                    skip = True
+                    break
+
+            if not skip and href not in urls:
+                urls.append(href)
+
+        return urls
+
+
+class Gigablast(SearchEngine):
+    """Gigablast search (independent US index)."""
+
+    name = 'gigablast'
+    base_url = 'https://www.gigablast.com/search'
+    rate_limit = 15
+    skip_domains = ['gigablast.com']
+
+    def build_url(self, query, page=0):
+        params = {'q': query}
+        if page > 0:
+            params['s'] = str(page * 10)
+        return '%s?%s' % (self.base_url, _urlencode(params))
+
+
+class Metager(SearchEngine):
+    """MetaGer search (German, privacy-focused meta search)."""
+
+    name = 'metager'
+    base_url = 'https://metager.org/meta/meta.ger3'
+    rate_limit = 10
+    skip_domains = ['metager.org', 'metager.de']
+
+    def build_url(self, query, page=0):
+        params = {
+            'eingabe': query,
+            'lang': 'en',
+        }
+        if page > 0:
+            params['page'] = str(page + 1)
+        return '%s?%s' % (self.base_url, _urlencode(params))
+
+
+class Swisscows(SearchEngine):
+    """Swisscows search (Swiss, privacy-focused)."""
+
+    name = 'swisscows'
+    base_url = 'https://swisscows.com/web'
+    rate_limit = 10
+    skip_domains = ['swisscows.com']
+
+    def build_url(self, query, page=0):
+        params = {
+            'query': query,
+            'region': 'en-US',
+        }
+        if page > 0:
+            params['offset'] = str(page * 10)
+        return '%s?%s' % (self.base_url, _urlencode(params))
+
+
+class Alexandria(SearchEngine):
+    """Alexandria search (independent, non-commercial focus)."""
+
+    name = 'alexandria'
+    base_url = 'https://www.alexandria.org/search'
+    rate_limit = 15
+    skip_domains = ['alexandria.org']
+
+    def build_url(self, query, page=0):
+        params = {'q': query}
+        if page > 0:
+            params['p'] = str(page + 1)
+        return '%s?%s' % (self.base_url, _urlencode(params))
+
+
 class Brave(SearchEngine):
     """Brave Search (privacy-focused, independent index)."""
 
@@ -376,11 +525,15 @@ class GitHub(SearchEngine):
     ]
 
     def build_url(self, query, page=0):
-        # GitHub search for repositories and code
-        search_query = query if query else random.choice(self.github_queries)
+        # GitHub search for repositories and code (pushed in last 2 weeks)
+        base_query = query if query else random.choice(self.github_queries)
+        date_filter = _date_weeks_ago(2)
+        search_query = '%s pushed:>%s' % (base_query, date_filter)
         params = {
             'q': search_query,
             'type': random.choice(['repositories', 'code']),
+            's': 'updated',  # Sort by recently updated
+            'o': 'desc',
         }
         if page > 0:
             params['p'] = str(page + 1)
@@ -459,6 +612,7 @@ class GitLab(SearchEngine):
         params = {
             'search': search_query,
             'scope': 'projects',
+            'sort': 'updated_desc',  # Most recently updated first
         }
         if page > 0:
             params['page'] = str(page + 1)
@@ -547,6 +701,221 @@ class Codeberg(SearchEngine):
         return urls
 
 
+class Bitbucket(SearchEngine):
+    """Bitbucket repository search."""
+
+    name = 'bitbucket'
+    base_url = 'https://bitbucket.org/repo/all'
+    rate_limit = 10
+    relative_base = 'https://bitbucket.org'
+    skip_patterns = ['/account/', '/dashboard/', '/support/', '/-/']
+
+    def build_url(self, query, page=0):
+        params = {'name': query if query else 'proxy'}
+        if page > 0:
+            params['page'] = str(page + 1)
+        return '%s?%s' % (self.base_url, _urlencode(params))
+
+    def extract_urls(self, content, urignore=None):
+        urls = []
+        if not content:
+            return urls
+
+        soup = soupify(content, nohtml=True)
+
+        for a in _get_body(soup).find_all('a'):
+            href = a.get('href', '')
+            if not href:
+                continue
+
+            if href.startswith('/'):
+                href = self.relative_base + href
+
+            if not href.startswith('http') or 'bitbucket.org' not in href:
+                continue
+
+            skip = False
+            for pattern in self.skip_patterns:
+                if pattern in href:
+                    skip = True
+                    break
+            if skip:
+                continue
+
+            # Keep repo links (format: /workspace/repo)
+            if re.match(r'https://bitbucket\.org/[^/]+/[^/]+$', href):
+                if href not in urls:
+                    urls.append(href)
+
+        return urls
+
+
+class Sourcehut(SearchEngine):
+    """Sourcehut (sr.ht) repository search."""
+
+    name = 'sourcehut'
+    base_url = 'https://sr.ht/projects'
+    rate_limit = 10
+    relative_base = 'https://sr.ht'
+
+    def build_url(self, query, page=0):
+        params = {'search': query if query else 'proxy'}
+        if page > 0:
+            params['page'] = str(page + 1)
+        return '%s?%s' % (self.base_url, _urlencode(params))
+
+    def extract_urls(self, content, urignore=None):
+        urls = []
+        if not content:
+            return urls
+
+        soup = soupify(content, nohtml=True)
+
+        for a in _get_body(soup).find_all('a'):
+            href = a.get('href', '')
+            if not href:
+                continue
+
+            if href.startswith('/'):
+                href = self.relative_base + href
+
+            if not href.startswith('http'):
+                continue
+
+            # Keep git.sr.ht repo links
+            if 'git.sr.ht/~' in href:
+                if href not in urls:
+                    urls.append(href)
+
+        return urls
+
+
+class Pastebin(SearchEngine):
+    """Pastebin search via DuckDuckGo site: query (recent only)."""
+
+    name = 'pastebin'
+    base_url = 'https://html.duckduckgo.com/html/'
+    rate_limit = 10
+
+    # Pastebin sites to search
+    paste_sites = [
+        'pastebin.com',
+        'paste.ee',
+        'dpaste.org',
+        'hastebin.com',
+        'ghostbin.com',
+        'paste.ubuntu.com',
+        'bpa.st',
+    ]
+
+    def build_url(self, query, page=0):
+        # Search for proxy lists on paste sites (last 2 weeks only)
+        site = random.choice(self.paste_sites)
+        search_query = 'site:%s %s' % (site, query if query else 'proxy list')
+        params = {
+            'q': search_query,
+            'df': 'w',  # Past week (DuckDuckGo date filter)
+        }
+        if page > 0:
+            params['s'] = str(page * 30)
+        return '%s?%s' % (self.base_url, _urlencode(params))
+
+    def extract_urls(self, content, urignore=None):
+        urls = []
+        if not content:
+            return urls
+        urignore = urignore or []
+
+        soup = soupify(content, nohtml=True)
+
+        for a in _get_body(soup).find_all('a'):
+            href = a.get('href', '')
+            if not href:
+                continue
+
+            # Unwrap DDG redirect
+            if 'uddg=' in href:
+                match = re.search(r'uddg=([^&]+)', href)
+                if match:
+                    try:
+                        href = urllib.unquote(match.group(1))
+                    except Exception:
+                        continue
+
+            if not href.startswith('http'):
+                continue
+
+            # Only keep paste site links
+            is_paste = False
+            for site in self.paste_sites:
+                if site in href:
+                    is_paste = True
+                    break
+            if not is_paste:
+                continue
+
+            skip = False
+            for pattern in urignore:
+                if re.search(pattern, href):
+                    skip = True
+                    break
+
+            if not skip and href not in urls:
+                urls.append(href)
+
+        return urls
+
+
+class Rentry(SearchEngine):
+    """Rentry.co/org search (markdown pastebin often used for proxy lists)."""
+
+    name = 'rentry'
+    base_url = 'https://html.duckduckgo.com/html/'
+    rate_limit = 10
+
+    def build_url(self, query, page=0):
+        # Search rentry for proxy lists (last 2 weeks only)
+        search_query = 'site:rentry.co OR site:rentry.org %s' % (query if query else 'proxy socks')
+        params = {
+            'q': search_query,
+            'df': 'w',  # Past week (DuckDuckGo date filter)
+        }
+        if page > 0:
+            params['s'] = str(page * 30)
+        return '%s?%s' % (self.base_url, _urlencode(params))
+
+    def extract_urls(self, content, urignore=None):
+        urls = []
+        if not content:
+            return urls
+
+        soup = soupify(content, nohtml=True)
+
+        for a in _get_body(soup).find_all('a'):
+            href = a.get('href', '')
+            if not href:
+                continue
+
+            # Unwrap DDG redirect
+            if 'uddg=' in href:
+                match = re.search(r'uddg=([^&]+)', href)
+                if match:
+                    try:
+                        href = urllib.unquote(match.group(1))
+                    except Exception:
+                        continue
+
+            if not href.startswith('http'):
+                continue
+
+            # Only keep rentry links
+            if 'rentry.co' in href or 'rentry.org' in href:
+                if href not in urls:
+                    urls.append(href)
+
+        return urls
+
+
 class Gitea(SearchEngine):
     """Generic Gitea instance search (configurable)."""
 
@@ -557,7 +926,8 @@ class Gitea(SearchEngine):
     instances = [
         'https://git.disroot.org',
         'https://git.envs.net',
-        'https://git.sr.ht',
+        'https://gitea.com',
+        'https://try.gitea.io',
     ]
 
     def __init__(self):
@@ -656,22 +1026,86 @@ class Searx(SearchEngine):
         return urls
 
 
+class DuckDuckGoOnion(DuckDuckGo):
+    """DuckDuckGo via Tor hidden service."""
+
+    name = 'duckduckgo_onion'
+    base_url = 'http://duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion/html/'
+    skip_domains = ['duckduckgo.com', '.onion']
+
+
+class StartpageOnion(Startpage):
+    """Startpage via Tor hidden service."""
+
+    name = 'startpage_onion'
+    base_url = 'http://startpagel6srwcjlue4zgq3zevrujfaow726kjytqbbjyrswwmjzcqd.onion/do/search'
+    skip_domains = ['startpage.com', '.onion']
+
+
+class BraveOnion(Brave):
+    """Brave Search via Tor hidden service."""
+
+    name = 'brave_onion'
+    base_url = 'https://search.brave4u7jddbv7cyviptqjc7jusxh72uik7zt6adtckl5f4nwy2v72qd.onion/search'
+    skip_domains = ['brave.com', '.onion']
+
+
+class Ahmia(SearchEngine):
+    """Ahmia dark web search engine (indexes .onion sites)."""
+
+    name = 'ahmia'
+    base_url = 'https://ahmia.fi/search/'
+    rate_limit = 10
+    skip_domains = ['ahmia.fi']
+
+    def build_url(self, query, page=0):
+        params = {'q': query}
+        if page > 0:
+            params['p'] = str(page + 1)
+        return '%s?%s' % (self.base_url, _urlencode(params))
+
+
+class AhmiaOnion(Ahmia):
+    """Ahmia via Tor hidden service."""
+
+    name = 'ahmia_onion'
+    base_url = 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion/search/'
+    skip_domains = ['ahmia.fi', '.onion']
+
+
 # Registry of available engines
 ENGINES = {
+    # Major search engines
+    'bing': Bing,
+    'yahoo': Yahoo,
     # Privacy-focused search engines
     'duckduckgo': DuckDuckGo,
+    'duckduckgo_onion': DuckDuckGoOnion,
     'startpage': Startpage,
+    'startpage_onion': StartpageOnion,
     'brave': Brave,
+    'brave_onion': BraveOnion,
+    'ahmia': Ahmia,
+    'ahmia_onion': AhmiaOnion,
     'ecosia': Ecosia,
-    # Regional/non-US search engines
+    'metager': Metager,
+    'swisscows': Swisscows,
+    # Independent/regional search engines
     'mojeek': Mojeek,      # UK
     'qwant': Qwant,        # France
     'yandex': Yandex,      # Russia
+    'gigablast': Gigablast,
+    'alexandria': Alexandria,
     # Git hosting platforms
     'github': GitHub,
     'gitlab': GitLab,
     'codeberg': Codeberg,
     'gitea': Gitea,
+    'bitbucket': Bitbucket,
+    'sourcehut': Sourcehut,
+    # Paste sites
+    'pastebin': Pastebin,
+    'rentry': Rentry,
 }