From 630ed96aa2a391fb5720fbd4598429f225bfd93d Mon Sep 17 00:00:00 2001 From: Username Date: Thu, 25 Dec 2025 02:51:11 +0100 Subject: [PATCH] engines: add Bing and Yahoo search engines --- engines.py | 442 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 438 insertions(+), 4 deletions(-) diff --git a/engines.py b/engines.py index a743656..e016d46 100644 --- a/engines.py +++ b/engines.py @@ -5,10 +5,17 @@ import re import urllib import random +import time from soup_parser import soupify from misc import _log +def _date_weeks_ago(weeks): + """Return date string (YYYY-MM-DD) for N weeks ago.""" + secs = time.time() - (weeks * 7 * 24 * 3600) + return time.strftime('%Y-%m-%d', time.gmtime(secs)) + + def _urlencode(params): """URL-encode params dict, handling Unicode strings. @@ -335,6 +342,148 @@ class Ecosia(SearchEngine): return '%s?%s' % (self.base_url, _urlencode(params)) +class Bing(SearchEngine): + """Bing search (Microsoft).""" + + name = 'bing' + base_url = 'https://www.bing.com/search' + rate_limit = 10 + skip_domains = ['bing.com', 'microsoft.com', 'msn.com'] + + def build_url(self, query, page=0): + params = {'q': query} + if page > 0: + params['first'] = str(page * 10 + 1) + return '%s?%s' % (self.base_url, _urlencode(params)) + + +class Yahoo(SearchEngine): + """Yahoo search.""" + + name = 'yahoo' + base_url = 'https://search.yahoo.com/search' + rate_limit = 10 + skip_domains = ['yahoo.com', 'yahooapis.com'] + + def build_url(self, query, page=0): + params = {'p': query} + if page > 0: + params['b'] = str(page * 10 + 1) + return '%s?%s' % (self.base_url, _urlencode(params)) + + def _unwrap_url(self, href): + """Extract actual URL from Yahoo redirect wrapper.""" + if 'yahoo.com' in href and '/RU=' in href: + match = re.search(r'/RU=([^/]+)/', href) + if match: + try: + return urllib.unquote(match.group(1)) + except Exception: + return None + return href + + def extract_urls(self, content, urignore=None): + urls = [] + if not content: + return urls + urignore = urignore or [] + + soup = soupify(content, nohtml=True) + + for a in _get_body(soup).find_all('a'): + href = a.get('href', '') + if not href or not href.startswith('http'): + continue + + href = self._unwrap_url(href) + if not href: + continue + + skip = False + for domain in self.skip_domains: + if domain in href: + skip = True + break + if skip: + continue + + for pattern in urignore: + if re.search(pattern, href): + skip = True + break + + if not skip and href not in urls: + urls.append(href) + + return urls + + +class Gigablast(SearchEngine): + """Gigablast search (independent US index).""" + + name = 'gigablast' + base_url = 'https://www.gigablast.com/search' + rate_limit = 15 + skip_domains = ['gigablast.com'] + + def build_url(self, query, page=0): + params = {'q': query} + if page > 0: + params['s'] = str(page * 10) + return '%s?%s' % (self.base_url, _urlencode(params)) + + +class Metager(SearchEngine): + """MetaGer search (German, privacy-focused meta search).""" + + name = 'metager' + base_url = 'https://metager.org/meta/meta.ger3' + rate_limit = 10 + skip_domains = ['metager.org', 'metager.de'] + + def build_url(self, query, page=0): + params = { + 'eingabe': query, + 'lang': 'en', + } + if page > 0: + params['page'] = str(page + 1) + return '%s?%s' % (self.base_url, _urlencode(params)) + + +class Swisscows(SearchEngine): + """Swisscows search (Swiss, privacy-focused).""" + + name = 'swisscows' + base_url = 'https://swisscows.com/web' + rate_limit = 10 + skip_domains = ['swisscows.com'] + + def build_url(self, query, page=0): + params = { + 'query': query, + 'region': 'en-US', + } + if page > 0: + params['offset'] = str(page * 10) + return '%s?%s' % (self.base_url, _urlencode(params)) + + +class Alexandria(SearchEngine): + """Alexandria search (independent, non-commercial focus).""" + + name = 'alexandria' + base_url = 'https://www.alexandria.org/search' + rate_limit = 15 + skip_domains = ['alexandria.org'] + + def build_url(self, query, page=0): + params = {'q': query} + if page > 0: + params['p'] = str(page + 1) + return '%s?%s' % (self.base_url, _urlencode(params)) + + class Brave(SearchEngine): """Brave Search (privacy-focused, independent index).""" @@ -376,11 +525,15 @@ class GitHub(SearchEngine): ] def build_url(self, query, page=0): - # GitHub search for repositories and code - search_query = query if query else random.choice(self.github_queries) + # GitHub search for repositories and code (pushed in last 2 weeks) + base_query = query if query else random.choice(self.github_queries) + date_filter = _date_weeks_ago(2) + search_query = '%s pushed:>%s' % (base_query, date_filter) params = { 'q': search_query, 'type': random.choice(['repositories', 'code']), + 's': 'updated', # Sort by recently updated + 'o': 'desc', } if page > 0: params['p'] = str(page + 1) @@ -459,6 +612,7 @@ class GitLab(SearchEngine): params = { 'search': search_query, 'scope': 'projects', + 'sort': 'updated_desc', # Most recently updated first } if page > 0: params['page'] = str(page + 1) @@ -547,6 +701,221 @@ class Codeberg(SearchEngine): return urls +class Bitbucket(SearchEngine): + """Bitbucket repository search.""" + + name = 'bitbucket' + base_url = 'https://bitbucket.org/repo/all' + rate_limit = 10 + relative_base = 'https://bitbucket.org' + skip_patterns = ['/account/', '/dashboard/', '/support/', '/-/'] + + def build_url(self, query, page=0): + params = {'name': query if query else 'proxy'} + if page > 0: + params['page'] = str(page + 1) + return '%s?%s' % (self.base_url, _urlencode(params)) + + def extract_urls(self, content, urignore=None): + urls = [] + if not content: + return urls + + soup = soupify(content, nohtml=True) + + for a in _get_body(soup).find_all('a'): + href = a.get('href', '') + if not href: + continue + + if href.startswith('/'): + href = self.relative_base + href + + if not href.startswith('http') or 'bitbucket.org' not in href: + continue + + skip = False + for pattern in self.skip_patterns: + if pattern in href: + skip = True + break + if skip: + continue + + # Keep repo links (format: /workspace/repo) + if re.match(r'https://bitbucket\.org/[^/]+/[^/]+$', href): + if href not in urls: + urls.append(href) + + return urls + + +class Sourcehut(SearchEngine): + """Sourcehut (sr.ht) repository search.""" + + name = 'sourcehut' + base_url = 'https://sr.ht/projects' + rate_limit = 10 + relative_base = 'https://sr.ht' + + def build_url(self, query, page=0): + params = {'search': query if query else 'proxy'} + if page > 0: + params['page'] = str(page + 1) + return '%s?%s' % (self.base_url, _urlencode(params)) + + def extract_urls(self, content, urignore=None): + urls = [] + if not content: + return urls + + soup = soupify(content, nohtml=True) + + for a in _get_body(soup).find_all('a'): + href = a.get('href', '') + if not href: + continue + + if href.startswith('/'): + href = self.relative_base + href + + if not href.startswith('http'): + continue + + # Keep git.sr.ht repo links + if 'git.sr.ht/~' in href: + if href not in urls: + urls.append(href) + + return urls + + +class Pastebin(SearchEngine): + """Pastebin search via DuckDuckGo site: query (recent only).""" + + name = 'pastebin' + base_url = 'https://html.duckduckgo.com/html/' + rate_limit = 10 + + # Pastebin sites to search + paste_sites = [ + 'pastebin.com', + 'paste.ee', + 'dpaste.org', + 'hastebin.com', + 'ghostbin.com', + 'paste.ubuntu.com', + 'bpa.st', + ] + + def build_url(self, query, page=0): + # Search for proxy lists on paste sites (last 2 weeks only) + site = random.choice(self.paste_sites) + search_query = 'site:%s %s' % (site, query if query else 'proxy list') + params = { + 'q': search_query, + 'df': 'w', # Past week (DuckDuckGo date filter) + } + if page > 0: + params['s'] = str(page * 30) + return '%s?%s' % (self.base_url, _urlencode(params)) + + def extract_urls(self, content, urignore=None): + urls = [] + if not content: + return urls + urignore = urignore or [] + + soup = soupify(content, nohtml=True) + + for a in _get_body(soup).find_all('a'): + href = a.get('href', '') + if not href: + continue + + # Unwrap DDG redirect + if 'uddg=' in href: + match = re.search(r'uddg=([^&]+)', href) + if match: + try: + href = urllib.unquote(match.group(1)) + except Exception: + continue + + if not href.startswith('http'): + continue + + # Only keep paste site links + is_paste = False + for site in self.paste_sites: + if site in href: + is_paste = True + break + if not is_paste: + continue + + skip = False + for pattern in urignore: + if re.search(pattern, href): + skip = True + break + + if not skip and href not in urls: + urls.append(href) + + return urls + + +class Rentry(SearchEngine): + """Rentry.co/org search (markdown pastebin often used for proxy lists).""" + + name = 'rentry' + base_url = 'https://html.duckduckgo.com/html/' + rate_limit = 10 + + def build_url(self, query, page=0): + # Search rentry for proxy lists (last 2 weeks only) + search_query = 'site:rentry.co OR site:rentry.org %s' % (query if query else 'proxy socks') + params = { + 'q': search_query, + 'df': 'w', # Past week (DuckDuckGo date filter) + } + if page > 0: + params['s'] = str(page * 30) + return '%s?%s' % (self.base_url, _urlencode(params)) + + def extract_urls(self, content, urignore=None): + urls = [] + if not content: + return urls + + soup = soupify(content, nohtml=True) + + for a in _get_body(soup).find_all('a'): + href = a.get('href', '') + if not href: + continue + + # Unwrap DDG redirect + if 'uddg=' in href: + match = re.search(r'uddg=([^&]+)', href) + if match: + try: + href = urllib.unquote(match.group(1)) + except Exception: + continue + + if not href.startswith('http'): + continue + + # Only keep rentry links + if 'rentry.co' in href or 'rentry.org' in href: + if href not in urls: + urls.append(href) + + return urls + + class Gitea(SearchEngine): """Generic Gitea instance search (configurable).""" @@ -557,7 +926,8 @@ class Gitea(SearchEngine): instances = [ 'https://git.disroot.org', 'https://git.envs.net', - 'https://git.sr.ht', + 'https://gitea.com', + 'https://try.gitea.io', ] def __init__(self): @@ -656,22 +1026,86 @@ class Searx(SearchEngine): return urls +class DuckDuckGoOnion(DuckDuckGo): + """DuckDuckGo via Tor hidden service.""" + + name = 'duckduckgo_onion' + base_url = 'http://duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion/html/' + skip_domains = ['duckduckgo.com', '.onion'] + + +class StartpageOnion(Startpage): + """Startpage via Tor hidden service.""" + + name = 'startpage_onion' + base_url = 'http://startpagel6srwcjlue4zgq3zevrujfaow726kjytqbbjyrswwmjzcqd.onion/do/search' + skip_domains = ['startpage.com', '.onion'] + + +class BraveOnion(Brave): + """Brave Search via Tor hidden service.""" + + name = 'brave_onion' + base_url = 'https://search.brave4u7jddbv7cyviptqjc7jusxh72uik7zt6adtckl5f4nwy2v72qd.onion/search' + skip_domains = ['brave.com', '.onion'] + + +class Ahmia(SearchEngine): + """Ahmia dark web search engine (indexes .onion sites).""" + + name = 'ahmia' + base_url = 'https://ahmia.fi/search/' + rate_limit = 10 + skip_domains = ['ahmia.fi'] + + def build_url(self, query, page=0): + params = {'q': query} + if page > 0: + params['p'] = str(page + 1) + return '%s?%s' % (self.base_url, _urlencode(params)) + + +class AhmiaOnion(Ahmia): + """Ahmia via Tor hidden service.""" + + name = 'ahmia_onion' + base_url = 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion/search/' + skip_domains = ['ahmia.fi', '.onion'] + + # Registry of available engines ENGINES = { + # Major search engines + 'bing': Bing, + 'yahoo': Yahoo, # Privacy-focused search engines 'duckduckgo': DuckDuckGo, + 'duckduckgo_onion': DuckDuckGoOnion, 'startpage': Startpage, + 'startpage_onion': StartpageOnion, 'brave': Brave, + 'brave_onion': BraveOnion, + 'ahmia': Ahmia, + 'ahmia_onion': AhmiaOnion, 'ecosia': Ecosia, - # Regional/non-US search engines + 'metager': Metager, + 'swisscows': Swisscows, + # Independent/regional search engines 'mojeek': Mojeek, # UK 'qwant': Qwant, # France 'yandex': Yandex, # Russia + 'gigablast': Gigablast, + 'alexandria': Alexandria, # Git hosting platforms 'github': GitHub, 'gitlab': GitLab, 'codeberg': Codeberg, 'gitea': Gitea, + 'bitbucket': Bitbucket, + 'sourcehut': Sourcehut, + # Paste sites + 'pastebin': Pastebin, + 'rentry': Rentry, }