engines: add Bing and Yahoo search engines
This commit is contained in:
442
engines.py
442
engines.py
@@ -5,10 +5,17 @@
|
||||
import re
|
||||
import urllib
|
||||
import random
|
||||
import time
|
||||
from soup_parser import soupify
|
||||
from misc import _log
|
||||
|
||||
|
||||
def _date_weeks_ago(weeks):
|
||||
"""Return date string (YYYY-MM-DD) for N weeks ago."""
|
||||
secs = time.time() - (weeks * 7 * 24 * 3600)
|
||||
return time.strftime('%Y-%m-%d', time.gmtime(secs))
|
||||
|
||||
|
||||
def _urlencode(params):
|
||||
"""URL-encode params dict, handling Unicode strings.
|
||||
|
||||
@@ -335,6 +342,148 @@ class Ecosia(SearchEngine):
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
|
||||
class Bing(SearchEngine):
|
||||
"""Bing search (Microsoft)."""
|
||||
|
||||
name = 'bing'
|
||||
base_url = 'https://www.bing.com/search'
|
||||
rate_limit = 10
|
||||
skip_domains = ['bing.com', 'microsoft.com', 'msn.com']
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
params = {'q': query}
|
||||
if page > 0:
|
||||
params['first'] = str(page * 10 + 1)
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
|
||||
class Yahoo(SearchEngine):
|
||||
"""Yahoo search."""
|
||||
|
||||
name = 'yahoo'
|
||||
base_url = 'https://search.yahoo.com/search'
|
||||
rate_limit = 10
|
||||
skip_domains = ['yahoo.com', 'yahooapis.com']
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
params = {'p': query}
|
||||
if page > 0:
|
||||
params['b'] = str(page * 10 + 1)
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
def _unwrap_url(self, href):
|
||||
"""Extract actual URL from Yahoo redirect wrapper."""
|
||||
if 'yahoo.com' in href and '/RU=' in href:
|
||||
match = re.search(r'/RU=([^/]+)/', href)
|
||||
if match:
|
||||
try:
|
||||
return urllib.unquote(match.group(1))
|
||||
except Exception:
|
||||
return None
|
||||
return href
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
urls = []
|
||||
if not content:
|
||||
return urls
|
||||
urignore = urignore or []
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in _get_body(soup).find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href or not href.startswith('http'):
|
||||
continue
|
||||
|
||||
href = self._unwrap_url(href)
|
||||
if not href:
|
||||
continue
|
||||
|
||||
skip = False
|
||||
for domain in self.skip_domains:
|
||||
if domain in href:
|
||||
skip = True
|
||||
break
|
||||
if skip:
|
||||
continue
|
||||
|
||||
for pattern in urignore:
|
||||
if re.search(pattern, href):
|
||||
skip = True
|
||||
break
|
||||
|
||||
if not skip and href not in urls:
|
||||
urls.append(href)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
class Gigablast(SearchEngine):
|
||||
"""Gigablast search (independent US index)."""
|
||||
|
||||
name = 'gigablast'
|
||||
base_url = 'https://www.gigablast.com/search'
|
||||
rate_limit = 15
|
||||
skip_domains = ['gigablast.com']
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
params = {'q': query}
|
||||
if page > 0:
|
||||
params['s'] = str(page * 10)
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
|
||||
class Metager(SearchEngine):
|
||||
"""MetaGer search (German, privacy-focused meta search)."""
|
||||
|
||||
name = 'metager'
|
||||
base_url = 'https://metager.org/meta/meta.ger3'
|
||||
rate_limit = 10
|
||||
skip_domains = ['metager.org', 'metager.de']
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
params = {
|
||||
'eingabe': query,
|
||||
'lang': 'en',
|
||||
}
|
||||
if page > 0:
|
||||
params['page'] = str(page + 1)
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
|
||||
class Swisscows(SearchEngine):
|
||||
"""Swisscows search (Swiss, privacy-focused)."""
|
||||
|
||||
name = 'swisscows'
|
||||
base_url = 'https://swisscows.com/web'
|
||||
rate_limit = 10
|
||||
skip_domains = ['swisscows.com']
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
params = {
|
||||
'query': query,
|
||||
'region': 'en-US',
|
||||
}
|
||||
if page > 0:
|
||||
params['offset'] = str(page * 10)
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
|
||||
class Alexandria(SearchEngine):
|
||||
"""Alexandria search (independent, non-commercial focus)."""
|
||||
|
||||
name = 'alexandria'
|
||||
base_url = 'https://www.alexandria.org/search'
|
||||
rate_limit = 15
|
||||
skip_domains = ['alexandria.org']
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
params = {'q': query}
|
||||
if page > 0:
|
||||
params['p'] = str(page + 1)
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
|
||||
class Brave(SearchEngine):
|
||||
"""Brave Search (privacy-focused, independent index)."""
|
||||
|
||||
@@ -376,11 +525,15 @@ class GitHub(SearchEngine):
|
||||
]
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
# GitHub search for repositories and code
|
||||
search_query = query if query else random.choice(self.github_queries)
|
||||
# GitHub search for repositories and code (pushed in last 2 weeks)
|
||||
base_query = query if query else random.choice(self.github_queries)
|
||||
date_filter = _date_weeks_ago(2)
|
||||
search_query = '%s pushed:>%s' % (base_query, date_filter)
|
||||
params = {
|
||||
'q': search_query,
|
||||
'type': random.choice(['repositories', 'code']),
|
||||
's': 'updated', # Sort by recently updated
|
||||
'o': 'desc',
|
||||
}
|
||||
if page > 0:
|
||||
params['p'] = str(page + 1)
|
||||
@@ -459,6 +612,7 @@ class GitLab(SearchEngine):
|
||||
params = {
|
||||
'search': search_query,
|
||||
'scope': 'projects',
|
||||
'sort': 'updated_desc', # Most recently updated first
|
||||
}
|
||||
if page > 0:
|
||||
params['page'] = str(page + 1)
|
||||
@@ -547,6 +701,221 @@ class Codeberg(SearchEngine):
|
||||
return urls
|
||||
|
||||
|
||||
class Bitbucket(SearchEngine):
|
||||
"""Bitbucket repository search."""
|
||||
|
||||
name = 'bitbucket'
|
||||
base_url = 'https://bitbucket.org/repo/all'
|
||||
rate_limit = 10
|
||||
relative_base = 'https://bitbucket.org'
|
||||
skip_patterns = ['/account/', '/dashboard/', '/support/', '/-/']
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
params = {'name': query if query else 'proxy'}
|
||||
if page > 0:
|
||||
params['page'] = str(page + 1)
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
urls = []
|
||||
if not content:
|
||||
return urls
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in _get_body(soup).find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href:
|
||||
continue
|
||||
|
||||
if href.startswith('/'):
|
||||
href = self.relative_base + href
|
||||
|
||||
if not href.startswith('http') or 'bitbucket.org' not in href:
|
||||
continue
|
||||
|
||||
skip = False
|
||||
for pattern in self.skip_patterns:
|
||||
if pattern in href:
|
||||
skip = True
|
||||
break
|
||||
if skip:
|
||||
continue
|
||||
|
||||
# Keep repo links (format: /workspace/repo)
|
||||
if re.match(r'https://bitbucket\.org/[^/]+/[^/]+$', href):
|
||||
if href not in urls:
|
||||
urls.append(href)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
class Sourcehut(SearchEngine):
|
||||
"""Sourcehut (sr.ht) repository search."""
|
||||
|
||||
name = 'sourcehut'
|
||||
base_url = 'https://sr.ht/projects'
|
||||
rate_limit = 10
|
||||
relative_base = 'https://sr.ht'
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
params = {'search': query if query else 'proxy'}
|
||||
if page > 0:
|
||||
params['page'] = str(page + 1)
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
urls = []
|
||||
if not content:
|
||||
return urls
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in _get_body(soup).find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href:
|
||||
continue
|
||||
|
||||
if href.startswith('/'):
|
||||
href = self.relative_base + href
|
||||
|
||||
if not href.startswith('http'):
|
||||
continue
|
||||
|
||||
# Keep git.sr.ht repo links
|
||||
if 'git.sr.ht/~' in href:
|
||||
if href not in urls:
|
||||
urls.append(href)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
class Pastebin(SearchEngine):
|
||||
"""Pastebin search via DuckDuckGo site: query (recent only)."""
|
||||
|
||||
name = 'pastebin'
|
||||
base_url = 'https://html.duckduckgo.com/html/'
|
||||
rate_limit = 10
|
||||
|
||||
# Pastebin sites to search
|
||||
paste_sites = [
|
||||
'pastebin.com',
|
||||
'paste.ee',
|
||||
'dpaste.org',
|
||||
'hastebin.com',
|
||||
'ghostbin.com',
|
||||
'paste.ubuntu.com',
|
||||
'bpa.st',
|
||||
]
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
# Search for proxy lists on paste sites (last 2 weeks only)
|
||||
site = random.choice(self.paste_sites)
|
||||
search_query = 'site:%s %s' % (site, query if query else 'proxy list')
|
||||
params = {
|
||||
'q': search_query,
|
||||
'df': 'w', # Past week (DuckDuckGo date filter)
|
||||
}
|
||||
if page > 0:
|
||||
params['s'] = str(page * 30)
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
urls = []
|
||||
if not content:
|
||||
return urls
|
||||
urignore = urignore or []
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in _get_body(soup).find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href:
|
||||
continue
|
||||
|
||||
# Unwrap DDG redirect
|
||||
if 'uddg=' in href:
|
||||
match = re.search(r'uddg=([^&]+)', href)
|
||||
if match:
|
||||
try:
|
||||
href = urllib.unquote(match.group(1))
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not href.startswith('http'):
|
||||
continue
|
||||
|
||||
# Only keep paste site links
|
||||
is_paste = False
|
||||
for site in self.paste_sites:
|
||||
if site in href:
|
||||
is_paste = True
|
||||
break
|
||||
if not is_paste:
|
||||
continue
|
||||
|
||||
skip = False
|
||||
for pattern in urignore:
|
||||
if re.search(pattern, href):
|
||||
skip = True
|
||||
break
|
||||
|
||||
if not skip and href not in urls:
|
||||
urls.append(href)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
class Rentry(SearchEngine):
|
||||
"""Rentry.co/org search (markdown pastebin often used for proxy lists)."""
|
||||
|
||||
name = 'rentry'
|
||||
base_url = 'https://html.duckduckgo.com/html/'
|
||||
rate_limit = 10
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
# Search rentry for proxy lists (last 2 weeks only)
|
||||
search_query = 'site:rentry.co OR site:rentry.org %s' % (query if query else 'proxy socks')
|
||||
params = {
|
||||
'q': search_query,
|
||||
'df': 'w', # Past week (DuckDuckGo date filter)
|
||||
}
|
||||
if page > 0:
|
||||
params['s'] = str(page * 30)
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
urls = []
|
||||
if not content:
|
||||
return urls
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in _get_body(soup).find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href:
|
||||
continue
|
||||
|
||||
# Unwrap DDG redirect
|
||||
if 'uddg=' in href:
|
||||
match = re.search(r'uddg=([^&]+)', href)
|
||||
if match:
|
||||
try:
|
||||
href = urllib.unquote(match.group(1))
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not href.startswith('http'):
|
||||
continue
|
||||
|
||||
# Only keep rentry links
|
||||
if 'rentry.co' in href or 'rentry.org' in href:
|
||||
if href not in urls:
|
||||
urls.append(href)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
class Gitea(SearchEngine):
|
||||
"""Generic Gitea instance search (configurable)."""
|
||||
|
||||
@@ -557,7 +926,8 @@ class Gitea(SearchEngine):
|
||||
instances = [
|
||||
'https://git.disroot.org',
|
||||
'https://git.envs.net',
|
||||
'https://git.sr.ht',
|
||||
'https://gitea.com',
|
||||
'https://try.gitea.io',
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
@@ -656,22 +1026,86 @@ class Searx(SearchEngine):
|
||||
return urls
|
||||
|
||||
|
||||
class DuckDuckGoOnion(DuckDuckGo):
|
||||
"""DuckDuckGo via Tor hidden service."""
|
||||
|
||||
name = 'duckduckgo_onion'
|
||||
base_url = 'http://duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion/html/'
|
||||
skip_domains = ['duckduckgo.com', '.onion']
|
||||
|
||||
|
||||
class StartpageOnion(Startpage):
|
||||
"""Startpage via Tor hidden service."""
|
||||
|
||||
name = 'startpage_onion'
|
||||
base_url = 'http://startpagel6srwcjlue4zgq3zevrujfaow726kjytqbbjyrswwmjzcqd.onion/do/search'
|
||||
skip_domains = ['startpage.com', '.onion']
|
||||
|
||||
|
||||
class BraveOnion(Brave):
|
||||
"""Brave Search via Tor hidden service."""
|
||||
|
||||
name = 'brave_onion'
|
||||
base_url = 'https://search.brave4u7jddbv7cyviptqjc7jusxh72uik7zt6adtckl5f4nwy2v72qd.onion/search'
|
||||
skip_domains = ['brave.com', '.onion']
|
||||
|
||||
|
||||
class Ahmia(SearchEngine):
|
||||
"""Ahmia dark web search engine (indexes .onion sites)."""
|
||||
|
||||
name = 'ahmia'
|
||||
base_url = 'https://ahmia.fi/search/'
|
||||
rate_limit = 10
|
||||
skip_domains = ['ahmia.fi']
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
params = {'q': query}
|
||||
if page > 0:
|
||||
params['p'] = str(page + 1)
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
|
||||
class AhmiaOnion(Ahmia):
|
||||
"""Ahmia via Tor hidden service."""
|
||||
|
||||
name = 'ahmia_onion'
|
||||
base_url = 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion/search/'
|
||||
skip_domains = ['ahmia.fi', '.onion']
|
||||
|
||||
|
||||
# Registry of available engines
|
||||
ENGINES = {
|
||||
# Major search engines
|
||||
'bing': Bing,
|
||||
'yahoo': Yahoo,
|
||||
# Privacy-focused search engines
|
||||
'duckduckgo': DuckDuckGo,
|
||||
'duckduckgo_onion': DuckDuckGoOnion,
|
||||
'startpage': Startpage,
|
||||
'startpage_onion': StartpageOnion,
|
||||
'brave': Brave,
|
||||
'brave_onion': BraveOnion,
|
||||
'ahmia': Ahmia,
|
||||
'ahmia_onion': AhmiaOnion,
|
||||
'ecosia': Ecosia,
|
||||
# Regional/non-US search engines
|
||||
'metager': Metager,
|
||||
'swisscows': Swisscows,
|
||||
# Independent/regional search engines
|
||||
'mojeek': Mojeek, # UK
|
||||
'qwant': Qwant, # France
|
||||
'yandex': Yandex, # Russia
|
||||
'gigablast': Gigablast,
|
||||
'alexandria': Alexandria,
|
||||
# Git hosting platforms
|
||||
'github': GitHub,
|
||||
'gitlab': GitLab,
|
||||
'codeberg': Codeberg,
|
||||
'gitea': Gitea,
|
||||
'bitbucket': Bitbucket,
|
||||
'sourcehut': Sourcehut,
|
||||
# Paste sites
|
||||
'pastebin': Pastebin,
|
||||
'rentry': Rentry,
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user