1127 lines
31 KiB
Python
1127 lines
31 KiB
Python
#!/usr/bin/env python2
|
|
# -*- coding: utf-8 -*-
|
|
"""Search engine implementations for proxy list discovery."""
|
|
|
|
import re
|
|
import urllib
|
|
import random
|
|
import time
|
|
from soup_parser import soupify
|
|
from misc import _log
|
|
|
|
|
|
def _date_weeks_ago(weeks):
|
|
"""Return date string (YYYY-MM-DD) for N weeks ago."""
|
|
secs = time.time() - (weeks * 7 * 24 * 3600)
|
|
return time.strftime('%Y-%m-%d', time.gmtime(secs))
|
|
|
|
|
|
def _urlencode(params):
|
|
"""URL-encode params dict, handling Unicode strings.
|
|
|
|
Python 2's urllib.urlencode() expects byte strings. This helper
|
|
encodes any Unicode values to UTF-8 before URL encoding.
|
|
|
|
Args:
|
|
params: Dictionary of query parameters
|
|
|
|
Returns:
|
|
URL-encoded query string
|
|
"""
|
|
encoded = {}
|
|
for k, v in params.items():
|
|
if isinstance(v, unicode):
|
|
v = v.encode('utf-8')
|
|
encoded[k] = v
|
|
return urllib.urlencode(encoded)
|
|
|
|
|
|
def _get_body(soup):
|
|
"""Get body element from soup, handling None case.
|
|
|
|
Args:
|
|
soup: BeautifulSoup or SoupResult object
|
|
|
|
Returns:
|
|
Body element or empty list wrapper if None
|
|
"""
|
|
if soup is None or soup.body is None:
|
|
# Return object with empty find_all to avoid AttributeError
|
|
class EmptyBody:
|
|
def find_all(self, *args, **kwargs):
|
|
return []
|
|
return EmptyBody()
|
|
return soup.body
|
|
|
|
|
|
class SearchEngine(object):
|
|
"""Base class for search engines."""
|
|
|
|
name = 'base'
|
|
base_url = ''
|
|
# Rate limiting: requests per minute (0 = no limit)
|
|
rate_limit = 0
|
|
# Domains to skip (engine's own domain)
|
|
skip_domains = []
|
|
# Path patterns to skip (internal pages)
|
|
skip_patterns = []
|
|
# Base URL for relative link conversion (None = skip relative links)
|
|
relative_base = None
|
|
|
|
def __init__(self):
|
|
self.last_request = 0
|
|
|
|
def build_url(self, query, page=0):
|
|
"""Build search URL for query and page number."""
|
|
raise NotImplementedError
|
|
|
|
def extract_urls(self, content, urignore=None):
|
|
"""Extract result URLs from response content.
|
|
|
|
Base implementation handles common patterns:
|
|
- Skips empty content
|
|
- Parses HTML with soupify
|
|
- Converts relative URLs if relative_base is set
|
|
- Skips domains in skip_domains
|
|
- Skips paths matching skip_patterns
|
|
- Applies urignore regex patterns
|
|
- Deduplicates results
|
|
"""
|
|
urls = []
|
|
if not content:
|
|
return urls
|
|
urignore = urignore or []
|
|
|
|
soup = soupify(content, nohtml=True)
|
|
|
|
for a in _get_body(soup).find_all('a'):
|
|
href = a.get('href', '')
|
|
if not href:
|
|
continue
|
|
|
|
# Handle relative URLs
|
|
if not href.startswith('http'):
|
|
if self.relative_base and href.startswith('/'):
|
|
href = self.relative_base + href
|
|
else:
|
|
continue
|
|
|
|
# Skip engine's own domain(s)
|
|
skip = False
|
|
for domain in self.skip_domains:
|
|
if domain in href:
|
|
skip = True
|
|
break
|
|
if skip:
|
|
continue
|
|
|
|
# Skip internal paths
|
|
for pattern in self.skip_patterns:
|
|
if pattern in href:
|
|
skip = True
|
|
break
|
|
if skip:
|
|
continue
|
|
|
|
# Check urignore patterns
|
|
for pattern in urignore:
|
|
if re.search(pattern, href):
|
|
skip = True
|
|
break
|
|
if skip:
|
|
continue
|
|
|
|
if href not in urls:
|
|
urls.append(href)
|
|
|
|
return urls
|
|
|
|
def is_rate_limited(self, content):
|
|
"""Check if response indicates rate limiting."""
|
|
if not content:
|
|
return True
|
|
rate_signals = (
|
|
'rate limit', 'too many requests', 'blocked',
|
|
'captcha', 'please verify', 'unusual traffic',
|
|
'access denied', '403', '429',
|
|
)
|
|
content_lower = content.lower()
|
|
for signal in rate_signals:
|
|
if signal in content_lower:
|
|
return True
|
|
return False
|
|
|
|
|
|
class DuckDuckGo(SearchEngine):
|
|
"""DuckDuckGo HTML search (no JavaScript required)."""
|
|
|
|
name = 'duckduckgo'
|
|
base_url = 'https://html.duckduckgo.com/html/'
|
|
rate_limit = 10
|
|
skip_domains = ['duckduckgo.com']
|
|
|
|
def build_url(self, query, page=0):
|
|
params = {'q': query}
|
|
if page > 0:
|
|
# DuckDuckGo uses 's' param for offset (30 results per page)
|
|
params['s'] = str(page * 30)
|
|
params['dc'] = str(page * 30 + 1)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
def _unwrap_url(self, href):
|
|
"""Extract actual URL from DuckDuckGo redirect wrapper."""
|
|
if '/l/?uddg=' in href or 'uddg=' in href:
|
|
match = re.search(r'uddg=([^&]+)', href)
|
|
if match:
|
|
try:
|
|
return urllib.unquote(match.group(1))
|
|
except Exception:
|
|
return None
|
|
return href
|
|
|
|
def extract_urls(self, content, urignore=None):
|
|
urls = []
|
|
if not content:
|
|
return urls
|
|
urignore = urignore or []
|
|
|
|
soup = soupify(content, nohtml=True)
|
|
|
|
for a in _get_body(soup).find_all('a'):
|
|
href = a.get('href', '')
|
|
if not href or not href.startswith('http'):
|
|
continue
|
|
|
|
# Unwrap redirect URLs
|
|
href = self._unwrap_url(href)
|
|
if not href:
|
|
continue
|
|
|
|
# Skip engine's domain
|
|
if 'duckduckgo.com' in href:
|
|
continue
|
|
|
|
# Check urignore patterns
|
|
skip = False
|
|
for pattern in urignore:
|
|
if re.search(pattern, href):
|
|
skip = True
|
|
break
|
|
|
|
if not skip and href not in urls:
|
|
urls.append(href)
|
|
|
|
return urls
|
|
|
|
|
|
class Startpage(SearchEngine):
|
|
"""Startpage search (privacy-focused, uses Google results)."""
|
|
|
|
name = 'startpage'
|
|
base_url = 'https://www.startpage.com/do/search'
|
|
rate_limit = 5
|
|
skip_domains = ['startpage.com']
|
|
|
|
def build_url(self, query, page=0):
|
|
params = {
|
|
'query': query,
|
|
'cat': 'web',
|
|
'language': 'english',
|
|
}
|
|
if page > 0:
|
|
params['page'] = str(page + 1)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
|
|
class Mojeek(SearchEngine):
|
|
"""Mojeek search (UK-based, independent index)."""
|
|
|
|
name = 'mojeek'
|
|
base_url = 'https://www.mojeek.com/search'
|
|
rate_limit = 10
|
|
skip_domains = ['mojeek.com', 'mojeek.co.uk']
|
|
|
|
def build_url(self, query, page=0):
|
|
params = {'q': query}
|
|
if page > 0:
|
|
# Mojeek uses 's' for start position (10 results per page)
|
|
params['s'] = str(page * 10 + 1)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
|
|
class Qwant(SearchEngine):
|
|
"""Qwant Lite search (French, EU-based, privacy-focused)."""
|
|
|
|
name = 'qwant'
|
|
base_url = 'https://lite.qwant.com/'
|
|
rate_limit = 10
|
|
skip_domains = ['qwant.com']
|
|
|
|
def build_url(self, query, page=0):
|
|
params = {
|
|
'q': query,
|
|
't': 'web',
|
|
}
|
|
if page > 0:
|
|
params['p'] = str(page + 1)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
|
|
class Yandex(SearchEngine):
|
|
"""Yandex search (Russian, large independent index)."""
|
|
|
|
name = 'yandex'
|
|
base_url = 'https://yandex.com/search/'
|
|
rate_limit = 5
|
|
skip_domains = ['yandex.com', 'yandex.ru']
|
|
|
|
def build_url(self, query, page=0):
|
|
params = {
|
|
'text': query,
|
|
'lr': '84', # Worldwide
|
|
}
|
|
if page > 0:
|
|
params['p'] = str(page)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
def _unwrap_url(self, href):
|
|
"""Extract actual URL from Yandex redirect wrapper."""
|
|
if '//yandex.' in href:
|
|
match = re.search(r'url=([^&]+)', href)
|
|
if match:
|
|
try:
|
|
return urllib.unquote(match.group(1))
|
|
except Exception:
|
|
return None
|
|
return None
|
|
return href
|
|
|
|
def extract_urls(self, content, urignore=None):
|
|
urls = []
|
|
if not content:
|
|
return urls
|
|
urignore = urignore or []
|
|
|
|
soup = soupify(content, nohtml=True)
|
|
|
|
for a in _get_body(soup).find_all('a'):
|
|
href = a.get('href', '')
|
|
if not href:
|
|
continue
|
|
|
|
# Unwrap redirect URLs
|
|
href = self._unwrap_url(href)
|
|
if not href or not href.startswith('http'):
|
|
continue
|
|
|
|
# Check urignore patterns
|
|
skip = False
|
|
for pattern in urignore:
|
|
if re.search(pattern, href):
|
|
skip = True
|
|
break
|
|
|
|
if not skip and href not in urls:
|
|
urls.append(href)
|
|
|
|
return urls
|
|
|
|
|
|
class Ecosia(SearchEngine):
|
|
"""Ecosia search (German, eco-friendly, uses Bing results)."""
|
|
|
|
name = 'ecosia'
|
|
base_url = 'https://www.ecosia.org/search'
|
|
rate_limit = 10
|
|
skip_domains = ['ecosia.org']
|
|
|
|
def build_url(self, query, page=0):
|
|
params = {'q': query}
|
|
if page > 0:
|
|
params['p'] = str(page)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
|
|
class Bing(SearchEngine):
|
|
"""Bing search (Microsoft)."""
|
|
|
|
name = 'bing'
|
|
base_url = 'https://www.bing.com/search'
|
|
rate_limit = 10
|
|
skip_domains = ['bing.com', 'microsoft.com', 'msn.com']
|
|
|
|
def build_url(self, query, page=0):
|
|
params = {'q': query}
|
|
if page > 0:
|
|
params['first'] = str(page * 10 + 1)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
|
|
class Yahoo(SearchEngine):
|
|
"""Yahoo search."""
|
|
|
|
name = 'yahoo'
|
|
base_url = 'https://search.yahoo.com/search'
|
|
rate_limit = 10
|
|
skip_domains = ['yahoo.com', 'yahooapis.com']
|
|
|
|
def build_url(self, query, page=0):
|
|
params = {'p': query}
|
|
if page > 0:
|
|
params['b'] = str(page * 10 + 1)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
def _unwrap_url(self, href):
|
|
"""Extract actual URL from Yahoo redirect wrapper."""
|
|
if 'yahoo.com' in href and '/RU=' in href:
|
|
match = re.search(r'/RU=([^/]+)/', href)
|
|
if match:
|
|
try:
|
|
return urllib.unquote(match.group(1))
|
|
except Exception:
|
|
return None
|
|
return href
|
|
|
|
def extract_urls(self, content, urignore=None):
|
|
urls = []
|
|
if not content:
|
|
return urls
|
|
urignore = urignore or []
|
|
|
|
soup = soupify(content, nohtml=True)
|
|
|
|
for a in _get_body(soup).find_all('a'):
|
|
href = a.get('href', '')
|
|
if not href or not href.startswith('http'):
|
|
continue
|
|
|
|
href = self._unwrap_url(href)
|
|
if not href:
|
|
continue
|
|
|
|
skip = False
|
|
for domain in self.skip_domains:
|
|
if domain in href:
|
|
skip = True
|
|
break
|
|
if skip:
|
|
continue
|
|
|
|
for pattern in urignore:
|
|
if re.search(pattern, href):
|
|
skip = True
|
|
break
|
|
|
|
if not skip and href not in urls:
|
|
urls.append(href)
|
|
|
|
return urls
|
|
|
|
|
|
class Gigablast(SearchEngine):
|
|
"""Gigablast search (independent US index)."""
|
|
|
|
name = 'gigablast'
|
|
base_url = 'https://www.gigablast.com/search'
|
|
rate_limit = 15
|
|
skip_domains = ['gigablast.com']
|
|
|
|
def build_url(self, query, page=0):
|
|
params = {'q': query}
|
|
if page > 0:
|
|
params['s'] = str(page * 10)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
|
|
class Metager(SearchEngine):
|
|
"""MetaGer search (German, privacy-focused meta search)."""
|
|
|
|
name = 'metager'
|
|
base_url = 'https://metager.org/meta/meta.ger3'
|
|
rate_limit = 10
|
|
skip_domains = ['metager.org', 'metager.de']
|
|
|
|
def build_url(self, query, page=0):
|
|
params = {
|
|
'eingabe': query,
|
|
'lang': 'en',
|
|
}
|
|
if page > 0:
|
|
params['page'] = str(page + 1)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
|
|
class Swisscows(SearchEngine):
|
|
"""Swisscows search (Swiss, privacy-focused)."""
|
|
|
|
name = 'swisscows'
|
|
base_url = 'https://swisscows.com/web'
|
|
rate_limit = 10
|
|
skip_domains = ['swisscows.com']
|
|
|
|
def build_url(self, query, page=0):
|
|
params = {
|
|
'query': query,
|
|
'region': 'en-US',
|
|
}
|
|
if page > 0:
|
|
params['offset'] = str(page * 10)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
|
|
class Alexandria(SearchEngine):
|
|
"""Alexandria search (independent, non-commercial focus)."""
|
|
|
|
name = 'alexandria'
|
|
base_url = 'https://www.alexandria.org/search'
|
|
rate_limit = 15
|
|
skip_domains = ['alexandria.org']
|
|
|
|
def build_url(self, query, page=0):
|
|
params = {'q': query}
|
|
if page > 0:
|
|
params['p'] = str(page + 1)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
|
|
class Brave(SearchEngine):
|
|
"""Brave Search (privacy-focused, independent index)."""
|
|
|
|
name = 'brave'
|
|
base_url = 'https://search.brave.com/search'
|
|
rate_limit = 10
|
|
skip_domains = ['brave.com']
|
|
|
|
def build_url(self, query, page=0):
|
|
params = {'q': query}
|
|
if page > 0:
|
|
params['offset'] = str(page)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
|
|
class GitHub(SearchEngine):
|
|
"""GitHub code/repository search for proxy lists."""
|
|
|
|
name = 'github'
|
|
base_url = 'https://github.com/search'
|
|
rate_limit = 5
|
|
relative_base = 'https://github.com'
|
|
skip_patterns = [
|
|
'/login', '/signup', '/join', '/settings',
|
|
'/notifications', '/marketplace', '/explore',
|
|
'/sponsors', '/pricing', '/features',
|
|
]
|
|
|
|
# Search terms specific to proxy lists on GitHub
|
|
github_queries = [
|
|
'proxy list',
|
|
'socks5 proxy list',
|
|
'free proxy',
|
|
'proxy scraper',
|
|
'proxy checker',
|
|
'proxies txt',
|
|
'socks4 list',
|
|
'http proxy list',
|
|
]
|
|
|
|
def build_url(self, query, page=0):
|
|
# GitHub search for repositories and code (pushed in last 2 weeks)
|
|
base_query = query if query else random.choice(self.github_queries)
|
|
date_filter = _date_weeks_ago(2)
|
|
search_query = '%s pushed:>%s' % (base_query, date_filter)
|
|
params = {
|
|
'q': search_query,
|
|
'type': random.choice(['repositories', 'code']),
|
|
's': 'updated', # Sort by recently updated
|
|
'o': 'desc',
|
|
}
|
|
if page > 0:
|
|
params['p'] = str(page + 1)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
def extract_urls(self, content, urignore=None):
|
|
"""Extract URLs with blob-to-raw conversion for direct file access."""
|
|
urls = []
|
|
if not content:
|
|
return urls
|
|
urignore = urignore or []
|
|
|
|
soup = soupify(content, nohtml=True)
|
|
|
|
for a in _get_body(soup).find_all('a'):
|
|
href = a.get('href', '')
|
|
if not href:
|
|
continue
|
|
|
|
# Convert relative to absolute
|
|
if href.startswith('/'):
|
|
href = self.relative_base + href
|
|
|
|
if not href.startswith('http'):
|
|
continue
|
|
|
|
# Only keep GitHub repo/file links
|
|
if 'github.com' not in href:
|
|
continue
|
|
|
|
# Skip internal pages
|
|
skip = False
|
|
for pattern in self.skip_patterns:
|
|
if pattern in href:
|
|
skip = True
|
|
break
|
|
if skip:
|
|
continue
|
|
|
|
# Check urignore patterns
|
|
for pattern in urignore:
|
|
if re.search(pattern, href):
|
|
skip = True
|
|
break
|
|
if skip:
|
|
continue
|
|
|
|
# Keep raw file links and repo links
|
|
if '/raw/' in href or '/blob/' in href or re.match(r'https://github\.com/[^/]+/[^/]+$', href):
|
|
# Convert blob to raw for direct access
|
|
if '/blob/' in href:
|
|
raw_href = href.replace('/blob/', '/raw/')
|
|
if raw_href not in urls:
|
|
urls.append(raw_href)
|
|
|
|
if href not in urls:
|
|
urls.append(href)
|
|
|
|
return urls
|
|
|
|
|
|
class GitLab(SearchEngine):
|
|
"""GitLab search for proxy lists."""
|
|
|
|
name = 'gitlab'
|
|
base_url = 'https://gitlab.com/search'
|
|
rate_limit = 5
|
|
relative_base = 'https://gitlab.com'
|
|
skip_patterns = [
|
|
'/users/', '/-/', '/explore', '/help',
|
|
'/admin', '/dashboard', '/profile',
|
|
]
|
|
|
|
def build_url(self, query, page=0):
|
|
search_query = query if query else 'proxy list'
|
|
params = {
|
|
'search': search_query,
|
|
'scope': 'projects',
|
|
'sort': 'updated_desc', # Most recently updated first
|
|
}
|
|
if page > 0:
|
|
params['page'] = str(page + 1)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
def extract_urls(self, content, urignore=None):
|
|
"""Extract project URLs only (whitelist pattern)."""
|
|
urls = []
|
|
if not content:
|
|
return urls
|
|
|
|
soup = soupify(content, nohtml=True)
|
|
|
|
for a in _get_body(soup).find_all('a'):
|
|
href = a.get('href', '')
|
|
if not href:
|
|
continue
|
|
|
|
# Convert relative to absolute
|
|
if href.startswith('/'):
|
|
href = self.relative_base + href
|
|
|
|
if not href.startswith('http') or 'gitlab.com' not in href:
|
|
continue
|
|
|
|
# Skip internal pages
|
|
skip = False
|
|
for pattern in self.skip_patterns:
|
|
if pattern in href:
|
|
skip = True
|
|
break
|
|
if skip:
|
|
continue
|
|
|
|
# Keep only project links
|
|
if re.match(r'https://gitlab\.com/[^/]+/[^/]+', href):
|
|
if href not in urls:
|
|
urls.append(href)
|
|
|
|
return urls
|
|
|
|
|
|
class Codeberg(SearchEngine):
|
|
"""Codeberg (Forgejo) search for proxy lists."""
|
|
|
|
name = 'codeberg'
|
|
base_url = 'https://codeberg.org/explore/repos'
|
|
rate_limit = 10
|
|
relative_base = 'https://codeberg.org'
|
|
|
|
def build_url(self, query, page=0):
|
|
search_query = query if query else 'proxy'
|
|
params = {
|
|
'q': search_query,
|
|
'sort': 'updated',
|
|
}
|
|
if page > 0:
|
|
params['page'] = str(page + 1)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
def extract_urls(self, content, urignore=None):
|
|
"""Extract repo URLs only (whitelist pattern)."""
|
|
urls = []
|
|
if not content:
|
|
return urls
|
|
|
|
soup = soupify(content, nohtml=True)
|
|
|
|
for a in _get_body(soup).find_all('a'):
|
|
href = a.get('href', '')
|
|
if not href:
|
|
continue
|
|
|
|
# Convert relative to absolute
|
|
if href.startswith('/'):
|
|
href = self.relative_base + href
|
|
|
|
if not href.startswith('http') or 'codeberg.org' not in href:
|
|
continue
|
|
|
|
# Keep repo links (format: /user/repo)
|
|
if re.match(r'https://codeberg\.org/[^/]+/[^/]+$', href):
|
|
if href not in urls:
|
|
urls.append(href)
|
|
|
|
return urls
|
|
|
|
|
|
class Bitbucket(SearchEngine):
|
|
"""Bitbucket repository search."""
|
|
|
|
name = 'bitbucket'
|
|
base_url = 'https://bitbucket.org/repo/all'
|
|
rate_limit = 10
|
|
relative_base = 'https://bitbucket.org'
|
|
skip_patterns = ['/account/', '/dashboard/', '/support/', '/-/']
|
|
|
|
def build_url(self, query, page=0):
|
|
params = {'name': query if query else 'proxy'}
|
|
if page > 0:
|
|
params['page'] = str(page + 1)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
def extract_urls(self, content, urignore=None):
|
|
urls = []
|
|
if not content:
|
|
return urls
|
|
|
|
soup = soupify(content, nohtml=True)
|
|
|
|
for a in _get_body(soup).find_all('a'):
|
|
href = a.get('href', '')
|
|
if not href:
|
|
continue
|
|
|
|
if href.startswith('/'):
|
|
href = self.relative_base + href
|
|
|
|
if not href.startswith('http') or 'bitbucket.org' not in href:
|
|
continue
|
|
|
|
skip = False
|
|
for pattern in self.skip_patterns:
|
|
if pattern in href:
|
|
skip = True
|
|
break
|
|
if skip:
|
|
continue
|
|
|
|
# Keep repo links (format: /workspace/repo)
|
|
if re.match(r'https://bitbucket\.org/[^/]+/[^/]+$', href):
|
|
if href not in urls:
|
|
urls.append(href)
|
|
|
|
return urls
|
|
|
|
|
|
class Sourcehut(SearchEngine):
|
|
"""Sourcehut (sr.ht) repository search."""
|
|
|
|
name = 'sourcehut'
|
|
base_url = 'https://sr.ht/projects'
|
|
rate_limit = 10
|
|
relative_base = 'https://sr.ht'
|
|
|
|
def build_url(self, query, page=0):
|
|
params = {'search': query if query else 'proxy'}
|
|
if page > 0:
|
|
params['page'] = str(page + 1)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
def extract_urls(self, content, urignore=None):
|
|
urls = []
|
|
if not content:
|
|
return urls
|
|
|
|
soup = soupify(content, nohtml=True)
|
|
|
|
for a in _get_body(soup).find_all('a'):
|
|
href = a.get('href', '')
|
|
if not href:
|
|
continue
|
|
|
|
if href.startswith('/'):
|
|
href = self.relative_base + href
|
|
|
|
if not href.startswith('http'):
|
|
continue
|
|
|
|
# Keep git.sr.ht repo links
|
|
if 'git.sr.ht/~' in href:
|
|
if href not in urls:
|
|
urls.append(href)
|
|
|
|
return urls
|
|
|
|
|
|
class Pastebin(SearchEngine):
|
|
"""Pastebin search via DuckDuckGo site: query (recent only)."""
|
|
|
|
name = 'pastebin'
|
|
base_url = 'https://html.duckduckgo.com/html/'
|
|
rate_limit = 10
|
|
|
|
# Pastebin sites to search
|
|
paste_sites = [
|
|
'pastebin.com',
|
|
'paste.ee',
|
|
'dpaste.org',
|
|
'hastebin.com',
|
|
'ghostbin.com',
|
|
'paste.ubuntu.com',
|
|
'bpa.st',
|
|
]
|
|
|
|
def build_url(self, query, page=0):
|
|
# Search for proxy lists on paste sites (last 2 weeks only)
|
|
site = random.choice(self.paste_sites)
|
|
search_query = 'site:%s %s' % (site, query if query else 'proxy list')
|
|
params = {
|
|
'q': search_query,
|
|
'df': 'w', # Past week (DuckDuckGo date filter)
|
|
}
|
|
if page > 0:
|
|
params['s'] = str(page * 30)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
def extract_urls(self, content, urignore=None):
|
|
urls = []
|
|
if not content:
|
|
return urls
|
|
urignore = urignore or []
|
|
|
|
soup = soupify(content, nohtml=True)
|
|
|
|
for a in _get_body(soup).find_all('a'):
|
|
href = a.get('href', '')
|
|
if not href:
|
|
continue
|
|
|
|
# Unwrap DDG redirect
|
|
if 'uddg=' in href:
|
|
match = re.search(r'uddg=([^&]+)', href)
|
|
if match:
|
|
try:
|
|
href = urllib.unquote(match.group(1))
|
|
except Exception:
|
|
continue
|
|
|
|
if not href.startswith('http'):
|
|
continue
|
|
|
|
# Only keep paste site links
|
|
is_paste = False
|
|
for site in self.paste_sites:
|
|
if site in href:
|
|
is_paste = True
|
|
break
|
|
if not is_paste:
|
|
continue
|
|
|
|
skip = False
|
|
for pattern in urignore:
|
|
if re.search(pattern, href):
|
|
skip = True
|
|
break
|
|
|
|
if not skip and href not in urls:
|
|
urls.append(href)
|
|
|
|
return urls
|
|
|
|
|
|
class Rentry(SearchEngine):
|
|
"""Rentry.co/org search (markdown pastebin often used for proxy lists)."""
|
|
|
|
name = 'rentry'
|
|
base_url = 'https://html.duckduckgo.com/html/'
|
|
rate_limit = 10
|
|
|
|
def build_url(self, query, page=0):
|
|
# Search rentry for proxy lists (last 2 weeks only)
|
|
search_query = 'site:rentry.co OR site:rentry.org %s' % (query if query else 'proxy socks')
|
|
params = {
|
|
'q': search_query,
|
|
'df': 'w', # Past week (DuckDuckGo date filter)
|
|
}
|
|
if page > 0:
|
|
params['s'] = str(page * 30)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
def extract_urls(self, content, urignore=None):
|
|
urls = []
|
|
if not content:
|
|
return urls
|
|
|
|
soup = soupify(content, nohtml=True)
|
|
|
|
for a in _get_body(soup).find_all('a'):
|
|
href = a.get('href', '')
|
|
if not href:
|
|
continue
|
|
|
|
# Unwrap DDG redirect
|
|
if 'uddg=' in href:
|
|
match = re.search(r'uddg=([^&]+)', href)
|
|
if match:
|
|
try:
|
|
href = urllib.unquote(match.group(1))
|
|
except Exception:
|
|
continue
|
|
|
|
if not href.startswith('http'):
|
|
continue
|
|
|
|
# Only keep rentry links
|
|
if 'rentry.co' in href or 'rentry.org' in href:
|
|
if href not in urls:
|
|
urls.append(href)
|
|
|
|
return urls
|
|
|
|
|
|
class Gitea(SearchEngine):
|
|
"""Generic Gitea instance search (configurable)."""
|
|
|
|
name = 'gitea'
|
|
rate_limit = 10
|
|
|
|
# Public Gitea instances with proxy-related content
|
|
instances = [
|
|
'https://git.disroot.org',
|
|
'https://git.envs.net',
|
|
'https://gitea.com',
|
|
'https://try.gitea.io',
|
|
]
|
|
|
|
def __init__(self):
|
|
super(Gitea, self).__init__()
|
|
self.current_instance = random.choice(self.instances)
|
|
|
|
def build_url(self, query, page=0):
|
|
search_query = query if query else 'proxy'
|
|
params = {
|
|
'q': search_query,
|
|
'sort': 'updated',
|
|
}
|
|
if page > 0:
|
|
params['page'] = str(page + 1)
|
|
return '%s/explore/repos?%s' % (self.current_instance, _urlencode(params))
|
|
|
|
def extract_urls(self, content, urignore=None):
|
|
"""Extract repo URLs for current dynamic instance."""
|
|
urls = []
|
|
if not content:
|
|
return urls
|
|
|
|
soup = soupify(content, nohtml=True)
|
|
instance_domain = self.current_instance.split('//')[1]
|
|
|
|
for a in _get_body(soup).find_all('a'):
|
|
href = a.get('href', '')
|
|
if not href:
|
|
continue
|
|
|
|
# Convert relative to absolute
|
|
if href.startswith('/'):
|
|
href = self.current_instance + href
|
|
|
|
if not href.startswith('http'):
|
|
continue
|
|
|
|
# Keep repo links for this instance
|
|
if instance_domain in href:
|
|
if re.match(r'https?://[^/]+/[^/]+/[^/]+$', href):
|
|
if href not in urls:
|
|
urls.append(href)
|
|
|
|
return urls
|
|
|
|
|
|
class Searx(SearchEngine):
|
|
"""Searx meta-search engine (uses instances from file)."""
|
|
|
|
name = 'searx'
|
|
rate_limit = 0 # Handled by instance tracker
|
|
|
|
def __init__(self, instance_url):
|
|
super(Searx, self).__init__()
|
|
self.base_url = instance_url
|
|
|
|
def build_url(self, query, page=0):
|
|
params = {
|
|
'q': query,
|
|
'category': 'general',
|
|
'time_range': random.choice(['day', 'week']),
|
|
}
|
|
if page > 0:
|
|
params['pageno'] = str(page + 1)
|
|
return '%s/?%s' % (self.base_url, _urlencode(params))
|
|
|
|
def extract_urls(self, content, urignore=None):
|
|
"""Extract URLs from Searx results (noreferrer links only)."""
|
|
urls = []
|
|
if not content:
|
|
return urls
|
|
urignore = urignore or []
|
|
|
|
soup = soupify(content, nohtml=True)
|
|
|
|
for a in _get_body(soup).find_all('a'):
|
|
# Searx uses rel="noreferrer" for result links
|
|
rel = a.get('rel', '')
|
|
if not rel or 'noreferrer' not in str(rel):
|
|
continue
|
|
|
|
href = a.get('href', '')
|
|
if not href or not href.startswith('http'):
|
|
continue
|
|
|
|
# Check urignore patterns
|
|
skip = False
|
|
for pattern in urignore:
|
|
if re.search(pattern, href):
|
|
skip = True
|
|
break
|
|
|
|
if not skip and href not in urls:
|
|
urls.append(href)
|
|
|
|
return urls
|
|
|
|
|
|
class DuckDuckGoOnion(DuckDuckGo):
|
|
"""DuckDuckGo via Tor hidden service."""
|
|
|
|
name = 'duckduckgo_onion'
|
|
base_url = 'http://duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion/html/'
|
|
skip_domains = ['duckduckgo.com', '.onion']
|
|
|
|
|
|
class StartpageOnion(Startpage):
|
|
"""Startpage via Tor hidden service."""
|
|
|
|
name = 'startpage_onion'
|
|
base_url = 'http://startpagel6srwcjlue4zgq3zevrujfaow726kjytqbbjyrswwmjzcqd.onion/do/search'
|
|
skip_domains = ['startpage.com', '.onion']
|
|
|
|
|
|
class BraveOnion(Brave):
|
|
"""Brave Search via Tor hidden service."""
|
|
|
|
name = 'brave_onion'
|
|
base_url = 'https://search.brave4u7jddbv7cyviptqjc7jusxh72uik7zt6adtckl5f4nwy2v72qd.onion/search'
|
|
skip_domains = ['brave.com', '.onion']
|
|
|
|
|
|
class Ahmia(SearchEngine):
|
|
"""Ahmia dark web search engine (indexes .onion sites)."""
|
|
|
|
name = 'ahmia'
|
|
base_url = 'https://ahmia.fi/search/'
|
|
rate_limit = 10
|
|
skip_domains = ['ahmia.fi']
|
|
|
|
def build_url(self, query, page=0):
|
|
params = {'q': query}
|
|
if page > 0:
|
|
params['p'] = str(page + 1)
|
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
|
|
|
|
|
class AhmiaOnion(Ahmia):
|
|
"""Ahmia via Tor hidden service."""
|
|
|
|
name = 'ahmia_onion'
|
|
base_url = 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion/search/'
|
|
skip_domains = ['ahmia.fi', '.onion']
|
|
|
|
|
|
# Registry of available engines
|
|
ENGINES = {
|
|
# Major search engines
|
|
'bing': Bing,
|
|
'yahoo': Yahoo,
|
|
# Privacy-focused search engines
|
|
'duckduckgo': DuckDuckGo,
|
|
'duckduckgo_onion': DuckDuckGoOnion,
|
|
'startpage': Startpage,
|
|
'startpage_onion': StartpageOnion,
|
|
'brave': Brave,
|
|
'brave_onion': BraveOnion,
|
|
'ahmia': Ahmia,
|
|
'ahmia_onion': AhmiaOnion,
|
|
'ecosia': Ecosia,
|
|
'metager': Metager,
|
|
'swisscows': Swisscows,
|
|
# Independent/regional search engines
|
|
'mojeek': Mojeek, # UK
|
|
'qwant': Qwant, # France
|
|
'yandex': Yandex, # Russia
|
|
'gigablast': Gigablast,
|
|
'alexandria': Alexandria,
|
|
# Git hosting platforms
|
|
'github': GitHub,
|
|
'gitlab': GitLab,
|
|
'codeberg': Codeberg,
|
|
'gitea': Gitea,
|
|
'bitbucket': Bitbucket,
|
|
'sourcehut': Sourcehut,
|
|
# Paste sites
|
|
'pastebin': Pastebin,
|
|
'rentry': Rentry,
|
|
}
|
|
|
|
|
|
def get_engine(name):
|
|
"""Get engine instance by name."""
|
|
if name not in ENGINES:
|
|
return None
|
|
return ENGINES[name]()
|
|
|
|
|
|
def get_all_engines():
|
|
"""Get instances of all available engines."""
|
|
return [cls() for cls in ENGINES.values()]
|
|
|
|
|
|
def list_engines():
|
|
"""List available engine names."""
|
|
return list(ENGINES.keys())
|