engines: add modular search engine abstraction
- SearchEngine base class with build_url, extract_urls, is_rate_limited - Implementations: DuckDuckGo, Startpage, Mojeek, Qwant, Yandex, Ecosia, Brave - Git hosters: GitHub, GitLab, Codeberg, Gitea - Searx wrapper for SearXNG instances
This commit is contained in:
716
engines.py
Normal file
716
engines.py
Normal file
@@ -0,0 +1,716 @@
|
||||
#!/usr/bin/env python2
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Search engine implementations for proxy list discovery."""
|
||||
|
||||
import re
|
||||
import urllib
|
||||
import random
|
||||
from soup_parser import soupify
|
||||
from misc import _log
|
||||
|
||||
|
||||
class SearchEngine(object):
|
||||
"""Base class for search engines."""
|
||||
|
||||
name = 'base'
|
||||
base_url = ''
|
||||
# Rate limiting: requests per minute (0 = no limit)
|
||||
rate_limit = 0
|
||||
|
||||
def __init__(self):
|
||||
self.last_request = 0
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
"""Build search URL for query and page number."""
|
||||
raise NotImplementedError
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
"""Extract result URLs from response content."""
|
||||
raise NotImplementedError
|
||||
|
||||
def is_rate_limited(self, content):
|
||||
"""Check if response indicates rate limiting."""
|
||||
if not content:
|
||||
return True
|
||||
rate_signals = (
|
||||
'rate limit', 'too many requests', 'blocked',
|
||||
'captcha', 'please verify', 'unusual traffic',
|
||||
'access denied', '403', '429',
|
||||
)
|
||||
content_lower = content.lower()
|
||||
for signal in rate_signals:
|
||||
if signal in content_lower:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class DuckDuckGo(SearchEngine):
|
||||
"""DuckDuckGo HTML search (no JavaScript required)."""
|
||||
|
||||
name = 'duckduckgo'
|
||||
base_url = 'https://html.duckduckgo.com/html/'
|
||||
rate_limit = 10
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
params = {'q': query}
|
||||
if page > 0:
|
||||
# DuckDuckGo uses 's' param for offset (30 results per page)
|
||||
params['s'] = str(page * 30)
|
||||
params['dc'] = str(page * 30 + 1)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
urls = []
|
||||
if not content:
|
||||
return urls
|
||||
urignore = urignore or []
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
# DuckDuckGo HTML results are in <a class="result__url"> or <a class="result__a">
|
||||
for a in soup.body.find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href or not href.startswith('http'):
|
||||
continue
|
||||
|
||||
# Skip DuckDuckGo internal links
|
||||
if 'duckduckgo.com' in href:
|
||||
continue
|
||||
|
||||
# DuckDuckGo wraps URLs - extract actual URL from redirect
|
||||
if '/l/?uddg=' in href or 'uddg=' in href:
|
||||
match = re.search(r'uddg=([^&]+)', href)
|
||||
if match:
|
||||
try:
|
||||
href = urllib.unquote(match.group(1))
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Check urignore patterns
|
||||
bad = False
|
||||
for pattern in urignore:
|
||||
if re.search(pattern, href):
|
||||
bad = True
|
||||
break
|
||||
|
||||
if not bad and href not in urls:
|
||||
urls.append(href)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
class Startpage(SearchEngine):
|
||||
"""Startpage search (privacy-focused, uses Google results)."""
|
||||
|
||||
name = 'startpage'
|
||||
base_url = 'https://www.startpage.com/do/search'
|
||||
rate_limit = 5
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
params = {
|
||||
'query': query,
|
||||
'cat': 'web',
|
||||
'language': 'english',
|
||||
}
|
||||
if page > 0:
|
||||
params['page'] = str(page + 1)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
urls = []
|
||||
if not content:
|
||||
return urls
|
||||
urignore = urignore or []
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href or not href.startswith('http'):
|
||||
continue
|
||||
|
||||
# Skip Startpage internal links
|
||||
if 'startpage.com' in href:
|
||||
continue
|
||||
|
||||
# Check urignore patterns
|
||||
bad = False
|
||||
for pattern in urignore:
|
||||
if re.search(pattern, href):
|
||||
bad = True
|
||||
break
|
||||
|
||||
if not bad and href not in urls:
|
||||
urls.append(href)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
class Mojeek(SearchEngine):
|
||||
"""Mojeek search (UK-based, independent index)."""
|
||||
|
||||
name = 'mojeek'
|
||||
base_url = 'https://www.mojeek.com/search'
|
||||
rate_limit = 10
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
params = {'q': query}
|
||||
if page > 0:
|
||||
# Mojeek uses 's' for start position (10 results per page)
|
||||
params['s'] = str(page * 10 + 1)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
urls = []
|
||||
if not content:
|
||||
return urls
|
||||
urignore = urignore or []
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href or not href.startswith('http'):
|
||||
continue
|
||||
|
||||
# Skip Mojeek internal links
|
||||
if 'mojeek.com' in href or 'mojeek.co.uk' in href:
|
||||
continue
|
||||
|
||||
# Check urignore patterns
|
||||
bad = False
|
||||
for pattern in urignore:
|
||||
if re.search(pattern, href):
|
||||
bad = True
|
||||
break
|
||||
|
||||
if not bad and href not in urls:
|
||||
urls.append(href)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
class Qwant(SearchEngine):
|
||||
"""Qwant Lite search (French, EU-based, privacy-focused)."""
|
||||
|
||||
name = 'qwant'
|
||||
base_url = 'https://lite.qwant.com/'
|
||||
rate_limit = 10
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
params = {
|
||||
'q': query,
|
||||
't': 'web',
|
||||
}
|
||||
if page > 0:
|
||||
params['p'] = str(page + 1)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
urls = []
|
||||
if not content:
|
||||
return urls
|
||||
urignore = urignore or []
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href or not href.startswith('http'):
|
||||
continue
|
||||
|
||||
# Skip Qwant internal links
|
||||
if 'qwant.com' in href:
|
||||
continue
|
||||
|
||||
# Check urignore patterns
|
||||
bad = False
|
||||
for pattern in urignore:
|
||||
if re.search(pattern, href):
|
||||
bad = True
|
||||
break
|
||||
|
||||
if not bad and href not in urls:
|
||||
urls.append(href)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
class Yandex(SearchEngine):
|
||||
"""Yandex search (Russian, large independent index)."""
|
||||
|
||||
name = 'yandex'
|
||||
base_url = 'https://yandex.com/search/'
|
||||
rate_limit = 5
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
params = {
|
||||
'text': query,
|
||||
'lr': '84', # Worldwide
|
||||
}
|
||||
if page > 0:
|
||||
params['p'] = str(page)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
urls = []
|
||||
if not content:
|
||||
return urls
|
||||
urignore = urignore or []
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href:
|
||||
continue
|
||||
|
||||
# Yandex uses redirect URLs, try to extract actual URL
|
||||
if '//yandex.' in href:
|
||||
# Try to find embedded URL
|
||||
match = re.search(r'url=([^&]+)', href)
|
||||
if match:
|
||||
try:
|
||||
href = urllib.unquote(match.group(1))
|
||||
except Exception:
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
|
||||
if not href.startswith('http'):
|
||||
continue
|
||||
|
||||
# Check urignore patterns
|
||||
bad = False
|
||||
for pattern in urignore:
|
||||
if re.search(pattern, href):
|
||||
bad = True
|
||||
break
|
||||
|
||||
if not bad and href not in urls:
|
||||
urls.append(href)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
class Ecosia(SearchEngine):
|
||||
"""Ecosia search (German, eco-friendly, uses Bing results)."""
|
||||
|
||||
name = 'ecosia'
|
||||
base_url = 'https://www.ecosia.org/search'
|
||||
rate_limit = 10
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
params = {'q': query}
|
||||
if page > 0:
|
||||
params['p'] = str(page)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
urls = []
|
||||
if not content:
|
||||
return urls
|
||||
urignore = urignore or []
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href or not href.startswith('http'):
|
||||
continue
|
||||
|
||||
# Skip Ecosia internal links
|
||||
if 'ecosia.org' in href:
|
||||
continue
|
||||
|
||||
# Check urignore patterns
|
||||
bad = False
|
||||
for pattern in urignore:
|
||||
if re.search(pattern, href):
|
||||
bad = True
|
||||
break
|
||||
|
||||
if not bad and href not in urls:
|
||||
urls.append(href)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
class Brave(SearchEngine):
|
||||
"""Brave Search (privacy-focused, independent index)."""
|
||||
|
||||
name = 'brave'
|
||||
base_url = 'https://search.brave.com/search'
|
||||
rate_limit = 10
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
params = {'q': query}
|
||||
if page > 0:
|
||||
params['offset'] = str(page)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
urls = []
|
||||
if not content:
|
||||
return urls
|
||||
urignore = urignore or []
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href or not href.startswith('http'):
|
||||
continue
|
||||
|
||||
# Skip Brave internal links
|
||||
if 'brave.com' in href:
|
||||
continue
|
||||
|
||||
# Check urignore patterns
|
||||
bad = False
|
||||
for pattern in urignore:
|
||||
if re.search(pattern, href):
|
||||
bad = True
|
||||
break
|
||||
|
||||
if not bad and href not in urls:
|
||||
urls.append(href)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
class GitHub(SearchEngine):
|
||||
"""GitHub code/repository search for proxy lists."""
|
||||
|
||||
name = 'github'
|
||||
base_url = 'https://github.com/search'
|
||||
rate_limit = 5
|
||||
|
||||
# Search terms specific to proxy lists on GitHub
|
||||
github_queries = [
|
||||
'proxy list',
|
||||
'socks5 proxy list',
|
||||
'free proxy',
|
||||
'proxy scraper',
|
||||
'proxy checker',
|
||||
'proxies txt',
|
||||
'socks4 list',
|
||||
'http proxy list',
|
||||
]
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
# GitHub search for repositories and code
|
||||
search_query = query if query else random.choice(self.github_queries)
|
||||
params = {
|
||||
'q': search_query,
|
||||
'type': random.choice(['repositories', 'code']),
|
||||
}
|
||||
if page > 0:
|
||||
params['p'] = str(page + 1)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
urls = []
|
||||
if not content:
|
||||
return urls
|
||||
urignore = urignore or []
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href:
|
||||
continue
|
||||
|
||||
# Convert relative to absolute
|
||||
if href.startswith('/'):
|
||||
href = 'https://github.com' + href
|
||||
|
||||
if not href.startswith('http'):
|
||||
continue
|
||||
|
||||
# Only keep GitHub repo/file links
|
||||
if 'github.com' not in href:
|
||||
continue
|
||||
|
||||
# Skip non-content links
|
||||
skip_patterns = [
|
||||
'/login', '/signup', '/join', '/settings',
|
||||
'/notifications', '/marketplace', '/explore',
|
||||
'/sponsors', '/pricing', '/features',
|
||||
]
|
||||
skip = False
|
||||
for pattern in skip_patterns:
|
||||
if pattern in href:
|
||||
skip = True
|
||||
break
|
||||
if skip:
|
||||
continue
|
||||
|
||||
# Keep raw file links and repo links
|
||||
if '/raw/' in href or '/blob/' in href or re.match(r'https://github\.com/[^/]+/[^/]+$', href):
|
||||
# Convert blob to raw for direct access
|
||||
if '/blob/' in href:
|
||||
raw_href = href.replace('/blob/', '/raw/')
|
||||
if raw_href not in urls:
|
||||
urls.append(raw_href)
|
||||
|
||||
if href not in urls:
|
||||
urls.append(href)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
class GitLab(SearchEngine):
|
||||
"""GitLab search for proxy lists."""
|
||||
|
||||
name = 'gitlab'
|
||||
base_url = 'https://gitlab.com/search'
|
||||
rate_limit = 5
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
search_query = query if query else 'proxy list'
|
||||
params = {
|
||||
'search': search_query,
|
||||
'scope': 'projects',
|
||||
}
|
||||
if page > 0:
|
||||
params['page'] = str(page + 1)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
urls = []
|
||||
if not content:
|
||||
return urls
|
||||
urignore = urignore or []
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href:
|
||||
continue
|
||||
|
||||
# Convert relative to absolute
|
||||
if href.startswith('/'):
|
||||
href = 'https://gitlab.com' + href
|
||||
|
||||
if not href.startswith('http'):
|
||||
continue
|
||||
|
||||
# Only keep GitLab project links
|
||||
if 'gitlab.com' not in href:
|
||||
continue
|
||||
|
||||
# Skip non-project links
|
||||
skip_patterns = [
|
||||
'/users/', '/-/', '/explore', '/help',
|
||||
'/admin', '/dashboard', '/profile',
|
||||
]
|
||||
skip = False
|
||||
for pattern in skip_patterns:
|
||||
if pattern in href:
|
||||
skip = True
|
||||
break
|
||||
if skip:
|
||||
continue
|
||||
|
||||
# Keep project and file links
|
||||
if re.match(r'https://gitlab\.com/[^/]+/[^/]+', href):
|
||||
if href not in urls:
|
||||
urls.append(href)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
class Codeberg(SearchEngine):
|
||||
"""Codeberg (Forgejo) search for proxy lists."""
|
||||
|
||||
name = 'codeberg'
|
||||
base_url = 'https://codeberg.org/explore/repos'
|
||||
rate_limit = 10
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
search_query = query if query else 'proxy'
|
||||
params = {
|
||||
'q': search_query,
|
||||
'sort': 'updated',
|
||||
}
|
||||
if page > 0:
|
||||
params['page'] = str(page + 1)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
urls = []
|
||||
if not content:
|
||||
return urls
|
||||
urignore = urignore or []
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href:
|
||||
continue
|
||||
|
||||
# Convert relative to absolute
|
||||
if href.startswith('/'):
|
||||
href = 'https://codeberg.org' + href
|
||||
|
||||
if not href.startswith('http'):
|
||||
continue
|
||||
|
||||
# Only keep Codeberg repo links
|
||||
if 'codeberg.org' not in href:
|
||||
continue
|
||||
|
||||
# Keep repo links (format: /user/repo)
|
||||
if re.match(r'https://codeberg\.org/[^/]+/[^/]+$', href):
|
||||
if href not in urls:
|
||||
urls.append(href)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
class Gitea(SearchEngine):
|
||||
"""Generic Gitea instance search (configurable)."""
|
||||
|
||||
name = 'gitea'
|
||||
rate_limit = 10
|
||||
|
||||
# Public Gitea instances with proxy-related content
|
||||
instances = [
|
||||
'https://git.disroot.org',
|
||||
'https://git.envs.net',
|
||||
'https://git.sr.ht',
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
super(Gitea, self).__init__()
|
||||
self.current_instance = random.choice(self.instances)
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
search_query = query if query else 'proxy'
|
||||
params = {
|
||||
'q': search_query,
|
||||
'sort': 'updated',
|
||||
}
|
||||
if page > 0:
|
||||
params['page'] = str(page + 1)
|
||||
return '%s/explore/repos?%s' % (self.current_instance, urllib.urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
urls = []
|
||||
if not content:
|
||||
return urls
|
||||
urignore = urignore or []
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href:
|
||||
continue
|
||||
|
||||
# Convert relative to absolute
|
||||
if href.startswith('/'):
|
||||
href = self.current_instance + href
|
||||
|
||||
if not href.startswith('http'):
|
||||
continue
|
||||
|
||||
# Keep repo links
|
||||
if self.current_instance.split('//')[1] in href:
|
||||
if re.match(r'https?://[^/]+/[^/]+/[^/]+$', href):
|
||||
if href not in urls:
|
||||
urls.append(href)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
class Searx(SearchEngine):
|
||||
"""Searx meta-search engine (uses instances from file)."""
|
||||
|
||||
name = 'searx'
|
||||
rate_limit = 0 # Handled by instance tracker
|
||||
|
||||
def __init__(self, instance_url):
|
||||
super(Searx, self).__init__()
|
||||
self.base_url = instance_url
|
||||
|
||||
def build_url(self, query, page=0):
|
||||
params = {
|
||||
'q': query,
|
||||
'category': 'general',
|
||||
'time_range': random.choice(['day', 'week']),
|
||||
}
|
||||
if page > 0:
|
||||
params['pageno'] = str(page + 1)
|
||||
return '%s/?%s' % (self.base_url, urllib.urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
"""Extract URLs from Searx results (noreferrer links)."""
|
||||
urls = []
|
||||
if not content:
|
||||
return urls
|
||||
urignore = urignore or []
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
# Searx uses rel="noreferrer" for result links
|
||||
rel = a.get('rel', '')
|
||||
if not rel or 'noreferrer' not in str(rel):
|
||||
continue
|
||||
|
||||
href = a.get('href', '')
|
||||
if not href or not href.startswith('http'):
|
||||
continue
|
||||
|
||||
# Check urignore patterns
|
||||
bad = False
|
||||
for pattern in urignore:
|
||||
if re.search(pattern, href):
|
||||
bad = True
|
||||
break
|
||||
|
||||
if not bad and href not in urls:
|
||||
urls.append(href)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
# Registry of available engines
|
||||
ENGINES = {
|
||||
# Privacy-focused search engines
|
||||
'duckduckgo': DuckDuckGo,
|
||||
'startpage': Startpage,
|
||||
'brave': Brave,
|
||||
'ecosia': Ecosia,
|
||||
# Regional/non-US search engines
|
||||
'mojeek': Mojeek, # UK
|
||||
'qwant': Qwant, # France
|
||||
'yandex': Yandex, # Russia
|
||||
# Git hosting platforms
|
||||
'github': GitHub,
|
||||
'gitlab': GitLab,
|
||||
'codeberg': Codeberg,
|
||||
'gitea': Gitea,
|
||||
}
|
||||
|
||||
|
||||
def get_engine(name):
|
||||
"""Get engine instance by name."""
|
||||
if name not in ENGINES:
|
||||
return None
|
||||
return ENGINES[name]()
|
||||
|
||||
|
||||
def get_all_engines():
|
||||
"""Get instances of all available engines."""
|
||||
return [cls() for cls in ENGINES.values()]
|
||||
|
||||
|
||||
def list_engines():
|
||||
"""List available engine names."""
|
||||
return list(ENGINES.keys())
|
||||
Reference in New Issue
Block a user