From ce79ef7d7f6f1998127004f0d7d0fb786f2b7628 Mon Sep 17 00:00:00 2001 From: Username Date: Sat, 20 Dec 2025 22:50:46 +0100 Subject: [PATCH] engines: consolidate extract_urls with base class method --- engines.py | 364 ++++++++++++++++++++++------------------------------- 1 file changed, 151 insertions(+), 213 deletions(-) diff --git a/engines.py b/engines.py index 47c1ba5..4353e9f 100644 --- a/engines.py +++ b/engines.py @@ -16,6 +16,12 @@ class SearchEngine(object): base_url = '' # Rate limiting: requests per minute (0 = no limit) rate_limit = 0 + # Domains to skip (engine's own domain) + skip_domains = [] + # Path patterns to skip (internal pages) + skip_patterns = [] + # Base URL for relative link conversion (None = skip relative links) + relative_base = None def __init__(self): self.last_request = 0 @@ -25,8 +31,65 @@ class SearchEngine(object): raise NotImplementedError def extract_urls(self, content, urignore=None): - """Extract result URLs from response content.""" - raise NotImplementedError + """Extract result URLs from response content. + + Base implementation handles common patterns: + - Skips empty content + - Parses HTML with soupify + - Converts relative URLs if relative_base is set + - Skips domains in skip_domains + - Skips paths matching skip_patterns + - Applies urignore regex patterns + - Deduplicates results + """ + urls = [] + if not content: + return urls + urignore = urignore or [] + + soup = soupify(content, nohtml=True) + + for a in soup.body.find_all('a'): + href = a.get('href', '') + if not href: + continue + + # Handle relative URLs + if not href.startswith('http'): + if self.relative_base and href.startswith('/'): + href = self.relative_base + href + else: + continue + + # Skip engine's own domain(s) + skip = False + for domain in self.skip_domains: + if domain in href: + skip = True + break + if skip: + continue + + # Skip internal paths + for pattern in self.skip_patterns: + if pattern in href: + skip = True + break + if skip: + continue + + # Check urignore patterns + for pattern in urignore: + if re.search(pattern, href): + skip = True + break + if skip: + continue + + if href not in urls: + urls.append(href) + + return urls def is_rate_limited(self, content): """Check if response indicates rate limiting.""" @@ -50,6 +113,7 @@ class DuckDuckGo(SearchEngine): name = 'duckduckgo' base_url = 'https://html.duckduckgo.com/html/' rate_limit = 10 + skip_domains = ['duckduckgo.com'] def build_url(self, query, page=0): params = {'q': query} @@ -59,6 +123,17 @@ class DuckDuckGo(SearchEngine): params['dc'] = str(page * 30 + 1) return '%s?%s' % (self.base_url, urllib.urlencode(params)) + def _unwrap_url(self, href): + """Extract actual URL from DuckDuckGo redirect wrapper.""" + if '/l/?uddg=' in href or 'uddg=' in href: + match = re.search(r'uddg=([^&]+)', href) + if match: + try: + return urllib.unquote(match.group(1)) + except Exception: + return None + return href + def extract_urls(self, content, urignore=None): urls = [] if not content: @@ -67,33 +142,28 @@ class DuckDuckGo(SearchEngine): soup = soupify(content, nohtml=True) - # DuckDuckGo HTML results are in or for a in soup.body.find_all('a'): href = a.get('href', '') if not href or not href.startswith('http'): continue - # Skip DuckDuckGo internal links + # Unwrap redirect URLs + href = self._unwrap_url(href) + if not href: + continue + + # Skip engine's domain if 'duckduckgo.com' in href: continue - # DuckDuckGo wraps URLs - extract actual URL from redirect - if '/l/?uddg=' in href or 'uddg=' in href: - match = re.search(r'uddg=([^&]+)', href) - if match: - try: - href = urllib.unquote(match.group(1)) - except Exception: - continue - # Check urignore patterns - bad = False + skip = False for pattern in urignore: if re.search(pattern, href): - bad = True + skip = True break - if not bad and href not in urls: + if not skip and href not in urls: urls.append(href) return urls @@ -105,6 +175,7 @@ class Startpage(SearchEngine): name = 'startpage' base_url = 'https://www.startpage.com/do/search' rate_limit = 5 + skip_domains = ['startpage.com'] def build_url(self, query, page=0): params = { @@ -116,35 +187,6 @@ class Startpage(SearchEngine): params['page'] = str(page + 1) return '%s?%s' % (self.base_url, urllib.urlencode(params)) - def extract_urls(self, content, urignore=None): - urls = [] - if not content: - return urls - urignore = urignore or [] - - soup = soupify(content, nohtml=True) - - for a in soup.body.find_all('a'): - href = a.get('href', '') - if not href or not href.startswith('http'): - continue - - # Skip Startpage internal links - if 'startpage.com' in href: - continue - - # Check urignore patterns - bad = False - for pattern in urignore: - if re.search(pattern, href): - bad = True - break - - if not bad and href not in urls: - urls.append(href) - - return urls - class Mojeek(SearchEngine): """Mojeek search (UK-based, independent index).""" @@ -152,6 +194,7 @@ class Mojeek(SearchEngine): name = 'mojeek' base_url = 'https://www.mojeek.com/search' rate_limit = 10 + skip_domains = ['mojeek.com', 'mojeek.co.uk'] def build_url(self, query, page=0): params = {'q': query} @@ -160,35 +203,6 @@ class Mojeek(SearchEngine): params['s'] = str(page * 10 + 1) return '%s?%s' % (self.base_url, urllib.urlencode(params)) - def extract_urls(self, content, urignore=None): - urls = [] - if not content: - return urls - urignore = urignore or [] - - soup = soupify(content, nohtml=True) - - for a in soup.body.find_all('a'): - href = a.get('href', '') - if not href or not href.startswith('http'): - continue - - # Skip Mojeek internal links - if 'mojeek.com' in href or 'mojeek.co.uk' in href: - continue - - # Check urignore patterns - bad = False - for pattern in urignore: - if re.search(pattern, href): - bad = True - break - - if not bad and href not in urls: - urls.append(href) - - return urls - class Qwant(SearchEngine): """Qwant Lite search (French, EU-based, privacy-focused).""" @@ -196,6 +210,7 @@ class Qwant(SearchEngine): name = 'qwant' base_url = 'https://lite.qwant.com/' rate_limit = 10 + skip_domains = ['qwant.com'] def build_url(self, query, page=0): params = { @@ -206,35 +221,6 @@ class Qwant(SearchEngine): params['p'] = str(page + 1) return '%s?%s' % (self.base_url, urllib.urlencode(params)) - def extract_urls(self, content, urignore=None): - urls = [] - if not content: - return urls - urignore = urignore or [] - - soup = soupify(content, nohtml=True) - - for a in soup.body.find_all('a'): - href = a.get('href', '') - if not href or not href.startswith('http'): - continue - - # Skip Qwant internal links - if 'qwant.com' in href: - continue - - # Check urignore patterns - bad = False - for pattern in urignore: - if re.search(pattern, href): - bad = True - break - - if not bad and href not in urls: - urls.append(href) - - return urls - class Yandex(SearchEngine): """Yandex search (Russian, large independent index).""" @@ -242,6 +228,7 @@ class Yandex(SearchEngine): name = 'yandex' base_url = 'https://yandex.com/search/' rate_limit = 5 + skip_domains = ['yandex.com', 'yandex.ru'] def build_url(self, query, page=0): params = { @@ -252,6 +239,18 @@ class Yandex(SearchEngine): params['p'] = str(page) return '%s?%s' % (self.base_url, urllib.urlencode(params)) + def _unwrap_url(self, href): + """Extract actual URL from Yandex redirect wrapper.""" + if '//yandex.' in href: + match = re.search(r'url=([^&]+)', href) + if match: + try: + return urllib.unquote(match.group(1)) + except Exception: + return None + return None + return href + def extract_urls(self, content, urignore=None): urls = [] if not content: @@ -265,29 +264,19 @@ class Yandex(SearchEngine): if not href: continue - # Yandex uses redirect URLs, try to extract actual URL - if '//yandex.' in href: - # Try to find embedded URL - match = re.search(r'url=([^&]+)', href) - if match: - try: - href = urllib.unquote(match.group(1)) - except Exception: - continue - else: - continue - - if not href.startswith('http'): + # Unwrap redirect URLs + href = self._unwrap_url(href) + if not href or not href.startswith('http'): continue # Check urignore patterns - bad = False + skip = False for pattern in urignore: if re.search(pattern, href): - bad = True + skip = True break - if not bad and href not in urls: + if not skip and href not in urls: urls.append(href) return urls @@ -299,6 +288,7 @@ class Ecosia(SearchEngine): name = 'ecosia' base_url = 'https://www.ecosia.org/search' rate_limit = 10 + skip_domains = ['ecosia.org'] def build_url(self, query, page=0): params = {'q': query} @@ -306,35 +296,6 @@ class Ecosia(SearchEngine): params['p'] = str(page) return '%s?%s' % (self.base_url, urllib.urlencode(params)) - def extract_urls(self, content, urignore=None): - urls = [] - if not content: - return urls - urignore = urignore or [] - - soup = soupify(content, nohtml=True) - - for a in soup.body.find_all('a'): - href = a.get('href', '') - if not href or not href.startswith('http'): - continue - - # Skip Ecosia internal links - if 'ecosia.org' in href: - continue - - # Check urignore patterns - bad = False - for pattern in urignore: - if re.search(pattern, href): - bad = True - break - - if not bad and href not in urls: - urls.append(href) - - return urls - class Brave(SearchEngine): """Brave Search (privacy-focused, independent index).""" @@ -342,6 +303,7 @@ class Brave(SearchEngine): name = 'brave' base_url = 'https://search.brave.com/search' rate_limit = 10 + skip_domains = ['brave.com'] def build_url(self, query, page=0): params = {'q': query} @@ -349,35 +311,6 @@ class Brave(SearchEngine): params['offset'] = str(page) return '%s?%s' % (self.base_url, urllib.urlencode(params)) - def extract_urls(self, content, urignore=None): - urls = [] - if not content: - return urls - urignore = urignore or [] - - soup = soupify(content, nohtml=True) - - for a in soup.body.find_all('a'): - href = a.get('href', '') - if not href or not href.startswith('http'): - continue - - # Skip Brave internal links - if 'brave.com' in href: - continue - - # Check urignore patterns - bad = False - for pattern in urignore: - if re.search(pattern, href): - bad = True - break - - if not bad and href not in urls: - urls.append(href) - - return urls - class GitHub(SearchEngine): """GitHub code/repository search for proxy lists.""" @@ -385,6 +318,12 @@ class GitHub(SearchEngine): name = 'github' base_url = 'https://github.com/search' rate_limit = 5 + relative_base = 'https://github.com' + skip_patterns = [ + '/login', '/signup', '/join', '/settings', + '/notifications', '/marketplace', '/explore', + '/sponsors', '/pricing', '/features', + ] # Search terms specific to proxy lists on GitHub github_queries = [ @@ -410,6 +349,7 @@ class GitHub(SearchEngine): return '%s?%s' % (self.base_url, urllib.urlencode(params)) def extract_urls(self, content, urignore=None): + """Extract URLs with blob-to-raw conversion for direct file access.""" urls = [] if not content: return urls @@ -424,7 +364,7 @@ class GitHub(SearchEngine): # Convert relative to absolute if href.startswith('/'): - href = 'https://github.com' + href + href = self.relative_base + href if not href.startswith('http'): continue @@ -433,20 +373,23 @@ class GitHub(SearchEngine): if 'github.com' not in href: continue - # Skip non-content links - skip_patterns = [ - '/login', '/signup', '/join', '/settings', - '/notifications', '/marketplace', '/explore', - '/sponsors', '/pricing', '/features', - ] + # Skip internal pages skip = False - for pattern in skip_patterns: + for pattern in self.skip_patterns: if pattern in href: skip = True break if skip: continue + # Check urignore patterns + for pattern in urignore: + if re.search(pattern, href): + skip = True + break + if skip: + continue + # Keep raw file links and repo links if '/raw/' in href or '/blob/' in href or re.match(r'https://github\.com/[^/]+/[^/]+$', href): # Convert blob to raw for direct access @@ -467,6 +410,11 @@ class GitLab(SearchEngine): name = 'gitlab' base_url = 'https://gitlab.com/search' rate_limit = 5 + relative_base = 'https://gitlab.com' + skip_patterns = [ + '/users/', '/-/', '/explore', '/help', + '/admin', '/dashboard', '/profile', + ] def build_url(self, query, page=0): search_query = query if query else 'proxy list' @@ -479,10 +427,10 @@ class GitLab(SearchEngine): return '%s?%s' % (self.base_url, urllib.urlencode(params)) def extract_urls(self, content, urignore=None): + """Extract project URLs only (whitelist pattern).""" urls = [] if not content: return urls - urignore = urignore or [] soup = soupify(content, nohtml=True) @@ -493,29 +441,21 @@ class GitLab(SearchEngine): # Convert relative to absolute if href.startswith('/'): - href = 'https://gitlab.com' + href + href = self.relative_base + href - if not href.startswith('http'): + if not href.startswith('http') or 'gitlab.com' not in href: continue - # Only keep GitLab project links - if 'gitlab.com' not in href: - continue - - # Skip non-project links - skip_patterns = [ - '/users/', '/-/', '/explore', '/help', - '/admin', '/dashboard', '/profile', - ] + # Skip internal pages skip = False - for pattern in skip_patterns: + for pattern in self.skip_patterns: if pattern in href: skip = True break if skip: continue - # Keep project and file links + # Keep only project links if re.match(r'https://gitlab\.com/[^/]+/[^/]+', href): if href not in urls: urls.append(href) @@ -529,6 +469,7 @@ class Codeberg(SearchEngine): name = 'codeberg' base_url = 'https://codeberg.org/explore/repos' rate_limit = 10 + relative_base = 'https://codeberg.org' def build_url(self, query, page=0): search_query = query if query else 'proxy' @@ -541,10 +482,10 @@ class Codeberg(SearchEngine): return '%s?%s' % (self.base_url, urllib.urlencode(params)) def extract_urls(self, content, urignore=None): + """Extract repo URLs only (whitelist pattern).""" urls = [] if not content: return urls - urignore = urignore or [] soup = soupify(content, nohtml=True) @@ -555,13 +496,9 @@ class Codeberg(SearchEngine): # Convert relative to absolute if href.startswith('/'): - href = 'https://codeberg.org' + href + href = self.relative_base + href - if not href.startswith('http'): - continue - - # Only keep Codeberg repo links - if 'codeberg.org' not in href: + if not href.startswith('http') or 'codeberg.org' not in href: continue # Keep repo links (format: /user/repo) @@ -600,12 +537,13 @@ class Gitea(SearchEngine): return '%s/explore/repos?%s' % (self.current_instance, urllib.urlencode(params)) def extract_urls(self, content, urignore=None): + """Extract repo URLs for current dynamic instance.""" urls = [] if not content: return urls - urignore = urignore or [] soup = soupify(content, nohtml=True) + instance_domain = self.current_instance.split('//')[1] for a in soup.body.find_all('a'): href = a.get('href', '') @@ -619,8 +557,8 @@ class Gitea(SearchEngine): if not href.startswith('http'): continue - # Keep repo links - if self.current_instance.split('//')[1] in href: + # Keep repo links for this instance + if instance_domain in href: if re.match(r'https?://[^/]+/[^/]+/[^/]+$', href): if href not in urls: urls.append(href) @@ -649,7 +587,7 @@ class Searx(SearchEngine): return '%s/?%s' % (self.base_url, urllib.urlencode(params)) def extract_urls(self, content, urignore=None): - """Extract URLs from Searx results (noreferrer links).""" + """Extract URLs from Searx results (noreferrer links only).""" urls = [] if not content: return urls @@ -668,13 +606,13 @@ class Searx(SearchEngine): continue # Check urignore patterns - bad = False + skip = False for pattern in urignore: if re.search(pattern, href): - bad = True + skip = True break - if not bad and href not in urls: + if not skip and href not in urls: urls.append(href) return urls