diff --git a/engines.py b/engines.py
index 47c1ba5..4353e9f 100644
--- a/engines.py
+++ b/engines.py
@@ -16,6 +16,12 @@ class SearchEngine(object):
base_url = ''
# Rate limiting: requests per minute (0 = no limit)
rate_limit = 0
+ # Domains to skip (engine's own domain)
+ skip_domains = []
+ # Path patterns to skip (internal pages)
+ skip_patterns = []
+ # Base URL for relative link conversion (None = skip relative links)
+ relative_base = None
def __init__(self):
self.last_request = 0
@@ -25,8 +31,65 @@ class SearchEngine(object):
raise NotImplementedError
def extract_urls(self, content, urignore=None):
- """Extract result URLs from response content."""
- raise NotImplementedError
+ """Extract result URLs from response content.
+
+ Base implementation handles common patterns:
+ - Skips empty content
+ - Parses HTML with soupify
+ - Converts relative URLs if relative_base is set
+ - Skips domains in skip_domains
+ - Skips paths matching skip_patterns
+ - Applies urignore regex patterns
+ - Deduplicates results
+ """
+ urls = []
+ if not content:
+ return urls
+ urignore = urignore or []
+
+ soup = soupify(content, nohtml=True)
+
+ for a in soup.body.find_all('a'):
+ href = a.get('href', '')
+ if not href:
+ continue
+
+ # Handle relative URLs
+ if not href.startswith('http'):
+ if self.relative_base and href.startswith('/'):
+ href = self.relative_base + href
+ else:
+ continue
+
+ # Skip engine's own domain(s)
+ skip = False
+ for domain in self.skip_domains:
+ if domain in href:
+ skip = True
+ break
+ if skip:
+ continue
+
+ # Skip internal paths
+ for pattern in self.skip_patterns:
+ if pattern in href:
+ skip = True
+ break
+ if skip:
+ continue
+
+ # Check urignore patterns
+ for pattern in urignore:
+ if re.search(pattern, href):
+ skip = True
+ break
+ if skip:
+ continue
+
+ if href not in urls:
+ urls.append(href)
+
+ return urls
def is_rate_limited(self, content):
"""Check if response indicates rate limiting."""
@@ -50,6 +113,7 @@ class DuckDuckGo(SearchEngine):
name = 'duckduckgo'
base_url = 'https://html.duckduckgo.com/html/'
rate_limit = 10
+ skip_domains = ['duckduckgo.com']
def build_url(self, query, page=0):
params = {'q': query}
@@ -59,6 +123,17 @@ class DuckDuckGo(SearchEngine):
params['dc'] = str(page * 30 + 1)
return '%s?%s' % (self.base_url, urllib.urlencode(params))
+ def _unwrap_url(self, href):
+ """Extract actual URL from DuckDuckGo redirect wrapper."""
+ if '/l/?uddg=' in href or 'uddg=' in href:
+ match = re.search(r'uddg=([^&]+)', href)
+ if match:
+ try:
+ return urllib.unquote(match.group(1))
+ except Exception:
+ return None
+ return href
+
def extract_urls(self, content, urignore=None):
urls = []
if not content:
@@ -67,33 +142,28 @@ class DuckDuckGo(SearchEngine):
soup = soupify(content, nohtml=True)
- # DuckDuckGo HTML results are in or
for a in soup.body.find_all('a'):
href = a.get('href', '')
if not href or not href.startswith('http'):
continue
- # Skip DuckDuckGo internal links
+ # Unwrap redirect URLs
+ href = self._unwrap_url(href)
+ if not href:
+ continue
+
+ # Skip engine's domain
if 'duckduckgo.com' in href:
continue
- # DuckDuckGo wraps URLs - extract actual URL from redirect
- if '/l/?uddg=' in href or 'uddg=' in href:
- match = re.search(r'uddg=([^&]+)', href)
- if match:
- try:
- href = urllib.unquote(match.group(1))
- except Exception:
- continue
-
# Check urignore patterns
- bad = False
+ skip = False
for pattern in urignore:
if re.search(pattern, href):
- bad = True
+ skip = True
break
- if not bad and href not in urls:
+ if not skip and href not in urls:
urls.append(href)
return urls
@@ -105,6 +175,7 @@ class Startpage(SearchEngine):
name = 'startpage'
base_url = 'https://www.startpage.com/do/search'
rate_limit = 5
+ skip_domains = ['startpage.com']
def build_url(self, query, page=0):
params = {
@@ -116,35 +187,6 @@ class Startpage(SearchEngine):
params['page'] = str(page + 1)
return '%s?%s' % (self.base_url, urllib.urlencode(params))
- def extract_urls(self, content, urignore=None):
- urls = []
- if not content:
- return urls
- urignore = urignore or []
-
- soup = soupify(content, nohtml=True)
-
- for a in soup.body.find_all('a'):
- href = a.get('href', '')
- if not href or not href.startswith('http'):
- continue
-
- # Skip Startpage internal links
- if 'startpage.com' in href:
- continue
-
- # Check urignore patterns
- bad = False
- for pattern in urignore:
- if re.search(pattern, href):
- bad = True
- break
-
- if not bad and href not in urls:
- urls.append(href)
-
- return urls
-
class Mojeek(SearchEngine):
"""Mojeek search (UK-based, independent index)."""
@@ -152,6 +194,7 @@ class Mojeek(SearchEngine):
name = 'mojeek'
base_url = 'https://www.mojeek.com/search'
rate_limit = 10
+ skip_domains = ['mojeek.com', 'mojeek.co.uk']
def build_url(self, query, page=0):
params = {'q': query}
@@ -160,35 +203,6 @@ class Mojeek(SearchEngine):
params['s'] = str(page * 10 + 1)
return '%s?%s' % (self.base_url, urllib.urlencode(params))
- def extract_urls(self, content, urignore=None):
- urls = []
- if not content:
- return urls
- urignore = urignore or []
-
- soup = soupify(content, nohtml=True)
-
- for a in soup.body.find_all('a'):
- href = a.get('href', '')
- if not href or not href.startswith('http'):
- continue
-
- # Skip Mojeek internal links
- if 'mojeek.com' in href or 'mojeek.co.uk' in href:
- continue
-
- # Check urignore patterns
- bad = False
- for pattern in urignore:
- if re.search(pattern, href):
- bad = True
- break
-
- if not bad and href not in urls:
- urls.append(href)
-
- return urls
-
class Qwant(SearchEngine):
"""Qwant Lite search (French, EU-based, privacy-focused)."""
@@ -196,6 +210,7 @@ class Qwant(SearchEngine):
name = 'qwant'
base_url = 'https://lite.qwant.com/'
rate_limit = 10
+ skip_domains = ['qwant.com']
def build_url(self, query, page=0):
params = {
@@ -206,35 +221,6 @@ class Qwant(SearchEngine):
params['p'] = str(page + 1)
return '%s?%s' % (self.base_url, urllib.urlencode(params))
- def extract_urls(self, content, urignore=None):
- urls = []
- if not content:
- return urls
- urignore = urignore or []
-
- soup = soupify(content, nohtml=True)
-
- for a in soup.body.find_all('a'):
- href = a.get('href', '')
- if not href or not href.startswith('http'):
- continue
-
- # Skip Qwant internal links
- if 'qwant.com' in href:
- continue
-
- # Check urignore patterns
- bad = False
- for pattern in urignore:
- if re.search(pattern, href):
- bad = True
- break
-
- if not bad and href not in urls:
- urls.append(href)
-
- return urls
-
class Yandex(SearchEngine):
"""Yandex search (Russian, large independent index)."""
@@ -242,6 +228,7 @@ class Yandex(SearchEngine):
name = 'yandex'
base_url = 'https://yandex.com/search/'
rate_limit = 5
+ skip_domains = ['yandex.com', 'yandex.ru']
def build_url(self, query, page=0):
params = {
@@ -252,6 +239,18 @@ class Yandex(SearchEngine):
params['p'] = str(page)
return '%s?%s' % (self.base_url, urllib.urlencode(params))
+ def _unwrap_url(self, href):
+ """Extract actual URL from Yandex redirect wrapper."""
+ if '//yandex.' in href:
+ match = re.search(r'url=([^&]+)', href)
+ if match:
+ try:
+ return urllib.unquote(match.group(1))
+ except Exception:
+ return None
+ return None
+ return href
+
def extract_urls(self, content, urignore=None):
urls = []
if not content:
@@ -265,29 +264,19 @@ class Yandex(SearchEngine):
if not href:
continue
- # Yandex uses redirect URLs, try to extract actual URL
- if '//yandex.' in href:
- # Try to find embedded URL
- match = re.search(r'url=([^&]+)', href)
- if match:
- try:
- href = urllib.unquote(match.group(1))
- except Exception:
- continue
- else:
- continue
-
- if not href.startswith('http'):
+ # Unwrap redirect URLs
+ href = self._unwrap_url(href)
+ if not href or not href.startswith('http'):
continue
# Check urignore patterns
- bad = False
+ skip = False
for pattern in urignore:
if re.search(pattern, href):
- bad = True
+ skip = True
break
- if not bad and href not in urls:
+ if not skip and href not in urls:
urls.append(href)
return urls
@@ -299,6 +288,7 @@ class Ecosia(SearchEngine):
name = 'ecosia'
base_url = 'https://www.ecosia.org/search'
rate_limit = 10
+ skip_domains = ['ecosia.org']
def build_url(self, query, page=0):
params = {'q': query}
@@ -306,35 +296,6 @@ class Ecosia(SearchEngine):
params['p'] = str(page)
return '%s?%s' % (self.base_url, urllib.urlencode(params))
- def extract_urls(self, content, urignore=None):
- urls = []
- if not content:
- return urls
- urignore = urignore or []
-
- soup = soupify(content, nohtml=True)
-
- for a in soup.body.find_all('a'):
- href = a.get('href', '')
- if not href or not href.startswith('http'):
- continue
-
- # Skip Ecosia internal links
- if 'ecosia.org' in href:
- continue
-
- # Check urignore patterns
- bad = False
- for pattern in urignore:
- if re.search(pattern, href):
- bad = True
- break
-
- if not bad and href not in urls:
- urls.append(href)
-
- return urls
-
class Brave(SearchEngine):
"""Brave Search (privacy-focused, independent index)."""
@@ -342,6 +303,7 @@ class Brave(SearchEngine):
name = 'brave'
base_url = 'https://search.brave.com/search'
rate_limit = 10
+ skip_domains = ['brave.com']
def build_url(self, query, page=0):
params = {'q': query}
@@ -349,35 +311,6 @@ class Brave(SearchEngine):
params['offset'] = str(page)
return '%s?%s' % (self.base_url, urllib.urlencode(params))
- def extract_urls(self, content, urignore=None):
- urls = []
- if not content:
- return urls
- urignore = urignore or []
-
- soup = soupify(content, nohtml=True)
-
- for a in soup.body.find_all('a'):
- href = a.get('href', '')
- if not href or not href.startswith('http'):
- continue
-
- # Skip Brave internal links
- if 'brave.com' in href:
- continue
-
- # Check urignore patterns
- bad = False
- for pattern in urignore:
- if re.search(pattern, href):
- bad = True
- break
-
- if not bad and href not in urls:
- urls.append(href)
-
- return urls
-
class GitHub(SearchEngine):
"""GitHub code/repository search for proxy lists."""
@@ -385,6 +318,12 @@ class GitHub(SearchEngine):
name = 'github'
base_url = 'https://github.com/search'
rate_limit = 5
+ relative_base = 'https://github.com'
+ skip_patterns = [
+ '/login', '/signup', '/join', '/settings',
+ '/notifications', '/marketplace', '/explore',
+ '/sponsors', '/pricing', '/features',
+ ]
# Search terms specific to proxy lists on GitHub
github_queries = [
@@ -410,6 +349,7 @@ class GitHub(SearchEngine):
return '%s?%s' % (self.base_url, urllib.urlencode(params))
def extract_urls(self, content, urignore=None):
+ """Extract URLs with blob-to-raw conversion for direct file access."""
urls = []
if not content:
return urls
@@ -424,7 +364,7 @@ class GitHub(SearchEngine):
# Convert relative to absolute
if href.startswith('/'):
- href = 'https://github.com' + href
+ href = self.relative_base + href
if not href.startswith('http'):
continue
@@ -433,20 +373,23 @@ class GitHub(SearchEngine):
if 'github.com' not in href:
continue
- # Skip non-content links
- skip_patterns = [
- '/login', '/signup', '/join', '/settings',
- '/notifications', '/marketplace', '/explore',
- '/sponsors', '/pricing', '/features',
- ]
+ # Skip internal pages
skip = False
- for pattern in skip_patterns:
+ for pattern in self.skip_patterns:
if pattern in href:
skip = True
break
if skip:
continue
+ # Check urignore patterns
+ for pattern in urignore:
+ if re.search(pattern, href):
+ skip = True
+ break
+ if skip:
+ continue
+
# Keep raw file links and repo links
if '/raw/' in href or '/blob/' in href or re.match(r'https://github\.com/[^/]+/[^/]+$', href):
# Convert blob to raw for direct access
@@ -467,6 +410,11 @@ class GitLab(SearchEngine):
name = 'gitlab'
base_url = 'https://gitlab.com/search'
rate_limit = 5
+ relative_base = 'https://gitlab.com'
+ skip_patterns = [
+ '/users/', '/-/', '/explore', '/help',
+ '/admin', '/dashboard', '/profile',
+ ]
def build_url(self, query, page=0):
search_query = query if query else 'proxy list'
@@ -479,10 +427,10 @@ class GitLab(SearchEngine):
return '%s?%s' % (self.base_url, urllib.urlencode(params))
def extract_urls(self, content, urignore=None):
+ """Extract project URLs only (whitelist pattern)."""
urls = []
if not content:
return urls
- urignore = urignore or []
soup = soupify(content, nohtml=True)
@@ -493,29 +441,21 @@ class GitLab(SearchEngine):
# Convert relative to absolute
if href.startswith('/'):
- href = 'https://gitlab.com' + href
+ href = self.relative_base + href
- if not href.startswith('http'):
+ if not href.startswith('http') or 'gitlab.com' not in href:
continue
- # Only keep GitLab project links
- if 'gitlab.com' not in href:
- continue
-
- # Skip non-project links
- skip_patterns = [
- '/users/', '/-/', '/explore', '/help',
- '/admin', '/dashboard', '/profile',
- ]
+ # Skip internal pages
skip = False
- for pattern in skip_patterns:
+ for pattern in self.skip_patterns:
if pattern in href:
skip = True
break
if skip:
continue
- # Keep project and file links
+ # Keep only project links
if re.match(r'https://gitlab\.com/[^/]+/[^/]+', href):
if href not in urls:
urls.append(href)
@@ -529,6 +469,7 @@ class Codeberg(SearchEngine):
name = 'codeberg'
base_url = 'https://codeberg.org/explore/repos'
rate_limit = 10
+ relative_base = 'https://codeberg.org'
def build_url(self, query, page=0):
search_query = query if query else 'proxy'
@@ -541,10 +482,10 @@ class Codeberg(SearchEngine):
return '%s?%s' % (self.base_url, urllib.urlencode(params))
def extract_urls(self, content, urignore=None):
+ """Extract repo URLs only (whitelist pattern)."""
urls = []
if not content:
return urls
- urignore = urignore or []
soup = soupify(content, nohtml=True)
@@ -555,13 +496,9 @@ class Codeberg(SearchEngine):
# Convert relative to absolute
if href.startswith('/'):
- href = 'https://codeberg.org' + href
+ href = self.relative_base + href
- if not href.startswith('http'):
- continue
-
- # Only keep Codeberg repo links
- if 'codeberg.org' not in href:
+ if not href.startswith('http') or 'codeberg.org' not in href:
continue
# Keep repo links (format: /user/repo)
@@ -600,12 +537,13 @@ class Gitea(SearchEngine):
return '%s/explore/repos?%s' % (self.current_instance, urllib.urlencode(params))
def extract_urls(self, content, urignore=None):
+ """Extract repo URLs for current dynamic instance."""
urls = []
if not content:
return urls
- urignore = urignore or []
soup = soupify(content, nohtml=True)
+ instance_domain = self.current_instance.split('//')[1]
for a in soup.body.find_all('a'):
href = a.get('href', '')
@@ -619,8 +557,8 @@ class Gitea(SearchEngine):
if not href.startswith('http'):
continue
- # Keep repo links
- if self.current_instance.split('//')[1] in href:
+ # Keep repo links for this instance
+ if instance_domain in href:
if re.match(r'https?://[^/]+/[^/]+/[^/]+$', href):
if href not in urls:
urls.append(href)
@@ -649,7 +587,7 @@ class Searx(SearchEngine):
return '%s/?%s' % (self.base_url, urllib.urlencode(params))
def extract_urls(self, content, urignore=None):
- """Extract URLs from Searx results (noreferrer links)."""
+ """Extract URLs from Searx results (noreferrer links only)."""
urls = []
if not content:
return urls
@@ -668,13 +606,13 @@ class Searx(SearchEngine):
continue
# Check urignore patterns
- bad = False
+ skip = False
for pattern in urignore:
if re.search(pattern, href):
- bad = True
+ skip = True
break
- if not bad and href not in urls:
+ if not skip and href not in urls:
urls.append(href)
return urls