From ce79ef7d7f6f1998127004f0d7d0fb786f2b7628 Mon Sep 17 00:00:00 2001
From: Username <user@mymx.me>
Date: Sat, 20 Dec 2025 22:50:46 +0100
Subject: [PATCH] engines: consolidate extract_urls with base class method

---
 engines.py | 364 ++++++++++++++++++++++-------------------------------
 1 file changed, 151 insertions(+), 213 deletions(-)
diff --git a/engines.py b/engines.py
index 47c1ba5..4353e9f 100644
--- a/engines.py
+++ b/engines.py
@@ -16,6 +16,12 @@ class SearchEngine(object):
     base_url = ''
     # Rate limiting: requests per minute (0 = no limit)
     rate_limit = 0
+    # Domains to skip (engine's own domain)
+    skip_domains = []
+    # Path patterns to skip (internal pages)
+    skip_patterns = []
+    # Base URL for relative link conversion (None = skip relative links)
+    relative_base = None
 
     def __init__(self):
         self.last_request = 0
@@ -25,8 +31,65 @@ class SearchEngine(object):
         raise NotImplementedError
 
     def extract_urls(self, content, urignore=None):
-        """Extract result URLs from response content."""
-        raise NotImplementedError
+        """Extract result URLs from response content.
+
+        Base implementation handles common patterns:
+        - Skips empty content
+        - Parses HTML with soupify
+        - Converts relative URLs if relative_base is set
+        - Skips domains in skip_domains
+        - Skips paths matching skip_patterns
+        - Applies urignore regex patterns
+        - Deduplicates results
+        """
+        urls = []
+        if not content:
+            return urls
+        urignore = urignore or []
+
+        soup = soupify(content, nohtml=True)
+
+        for a in soup.body.find_all('a'):
+            href = a.get('href', '')
+            if not href:
+                continue
+
+            # Handle relative URLs
+            if not href.startswith('http'):
+                if self.relative_base and href.startswith('/'):
+                    href = self.relative_base + href
+                else:
+                    continue
+
+            # Skip engine's own domain(s)
+            skip = False
+            for domain in self.skip_domains:
+                if domain in href:
+                    skip = True
+                    break
+            if skip:
+                continue
+
+            # Skip internal paths
+            for pattern in self.skip_patterns:
+                if pattern in href:
+                    skip = True
+                    break
+            if skip:
+                continue
+
+            # Check urignore patterns
+            for pattern in urignore:
+                if re.search(pattern, href):
+                    skip = True
+                    break
+            if skip:
+                continue
+
+            if href not in urls:
+                urls.append(href)
+
+        return urls
 
     def is_rate_limited(self, content):
         """Check if response indicates rate limiting."""
@@ -50,6 +113,7 @@ class DuckDuckGo(SearchEngine):
     name = 'duckduckgo'
     base_url = 'https://html.duckduckgo.com/html/'
     rate_limit = 10
+    skip_domains = ['duckduckgo.com']
 
     def build_url(self, query, page=0):
         params = {'q': query}
@@ -59,6 +123,17 @@ class DuckDuckGo(SearchEngine):
             params['dc'] = str(page * 30 + 1)
         return '%s?%s' % (self.base_url, urllib.urlencode(params))
 
+    def _unwrap_url(self, href):
+        """Extract actual URL from DuckDuckGo redirect wrapper."""
+        if '/l/?uddg=' in href or 'uddg=' in href:
+            match = re.search(r'uddg=([^&]+)', href)
+            if match:
+                try:
+                    return urllib.unquote(match.group(1))
+                except Exception:
+                    return None
+        return href
+
     def extract_urls(self, content, urignore=None):
         urls = []
         if not content:
@@ -67,33 +142,28 @@ class DuckDuckGo(SearchEngine):
 
         soup = soupify(content, nohtml=True)
 
-        # DuckDuckGo HTML results are in <a class="result__url"> or <a class="result__a">
         for a in soup.body.find_all('a'):
             href = a.get('href', '')
             if not href or not href.startswith('http'):
                 continue
 
-            # Skip DuckDuckGo internal links
+            # Unwrap redirect URLs
+            href = self._unwrap_url(href)
+            if not href:
+                continue
+
+            # Skip engine's domain
             if 'duckduckgo.com' in href:
                 continue
 
-            # DuckDuckGo wraps URLs - extract actual URL from redirect
-            if '/l/?uddg=' in href or 'uddg=' in href:
-                match = re.search(r'uddg=([^&]+)', href)
-                if match:
-                    try:
-                        href = urllib.unquote(match.group(1))
-                    except Exception:
-                        continue
-
             # Check urignore patterns
-            bad = False
+            skip = False
             for pattern in urignore:
                 if re.search(pattern, href):
-                    bad = True
+                    skip = True
                     break
 
-            if not bad and href not in urls:
+            if not skip and href not in urls:
                 urls.append(href)
 
         return urls
@@ -105,6 +175,7 @@ class Startpage(SearchEngine):
     name = 'startpage'
     base_url = 'https://www.startpage.com/do/search'
     rate_limit = 5
+    skip_domains = ['startpage.com']
 
     def build_url(self, query, page=0):
         params = {
@@ -116,35 +187,6 @@ class Startpage(SearchEngine):
             params['page'] = str(page + 1)
         return '%s?%s' % (self.base_url, urllib.urlencode(params))
 
-    def extract_urls(self, content, urignore=None):
-        urls = []
-        if not content:
-            return urls
-        urignore = urignore or []
-
-        soup = soupify(content, nohtml=True)
-
-        for a in soup.body.find_all('a'):
-            href = a.get('href', '')
-            if not href or not href.startswith('http'):
-                continue
-
-            # Skip Startpage internal links
-            if 'startpage.com' in href:
-                continue
-
-            # Check urignore patterns
-            bad = False
-            for pattern in urignore:
-                if re.search(pattern, href):
-                    bad = True
-                    break
-
-            if not bad and href not in urls:
-                urls.append(href)
-
-        return urls
-
 
 class Mojeek(SearchEngine):
     """Mojeek search (UK-based, independent index)."""
@@ -152,6 +194,7 @@ class Mojeek(SearchEngine):
     name = 'mojeek'
     base_url = 'https://www.mojeek.com/search'
     rate_limit = 10
+    skip_domains = ['mojeek.com', 'mojeek.co.uk']
 
     def build_url(self, query, page=0):
         params = {'q': query}
@@ -160,35 +203,6 @@ class Mojeek(SearchEngine):
             params['s'] = str(page * 10 + 1)
         return '%s?%s' % (self.base_url, urllib.urlencode(params))
 
-    def extract_urls(self, content, urignore=None):
-        urls = []
-        if not content:
-            return urls
-        urignore = urignore or []
-
-        soup = soupify(content, nohtml=True)
-
-        for a in soup.body.find_all('a'):
-            href = a.get('href', '')
-            if not href or not href.startswith('http'):
-                continue
-
-            # Skip Mojeek internal links
-            if 'mojeek.com' in href or 'mojeek.co.uk' in href:
-                continue
-
-            # Check urignore patterns
-            bad = False
-            for pattern in urignore:
-                if re.search(pattern, href):
-                    bad = True
-                    break
-
-            if not bad and href not in urls:
-                urls.append(href)
-
-        return urls
-
 
 class Qwant(SearchEngine):
     """Qwant Lite search (French, EU-based, privacy-focused)."""
@@ -196,6 +210,7 @@ class Qwant(SearchEngine):
     name = 'qwant'
     base_url = 'https://lite.qwant.com/'
     rate_limit = 10
+    skip_domains = ['qwant.com']
 
     def build_url(self, query, page=0):
         params = {
@@ -206,35 +221,6 @@ class Qwant(SearchEngine):
             params['p'] = str(page + 1)
         return '%s?%s' % (self.base_url, urllib.urlencode(params))
 
-    def extract_urls(self, content, urignore=None):
-        urls = []
-        if not content:
-            return urls
-        urignore = urignore or []
-
-        soup = soupify(content, nohtml=True)
-
-        for a in soup.body.find_all('a'):
-            href = a.get('href', '')
-            if not href or not href.startswith('http'):
-                continue
-
-            # Skip Qwant internal links
-            if 'qwant.com' in href:
-                continue
-
-            # Check urignore patterns
-            bad = False
-            for pattern in urignore:
-                if re.search(pattern, href):
-                    bad = True
-                    break
-
-            if not bad and href not in urls:
-                urls.append(href)
-
-        return urls
-
 
 class Yandex(SearchEngine):
     """Yandex search (Russian, large independent index)."""
@@ -242,6 +228,7 @@ class Yandex(SearchEngine):
     name = 'yandex'
     base_url = 'https://yandex.com/search/'
     rate_limit = 5
+    skip_domains = ['yandex.com', 'yandex.ru']
 
     def build_url(self, query, page=0):
         params = {
@@ -252,6 +239,18 @@ class Yandex(SearchEngine):
             params['p'] = str(page)
         return '%s?%s' % (self.base_url, urllib.urlencode(params))
 
+    def _unwrap_url(self, href):
+        """Extract actual URL from Yandex redirect wrapper."""
+        if '//yandex.' in href:
+            match = re.search(r'url=([^&]+)', href)
+            if match:
+                try:
+                    return urllib.unquote(match.group(1))
+                except Exception:
+                    return None
+            return None
+        return href
+
     def extract_urls(self, content, urignore=None):
         urls = []
         if not content:
@@ -265,29 +264,19 @@ class Yandex(SearchEngine):
             if not href:
                 continue
 
-            # Yandex uses redirect URLs, try to extract actual URL
-            if '//yandex.' in href:
-                # Try to find embedded URL
-                match = re.search(r'url=([^&]+)', href)
-                if match:
-                    try:
-                        href = urllib.unquote(match.group(1))
-                    except Exception:
-                        continue
-                else:
-                    continue
-
-            if not href.startswith('http'):
+            # Unwrap redirect URLs
+            href = self._unwrap_url(href)
+            if not href or not href.startswith('http'):
                 continue
 
             # Check urignore patterns
-            bad = False
+            skip = False
             for pattern in urignore:
                 if re.search(pattern, href):
-                    bad = True
+                    skip = True
                     break
 
-            if not bad and href not in urls:
+            if not skip and href not in urls:
                 urls.append(href)
 
         return urls
@@ -299,6 +288,7 @@ class Ecosia(SearchEngine):
     name = 'ecosia'
     base_url = 'https://www.ecosia.org/search'
     rate_limit = 10
+    skip_domains = ['ecosia.org']
 
     def build_url(self, query, page=0):
         params = {'q': query}
@@ -306,35 +296,6 @@ class Ecosia(SearchEngine):
             params['p'] = str(page)
         return '%s?%s' % (self.base_url, urllib.urlencode(params))
 
-    def extract_urls(self, content, urignore=None):
-        urls = []
-        if not content:
-            return urls
-        urignore = urignore or []
-
-        soup = soupify(content, nohtml=True)
-
-        for a in soup.body.find_all('a'):
-            href = a.get('href', '')
-            if not href or not href.startswith('http'):
-                continue
-
-            # Skip Ecosia internal links
-            if 'ecosia.org' in href:
-                continue
-
-            # Check urignore patterns
-            bad = False
-            for pattern in urignore:
-                if re.search(pattern, href):
-                    bad = True
-                    break
-
-            if not bad and href not in urls:
-                urls.append(href)
-
-        return urls
-
 
 class Brave(SearchEngine):
     """Brave Search (privacy-focused, independent index)."""
@@ -342,6 +303,7 @@ class Brave(SearchEngine):
     name = 'brave'
     base_url = 'https://search.brave.com/search'
     rate_limit = 10
+    skip_domains = ['brave.com']
 
     def build_url(self, query, page=0):
         params = {'q': query}
@@ -349,35 +311,6 @@ class Brave(SearchEngine):
             params['offset'] = str(page)
         return '%s?%s' % (self.base_url, urllib.urlencode(params))
 
-    def extract_urls(self, content, urignore=None):
-        urls = []
-        if not content:
-            return urls
-        urignore = urignore or []
-
-        soup = soupify(content, nohtml=True)
-
-        for a in soup.body.find_all('a'):
-            href = a.get('href', '')
-            if not href or not href.startswith('http'):
-                continue
-
-            # Skip Brave internal links
-            if 'brave.com' in href:
-                continue
-
-            # Check urignore patterns
-            bad = False
-            for pattern in urignore:
-                if re.search(pattern, href):
-                    bad = True
-                    break
-
-            if not bad and href not in urls:
-                urls.append(href)
-
-        return urls
-
 
 class GitHub(SearchEngine):
     """GitHub code/repository search for proxy lists."""
@@ -385,6 +318,12 @@ class GitHub(SearchEngine):
     name = 'github'
     base_url = 'https://github.com/search'
     rate_limit = 5
+    relative_base = 'https://github.com'
+    skip_patterns = [
+        '/login', '/signup', '/join', '/settings',
+        '/notifications', '/marketplace', '/explore',
+        '/sponsors', '/pricing', '/features',
+    ]
 
     # Search terms specific to proxy lists on GitHub
     github_queries = [
@@ -410,6 +349,7 @@ class GitHub(SearchEngine):
         return '%s?%s' % (self.base_url, urllib.urlencode(params))
 
     def extract_urls(self, content, urignore=None):
+        """Extract URLs with blob-to-raw conversion for direct file access."""
         urls = []
         if not content:
             return urls
@@ -424,7 +364,7 @@ class GitHub(SearchEngine):
 
             # Convert relative to absolute
             if href.startswith('/'):
-                href = 'https://github.com' + href
+                href = self.relative_base + href
 
             if not href.startswith('http'):
                 continue
@@ -433,20 +373,23 @@ class GitHub(SearchEngine):
             if 'github.com' not in href:
                 continue
 
-            # Skip non-content links
-            skip_patterns = [
-                '/login', '/signup', '/join', '/settings',
-                '/notifications', '/marketplace', '/explore',
-                '/sponsors', '/pricing', '/features',
-            ]
+            # Skip internal pages
             skip = False
-            for pattern in skip_patterns:
+            for pattern in self.skip_patterns:
                 if pattern in href:
                     skip = True
                     break
             if skip:
                 continue
 
+            # Check urignore patterns
+            for pattern in urignore:
+                if re.search(pattern, href):
+                    skip = True
+                    break
+            if skip:
+                continue
+
             # Keep raw file links and repo links
             if '/raw/' in href or '/blob/' in href or re.match(r'https://github\.com/[^/]+/[^/]+$', href):
                 # Convert blob to raw for direct access
@@ -467,6 +410,11 @@ class GitLab(SearchEngine):
     name = 'gitlab'
     base_url = 'https://gitlab.com/search'
     rate_limit = 5
+    relative_base = 'https://gitlab.com'
+    skip_patterns = [
+        '/users/', '/-/', '/explore', '/help',
+        '/admin', '/dashboard', '/profile',
+    ]
 
     def build_url(self, query, page=0):
         search_query = query if query else 'proxy list'
@@ -479,10 +427,10 @@ class GitLab(SearchEngine):
         return '%s?%s' % (self.base_url, urllib.urlencode(params))
 
     def extract_urls(self, content, urignore=None):
+        """Extract project URLs only (whitelist pattern)."""
         urls = []
         if not content:
             return urls
-        urignore = urignore or []
 
         soup = soupify(content, nohtml=True)
 
@@ -493,29 +441,21 @@ class GitLab(SearchEngine):
 
             # Convert relative to absolute
             if href.startswith('/'):
-                href = 'https://gitlab.com' + href
+                href = self.relative_base + href
 
-            if not href.startswith('http'):
+            if not href.startswith('http') or 'gitlab.com' not in href:
                 continue
 
-            # Only keep GitLab project links
-            if 'gitlab.com' not in href:
-                continue
-
-            # Skip non-project links
-            skip_patterns = [
-                '/users/', '/-/', '/explore', '/help',
-                '/admin', '/dashboard', '/profile',
-            ]
+            # Skip internal pages
             skip = False
-            for pattern in skip_patterns:
+            for pattern in self.skip_patterns:
                 if pattern in href:
                     skip = True
                     break
             if skip:
                 continue
 
-            # Keep project and file links
+            # Keep only project links
             if re.match(r'https://gitlab\.com/[^/]+/[^/]+', href):
                 if href not in urls:
                     urls.append(href)
@@ -529,6 +469,7 @@ class Codeberg(SearchEngine):
     name = 'codeberg'
     base_url = 'https://codeberg.org/explore/repos'
     rate_limit = 10
+    relative_base = 'https://codeberg.org'
 
     def build_url(self, query, page=0):
         search_query = query if query else 'proxy'
@@ -541,10 +482,10 @@ class Codeberg(SearchEngine):
         return '%s?%s' % (self.base_url, urllib.urlencode(params))
 
     def extract_urls(self, content, urignore=None):
+        """Extract repo URLs only (whitelist pattern)."""
         urls = []
         if not content:
             return urls
-        urignore = urignore or []
 
         soup = soupify(content, nohtml=True)
 
@@ -555,13 +496,9 @@ class Codeberg(SearchEngine):
 
             # Convert relative to absolute
             if href.startswith('/'):
-                href = 'https://codeberg.org' + href
+                href = self.relative_base + href
 
-            if not href.startswith('http'):
-                continue
-
-            # Only keep Codeberg repo links
-            if 'codeberg.org' not in href:
+            if not href.startswith('http') or 'codeberg.org' not in href:
                 continue
 
             # Keep repo links (format: /user/repo)
@@ -600,12 +537,13 @@ class Gitea(SearchEngine):
         return '%s/explore/repos?%s' % (self.current_instance, urllib.urlencode(params))
 
     def extract_urls(self, content, urignore=None):
+        """Extract repo URLs for current dynamic instance."""
         urls = []
         if not content:
             return urls
-        urignore = urignore or []
 
         soup = soupify(content, nohtml=True)
+        instance_domain = self.current_instance.split('//')[1]
 
         for a in soup.body.find_all('a'):
             href = a.get('href', '')
@@ -619,8 +557,8 @@ class Gitea(SearchEngine):
             if not href.startswith('http'):
                 continue
 
-            # Keep repo links
-            if self.current_instance.split('//')[1] in href:
+            # Keep repo links for this instance
+            if instance_domain in href:
                 if re.match(r'https?://[^/]+/[^/]+/[^/]+$', href):
                     if href not in urls:
                         urls.append(href)
@@ -649,7 +587,7 @@ class Searx(SearchEngine):
         return '%s/?%s' % (self.base_url, urllib.urlencode(params))
 
     def extract_urls(self, content, urignore=None):
-        """Extract URLs from Searx results (noreferrer links)."""
+        """Extract URLs from Searx results (noreferrer links only)."""
         urls = []
         if not content:
             return urls
@@ -668,13 +606,13 @@ class Searx(SearchEngine):
                 continue
 
             # Check urignore patterns
-            bad = False
+            skip = False
             for pattern in urignore:
                 if re.search(pattern, href):
-                    bad = True
+                    skip = True
                     break
 
-            if not bad and href not in urls:
+            if not skip and href not in urls:
                 urls.append(href)
 
         return urls