engines: consolidate extract_urls with base class method
This commit is contained in:
364
engines.py
364
engines.py
@@ -16,6 +16,12 @@ class SearchEngine(object):
|
|||||||
base_url = ''
|
base_url = ''
|
||||||
# Rate limiting: requests per minute (0 = no limit)
|
# Rate limiting: requests per minute (0 = no limit)
|
||||||
rate_limit = 0
|
rate_limit = 0
|
||||||
|
# Domains to skip (engine's own domain)
|
||||||
|
skip_domains = []
|
||||||
|
# Path patterns to skip (internal pages)
|
||||||
|
skip_patterns = []
|
||||||
|
# Base URL for relative link conversion (None = skip relative links)
|
||||||
|
relative_base = None
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.last_request = 0
|
self.last_request = 0
|
||||||
@@ -25,8 +31,65 @@ class SearchEngine(object):
|
|||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def extract_urls(self, content, urignore=None):
|
def extract_urls(self, content, urignore=None):
|
||||||
"""Extract result URLs from response content."""
|
"""Extract result URLs from response content.
|
||||||
raise NotImplementedError
|
|
||||||
|
Base implementation handles common patterns:
|
||||||
|
- Skips empty content
|
||||||
|
- Parses HTML with soupify
|
||||||
|
- Converts relative URLs if relative_base is set
|
||||||
|
- Skips domains in skip_domains
|
||||||
|
- Skips paths matching skip_patterns
|
||||||
|
- Applies urignore regex patterns
|
||||||
|
- Deduplicates results
|
||||||
|
"""
|
||||||
|
urls = []
|
||||||
|
if not content:
|
||||||
|
return urls
|
||||||
|
urignore = urignore or []
|
||||||
|
|
||||||
|
soup = soupify(content, nohtml=True)
|
||||||
|
|
||||||
|
for a in soup.body.find_all('a'):
|
||||||
|
href = a.get('href', '')
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Handle relative URLs
|
||||||
|
if not href.startswith('http'):
|
||||||
|
if self.relative_base and href.startswith('/'):
|
||||||
|
href = self.relative_base + href
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip engine's own domain(s)
|
||||||
|
skip = False
|
||||||
|
for domain in self.skip_domains:
|
||||||
|
if domain in href:
|
||||||
|
skip = True
|
||||||
|
break
|
||||||
|
if skip:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip internal paths
|
||||||
|
for pattern in self.skip_patterns:
|
||||||
|
if pattern in href:
|
||||||
|
skip = True
|
||||||
|
break
|
||||||
|
if skip:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check urignore patterns
|
||||||
|
for pattern in urignore:
|
||||||
|
if re.search(pattern, href):
|
||||||
|
skip = True
|
||||||
|
break
|
||||||
|
if skip:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if href not in urls:
|
||||||
|
urls.append(href)
|
||||||
|
|
||||||
|
return urls
|
||||||
|
|
||||||
def is_rate_limited(self, content):
|
def is_rate_limited(self, content):
|
||||||
"""Check if response indicates rate limiting."""
|
"""Check if response indicates rate limiting."""
|
||||||
@@ -50,6 +113,7 @@ class DuckDuckGo(SearchEngine):
|
|||||||
name = 'duckduckgo'
|
name = 'duckduckgo'
|
||||||
base_url = 'https://html.duckduckgo.com/html/'
|
base_url = 'https://html.duckduckgo.com/html/'
|
||||||
rate_limit = 10
|
rate_limit = 10
|
||||||
|
skip_domains = ['duckduckgo.com']
|
||||||
|
|
||||||
def build_url(self, query, page=0):
|
def build_url(self, query, page=0):
|
||||||
params = {'q': query}
|
params = {'q': query}
|
||||||
@@ -59,6 +123,17 @@ class DuckDuckGo(SearchEngine):
|
|||||||
params['dc'] = str(page * 30 + 1)
|
params['dc'] = str(page * 30 + 1)
|
||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||||
|
|
||||||
|
def _unwrap_url(self, href):
|
||||||
|
"""Extract actual URL from DuckDuckGo redirect wrapper."""
|
||||||
|
if '/l/?uddg=' in href or 'uddg=' in href:
|
||||||
|
match = re.search(r'uddg=([^&]+)', href)
|
||||||
|
if match:
|
||||||
|
try:
|
||||||
|
return urllib.unquote(match.group(1))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
return href
|
||||||
|
|
||||||
def extract_urls(self, content, urignore=None):
|
def extract_urls(self, content, urignore=None):
|
||||||
urls = []
|
urls = []
|
||||||
if not content:
|
if not content:
|
||||||
@@ -67,33 +142,28 @@ class DuckDuckGo(SearchEngine):
|
|||||||
|
|
||||||
soup = soupify(content, nohtml=True)
|
soup = soupify(content, nohtml=True)
|
||||||
|
|
||||||
# DuckDuckGo HTML results are in <a class="result__url"> or <a class="result__a">
|
|
||||||
for a in soup.body.find_all('a'):
|
for a in soup.body.find_all('a'):
|
||||||
href = a.get('href', '')
|
href = a.get('href', '')
|
||||||
if not href or not href.startswith('http'):
|
if not href or not href.startswith('http'):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Skip DuckDuckGo internal links
|
# Unwrap redirect URLs
|
||||||
|
href = self._unwrap_url(href)
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip engine's domain
|
||||||
if 'duckduckgo.com' in href:
|
if 'duckduckgo.com' in href:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# DuckDuckGo wraps URLs - extract actual URL from redirect
|
|
||||||
if '/l/?uddg=' in href or 'uddg=' in href:
|
|
||||||
match = re.search(r'uddg=([^&]+)', href)
|
|
||||||
if match:
|
|
||||||
try:
|
|
||||||
href = urllib.unquote(match.group(1))
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check urignore patterns
|
# Check urignore patterns
|
||||||
bad = False
|
skip = False
|
||||||
for pattern in urignore:
|
for pattern in urignore:
|
||||||
if re.search(pattern, href):
|
if re.search(pattern, href):
|
||||||
bad = True
|
skip = True
|
||||||
break
|
break
|
||||||
|
|
||||||
if not bad and href not in urls:
|
if not skip and href not in urls:
|
||||||
urls.append(href)
|
urls.append(href)
|
||||||
|
|
||||||
return urls
|
return urls
|
||||||
@@ -105,6 +175,7 @@ class Startpage(SearchEngine):
|
|||||||
name = 'startpage'
|
name = 'startpage'
|
||||||
base_url = 'https://www.startpage.com/do/search'
|
base_url = 'https://www.startpage.com/do/search'
|
||||||
rate_limit = 5
|
rate_limit = 5
|
||||||
|
skip_domains = ['startpage.com']
|
||||||
|
|
||||||
def build_url(self, query, page=0):
|
def build_url(self, query, page=0):
|
||||||
params = {
|
params = {
|
||||||
@@ -116,35 +187,6 @@ class Startpage(SearchEngine):
|
|||||||
params['page'] = str(page + 1)
|
params['page'] = str(page + 1)
|
||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||||
|
|
||||||
def extract_urls(self, content, urignore=None):
|
|
||||||
urls = []
|
|
||||||
if not content:
|
|
||||||
return urls
|
|
||||||
urignore = urignore or []
|
|
||||||
|
|
||||||
soup = soupify(content, nohtml=True)
|
|
||||||
|
|
||||||
for a in soup.body.find_all('a'):
|
|
||||||
href = a.get('href', '')
|
|
||||||
if not href or not href.startswith('http'):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Skip Startpage internal links
|
|
||||||
if 'startpage.com' in href:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check urignore patterns
|
|
||||||
bad = False
|
|
||||||
for pattern in urignore:
|
|
||||||
if re.search(pattern, href):
|
|
||||||
bad = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if not bad and href not in urls:
|
|
||||||
urls.append(href)
|
|
||||||
|
|
||||||
return urls
|
|
||||||
|
|
||||||
|
|
||||||
class Mojeek(SearchEngine):
|
class Mojeek(SearchEngine):
|
||||||
"""Mojeek search (UK-based, independent index)."""
|
"""Mojeek search (UK-based, independent index)."""
|
||||||
@@ -152,6 +194,7 @@ class Mojeek(SearchEngine):
|
|||||||
name = 'mojeek'
|
name = 'mojeek'
|
||||||
base_url = 'https://www.mojeek.com/search'
|
base_url = 'https://www.mojeek.com/search'
|
||||||
rate_limit = 10
|
rate_limit = 10
|
||||||
|
skip_domains = ['mojeek.com', 'mojeek.co.uk']
|
||||||
|
|
||||||
def build_url(self, query, page=0):
|
def build_url(self, query, page=0):
|
||||||
params = {'q': query}
|
params = {'q': query}
|
||||||
@@ -160,35 +203,6 @@ class Mojeek(SearchEngine):
|
|||||||
params['s'] = str(page * 10 + 1)
|
params['s'] = str(page * 10 + 1)
|
||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||||
|
|
||||||
def extract_urls(self, content, urignore=None):
|
|
||||||
urls = []
|
|
||||||
if not content:
|
|
||||||
return urls
|
|
||||||
urignore = urignore or []
|
|
||||||
|
|
||||||
soup = soupify(content, nohtml=True)
|
|
||||||
|
|
||||||
for a in soup.body.find_all('a'):
|
|
||||||
href = a.get('href', '')
|
|
||||||
if not href or not href.startswith('http'):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Skip Mojeek internal links
|
|
||||||
if 'mojeek.com' in href or 'mojeek.co.uk' in href:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check urignore patterns
|
|
||||||
bad = False
|
|
||||||
for pattern in urignore:
|
|
||||||
if re.search(pattern, href):
|
|
||||||
bad = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if not bad and href not in urls:
|
|
||||||
urls.append(href)
|
|
||||||
|
|
||||||
return urls
|
|
||||||
|
|
||||||
|
|
||||||
class Qwant(SearchEngine):
|
class Qwant(SearchEngine):
|
||||||
"""Qwant Lite search (French, EU-based, privacy-focused)."""
|
"""Qwant Lite search (French, EU-based, privacy-focused)."""
|
||||||
@@ -196,6 +210,7 @@ class Qwant(SearchEngine):
|
|||||||
name = 'qwant'
|
name = 'qwant'
|
||||||
base_url = 'https://lite.qwant.com/'
|
base_url = 'https://lite.qwant.com/'
|
||||||
rate_limit = 10
|
rate_limit = 10
|
||||||
|
skip_domains = ['qwant.com']
|
||||||
|
|
||||||
def build_url(self, query, page=0):
|
def build_url(self, query, page=0):
|
||||||
params = {
|
params = {
|
||||||
@@ -206,35 +221,6 @@ class Qwant(SearchEngine):
|
|||||||
params['p'] = str(page + 1)
|
params['p'] = str(page + 1)
|
||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||||
|
|
||||||
def extract_urls(self, content, urignore=None):
|
|
||||||
urls = []
|
|
||||||
if not content:
|
|
||||||
return urls
|
|
||||||
urignore = urignore or []
|
|
||||||
|
|
||||||
soup = soupify(content, nohtml=True)
|
|
||||||
|
|
||||||
for a in soup.body.find_all('a'):
|
|
||||||
href = a.get('href', '')
|
|
||||||
if not href or not href.startswith('http'):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Skip Qwant internal links
|
|
||||||
if 'qwant.com' in href:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check urignore patterns
|
|
||||||
bad = False
|
|
||||||
for pattern in urignore:
|
|
||||||
if re.search(pattern, href):
|
|
||||||
bad = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if not bad and href not in urls:
|
|
||||||
urls.append(href)
|
|
||||||
|
|
||||||
return urls
|
|
||||||
|
|
||||||
|
|
||||||
class Yandex(SearchEngine):
|
class Yandex(SearchEngine):
|
||||||
"""Yandex search (Russian, large independent index)."""
|
"""Yandex search (Russian, large independent index)."""
|
||||||
@@ -242,6 +228,7 @@ class Yandex(SearchEngine):
|
|||||||
name = 'yandex'
|
name = 'yandex'
|
||||||
base_url = 'https://yandex.com/search/'
|
base_url = 'https://yandex.com/search/'
|
||||||
rate_limit = 5
|
rate_limit = 5
|
||||||
|
skip_domains = ['yandex.com', 'yandex.ru']
|
||||||
|
|
||||||
def build_url(self, query, page=0):
|
def build_url(self, query, page=0):
|
||||||
params = {
|
params = {
|
||||||
@@ -252,6 +239,18 @@ class Yandex(SearchEngine):
|
|||||||
params['p'] = str(page)
|
params['p'] = str(page)
|
||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||||
|
|
||||||
|
def _unwrap_url(self, href):
|
||||||
|
"""Extract actual URL from Yandex redirect wrapper."""
|
||||||
|
if '//yandex.' in href:
|
||||||
|
match = re.search(r'url=([^&]+)', href)
|
||||||
|
if match:
|
||||||
|
try:
|
||||||
|
return urllib.unquote(match.group(1))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
return href
|
||||||
|
|
||||||
def extract_urls(self, content, urignore=None):
|
def extract_urls(self, content, urignore=None):
|
||||||
urls = []
|
urls = []
|
||||||
if not content:
|
if not content:
|
||||||
@@ -265,29 +264,19 @@ class Yandex(SearchEngine):
|
|||||||
if not href:
|
if not href:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Yandex uses redirect URLs, try to extract actual URL
|
# Unwrap redirect URLs
|
||||||
if '//yandex.' in href:
|
href = self._unwrap_url(href)
|
||||||
# Try to find embedded URL
|
if not href or not href.startswith('http'):
|
||||||
match = re.search(r'url=([^&]+)', href)
|
|
||||||
if match:
|
|
||||||
try:
|
|
||||||
href = urllib.unquote(match.group(1))
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not href.startswith('http'):
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check urignore patterns
|
# Check urignore patterns
|
||||||
bad = False
|
skip = False
|
||||||
for pattern in urignore:
|
for pattern in urignore:
|
||||||
if re.search(pattern, href):
|
if re.search(pattern, href):
|
||||||
bad = True
|
skip = True
|
||||||
break
|
break
|
||||||
|
|
||||||
if not bad and href not in urls:
|
if not skip and href not in urls:
|
||||||
urls.append(href)
|
urls.append(href)
|
||||||
|
|
||||||
return urls
|
return urls
|
||||||
@@ -299,6 +288,7 @@ class Ecosia(SearchEngine):
|
|||||||
name = 'ecosia'
|
name = 'ecosia'
|
||||||
base_url = 'https://www.ecosia.org/search'
|
base_url = 'https://www.ecosia.org/search'
|
||||||
rate_limit = 10
|
rate_limit = 10
|
||||||
|
skip_domains = ['ecosia.org']
|
||||||
|
|
||||||
def build_url(self, query, page=0):
|
def build_url(self, query, page=0):
|
||||||
params = {'q': query}
|
params = {'q': query}
|
||||||
@@ -306,35 +296,6 @@ class Ecosia(SearchEngine):
|
|||||||
params['p'] = str(page)
|
params['p'] = str(page)
|
||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||||
|
|
||||||
def extract_urls(self, content, urignore=None):
|
|
||||||
urls = []
|
|
||||||
if not content:
|
|
||||||
return urls
|
|
||||||
urignore = urignore or []
|
|
||||||
|
|
||||||
soup = soupify(content, nohtml=True)
|
|
||||||
|
|
||||||
for a in soup.body.find_all('a'):
|
|
||||||
href = a.get('href', '')
|
|
||||||
if not href or not href.startswith('http'):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Skip Ecosia internal links
|
|
||||||
if 'ecosia.org' in href:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check urignore patterns
|
|
||||||
bad = False
|
|
||||||
for pattern in urignore:
|
|
||||||
if re.search(pattern, href):
|
|
||||||
bad = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if not bad and href not in urls:
|
|
||||||
urls.append(href)
|
|
||||||
|
|
||||||
return urls
|
|
||||||
|
|
||||||
|
|
||||||
class Brave(SearchEngine):
|
class Brave(SearchEngine):
|
||||||
"""Brave Search (privacy-focused, independent index)."""
|
"""Brave Search (privacy-focused, independent index)."""
|
||||||
@@ -342,6 +303,7 @@ class Brave(SearchEngine):
|
|||||||
name = 'brave'
|
name = 'brave'
|
||||||
base_url = 'https://search.brave.com/search'
|
base_url = 'https://search.brave.com/search'
|
||||||
rate_limit = 10
|
rate_limit = 10
|
||||||
|
skip_domains = ['brave.com']
|
||||||
|
|
||||||
def build_url(self, query, page=0):
|
def build_url(self, query, page=0):
|
||||||
params = {'q': query}
|
params = {'q': query}
|
||||||
@@ -349,35 +311,6 @@ class Brave(SearchEngine):
|
|||||||
params['offset'] = str(page)
|
params['offset'] = str(page)
|
||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||||
|
|
||||||
def extract_urls(self, content, urignore=None):
|
|
||||||
urls = []
|
|
||||||
if not content:
|
|
||||||
return urls
|
|
||||||
urignore = urignore or []
|
|
||||||
|
|
||||||
soup = soupify(content, nohtml=True)
|
|
||||||
|
|
||||||
for a in soup.body.find_all('a'):
|
|
||||||
href = a.get('href', '')
|
|
||||||
if not href or not href.startswith('http'):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Skip Brave internal links
|
|
||||||
if 'brave.com' in href:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check urignore patterns
|
|
||||||
bad = False
|
|
||||||
for pattern in urignore:
|
|
||||||
if re.search(pattern, href):
|
|
||||||
bad = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if not bad and href not in urls:
|
|
||||||
urls.append(href)
|
|
||||||
|
|
||||||
return urls
|
|
||||||
|
|
||||||
|
|
||||||
class GitHub(SearchEngine):
|
class GitHub(SearchEngine):
|
||||||
"""GitHub code/repository search for proxy lists."""
|
"""GitHub code/repository search for proxy lists."""
|
||||||
@@ -385,6 +318,12 @@ class GitHub(SearchEngine):
|
|||||||
name = 'github'
|
name = 'github'
|
||||||
base_url = 'https://github.com/search'
|
base_url = 'https://github.com/search'
|
||||||
rate_limit = 5
|
rate_limit = 5
|
||||||
|
relative_base = 'https://github.com'
|
||||||
|
skip_patterns = [
|
||||||
|
'/login', '/signup', '/join', '/settings',
|
||||||
|
'/notifications', '/marketplace', '/explore',
|
||||||
|
'/sponsors', '/pricing', '/features',
|
||||||
|
]
|
||||||
|
|
||||||
# Search terms specific to proxy lists on GitHub
|
# Search terms specific to proxy lists on GitHub
|
||||||
github_queries = [
|
github_queries = [
|
||||||
@@ -410,6 +349,7 @@ class GitHub(SearchEngine):
|
|||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||||
|
|
||||||
def extract_urls(self, content, urignore=None):
|
def extract_urls(self, content, urignore=None):
|
||||||
|
"""Extract URLs with blob-to-raw conversion for direct file access."""
|
||||||
urls = []
|
urls = []
|
||||||
if not content:
|
if not content:
|
||||||
return urls
|
return urls
|
||||||
@@ -424,7 +364,7 @@ class GitHub(SearchEngine):
|
|||||||
|
|
||||||
# Convert relative to absolute
|
# Convert relative to absolute
|
||||||
if href.startswith('/'):
|
if href.startswith('/'):
|
||||||
href = 'https://github.com' + href
|
href = self.relative_base + href
|
||||||
|
|
||||||
if not href.startswith('http'):
|
if not href.startswith('http'):
|
||||||
continue
|
continue
|
||||||
@@ -433,20 +373,23 @@ class GitHub(SearchEngine):
|
|||||||
if 'github.com' not in href:
|
if 'github.com' not in href:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Skip non-content links
|
# Skip internal pages
|
||||||
skip_patterns = [
|
|
||||||
'/login', '/signup', '/join', '/settings',
|
|
||||||
'/notifications', '/marketplace', '/explore',
|
|
||||||
'/sponsors', '/pricing', '/features',
|
|
||||||
]
|
|
||||||
skip = False
|
skip = False
|
||||||
for pattern in skip_patterns:
|
for pattern in self.skip_patterns:
|
||||||
if pattern in href:
|
if pattern in href:
|
||||||
skip = True
|
skip = True
|
||||||
break
|
break
|
||||||
if skip:
|
if skip:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Check urignore patterns
|
||||||
|
for pattern in urignore:
|
||||||
|
if re.search(pattern, href):
|
||||||
|
skip = True
|
||||||
|
break
|
||||||
|
if skip:
|
||||||
|
continue
|
||||||
|
|
||||||
# Keep raw file links and repo links
|
# Keep raw file links and repo links
|
||||||
if '/raw/' in href or '/blob/' in href or re.match(r'https://github\.com/[^/]+/[^/]+$', href):
|
if '/raw/' in href or '/blob/' in href or re.match(r'https://github\.com/[^/]+/[^/]+$', href):
|
||||||
# Convert blob to raw for direct access
|
# Convert blob to raw for direct access
|
||||||
@@ -467,6 +410,11 @@ class GitLab(SearchEngine):
|
|||||||
name = 'gitlab'
|
name = 'gitlab'
|
||||||
base_url = 'https://gitlab.com/search'
|
base_url = 'https://gitlab.com/search'
|
||||||
rate_limit = 5
|
rate_limit = 5
|
||||||
|
relative_base = 'https://gitlab.com'
|
||||||
|
skip_patterns = [
|
||||||
|
'/users/', '/-/', '/explore', '/help',
|
||||||
|
'/admin', '/dashboard', '/profile',
|
||||||
|
]
|
||||||
|
|
||||||
def build_url(self, query, page=0):
|
def build_url(self, query, page=0):
|
||||||
search_query = query if query else 'proxy list'
|
search_query = query if query else 'proxy list'
|
||||||
@@ -479,10 +427,10 @@ class GitLab(SearchEngine):
|
|||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||||
|
|
||||||
def extract_urls(self, content, urignore=None):
|
def extract_urls(self, content, urignore=None):
|
||||||
|
"""Extract project URLs only (whitelist pattern)."""
|
||||||
urls = []
|
urls = []
|
||||||
if not content:
|
if not content:
|
||||||
return urls
|
return urls
|
||||||
urignore = urignore or []
|
|
||||||
|
|
||||||
soup = soupify(content, nohtml=True)
|
soup = soupify(content, nohtml=True)
|
||||||
|
|
||||||
@@ -493,29 +441,21 @@ class GitLab(SearchEngine):
|
|||||||
|
|
||||||
# Convert relative to absolute
|
# Convert relative to absolute
|
||||||
if href.startswith('/'):
|
if href.startswith('/'):
|
||||||
href = 'https://gitlab.com' + href
|
href = self.relative_base + href
|
||||||
|
|
||||||
if not href.startswith('http'):
|
if not href.startswith('http') or 'gitlab.com' not in href:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Only keep GitLab project links
|
# Skip internal pages
|
||||||
if 'gitlab.com' not in href:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Skip non-project links
|
|
||||||
skip_patterns = [
|
|
||||||
'/users/', '/-/', '/explore', '/help',
|
|
||||||
'/admin', '/dashboard', '/profile',
|
|
||||||
]
|
|
||||||
skip = False
|
skip = False
|
||||||
for pattern in skip_patterns:
|
for pattern in self.skip_patterns:
|
||||||
if pattern in href:
|
if pattern in href:
|
||||||
skip = True
|
skip = True
|
||||||
break
|
break
|
||||||
if skip:
|
if skip:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Keep project and file links
|
# Keep only project links
|
||||||
if re.match(r'https://gitlab\.com/[^/]+/[^/]+', href):
|
if re.match(r'https://gitlab\.com/[^/]+/[^/]+', href):
|
||||||
if href not in urls:
|
if href not in urls:
|
||||||
urls.append(href)
|
urls.append(href)
|
||||||
@@ -529,6 +469,7 @@ class Codeberg(SearchEngine):
|
|||||||
name = 'codeberg'
|
name = 'codeberg'
|
||||||
base_url = 'https://codeberg.org/explore/repos'
|
base_url = 'https://codeberg.org/explore/repos'
|
||||||
rate_limit = 10
|
rate_limit = 10
|
||||||
|
relative_base = 'https://codeberg.org'
|
||||||
|
|
||||||
def build_url(self, query, page=0):
|
def build_url(self, query, page=0):
|
||||||
search_query = query if query else 'proxy'
|
search_query = query if query else 'proxy'
|
||||||
@@ -541,10 +482,10 @@ class Codeberg(SearchEngine):
|
|||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||||
|
|
||||||
def extract_urls(self, content, urignore=None):
|
def extract_urls(self, content, urignore=None):
|
||||||
|
"""Extract repo URLs only (whitelist pattern)."""
|
||||||
urls = []
|
urls = []
|
||||||
if not content:
|
if not content:
|
||||||
return urls
|
return urls
|
||||||
urignore = urignore or []
|
|
||||||
|
|
||||||
soup = soupify(content, nohtml=True)
|
soup = soupify(content, nohtml=True)
|
||||||
|
|
||||||
@@ -555,13 +496,9 @@ class Codeberg(SearchEngine):
|
|||||||
|
|
||||||
# Convert relative to absolute
|
# Convert relative to absolute
|
||||||
if href.startswith('/'):
|
if href.startswith('/'):
|
||||||
href = 'https://codeberg.org' + href
|
href = self.relative_base + href
|
||||||
|
|
||||||
if not href.startswith('http'):
|
if not href.startswith('http') or 'codeberg.org' not in href:
|
||||||
continue
|
|
||||||
|
|
||||||
# Only keep Codeberg repo links
|
|
||||||
if 'codeberg.org' not in href:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Keep repo links (format: /user/repo)
|
# Keep repo links (format: /user/repo)
|
||||||
@@ -600,12 +537,13 @@ class Gitea(SearchEngine):
|
|||||||
return '%s/explore/repos?%s' % (self.current_instance, urllib.urlencode(params))
|
return '%s/explore/repos?%s' % (self.current_instance, urllib.urlencode(params))
|
||||||
|
|
||||||
def extract_urls(self, content, urignore=None):
|
def extract_urls(self, content, urignore=None):
|
||||||
|
"""Extract repo URLs for current dynamic instance."""
|
||||||
urls = []
|
urls = []
|
||||||
if not content:
|
if not content:
|
||||||
return urls
|
return urls
|
||||||
urignore = urignore or []
|
|
||||||
|
|
||||||
soup = soupify(content, nohtml=True)
|
soup = soupify(content, nohtml=True)
|
||||||
|
instance_domain = self.current_instance.split('//')[1]
|
||||||
|
|
||||||
for a in soup.body.find_all('a'):
|
for a in soup.body.find_all('a'):
|
||||||
href = a.get('href', '')
|
href = a.get('href', '')
|
||||||
@@ -619,8 +557,8 @@ class Gitea(SearchEngine):
|
|||||||
if not href.startswith('http'):
|
if not href.startswith('http'):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Keep repo links
|
# Keep repo links for this instance
|
||||||
if self.current_instance.split('//')[1] in href:
|
if instance_domain in href:
|
||||||
if re.match(r'https?://[^/]+/[^/]+/[^/]+$', href):
|
if re.match(r'https?://[^/]+/[^/]+/[^/]+$', href):
|
||||||
if href not in urls:
|
if href not in urls:
|
||||||
urls.append(href)
|
urls.append(href)
|
||||||
@@ -649,7 +587,7 @@ class Searx(SearchEngine):
|
|||||||
return '%s/?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s/?%s' % (self.base_url, urllib.urlencode(params))
|
||||||
|
|
||||||
def extract_urls(self, content, urignore=None):
|
def extract_urls(self, content, urignore=None):
|
||||||
"""Extract URLs from Searx results (noreferrer links)."""
|
"""Extract URLs from Searx results (noreferrer links only)."""
|
||||||
urls = []
|
urls = []
|
||||||
if not content:
|
if not content:
|
||||||
return urls
|
return urls
|
||||||
@@ -668,13 +606,13 @@ class Searx(SearchEngine):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Check urignore patterns
|
# Check urignore patterns
|
||||||
bad = False
|
skip = False
|
||||||
for pattern in urignore:
|
for pattern in urignore:
|
||||||
if re.search(pattern, href):
|
if re.search(pattern, href):
|
||||||
bad = True
|
skip = True
|
||||||
break
|
break
|
||||||
|
|
||||||
if not bad and href not in urls:
|
if not skip and href not in urls:
|
||||||
urls.append(href)
|
urls.append(href)
|
||||||
|
|
||||||
return urls
|
return urls
|
||||||
|
|||||||
Reference in New Issue
Block a user