engines: improve search engine rate limiting

This commit is contained in:
Username
2025-12-21 23:37:48 +01:00
parent 00623f3a18
commit 0274b84af8

View File

@@ -9,6 +9,44 @@ from soup_parser import soupify
from misc import _log
def _urlencode(params):
"""URL-encode params dict, handling Unicode strings.
Python 2's urllib.urlencode() expects byte strings. This helper
encodes any Unicode values to UTF-8 before URL encoding.
Args:
params: Dictionary of query parameters
Returns:
URL-encoded query string
"""
encoded = {}
for k, v in params.items():
if isinstance(v, unicode):
v = v.encode('utf-8')
encoded[k] = v
return urllib.urlencode(encoded)
def _get_body(soup):
"""Get body element from soup, handling None case.
Args:
soup: BeautifulSoup or SoupResult object
Returns:
Body element or empty list wrapper if None
"""
if soup is None or soup.body is None:
# Return object with empty find_all to avoid AttributeError
class EmptyBody:
def find_all(self, *args, **kwargs):
return []
return EmptyBody()
return soup.body
class SearchEngine(object):
"""Base class for search engines."""
@@ -49,7 +87,7 @@ class SearchEngine(object):
soup = soupify(content, nohtml=True)
for a in soup.body.find_all('a'):
for a in _get_body(soup).find_all('a'):
href = a.get('href', '')
if not href:
continue
@@ -121,7 +159,7 @@ class DuckDuckGo(SearchEngine):
# DuckDuckGo uses 's' param for offset (30 results per page)
params['s'] = str(page * 30)
params['dc'] = str(page * 30 + 1)
return '%s?%s' % (self.base_url, urllib.urlencode(params))
return '%s?%s' % (self.base_url, _urlencode(params))
def _unwrap_url(self, href):
"""Extract actual URL from DuckDuckGo redirect wrapper."""
@@ -142,7 +180,7 @@ class DuckDuckGo(SearchEngine):
soup = soupify(content, nohtml=True)
for a in soup.body.find_all('a'):
for a in _get_body(soup).find_all('a'):
href = a.get('href', '')
if not href or not href.startswith('http'):
continue
@@ -185,7 +223,7 @@ class Startpage(SearchEngine):
}
if page > 0:
params['page'] = str(page + 1)
return '%s?%s' % (self.base_url, urllib.urlencode(params))
return '%s?%s' % (self.base_url, _urlencode(params))
class Mojeek(SearchEngine):
@@ -201,7 +239,7 @@ class Mojeek(SearchEngine):
if page > 0:
# Mojeek uses 's' for start position (10 results per page)
params['s'] = str(page * 10 + 1)
return '%s?%s' % (self.base_url, urllib.urlencode(params))
return '%s?%s' % (self.base_url, _urlencode(params))
class Qwant(SearchEngine):
@@ -219,7 +257,7 @@ class Qwant(SearchEngine):
}
if page > 0:
params['p'] = str(page + 1)
return '%s?%s' % (self.base_url, urllib.urlencode(params))
return '%s?%s' % (self.base_url, _urlencode(params))
class Yandex(SearchEngine):
@@ -237,7 +275,7 @@ class Yandex(SearchEngine):
}
if page > 0:
params['p'] = str(page)
return '%s?%s' % (self.base_url, urllib.urlencode(params))
return '%s?%s' % (self.base_url, _urlencode(params))
def _unwrap_url(self, href):
"""Extract actual URL from Yandex redirect wrapper."""
@@ -259,7 +297,7 @@ class Yandex(SearchEngine):
soup = soupify(content, nohtml=True)
for a in soup.body.find_all('a'):
for a in _get_body(soup).find_all('a'):
href = a.get('href', '')
if not href:
continue
@@ -294,7 +332,7 @@ class Ecosia(SearchEngine):
params = {'q': query}
if page > 0:
params['p'] = str(page)
return '%s?%s' % (self.base_url, urllib.urlencode(params))
return '%s?%s' % (self.base_url, _urlencode(params))
class Brave(SearchEngine):
@@ -309,7 +347,7 @@ class Brave(SearchEngine):
params = {'q': query}
if page > 0:
params['offset'] = str(page)
return '%s?%s' % (self.base_url, urllib.urlencode(params))
return '%s?%s' % (self.base_url, _urlencode(params))
class GitHub(SearchEngine):
@@ -346,7 +384,7 @@ class GitHub(SearchEngine):
}
if page > 0:
params['p'] = str(page + 1)
return '%s?%s' % (self.base_url, urllib.urlencode(params))
return '%s?%s' % (self.base_url, _urlencode(params))
def extract_urls(self, content, urignore=None):
"""Extract URLs with blob-to-raw conversion for direct file access."""
@@ -357,7 +395,7 @@ class GitHub(SearchEngine):
soup = soupify(content, nohtml=True)
for a in soup.body.find_all('a'):
for a in _get_body(soup).find_all('a'):
href = a.get('href', '')
if not href:
continue
@@ -424,7 +462,7 @@ class GitLab(SearchEngine):
}
if page > 0:
params['page'] = str(page + 1)
return '%s?%s' % (self.base_url, urllib.urlencode(params))
return '%s?%s' % (self.base_url, _urlencode(params))
def extract_urls(self, content, urignore=None):
"""Extract project URLs only (whitelist pattern)."""
@@ -434,7 +472,7 @@ class GitLab(SearchEngine):
soup = soupify(content, nohtml=True)
for a in soup.body.find_all('a'):
for a in _get_body(soup).find_all('a'):
href = a.get('href', '')
if not href:
continue
@@ -479,7 +517,7 @@ class Codeberg(SearchEngine):
}
if page > 0:
params['page'] = str(page + 1)
return '%s?%s' % (self.base_url, urllib.urlencode(params))
return '%s?%s' % (self.base_url, _urlencode(params))
def extract_urls(self, content, urignore=None):
"""Extract repo URLs only (whitelist pattern)."""
@@ -489,7 +527,7 @@ class Codeberg(SearchEngine):
soup = soupify(content, nohtml=True)
for a in soup.body.find_all('a'):
for a in _get_body(soup).find_all('a'):
href = a.get('href', '')
if not href:
continue
@@ -534,7 +572,7 @@ class Gitea(SearchEngine):
}
if page > 0:
params['page'] = str(page + 1)
return '%s/explore/repos?%s' % (self.current_instance, urllib.urlencode(params))
return '%s/explore/repos?%s' % (self.current_instance, _urlencode(params))
def extract_urls(self, content, urignore=None):
"""Extract repo URLs for current dynamic instance."""
@@ -545,7 +583,7 @@ class Gitea(SearchEngine):
soup = soupify(content, nohtml=True)
instance_domain = self.current_instance.split('//')[1]
for a in soup.body.find_all('a'):
for a in _get_body(soup).find_all('a'):
href = a.get('href', '')
if not href:
continue
@@ -584,7 +622,7 @@ class Searx(SearchEngine):
}
if page > 0:
params['pageno'] = str(page + 1)
return '%s/?%s' % (self.base_url, urllib.urlencode(params))
return '%s/?%s' % (self.base_url, _urlencode(params))
def extract_urls(self, content, urignore=None):
"""Extract URLs from Searx results (noreferrer links only)."""
@@ -595,7 +633,7 @@ class Searx(SearchEngine):
soup = soupify(content, nohtml=True)
for a in soup.body.find_all('a'):
for a in _get_body(soup).find_all('a'):
# Searx uses rel="noreferrer" for result links
rel = a.get('rel', '')
if not rel or 'noreferrer' not in str(rel):