engines: improve search engine rate limiting
This commit is contained in:
78
engines.py
78
engines.py
@@ -9,6 +9,44 @@ from soup_parser import soupify
|
||||
from misc import _log
|
||||
|
||||
|
||||
def _urlencode(params):
|
||||
"""URL-encode params dict, handling Unicode strings.
|
||||
|
||||
Python 2's urllib.urlencode() expects byte strings. This helper
|
||||
encodes any Unicode values to UTF-8 before URL encoding.
|
||||
|
||||
Args:
|
||||
params: Dictionary of query parameters
|
||||
|
||||
Returns:
|
||||
URL-encoded query string
|
||||
"""
|
||||
encoded = {}
|
||||
for k, v in params.items():
|
||||
if isinstance(v, unicode):
|
||||
v = v.encode('utf-8')
|
||||
encoded[k] = v
|
||||
return urllib.urlencode(encoded)
|
||||
|
||||
|
||||
def _get_body(soup):
|
||||
"""Get body element from soup, handling None case.
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup or SoupResult object
|
||||
|
||||
Returns:
|
||||
Body element or empty list wrapper if None
|
||||
"""
|
||||
if soup is None or soup.body is None:
|
||||
# Return object with empty find_all to avoid AttributeError
|
||||
class EmptyBody:
|
||||
def find_all(self, *args, **kwargs):
|
||||
return []
|
||||
return EmptyBody()
|
||||
return soup.body
|
||||
|
||||
|
||||
class SearchEngine(object):
|
||||
"""Base class for search engines."""
|
||||
|
||||
@@ -49,7 +87,7 @@ class SearchEngine(object):
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
for a in _get_body(soup).find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href:
|
||||
continue
|
||||
@@ -121,7 +159,7 @@ class DuckDuckGo(SearchEngine):
|
||||
# DuckDuckGo uses 's' param for offset (30 results per page)
|
||||
params['s'] = str(page * 30)
|
||||
params['dc'] = str(page * 30 + 1)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
def _unwrap_url(self, href):
|
||||
"""Extract actual URL from DuckDuckGo redirect wrapper."""
|
||||
@@ -142,7 +180,7 @@ class DuckDuckGo(SearchEngine):
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
for a in _get_body(soup).find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href or not href.startswith('http'):
|
||||
continue
|
||||
@@ -185,7 +223,7 @@ class Startpage(SearchEngine):
|
||||
}
|
||||
if page > 0:
|
||||
params['page'] = str(page + 1)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
|
||||
class Mojeek(SearchEngine):
|
||||
@@ -201,7 +239,7 @@ class Mojeek(SearchEngine):
|
||||
if page > 0:
|
||||
# Mojeek uses 's' for start position (10 results per page)
|
||||
params['s'] = str(page * 10 + 1)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
|
||||
class Qwant(SearchEngine):
|
||||
@@ -219,7 +257,7 @@ class Qwant(SearchEngine):
|
||||
}
|
||||
if page > 0:
|
||||
params['p'] = str(page + 1)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
|
||||
class Yandex(SearchEngine):
|
||||
@@ -237,7 +275,7 @@ class Yandex(SearchEngine):
|
||||
}
|
||||
if page > 0:
|
||||
params['p'] = str(page)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
def _unwrap_url(self, href):
|
||||
"""Extract actual URL from Yandex redirect wrapper."""
|
||||
@@ -259,7 +297,7 @@ class Yandex(SearchEngine):
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
for a in _get_body(soup).find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href:
|
||||
continue
|
||||
@@ -294,7 +332,7 @@ class Ecosia(SearchEngine):
|
||||
params = {'q': query}
|
||||
if page > 0:
|
||||
params['p'] = str(page)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
|
||||
class Brave(SearchEngine):
|
||||
@@ -309,7 +347,7 @@ class Brave(SearchEngine):
|
||||
params = {'q': query}
|
||||
if page > 0:
|
||||
params['offset'] = str(page)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
|
||||
class GitHub(SearchEngine):
|
||||
@@ -346,7 +384,7 @@ class GitHub(SearchEngine):
|
||||
}
|
||||
if page > 0:
|
||||
params['p'] = str(page + 1)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
"""Extract URLs with blob-to-raw conversion for direct file access."""
|
||||
@@ -357,7 +395,7 @@ class GitHub(SearchEngine):
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
for a in _get_body(soup).find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href:
|
||||
continue
|
||||
@@ -424,7 +462,7 @@ class GitLab(SearchEngine):
|
||||
}
|
||||
if page > 0:
|
||||
params['page'] = str(page + 1)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
"""Extract project URLs only (whitelist pattern)."""
|
||||
@@ -434,7 +472,7 @@ class GitLab(SearchEngine):
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
for a in _get_body(soup).find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href:
|
||||
continue
|
||||
@@ -479,7 +517,7 @@ class Codeberg(SearchEngine):
|
||||
}
|
||||
if page > 0:
|
||||
params['page'] = str(page + 1)
|
||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
||||
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
"""Extract repo URLs only (whitelist pattern)."""
|
||||
@@ -489,7 +527,7 @@ class Codeberg(SearchEngine):
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
for a in _get_body(soup).find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href:
|
||||
continue
|
||||
@@ -534,7 +572,7 @@ class Gitea(SearchEngine):
|
||||
}
|
||||
if page > 0:
|
||||
params['page'] = str(page + 1)
|
||||
return '%s/explore/repos?%s' % (self.current_instance, urllib.urlencode(params))
|
||||
return '%s/explore/repos?%s' % (self.current_instance, _urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
"""Extract repo URLs for current dynamic instance."""
|
||||
@@ -545,7 +583,7 @@ class Gitea(SearchEngine):
|
||||
soup = soupify(content, nohtml=True)
|
||||
instance_domain = self.current_instance.split('//')[1]
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
for a in _get_body(soup).find_all('a'):
|
||||
href = a.get('href', '')
|
||||
if not href:
|
||||
continue
|
||||
@@ -584,7 +622,7 @@ class Searx(SearchEngine):
|
||||
}
|
||||
if page > 0:
|
||||
params['pageno'] = str(page + 1)
|
||||
return '%s/?%s' % (self.base_url, urllib.urlencode(params))
|
||||
return '%s/?%s' % (self.base_url, _urlencode(params))
|
||||
|
||||
def extract_urls(self, content, urignore=None):
|
||||
"""Extract URLs from Searx results (noreferrer links only)."""
|
||||
@@ -595,7 +633,7 @@ class Searx(SearchEngine):
|
||||
|
||||
soup = soupify(content, nohtml=True)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
for a in _get_body(soup).find_all('a'):
|
||||
# Searx uses rel="noreferrer" for result links
|
||||
rel = a.get('rel', '')
|
||||
if not rel or 'noreferrer' not in str(rel):
|
||||
|
||||
Reference in New Issue
Block a user