engines: improve search engine rate limiting
This commit is contained in:
78
engines.py
78
engines.py
@@ -9,6 +9,44 @@ from soup_parser import soupify
|
|||||||
from misc import _log
|
from misc import _log
|
||||||
|
|
||||||
|
|
||||||
|
def _urlencode(params):
|
||||||
|
"""URL-encode params dict, handling Unicode strings.
|
||||||
|
|
||||||
|
Python 2's urllib.urlencode() expects byte strings. This helper
|
||||||
|
encodes any Unicode values to UTF-8 before URL encoding.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
params: Dictionary of query parameters
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
URL-encoded query string
|
||||||
|
"""
|
||||||
|
encoded = {}
|
||||||
|
for k, v in params.items():
|
||||||
|
if isinstance(v, unicode):
|
||||||
|
v = v.encode('utf-8')
|
||||||
|
encoded[k] = v
|
||||||
|
return urllib.urlencode(encoded)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_body(soup):
|
||||||
|
"""Get body element from soup, handling None case.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
soup: BeautifulSoup or SoupResult object
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Body element or empty list wrapper if None
|
||||||
|
"""
|
||||||
|
if soup is None or soup.body is None:
|
||||||
|
# Return object with empty find_all to avoid AttributeError
|
||||||
|
class EmptyBody:
|
||||||
|
def find_all(self, *args, **kwargs):
|
||||||
|
return []
|
||||||
|
return EmptyBody()
|
||||||
|
return soup.body
|
||||||
|
|
||||||
|
|
||||||
class SearchEngine(object):
|
class SearchEngine(object):
|
||||||
"""Base class for search engines."""
|
"""Base class for search engines."""
|
||||||
|
|
||||||
@@ -49,7 +87,7 @@ class SearchEngine(object):
|
|||||||
|
|
||||||
soup = soupify(content, nohtml=True)
|
soup = soupify(content, nohtml=True)
|
||||||
|
|
||||||
for a in soup.body.find_all('a'):
|
for a in _get_body(soup).find_all('a'):
|
||||||
href = a.get('href', '')
|
href = a.get('href', '')
|
||||||
if not href:
|
if not href:
|
||||||
continue
|
continue
|
||||||
@@ -121,7 +159,7 @@ class DuckDuckGo(SearchEngine):
|
|||||||
# DuckDuckGo uses 's' param for offset (30 results per page)
|
# DuckDuckGo uses 's' param for offset (30 results per page)
|
||||||
params['s'] = str(page * 30)
|
params['s'] = str(page * 30)
|
||||||
params['dc'] = str(page * 30 + 1)
|
params['dc'] = str(page * 30 + 1)
|
||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||||
|
|
||||||
def _unwrap_url(self, href):
|
def _unwrap_url(self, href):
|
||||||
"""Extract actual URL from DuckDuckGo redirect wrapper."""
|
"""Extract actual URL from DuckDuckGo redirect wrapper."""
|
||||||
@@ -142,7 +180,7 @@ class DuckDuckGo(SearchEngine):
|
|||||||
|
|
||||||
soup = soupify(content, nohtml=True)
|
soup = soupify(content, nohtml=True)
|
||||||
|
|
||||||
for a in soup.body.find_all('a'):
|
for a in _get_body(soup).find_all('a'):
|
||||||
href = a.get('href', '')
|
href = a.get('href', '')
|
||||||
if not href or not href.startswith('http'):
|
if not href or not href.startswith('http'):
|
||||||
continue
|
continue
|
||||||
@@ -185,7 +223,7 @@ class Startpage(SearchEngine):
|
|||||||
}
|
}
|
||||||
if page > 0:
|
if page > 0:
|
||||||
params['page'] = str(page + 1)
|
params['page'] = str(page + 1)
|
||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||||
|
|
||||||
|
|
||||||
class Mojeek(SearchEngine):
|
class Mojeek(SearchEngine):
|
||||||
@@ -201,7 +239,7 @@ class Mojeek(SearchEngine):
|
|||||||
if page > 0:
|
if page > 0:
|
||||||
# Mojeek uses 's' for start position (10 results per page)
|
# Mojeek uses 's' for start position (10 results per page)
|
||||||
params['s'] = str(page * 10 + 1)
|
params['s'] = str(page * 10 + 1)
|
||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||||
|
|
||||||
|
|
||||||
class Qwant(SearchEngine):
|
class Qwant(SearchEngine):
|
||||||
@@ -219,7 +257,7 @@ class Qwant(SearchEngine):
|
|||||||
}
|
}
|
||||||
if page > 0:
|
if page > 0:
|
||||||
params['p'] = str(page + 1)
|
params['p'] = str(page + 1)
|
||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||||
|
|
||||||
|
|
||||||
class Yandex(SearchEngine):
|
class Yandex(SearchEngine):
|
||||||
@@ -237,7 +275,7 @@ class Yandex(SearchEngine):
|
|||||||
}
|
}
|
||||||
if page > 0:
|
if page > 0:
|
||||||
params['p'] = str(page)
|
params['p'] = str(page)
|
||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||||
|
|
||||||
def _unwrap_url(self, href):
|
def _unwrap_url(self, href):
|
||||||
"""Extract actual URL from Yandex redirect wrapper."""
|
"""Extract actual URL from Yandex redirect wrapper."""
|
||||||
@@ -259,7 +297,7 @@ class Yandex(SearchEngine):
|
|||||||
|
|
||||||
soup = soupify(content, nohtml=True)
|
soup = soupify(content, nohtml=True)
|
||||||
|
|
||||||
for a in soup.body.find_all('a'):
|
for a in _get_body(soup).find_all('a'):
|
||||||
href = a.get('href', '')
|
href = a.get('href', '')
|
||||||
if not href:
|
if not href:
|
||||||
continue
|
continue
|
||||||
@@ -294,7 +332,7 @@ class Ecosia(SearchEngine):
|
|||||||
params = {'q': query}
|
params = {'q': query}
|
||||||
if page > 0:
|
if page > 0:
|
||||||
params['p'] = str(page)
|
params['p'] = str(page)
|
||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||||
|
|
||||||
|
|
||||||
class Brave(SearchEngine):
|
class Brave(SearchEngine):
|
||||||
@@ -309,7 +347,7 @@ class Brave(SearchEngine):
|
|||||||
params = {'q': query}
|
params = {'q': query}
|
||||||
if page > 0:
|
if page > 0:
|
||||||
params['offset'] = str(page)
|
params['offset'] = str(page)
|
||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||||
|
|
||||||
|
|
||||||
class GitHub(SearchEngine):
|
class GitHub(SearchEngine):
|
||||||
@@ -346,7 +384,7 @@ class GitHub(SearchEngine):
|
|||||||
}
|
}
|
||||||
if page > 0:
|
if page > 0:
|
||||||
params['p'] = str(page + 1)
|
params['p'] = str(page + 1)
|
||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||||
|
|
||||||
def extract_urls(self, content, urignore=None):
|
def extract_urls(self, content, urignore=None):
|
||||||
"""Extract URLs with blob-to-raw conversion for direct file access."""
|
"""Extract URLs with blob-to-raw conversion for direct file access."""
|
||||||
@@ -357,7 +395,7 @@ class GitHub(SearchEngine):
|
|||||||
|
|
||||||
soup = soupify(content, nohtml=True)
|
soup = soupify(content, nohtml=True)
|
||||||
|
|
||||||
for a in soup.body.find_all('a'):
|
for a in _get_body(soup).find_all('a'):
|
||||||
href = a.get('href', '')
|
href = a.get('href', '')
|
||||||
if not href:
|
if not href:
|
||||||
continue
|
continue
|
||||||
@@ -424,7 +462,7 @@ class GitLab(SearchEngine):
|
|||||||
}
|
}
|
||||||
if page > 0:
|
if page > 0:
|
||||||
params['page'] = str(page + 1)
|
params['page'] = str(page + 1)
|
||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||||
|
|
||||||
def extract_urls(self, content, urignore=None):
|
def extract_urls(self, content, urignore=None):
|
||||||
"""Extract project URLs only (whitelist pattern)."""
|
"""Extract project URLs only (whitelist pattern)."""
|
||||||
@@ -434,7 +472,7 @@ class GitLab(SearchEngine):
|
|||||||
|
|
||||||
soup = soupify(content, nohtml=True)
|
soup = soupify(content, nohtml=True)
|
||||||
|
|
||||||
for a in soup.body.find_all('a'):
|
for a in _get_body(soup).find_all('a'):
|
||||||
href = a.get('href', '')
|
href = a.get('href', '')
|
||||||
if not href:
|
if not href:
|
||||||
continue
|
continue
|
||||||
@@ -479,7 +517,7 @@ class Codeberg(SearchEngine):
|
|||||||
}
|
}
|
||||||
if page > 0:
|
if page > 0:
|
||||||
params['page'] = str(page + 1)
|
params['page'] = str(page + 1)
|
||||||
return '%s?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s?%s' % (self.base_url, _urlencode(params))
|
||||||
|
|
||||||
def extract_urls(self, content, urignore=None):
|
def extract_urls(self, content, urignore=None):
|
||||||
"""Extract repo URLs only (whitelist pattern)."""
|
"""Extract repo URLs only (whitelist pattern)."""
|
||||||
@@ -489,7 +527,7 @@ class Codeberg(SearchEngine):
|
|||||||
|
|
||||||
soup = soupify(content, nohtml=True)
|
soup = soupify(content, nohtml=True)
|
||||||
|
|
||||||
for a in soup.body.find_all('a'):
|
for a in _get_body(soup).find_all('a'):
|
||||||
href = a.get('href', '')
|
href = a.get('href', '')
|
||||||
if not href:
|
if not href:
|
||||||
continue
|
continue
|
||||||
@@ -534,7 +572,7 @@ class Gitea(SearchEngine):
|
|||||||
}
|
}
|
||||||
if page > 0:
|
if page > 0:
|
||||||
params['page'] = str(page + 1)
|
params['page'] = str(page + 1)
|
||||||
return '%s/explore/repos?%s' % (self.current_instance, urllib.urlencode(params))
|
return '%s/explore/repos?%s' % (self.current_instance, _urlencode(params))
|
||||||
|
|
||||||
def extract_urls(self, content, urignore=None):
|
def extract_urls(self, content, urignore=None):
|
||||||
"""Extract repo URLs for current dynamic instance."""
|
"""Extract repo URLs for current dynamic instance."""
|
||||||
@@ -545,7 +583,7 @@ class Gitea(SearchEngine):
|
|||||||
soup = soupify(content, nohtml=True)
|
soup = soupify(content, nohtml=True)
|
||||||
instance_domain = self.current_instance.split('//')[1]
|
instance_domain = self.current_instance.split('//')[1]
|
||||||
|
|
||||||
for a in soup.body.find_all('a'):
|
for a in _get_body(soup).find_all('a'):
|
||||||
href = a.get('href', '')
|
href = a.get('href', '')
|
||||||
if not href:
|
if not href:
|
||||||
continue
|
continue
|
||||||
@@ -584,7 +622,7 @@ class Searx(SearchEngine):
|
|||||||
}
|
}
|
||||||
if page > 0:
|
if page > 0:
|
||||||
params['pageno'] = str(page + 1)
|
params['pageno'] = str(page + 1)
|
||||||
return '%s/?%s' % (self.base_url, urllib.urlencode(params))
|
return '%s/?%s' % (self.base_url, _urlencode(params))
|
||||||
|
|
||||||
def extract_urls(self, content, urignore=None):
|
def extract_urls(self, content, urignore=None):
|
||||||
"""Extract URLs from Searx results (noreferrer links only)."""
|
"""Extract URLs from Searx results (noreferrer links only)."""
|
||||||
@@ -595,7 +633,7 @@ class Searx(SearchEngine):
|
|||||||
|
|
||||||
soup = soupify(content, nohtml=True)
|
soup = soupify(content, nohtml=True)
|
||||||
|
|
||||||
for a in soup.body.find_all('a'):
|
for a in _get_body(soup).find_all('a'):
|
||||||
# Searx uses rel="noreferrer" for result links
|
# Searx uses rel="noreferrer" for result links
|
||||||
rel = a.get('rel', '')
|
rel = a.get('rel', '')
|
||||||
if not rel or 'noreferrer' not in str(rel):
|
if not rel or 'noreferrer' not in str(rel):
|
||||||
|
|||||||
Reference in New Issue
Block a user