diff --git a/engines.py b/engines.py index 4353e9f..a743656 100644 --- a/engines.py +++ b/engines.py @@ -9,6 +9,44 @@ from soup_parser import soupify from misc import _log +def _urlencode(params): + """URL-encode params dict, handling Unicode strings. + + Python 2's urllib.urlencode() expects byte strings. This helper + encodes any Unicode values to UTF-8 before URL encoding. + + Args: + params: Dictionary of query parameters + + Returns: + URL-encoded query string + """ + encoded = {} + for k, v in params.items(): + if isinstance(v, unicode): + v = v.encode('utf-8') + encoded[k] = v + return urllib.urlencode(encoded) + + +def _get_body(soup): + """Get body element from soup, handling None case. + + Args: + soup: BeautifulSoup or SoupResult object + + Returns: + Body element or empty list wrapper if None + """ + if soup is None or soup.body is None: + # Return object with empty find_all to avoid AttributeError + class EmptyBody: + def find_all(self, *args, **kwargs): + return [] + return EmptyBody() + return soup.body + + class SearchEngine(object): """Base class for search engines.""" @@ -49,7 +87,7 @@ class SearchEngine(object): soup = soupify(content, nohtml=True) - for a in soup.body.find_all('a'): + for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href: continue @@ -121,7 +159,7 @@ class DuckDuckGo(SearchEngine): # DuckDuckGo uses 's' param for offset (30 results per page) params['s'] = str(page * 30) params['dc'] = str(page * 30 + 1) - return '%s?%s' % (self.base_url, urllib.urlencode(params)) + return '%s?%s' % (self.base_url, _urlencode(params)) def _unwrap_url(self, href): """Extract actual URL from DuckDuckGo redirect wrapper.""" @@ -142,7 +180,7 @@ class DuckDuckGo(SearchEngine): soup = soupify(content, nohtml=True) - for a in soup.body.find_all('a'): + for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href or not href.startswith('http'): continue @@ -185,7 +223,7 @@ class Startpage(SearchEngine): } if page > 0: params['page'] = str(page + 1) - return '%s?%s' % (self.base_url, urllib.urlencode(params)) + return '%s?%s' % (self.base_url, _urlencode(params)) class Mojeek(SearchEngine): @@ -201,7 +239,7 @@ class Mojeek(SearchEngine): if page > 0: # Mojeek uses 's' for start position (10 results per page) params['s'] = str(page * 10 + 1) - return '%s?%s' % (self.base_url, urllib.urlencode(params)) + return '%s?%s' % (self.base_url, _urlencode(params)) class Qwant(SearchEngine): @@ -219,7 +257,7 @@ class Qwant(SearchEngine): } if page > 0: params['p'] = str(page + 1) - return '%s?%s' % (self.base_url, urllib.urlencode(params)) + return '%s?%s' % (self.base_url, _urlencode(params)) class Yandex(SearchEngine): @@ -237,7 +275,7 @@ class Yandex(SearchEngine): } if page > 0: params['p'] = str(page) - return '%s?%s' % (self.base_url, urllib.urlencode(params)) + return '%s?%s' % (self.base_url, _urlencode(params)) def _unwrap_url(self, href): """Extract actual URL from Yandex redirect wrapper.""" @@ -259,7 +297,7 @@ class Yandex(SearchEngine): soup = soupify(content, nohtml=True) - for a in soup.body.find_all('a'): + for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href: continue @@ -294,7 +332,7 @@ class Ecosia(SearchEngine): params = {'q': query} if page > 0: params['p'] = str(page) - return '%s?%s' % (self.base_url, urllib.urlencode(params)) + return '%s?%s' % (self.base_url, _urlencode(params)) class Brave(SearchEngine): @@ -309,7 +347,7 @@ class Brave(SearchEngine): params = {'q': query} if page > 0: params['offset'] = str(page) - return '%s?%s' % (self.base_url, urllib.urlencode(params)) + return '%s?%s' % (self.base_url, _urlencode(params)) class GitHub(SearchEngine): @@ -346,7 +384,7 @@ class GitHub(SearchEngine): } if page > 0: params['p'] = str(page + 1) - return '%s?%s' % (self.base_url, urllib.urlencode(params)) + return '%s?%s' % (self.base_url, _urlencode(params)) def extract_urls(self, content, urignore=None): """Extract URLs with blob-to-raw conversion for direct file access.""" @@ -357,7 +395,7 @@ class GitHub(SearchEngine): soup = soupify(content, nohtml=True) - for a in soup.body.find_all('a'): + for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href: continue @@ -424,7 +462,7 @@ class GitLab(SearchEngine): } if page > 0: params['page'] = str(page + 1) - return '%s?%s' % (self.base_url, urllib.urlencode(params)) + return '%s?%s' % (self.base_url, _urlencode(params)) def extract_urls(self, content, urignore=None): """Extract project URLs only (whitelist pattern).""" @@ -434,7 +472,7 @@ class GitLab(SearchEngine): soup = soupify(content, nohtml=True) - for a in soup.body.find_all('a'): + for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href: continue @@ -479,7 +517,7 @@ class Codeberg(SearchEngine): } if page > 0: params['page'] = str(page + 1) - return '%s?%s' % (self.base_url, urllib.urlencode(params)) + return '%s?%s' % (self.base_url, _urlencode(params)) def extract_urls(self, content, urignore=None): """Extract repo URLs only (whitelist pattern).""" @@ -489,7 +527,7 @@ class Codeberg(SearchEngine): soup = soupify(content, nohtml=True) - for a in soup.body.find_all('a'): + for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href: continue @@ -534,7 +572,7 @@ class Gitea(SearchEngine): } if page > 0: params['page'] = str(page + 1) - return '%s/explore/repos?%s' % (self.current_instance, urllib.urlencode(params)) + return '%s/explore/repos?%s' % (self.current_instance, _urlencode(params)) def extract_urls(self, content, urignore=None): """Extract repo URLs for current dynamic instance.""" @@ -545,7 +583,7 @@ class Gitea(SearchEngine): soup = soupify(content, nohtml=True) instance_domain = self.current_instance.split('//')[1] - for a in soup.body.find_all('a'): + for a in _get_body(soup).find_all('a'): href = a.get('href', '') if not href: continue @@ -584,7 +622,7 @@ class Searx(SearchEngine): } if page > 0: params['pageno'] = str(page + 1) - return '%s/?%s' % (self.base_url, urllib.urlencode(params)) + return '%s/?%s' % (self.base_url, _urlencode(params)) def extract_urls(self, content, urignore=None): """Extract URLs from Searx results (noreferrer links only).""" @@ -595,7 +633,7 @@ class Searx(SearchEngine): soup = soupify(content, nohtml=True) - for a in soup.body.find_all('a'): + for a in _get_body(soup).find_all('a'): # Searx uses rel="noreferrer" for result links rel = a.get('rel', '') if not rel or 'noreferrer' not in str(rel):