fetch: encode unicode URLs to bytes before HTTP/SOCKS ops
When URLs arrive as unicode (e.g. from JSON API responses), the unicode type propagates through _parse_url into the SOCKS5 packet construction in rocksock. Port bytes > 127 formatted via %c in a unicode string produce non-ASCII characters that fail on socket sendall() implicit ASCII encode. Encode URLs to UTF-8 bytes at fetch entry points to keep the entire request pipeline in str (bytes) domain.
This commit is contained in:
4
fetch.py
4
fetch.py
@@ -56,6 +56,8 @@ class FetchSession(object):
|
||||
def fetch(self, url, head=False):
|
||||
"""Fetch URL, reusing connection if possible."""
|
||||
network_stats.set_category('scraper')
|
||||
if isinstance(url, unicode):
|
||||
url = url.encode('utf-8')
|
||||
host, port, ssl, uri = _parse_url(url)
|
||||
|
||||
# Check if we can reuse existing connection
|
||||
@@ -489,6 +491,8 @@ def fetch_contents(url, head=False, proxy=None):
|
||||
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
|
||||
def _fetch_contents(url, head = False, proxy=None):
|
||||
network_stats.set_category('scraper')
|
||||
if isinstance(url, unicode):
|
||||
url = url.encode('utf-8')
|
||||
host, port, ssl, uri = _parse_url(url)
|
||||
headers=[
|
||||
'Accept-Language: en-US,en;q=0.8',
|
||||
|
||||
Reference in New Issue
Block a user