fetch: encode unicode URLs to bytes before HTTP/SOCKS ops
When URLs arrive as unicode (e.g. from JSON API responses), the unicode type propagates through _parse_url into the SOCKS5 packet construction in rocksock. Port bytes > 127 formatted via %c in a unicode string produce non-ASCII characters that fail on socket sendall() implicit ASCII encode. Encode URLs to UTF-8 bytes at fetch entry points to keep the entire request pipeline in str (bytes) domain.
This commit is contained in:
4
fetch.py
4
fetch.py
@@ -56,6 +56,8 @@ class FetchSession(object):
|
|||||||
def fetch(self, url, head=False):
|
def fetch(self, url, head=False):
|
||||||
"""Fetch URL, reusing connection if possible."""
|
"""Fetch URL, reusing connection if possible."""
|
||||||
network_stats.set_category('scraper')
|
network_stats.set_category('scraper')
|
||||||
|
if isinstance(url, unicode):
|
||||||
|
url = url.encode('utf-8')
|
||||||
host, port, ssl, uri = _parse_url(url)
|
host, port, ssl, uri = _parse_url(url)
|
||||||
|
|
||||||
# Check if we can reuse existing connection
|
# Check if we can reuse existing connection
|
||||||
@@ -489,6 +491,8 @@ def fetch_contents(url, head=False, proxy=None):
|
|||||||
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
|
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
|
||||||
def _fetch_contents(url, head = False, proxy=None):
|
def _fetch_contents(url, head = False, proxy=None):
|
||||||
network_stats.set_category('scraper')
|
network_stats.set_category('scraper')
|
||||||
|
if isinstance(url, unicode):
|
||||||
|
url = url.encode('utf-8')
|
||||||
host, port, ssl, uri = _parse_url(url)
|
host, port, ssl, uri = _parse_url(url)
|
||||||
headers=[
|
headers=[
|
||||||
'Accept-Language: en-US,en;q=0.8',
|
'Accept-Language: en-US,en;q=0.8',
|
||||||
|
|||||||
Reference in New Issue
Block a user