fetch: encode unicode URLs to bytes before HTTP/SOCKS ops

When URLs arrive as unicode (e.g. from JSON API responses), the unicode
type propagates through _parse_url into the SOCKS5 packet construction
in rocksock. Port bytes > 127 formatted via %c in a unicode string
produce non-ASCII characters that fail on socket sendall() implicit
ASCII encode.

Encode URLs to UTF-8 bytes at fetch entry points to keep the entire
request pipeline in str (bytes) domain.
This commit is contained in:
Username
2026-02-17 16:43:26 +01:00
parent e74782ad3f
commit 0311abb46a

View File

@@ -56,6 +56,8 @@ class FetchSession(object):
def fetch(self, url, head=False):
"""Fetch URL, reusing connection if possible."""
network_stats.set_category('scraper')
if isinstance(url, unicode):
url = url.encode('utf-8')
host, port, ssl, uri = _parse_url(url)
# Check if we can reuse existing connection
@@ -489,6 +491,8 @@ def fetch_contents(url, head=False, proxy=None):
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
def _fetch_contents(url, head = False, proxy=None):
network_stats.set_category('scraper')
if isinstance(url, unicode):
url = url.encode('utf-8')
host, port, ssl, uri = _parse_url(url)
headers=[
'Accept-Language: en-US,en;q=0.8',