scraper: reuse connections, cycle circuit on block
All checks were successful
CI / syntax-check (push) Successful in 6s
CI / memory-leak-check (push) Successful in 15s

This commit is contained in:
Username
2025-12-25 19:26:23 +01:00
parent 68e8b88afa
commit 272eba0f05
2 changed files with 151 additions and 37 deletions

102
fetch.py
View File

@@ -15,6 +15,108 @@ def tor_proxy_url(torhost):
user = ''.join(random.choice(chars) for _ in range(8))
passwd = ''.join(random.choice(chars) for _ in range(8))
return 'socks5://%s:%s@%s' % (user, passwd, torhost)
class FetchSession(object):
"""Reusable fetch session with persistent Tor circuit.
Maintains HTTP connection and Tor credentials across multiple requests.
Call cycle() to get a new Tor circuit when blocked.
"""
def __init__(self):
self.http = None
self.current_host = None
self.current_port = None
self.current_ssl = None
self.tor_url = None
self._new_circuit()
def _new_circuit(self):
"""Generate new Tor credentials for a fresh circuit."""
if config and config.torhosts:
torhost = random.choice(config.torhosts)
self.tor_url = tor_proxy_url(torhost)
def cycle(self):
"""Cycle to a new Tor circuit (call when blocked)."""
self.close()
self._new_circuit()
def close(self):
"""Close current connection."""
if self.http:
try:
self.http.disconnect()
except Exception:
pass
self.http = None
self.current_host = None
def fetch(self, url, head=False):
"""Fetch URL, reusing connection if possible."""
network_stats.set_category('scraper')
host, port, ssl, uri = _parse_url(url)
# Check if we can reuse existing connection
if (self.http and self.current_host == host and
self.current_port == port and self.current_ssl == ssl):
# Reuse existing connection
try:
if head:
return self.http.head(uri, [
'Accept-Language: en-US,en;q=0.8',
'Cache-Control: max-age=0',
])
hdr, res = self.http.get(uri, [
'Accept-Language: en-US,en;q=0.8',
'Cache-Control: max-age=0',
])
res = res.encode('utf-8') if isinstance(res, unicode) else res
return res
except Exception:
# Connection died, close and reconnect
self.close()
# Need new connection
self.close()
if not self.tor_url:
self._new_circuit()
proxies = [rocksock.RocksockProxyFromURL(self.tor_url)]
self.http = RsHttp(
host, ssl=ssl, port=port, keep_alive=True,
timeout=config.ppf.timeout, max_tries=config.ppf.http_retries,
follow_redirects=True, auto_set_cookies=True, proxies=proxies,
user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
log_errors=False
)
if not self.http.connect():
self.close()
return None
self.current_host = host
self.current_port = port
self.current_ssl = ssl
try:
if head:
return self.http.head(uri, [
'Accept-Language: en-US,en;q=0.8',
'Cache-Control: max-age=0',
])
hdr, res = self.http.get(uri, [
'Accept-Language: en-US,en;q=0.8',
'Cache-Control: max-age=0',
])
res = res.encode('utf-8') if isinstance(res, unicode) else res
return res
except Exception:
self.close()
return None
_last_fail_log = 0
_fail_log_interval = 60