scraper: reuse connections, cycle circuit on block
This commit is contained in:
102
fetch.py
102
fetch.py
@@ -15,6 +15,108 @@ def tor_proxy_url(torhost):
|
||||
user = ''.join(random.choice(chars) for _ in range(8))
|
||||
passwd = ''.join(random.choice(chars) for _ in range(8))
|
||||
return 'socks5://%s:%s@%s' % (user, passwd, torhost)
|
||||
|
||||
|
||||
class FetchSession(object):
|
||||
"""Reusable fetch session with persistent Tor circuit.
|
||||
|
||||
Maintains HTTP connection and Tor credentials across multiple requests.
|
||||
Call cycle() to get a new Tor circuit when blocked.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.http = None
|
||||
self.current_host = None
|
||||
self.current_port = None
|
||||
self.current_ssl = None
|
||||
self.tor_url = None
|
||||
self._new_circuit()
|
||||
|
||||
def _new_circuit(self):
|
||||
"""Generate new Tor credentials for a fresh circuit."""
|
||||
if config and config.torhosts:
|
||||
torhost = random.choice(config.torhosts)
|
||||
self.tor_url = tor_proxy_url(torhost)
|
||||
|
||||
def cycle(self):
|
||||
"""Cycle to a new Tor circuit (call when blocked)."""
|
||||
self.close()
|
||||
self._new_circuit()
|
||||
|
||||
def close(self):
|
||||
"""Close current connection."""
|
||||
if self.http:
|
||||
try:
|
||||
self.http.disconnect()
|
||||
except Exception:
|
||||
pass
|
||||
self.http = None
|
||||
self.current_host = None
|
||||
|
||||
def fetch(self, url, head=False):
|
||||
"""Fetch URL, reusing connection if possible."""
|
||||
network_stats.set_category('scraper')
|
||||
host, port, ssl, uri = _parse_url(url)
|
||||
|
||||
# Check if we can reuse existing connection
|
||||
if (self.http and self.current_host == host and
|
||||
self.current_port == port and self.current_ssl == ssl):
|
||||
# Reuse existing connection
|
||||
try:
|
||||
if head:
|
||||
return self.http.head(uri, [
|
||||
'Accept-Language: en-US,en;q=0.8',
|
||||
'Cache-Control: max-age=0',
|
||||
])
|
||||
hdr, res = self.http.get(uri, [
|
||||
'Accept-Language: en-US,en;q=0.8',
|
||||
'Cache-Control: max-age=0',
|
||||
])
|
||||
res = res.encode('utf-8') if isinstance(res, unicode) else res
|
||||
return res
|
||||
except Exception:
|
||||
# Connection died, close and reconnect
|
||||
self.close()
|
||||
|
||||
# Need new connection
|
||||
self.close()
|
||||
if not self.tor_url:
|
||||
self._new_circuit()
|
||||
|
||||
proxies = [rocksock.RocksockProxyFromURL(self.tor_url)]
|
||||
self.http = RsHttp(
|
||||
host, ssl=ssl, port=port, keep_alive=True,
|
||||
timeout=config.ppf.timeout, max_tries=config.ppf.http_retries,
|
||||
follow_redirects=True, auto_set_cookies=True, proxies=proxies,
|
||||
user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0',
|
||||
log_errors=False
|
||||
)
|
||||
|
||||
if not self.http.connect():
|
||||
self.close()
|
||||
return None
|
||||
|
||||
self.current_host = host
|
||||
self.current_port = port
|
||||
self.current_ssl = ssl
|
||||
|
||||
try:
|
||||
if head:
|
||||
return self.http.head(uri, [
|
||||
'Accept-Language: en-US,en;q=0.8',
|
||||
'Cache-Control: max-age=0',
|
||||
])
|
||||
hdr, res = self.http.get(uri, [
|
||||
'Accept-Language: en-US,en;q=0.8',
|
||||
'Cache-Control: max-age=0',
|
||||
])
|
||||
res = res.encode('utf-8') if isinstance(res, unicode) else res
|
||||
return res
|
||||
except Exception:
|
||||
self.close()
|
||||
return None
|
||||
|
||||
|
||||
_last_fail_log = 0
|
||||
_fail_log_interval = 60
|
||||
|
||||
|
||||
Reference in New Issue
Block a user