update from twatscrape
This commit is contained in:
54
http2.py
54
http2.py
@@ -32,6 +32,11 @@ def _parse_url(url):
|
||||
ssl = False
|
||||
url = url[7:]
|
||||
port = 80
|
||||
elif url_l.startswith('//'):
|
||||
# can happen with a redirect
|
||||
ssl = False
|
||||
url = url[2:]
|
||||
port = -1
|
||||
elif url_l.startswith('/'):
|
||||
# can happen with a redirect
|
||||
url = url[1:]
|
||||
@@ -45,7 +50,12 @@ def _parse_url(url):
|
||||
return "", 0, False, url
|
||||
|
||||
port_index = -1
|
||||
fixed_amazon_redirect = False
|
||||
for i in range(len(url)):
|
||||
if url[i] == '?':
|
||||
if not fixed_amazon_redirect:
|
||||
url = url.replace('?','/?',True)
|
||||
fixed_amazon_redirect = True
|
||||
if url[i] == ':':
|
||||
host = url[:i]
|
||||
port_index = i+1
|
||||
@@ -78,12 +88,14 @@ class RsHttp():
|
||||
def __init__(self, host, port=80, ssl=False, follow_redirects=False, \
|
||||
auto_set_cookies=False, keep_alive=False, timeout=60, \
|
||||
user_agent=None, proxies=None, max_tries=10, log_errors=True, \
|
||||
verify_cert=False,
|
||||
**kwargs):
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.use_ssl = ssl
|
||||
self.debugreq = False
|
||||
self.follow_redirects = follow_redirects
|
||||
self.redirect_counter = 0
|
||||
self.auto_set_cookies = auto_set_cookies
|
||||
self.keep_alive = keep_alive
|
||||
self.timeout = timeout
|
||||
@@ -93,6 +105,7 @@ class RsHttp():
|
||||
self.max_tries = max_tries
|
||||
self.log_errors = log_errors
|
||||
self.last_rs_exception = None
|
||||
self.verify_cert=verify_cert
|
||||
self.headers = []
|
||||
|
||||
def get_last_rocksock_exception(self):
|
||||
@@ -166,6 +179,15 @@ class RsHttp():
|
||||
data = urllib.urlencode(values)
|
||||
return self._make_post_request_raw(url, data, extras)
|
||||
|
||||
def _try_gunzip(self, data):
|
||||
try:
|
||||
res = zlib.decompress(data, 16+zlib.MAX_WBITS)
|
||||
return 0, res
|
||||
except zlib.error as e:
|
||||
if 'incomplete' in e.message:
|
||||
return -1, ''
|
||||
return -2, ''
|
||||
|
||||
def _get_response(self):
|
||||
def parse_header_fields(line):
|
||||
if not ':' in line: return line.rstrip(' '), ""
|
||||
@@ -181,7 +203,9 @@ class RsHttp():
|
||||
s = ''
|
||||
res = ''
|
||||
#'HTTP/1.1 302 Found\r\n'
|
||||
l = self.conn.recvline().strip()
|
||||
l = ''
|
||||
while not l.startswith('HTTP/'):
|
||||
l = self.conn.recvline().strip()
|
||||
s = l + '\n'
|
||||
foo, code, msg = _parse_errorcode(l)
|
||||
while True:
|
||||
@@ -227,7 +251,13 @@ class RsHttp():
|
||||
|
||||
if len(res) != 0:
|
||||
if unzip == 'gzip':
|
||||
res = zlib.decompress(res, 16+zlib.MAX_WBITS)
|
||||
ec, extr = self._try_gunzip(res)
|
||||
while ec == -1:
|
||||
res += self.conn.recv(-1)
|
||||
ec, extr = self._try_gunzip(res)
|
||||
if ec == -2:
|
||||
raise zlib.error
|
||||
res = extr
|
||||
elif unzip == 'deflate':
|
||||
try:
|
||||
res = zlib.decompress(res)
|
||||
@@ -247,7 +277,7 @@ class RsHttp():
|
||||
while tries < self.max_tries:
|
||||
tries += 1
|
||||
try:
|
||||
self.conn = Rocksock(host=self.host, port=self.port, proxies=self.proxies, ssl=self.use_ssl, timeout=self.timeout)
|
||||
self.conn = Rocksock(host=self.host, port=self.port, proxies=self.proxies, ssl=self.use_ssl, timeout=self.timeout, verifycert=self.verify_cert)
|
||||
self.conn.connect()
|
||||
return True
|
||||
except RocksockException as e:
|
||||
@@ -314,15 +344,23 @@ class RsHttp():
|
||||
hdr, res, redirect = self._send_and_recv(req)
|
||||
|
||||
if redirect != '' and self.follow_redirects:
|
||||
MAX_REDIRECTS = 16
|
||||
self.redirect_counter += 1
|
||||
if self.redirect_counter > MAX_REDIRECTS:
|
||||
return '', ''
|
||||
|
||||
host, port, use_ssl, url = _parse_url(redirect)
|
||||
if port != 0:
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.use_ssl = use_ssl
|
||||
if port != -1: # -1: use existing port/ssl
|
||||
self.port = port
|
||||
self.use_ssl = use_ssl
|
||||
self.conn.disconnect()
|
||||
self.conn = None
|
||||
self.reconnect()
|
||||
return self.get(url, extras)
|
||||
else:
|
||||
self.redirect_counter = 0
|
||||
|
||||
return hdr, res
|
||||
|
||||
@@ -371,6 +409,12 @@ class RsHttp():
|
||||
# http.add_header("Referer: http://bbc.com")
|
||||
self.headers.append(s)
|
||||
|
||||
def add_headers(self, lines):
|
||||
# copy a multi-line header chunk verbatim into each request:
|
||||
for line in lines.split('\n'):
|
||||
line = line.rstrip('\r')
|
||||
if len(line): self.headers.append(line)
|
||||
|
||||
def set_cookie(self, c):
|
||||
if c.lower().startswith('set-cookie: '):
|
||||
c = c[len('Set-Cookie: '):]
|
||||
|
||||
Reference in New Issue
Block a user