update from twatscrape

This commit is contained in:
Your Name
2021-07-03 17:17:18 +02:00
parent 11c5bd67b3
commit 37622d8a17
2 changed files with 126 additions and 58 deletions

View File

@@ -32,6 +32,11 @@ def _parse_url(url):
ssl = False
url = url[7:]
port = 80
elif url_l.startswith('//'):
# can happen with a redirect
ssl = False
url = url[2:]
port = -1
elif url_l.startswith('/'):
# can happen with a redirect
url = url[1:]
@@ -45,7 +50,12 @@ def _parse_url(url):
return "", 0, False, url
port_index = -1
fixed_amazon_redirect = False
for i in range(len(url)):
if url[i] == '?':
if not fixed_amazon_redirect:
url = url.replace('?','/?',True)
fixed_amazon_redirect = True
if url[i] == ':':
host = url[:i]
port_index = i+1
@@ -78,12 +88,14 @@ class RsHttp():
def __init__(self, host, port=80, ssl=False, follow_redirects=False, \
auto_set_cookies=False, keep_alive=False, timeout=60, \
user_agent=None, proxies=None, max_tries=10, log_errors=True, \
verify_cert=False,
**kwargs):
self.host = host
self.port = port
self.use_ssl = ssl
self.debugreq = False
self.follow_redirects = follow_redirects
self.redirect_counter = 0
self.auto_set_cookies = auto_set_cookies
self.keep_alive = keep_alive
self.timeout = timeout
@@ -93,6 +105,7 @@ class RsHttp():
self.max_tries = max_tries
self.log_errors = log_errors
self.last_rs_exception = None
self.verify_cert=verify_cert
self.headers = []
def get_last_rocksock_exception(self):
@@ -166,6 +179,15 @@ class RsHttp():
data = urllib.urlencode(values)
return self._make_post_request_raw(url, data, extras)
def _try_gunzip(self, data):
try:
res = zlib.decompress(data, 16+zlib.MAX_WBITS)
return 0, res
except zlib.error as e:
if 'incomplete' in e.message:
return -1, ''
return -2, ''
def _get_response(self):
def parse_header_fields(line):
if not ':' in line: return line.rstrip(' '), ""
@@ -181,7 +203,9 @@ class RsHttp():
s = ''
res = ''
#'HTTP/1.1 302 Found\r\n'
l = self.conn.recvline().strip()
l = ''
while not l.startswith('HTTP/'):
l = self.conn.recvline().strip()
s = l + '\n'
foo, code, msg = _parse_errorcode(l)
while True:
@@ -227,7 +251,13 @@ class RsHttp():
if len(res) != 0:
if unzip == 'gzip':
res = zlib.decompress(res, 16+zlib.MAX_WBITS)
ec, extr = self._try_gunzip(res)
while ec == -1:
res += self.conn.recv(-1)
ec, extr = self._try_gunzip(res)
if ec == -2:
raise zlib.error
res = extr
elif unzip == 'deflate':
try:
res = zlib.decompress(res)
@@ -247,7 +277,7 @@ class RsHttp():
while tries < self.max_tries:
tries += 1
try:
self.conn = Rocksock(host=self.host, port=self.port, proxies=self.proxies, ssl=self.use_ssl, timeout=self.timeout)
self.conn = Rocksock(host=self.host, port=self.port, proxies=self.proxies, ssl=self.use_ssl, timeout=self.timeout, verifycert=self.verify_cert)
self.conn.connect()
return True
except RocksockException as e:
@@ -314,15 +344,23 @@ class RsHttp():
hdr, res, redirect = self._send_and_recv(req)
if redirect != '' and self.follow_redirects:
MAX_REDIRECTS = 16
self.redirect_counter += 1
if self.redirect_counter > MAX_REDIRECTS:
return '', ''
host, port, use_ssl, url = _parse_url(redirect)
if port != 0:
self.host = host
self.port = port
self.use_ssl = use_ssl
if port != -1: # -1: use existing port/ssl
self.port = port
self.use_ssl = use_ssl
self.conn.disconnect()
self.conn = None
self.reconnect()
return self.get(url, extras)
else:
self.redirect_counter = 0
return hdr, res
@@ -371,6 +409,12 @@ class RsHttp():
# http.add_header("Referer: http://bbc.com")
self.headers.append(s)
def add_headers(self, lines):
# copy a multi-line header chunk verbatim into each request:
for line in lines.split('\n'):
line = line.rstrip('\r')
if len(line): self.headers.append(line)
def set_cookie(self, c):
if c.lower().startswith('set-cookie: '):
c = c[len('Set-Cookie: '):]