factor out http related code from ppf.py

2019-01-18 19:30:42 +00:00
parent 0dad0176f3
commit 4a41796b19
2 changed files with 120 additions and 107 deletions
@@ -0,0 +1,113 @@
+import re, random, time
+import rocksock
+from http2 import RsHttp, _parse_url
+from soup_parser import soupify
+from misc import _log
+
+config = None
+def set_config(cfg):
+	global config
+	config = cfg
+
+cleanhtml_re = [
+	re.compile('<.*?>'),
+	re.compile('\s+'),
+	re.compile('::+'),
+]
+def cleanhtml(raw_html):
+	html = raw_html.replace('&nbsp;', ' ')
+	html = re.sub(cleanhtml_re[0], ':', html)
+	html = re.sub(cleanhtml_re[1], ':', html)
+	html = re.sub(cleanhtml_re[2], ':', html)
+	return html
+
+retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
+def fetch_contents(url):
+	host, port, ssl, uri = _parse_url(url)
+	headers=[
+		'Accept-Language: en-US,en;q=0.8',
+		'Cache-Control: max-age=0',
+	]
+	if config.ppf.debug:
+		_log("connecting to %s..."%url, "debug")
+	while True:
+		proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))]
+		http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0')
+		if not http.connect():
+			_log("failed to connect to %s"%url, "ppf")
+			e = http.get_last_rocksock_exception()
+			if not e:
+				return ''
+			et = e.get_errortype()
+			ee = e.get_error()
+			ef = e.get_failedproxy()
+			if et == rocksock.RS_ET_OWN and \
+			ee == rocksock.RS_E_TARGET_CONN_REFUSED \
+			and ef == 0:
+				_log("could not connect to proxy 0 - check your connection", "error")
+				time.sleep(5)
+				continue
+			return ''
+		break
+	hdr, res = http.get(uri, headers)
+	res = res.encode('utf-8') if isinstance(res, unicode) else res
+	for retry_message in retry_messages:
+		if retry_message in res: return ''
+
+	return res
+
+def valid_port(port):
+	return port > 0 and port < 65535
+
+def is_usable_proxy(proxy):
+	ip, port = proxy.split(':')
+	if not valid_port(int(port)): return False
+
+	octets = ip.split('.')
+	A = int(octets[0])
+	B = int(octets[1])
+	C = int(octets[2])
+	D = int(octets[3])
+
+	if (A < 1 or A > 254 or \
+	B > 255 or C > 255 or D > 255) or \
+	(A == 10 or A == 127) or \
+	(A == 192 and B == 168) or \
+	(A == 172 and B >= 16 and B <= 31): return False
+	return True
+
+_known_proxies = {}
+def extract_proxies(content, proxydb):
+	matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content))
+
+	uniques_dict = {}
+	for p in matches:
+		uniques_dict[p] = True
+
+	uniques = []
+	for p in uniques_dict.keys():
+		if is_usable_proxy(p): uniques.append(p)
+
+	global _known_proxies
+	if len(_known_proxies) == 0:
+		known = proxydb.execute('SELECT proxy FROM proxylist').fetchall()
+		for k in known:
+			_known_proxies[k[0]] = True
+
+	new = []
+	for p in uniques:
+		if not p in _known_proxies:
+			new.append(p)
+			_known_proxies[p] = True
+
+	return len(uniques), new
+
+def extract_urls(content, urls = None, urignore=None):
+	urls = [] if not urls else urls
+	soup = soupify(content)
+	for a in soup.body.find_all('a'):
+		if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue
+		badurl = [ i for i in urignore if re.findall(i,a.attrs['href'], re.IGNORECASE) ]
+		if not len(badurl): urls.append(a.attrs['href'])
+	return urls
+