ppf/fetch.py

import re, random, time
import rocksock
from http2 import RsHttp, _parse_url
from soup_parser import soupify
from misc import _log

config = None
def set_config(cfg):
	global config
	config = cfg

cleanhtml_re = [
	re.compile('<.*?>'),
	re.compile('\s+'),
	re.compile('::+'),
]
def cleanhtml(raw_html):
	html = raw_html.replace('&nbsp;', ' ')
	html = re.sub(cleanhtml_re[0], ':', html)
	html = re.sub(cleanhtml_re[1], ':', html)
	html = re.sub(cleanhtml_re[2], ':', html)
	return html

def fetch_contents(url, head=False, proxy=None):
	content = None
	if proxy is not None and len(proxy):
		for p in proxy:
			content = _fetch_contents(url, head=head, proxy=p)
			if content is not None: break

	else:
		content = _fetch_contents(url, head=head)

	return content if content is not None else ''

retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
def _fetch_contents(url, head = False, proxy=None):
	host, port, ssl, uri = _parse_url(url)
	headers=[
		'Accept-Language: en-US,en;q=0.8',
		'Cache-Control: max-age=0',
	]
	if config.ppf.debug:
		_log("connecting to %s... (header: %s)" % (url, str(head)), "debug")
	while True:
		proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))]
		if proxy: proxies.append( rocksock.RocksockProxyFromURL(proxy))

		http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0')
		if not http.connect():
			_log("failed to connect to %s"%url, "ppf")
			e = http.get_last_rocksock_exception()
			if not e:
				return None
			et = e.get_errortype()
			ee = e.get_error()
			ef = e.get_failedproxy()
			if et == rocksock.RS_ET_OWN and \
			ee == rocksock.RS_E_TARGET_CONN_REFUSED \
			and ef == 0:
				_log("could not connect to proxy 0 - check your connection", "error")
				time.sleep(5)
				continue
			return None
		break

	## only request header
	if head:
		hdr = http.head(uri, headers)
		return hdr

	hdr, res = http.get(uri, headers)
	res = res.encode('utf-8') if isinstance(res, unicode) else res
	for retry_message in retry_messages:
		if retry_message in res: return None

	return res

def valid_port(port):
	return port > 0 and port < 65535

def is_usable_proxy(proxy):
	ip, port = proxy.split(':')
	if not valid_port(int(port)): return False

	octets = ip.split('.')
	A = int(octets[0])
	B = int(octets[1])
	C = int(octets[2])
	D = int(octets[3])

	if (A < 1 or A > 254 or \
	B > 255 or C > 255 or D > 255) or \
	(A == 10 or A == 127) or \
	(A == 192 and B == 168) or \
	(A == 172 and B >= 16 and B <= 31): return False
	return True

_known_proxies = {}
def extract_proxies(content, proxydb):
	matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content))

	uniques_dict = {}
	for p in matches:
		ip, port = p.split(':')
		ip = '.'.join( [ str(int(str(i))) for i in ip.split('.') ] )
		port = int( port.lstrip('0') )
		p = '%s:%s' % (ip, port)
		uniques_dict[p] = True

	uniques = []
	for p in uniques_dict.keys():
		if is_usable_proxy(p): uniques.append(p)

	global _known_proxies
	if len(_known_proxies) == 0:
		known = proxydb.execute('SELECT proxy FROM proxylist').fetchall()
		for k in known:
			_known_proxies[k[0]] = True

	new = []
	for p in uniques:
		if not p in _known_proxies:
			new.append(p)
			_known_proxies[p] = True

	return len(uniques), new

def extract_urls(content, urls = None, urignore=None):
	urls = [] if not urls else urls
	soup = soupify(content)
	for a in soup.body.find_all('a'):
		if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue
		bad = False
		href = a.attrs['href']
		for i in urignore:
			if re.findall(i, href):
				bad = True
				break
		if not bad: urls.append(href)
	return urls