diff --git a/fetch.py b/fetch.py new file mode 100644 index 0000000..4f54bf6 --- /dev/null +++ b/fetch.py @@ -0,0 +1,113 @@ +import re, random, time +import rocksock +from http2 import RsHttp, _parse_url +from soup_parser import soupify +from misc import _log + +config = None +def set_config(cfg): + global config + config = cfg + +cleanhtml_re = [ + re.compile('<.*?>'), + re.compile('\s+'), + re.compile('::+'), +] +def cleanhtml(raw_html): + html = raw_html.replace(' ', ' ') + html = re.sub(cleanhtml_re[0], ':', html) + html = re.sub(cleanhtml_re[1], ':', html) + html = re.sub(cleanhtml_re[2], ':', html) + return html + +retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded') +def fetch_contents(url): + host, port, ssl, uri = _parse_url(url) + headers=[ + 'Accept-Language: en-US,en;q=0.8', + 'Cache-Control: max-age=0', + ] + if config.ppf.debug: + _log("connecting to %s..."%url, "debug") + while True: + proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))] + http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0') + if not http.connect(): + _log("failed to connect to %s"%url, "ppf") + e = http.get_last_rocksock_exception() + if not e: + return '' + et = e.get_errortype() + ee = e.get_error() + ef = e.get_failedproxy() + if et == rocksock.RS_ET_OWN and \ + ee == rocksock.RS_E_TARGET_CONN_REFUSED \ + and ef == 0: + _log("could not connect to proxy 0 - check your connection", "error") + time.sleep(5) + continue + return '' + break + hdr, res = http.get(uri, headers) + res = res.encode('utf-8') if isinstance(res, unicode) else res + for retry_message in retry_messages: + if retry_message in res: return '' + + return res + +def valid_port(port): + return port > 0 and port < 65535 + +def is_usable_proxy(proxy): + ip, port = proxy.split(':') + if not valid_port(int(port)): return False + + octets = ip.split('.') + A = int(octets[0]) + B = int(octets[1]) + C = int(octets[2]) + D = int(octets[3]) + + if (A < 1 or A > 254 or \ + B > 255 or C > 255 or D > 255) or \ + (A == 10 or A == 127) or \ + (A == 192 and B == 168) or \ + (A == 172 and B >= 16 and B <= 31): return False + return True + +_known_proxies = {} +def extract_proxies(content, proxydb): + matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content)) + + uniques_dict = {} + for p in matches: + uniques_dict[p] = True + + uniques = [] + for p in uniques_dict.keys(): + if is_usable_proxy(p): uniques.append(p) + + global _known_proxies + if len(_known_proxies) == 0: + known = proxydb.execute('SELECT proxy FROM proxylist').fetchall() + for k in known: + _known_proxies[k[0]] = True + + new = [] + for p in uniques: + if not p in _known_proxies: + new.append(p) + _known_proxies[p] = True + + return len(uniques), new + +def extract_urls(content, urls = None, urignore=None): + urls = [] if not urls else urls + soup = soupify(content) + for a in soup.body.find_all('a'): + if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue + badurl = [ i for i in urignore if re.findall(i,a.attrs['href'], re.IGNORECASE) ] + if not len(badurl): urls.append(a.attrs['href']) + return urls + diff --git a/ppf.py b/ppf.py index 43aadb2..208af6e 100755 --- a/ppf.py +++ b/ppf.py @@ -2,15 +2,12 @@ import dbs import random, time -import re import urllib import mysqlite import proxywatchd from misc import _log -from soup_parser import soupify from config import Config -from http2 import RsHttp, _parse_url -import rocksock +import fetch import sys config = Config() @@ -20,21 +17,8 @@ base_header = { } searx_instances = ('https://searx.me', 'https://searx.xyz', 'https://searx.site', 'https://searx.win', 'https://searx.ru', 'https://stemy.me/searx', 'https://searx.at', 'https://listi.me', 'https://searx.dk', 'https://searx.laquadrature.net' ) -retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded') -cleanhtml_re = [ - re.compile('<.*?>'), - re.compile('\s+'), - re.compile('::+'), -] -def cleanhtml(raw_html): - html = raw_html.replace(' ', ' ') - html = re.sub(cleanhtml_re[0], ':', html) - html = re.sub(cleanhtml_re[1], ':', html) - html = re.sub(cleanhtml_re[2], ':', html) - return html - def import_from_file(fn, sqlite): with open(fn, 'r') as f: for u in f.read().split('\n'): @@ -45,39 +29,6 @@ def import_from_file(fn, sqlite): sqlite.execute('INSERT INTO uris (added,url,check_time,error,stale_count,proxies_added) VALUES (?,?,?,?,?,?)', (int(time.time()),u,0,0,0,0)) sqlite.commit() -def fetch_contents(url): - host, port, ssl, uri = _parse_url(url) - headers=[ - 'Accept-Language: en-US,en;q=0.8', - 'Cache-Control: max-age=0', - ] - if config.ppf.debug: - _log("connecting to %s..."%url, "debug") - while True: - proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))] - http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0') - if not http.connect(): - _log("failed to connect to %s"%url, "ppf") - e = http.get_last_rocksock_exception() - if not e: - return '' - et = e.get_errortype() - ee = e.get_error() - ef = e.get_failedproxy() - if et == rocksock.RS_ET_OWN and \ - ee == rocksock.RS_E_TARGET_CONN_REFUSED \ - and ef == 0: - _log("could not connect to proxy 0 - check your connection", "error") - time.sleep(5) - continue - return '' - break - hdr, res = http.get(uri, headers) - res = res.encode('utf-8') if isinstance(res, unicode) else res - for retry_message in retry_messages: - if retry_message in res: return '' - - return res def proxyfind(sqlite = None): if not sqlite: sqlite = mysqlite.mysqlite(config.ppf.database,str) @@ -95,18 +46,10 @@ def proxyfind(sqlite = None): random.shuffle(search_args) search_arg = '&'.join(search_args) for x in range(1,10): - content = fetch_contents('%s/?%s&pageno=%d' % (srx,search_arg,x)) - if content: urls = extract_urls(content, urls) + content = fetch.fetch_contents('%s/?%s&pageno=%d' % (srx,search_arg,x)) + if content: urls = fetch.extract_urls(content, urls, urignore) if len(urls): insert_urls(urls, search_arg, sqlite) -def extract_urls(content, urls = []): - soup = soupify(content) - for a in soup.body.find_all('a'): - if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue - badurl = [ i for i in urignore if re.findall(i,a.attrs['href'], re.IGNORECASE) ] - if not len(badurl): urls.append(a.attrs['href']) - return urls - def insert_urls(urls, search, sqlite): query = [ 'url=?' for u in urls ] known = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE %s' % ' OR '.join(query),urls).fetchall() ] @@ -117,25 +60,6 @@ def insert_urls(urls, search, sqlite): sqlite.commit() _log('+%d item(s) from %s' % (len(new), search), 'added') -def valid_port(port): - return port > 0 and port < 65535 - -def is_usable_proxy(proxy): - ip, port = proxy.split(':') - if not valid_port(int(port)): return False - - octets = ip.split('.') - A = int(octets[0]) - B = int(octets[1]) - C = int(octets[2]) - D = int(octets[3]) - - if (A < 1 or A > 254 or \ - B > 255 or C > 255 or D > 255) or \ - (A == 10 or A == 127) or \ - (A == 192 and B == 168) or \ - (A == 172 and B >= 16 and B <= 31): return False - return True def insert_proxies(proxydb, proxies, url): timestamp = int(time.time()) @@ -153,11 +77,11 @@ def insert_proxies(proxydb, proxies, url): def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added): - try: content = fetch_contents(url) + try: content = fetch.fetch_contents(url) except KeyboardInterrupt as e: raise e except: content = '' - unique_count, new = extract_proxies(content) + unique_count, new = fetch.extract_proxies(content, proxydb) if retrievals == 0: # new site if content != '' and unique_count == 0: # site works but has zero proxy addresses @@ -180,35 +104,10 @@ def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_adde insert_proxies(proxydb, new, url) -_known_proxies = {} -def extract_proxies(content): - matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content)) - - uniques_dict = {} - for p in matches: - uniques_dict[p] = True - - uniques = [] - for p in uniques_dict.keys(): - if is_usable_proxy(p): uniques.append(p) - - global _known_proxies - if len(_known_proxies) == 0: - known = proxydb.execute('SELECT proxy FROM proxylist').fetchall() - for k in known: - _known_proxies[k[0]] = True - - new = [] - for p in uniques: - if not p in _known_proxies: - new.append(p) - _known_proxies[p] = True - - return len(uniques), new def import_proxies_from_file(proxydb, fn): content = open(fn, 'r').read() - unique_count, new = extract_proxies(content) + unique_count, new = fetch.extract_proxies(content, proxydb) if len(new): insert_proxies(proxydb, new, fn) return 0 @@ -216,6 +115,7 @@ def import_proxies_from_file(proxydb, fn): if __name__ == '__main__': config.load() + fetch.set_config(config) proxies={'http':'socks4://%s' % random.choice(config.torhosts),'https':'socks4://%s' % random.choice(config.torhosts)} proxydb = mysqlite.mysqlite(config.watchd.database, str)