#!/usr/bin/env python import dbs import random, time import re import urllib import mysqlite import proxywatchd from misc import _log from soup_parser import soupify from config import Config from http2 import RsHttp, _parse_url import rocksock config = Config() base_header = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', } searx_instances = ('https://searx.me', 'https://searx.xyz', 'https://searx.site', 'https://searx.win', 'https://searx.ru', 'https://stemy.me/searx', 'https://searx.at', 'https://listi.me', 'https://searx.dk', 'https://searx.laquadrature.net' ) retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded') cleanhtml_re = [ re.compile('<.*?>'), re.compile('\s+'), re.compile('::+'), ] def cleanhtml(raw_html): html = raw_html.replace(' ', ' ') html = re.sub(cleanhtml_re[0], ':', html) html = re.sub(cleanhtml_re[1], ':', html) html = re.sub(cleanhtml_re[2], ':', html) return html def import_from_file(fn, sqlite): with open(fn, 'r') as f: for u in f.read().split('\n'): if not len(u): continue exists = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE url=?',(u,)).fetchall() ] if exists: continue print('adding "%s"' % u) sqlite.execute('INSERT INTO uris (added,url,check_time,error,stale_count) VALUES (?,?,?,?,?)', (int(time.time()),u,0,0,0)) sqlite.commit() def fetch_contents(url): host, port, ssl, uri = _parse_url(url) headers=[ 'Accept-Language: en-US,en;q=0.8', 'Cache-Control: max-age=0', ] if config.ppf.debug: _log("connecting to %s..."%url, "debug") while True: proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))] http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0') if not http.connect(): _log("failed to connect to %s"%url, "ppf") e = http.get_last_rocksock_exception() if not e: return '' et = e.get_errortype() ee = e.get_error() ef = e.get_failedproxy() if et == rocksock.RS_ET_OWN and \ ee == rocksock.RS_E_TARGET_CONN_REFUSED \ and ef == 0: _log("could not connect to proxy 0 - check your connection", "error") time.sleep(5) continue return '' break hdr, res = http.get(uri, headers) res = res.encode('utf-8') if isinstance(res, unicode) else res for retry_message in retry_messages: if retry_message in res: return '' return res def proxyfind(sqlite = None): if not sqlite: sqlite = mysqlite.mysqlite(config.ppf.database,str) uris = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE error=0 and url not like "%github%" ORDER BY RANDOM() LIMIT 10').fetchall() ] if len(uris) > 0 and random.random() < random.random(): search = 'site:%s' % random.choice(uris).split('/')[2] else: search = random.choice(search_terms) search = '%s -intitle:pdf' % search search_args = [ 'category=general', 'time_range=day', 'q=%s' % urllib.quote_plus(search) ] for srx in searx_instances: urls = [] random.shuffle(search_args) search_arg = '&'.join(search_args) for x in range(1,10): content = fetch_contents('%s/?%s&pageno=%d' % (srx,search_arg,x)) if content: urls = extract_urls(content, urls) if len(urls): insert_urls(urls, search_arg, sqlite) def extract_urls(content, urls = []): soup = soupify(content) for a in soup.body.find_all('a'): if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue badurl = [ i for i in urignore if re.findall(i,a.attrs['href'], re.IGNORECASE) ] if not len(badurl): urls.append(a.attrs['href']) return urls def insert_urls(urls, search, sqlite): query = [ 'url=?' for u in urls ] known = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE %s' % ' OR '.join(query),urls).fetchall() ] time_now = int(time.time()) new = [ (time_now,i,0,0,0) for i in urls if not i in known ] if not len(new): return sqlite.executemany('INSERT INTO uris (added,url,check_time,error,stale_count) values(?,?,?,?,?)', new) sqlite.commit() _log('+%d item(s) from %s' % (len(new), search), 'added') def valid_port(port): return port > 0 and port < 65535 def is_usable_proxy(proxy): ip, port = proxy.split(':') if not valid_port(int(port)): return False octets = ip.split('.') A = int(octets[0]) B = int(octets[1]) C = int(octets[2]) D = int(octets[3]) if (A < 1 or A > 254 or \ B > 255 or C > 255 or D > 255) or \ (A == 10 or A == 127) or \ (A == 192 and B == 168) or \ (A == 172 and B >= 16 and B <= 31): return False return True def insert_proxies(proxies, sqlite, timestamp): new = [] for p in proxies: new.append((timestamp,p,3,0,0,0)) if len(new): sqlite.executemany('INSERT INTO proxylist (added,proxy,failed,tested,success_count,total_duration) VALUES (?,?,?,?,?,?)', new) sqlite.commit() _known_proxies = {} def proxyleech(proxydb, urldb, url, stale_count, error): try: content = fetch_contents(url) except KeyboardInterrupt as e: raise e except: content = '' matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content)) uniques_dict = {} for p in matches: uniques_dict[p] = True uniques = [] for p in uniques_dict.keys(): if is_usable_proxy(p): uniques.append(p) global _known_proxies if len(_known_proxies) == 0: known = proxydb.execute('SELECT proxy FROM proxylist').fetchall() for k in known: _known_proxies[k[0]] = True new = [] for p in uniques: if not p in _known_proxies: new.append(p) _known_proxies[p] = True if stale_count == 0 and error == 0: # new site if content != '' and len(uniques) == 0: # site works but has zero proxy addresses error = 99999 else: if len(new) == 0: stale_count += 1 if content == '': error += 1 else: error = 0 urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=? where url=?', (error, stale_count, int(time.time()), url)) urldb.commit() if not len(new): return add = [] time_now = int(time.time()) for i in new: add.append(i) if len(add) >= 500: insert_proxies(add, proxydb, time_now) add = [] if len(add): insert_proxies(add, proxydb, time_now) _log('+%d item(s) from %s' % (len(new), url), 'added') if __name__ == '__main__': config.load() proxies={'http':'socks4://%s' % random.choice(config.torhosts),'https':'socks4://%s' % random.choice(config.torhosts)} proxydb = mysqlite.mysqlite(config.watchd.database, str) dbs.create_table_if_not_exists(proxydb, 'proxylist') urldb = mysqlite.mysqlite(config.ppf.database, str) dbs.create_table_if_not_exists(urldb, 'uris') import_from_file('import.txt', urldb) if config.ppf.search: ## load search terms with open('search_terms.txt', 'r') as f: search_terms = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] ## load bad terms with open('urignore.txt', 'r') as f: urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] ## add searx instances as bad terms (avoid loops) empty = [ urignore.append(i.split('/')[2]) for i in searx_instances ] # start proxy watcher if config.watchd.threads > 0: watcherd = proxywatchd.Proxywatchd() watcherd.start() else: watcherd = None while True: try: ## any site that needs to be checked ? rows = [ [i[0],i[1],i[2]] for i in urldb.execute('SELECT url,stale_count,error FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?)