#!/usr/bin/env python import sys import requests import random, time import re import urllib import hashlib from requests.packages.urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning) import mysqlite import proxywatchd from misc import _log from soup_parser import soupify import config base_header = { 'Accept-Language':'en-US,en;q=0.8', 'Cache-Control':'max-age=0', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'DNT': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' } searx_instances = ('https://searx.me', 'https://searx.xyz', 'https://searx.site', 'https://searx.win', 'https://searx.ru', 'https://stemy.me/searx', 'https://searx.at', 'https://listi.me', 'https://searx.dk', 'https://searx.laquadrature.net' ) retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded') def cleanhtml(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, ':', raw_html) cleantext = re.sub('::+',':', cleantext) return cleantext def import_from_file(fn, sqlite): with open(fn, 'r') as f: for u in f.read().split('\n'): if not len(u): continue exists = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE url=?',(u,)).fetchall() ] if exists: continue print('adding "%s"' % u) sqlite.execute('INSERT INTO uris (added,url,check_time,error) VALUES (?,?,?,?)', (time.time(),u,0,1)) sqlite.commit() def fetch_contents(uri): headers = base_header try: resp = requests.get(uri, timeout=45, headers=headers, verify=False, proxies=proxies) except KeyboardInterrupt as e: raise e except: return '' data = resp.text for retry_message in retry_messages: if retry_message in data: return '' return data def insert_proxies(proxies, uri, sqlite): time_now = time.time() query = [ 'proxy=?' for p in proxies ] known = [ i[0] for i in sqlite.execute('SELECT proxy FROM proxylist WHERE %s' % ' OR '.join(query), proxies).fetchall() ] new = [ (time_now,i,3,0,0,0) for i in proxies if not i in known ] if len(new): sqlite.executemany('INSERT INTO proxylist (added,proxy,failed,tested,success_count,total_duration) VALUES (?,?,?,?,?,?)', new) sqlite.commit() _log('+%d item(s) from %s' % (len(new), uri), 'added') time.sleep(0.1) def proxyfind(sqlite = None): #print('entering proxyfind...') if not sqlite: sqlite = mysqlite.mysqlite(config.database,str) uris = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE error=0 and url not like "%github%" ORDER BY RANDOM() LIMIT 10').fetchall() ] if len(uris) > 0 and random.random() < random.random(): search = urllib.quote_plus('site:%s' % random.choice(uris).split('/')[2]) else: search = urllib.quote_plus(random.choice(search_terms)) choice = random.choice(searx_instances) urls = [] content = fetch_contents('%s/?q=%s&pageno=%d' % (choice, search, random.randint(0,10))) if not content: return soup = soupify(content) for a in soup.body.find_all('a'): if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue badurl = [ i for i in urignore if re.findall(i,a.attrs['href'], re.IGNORECASE) ] if not len(badurl): urls.append(a.attrs['href']) if len(urls): query = [ 'url=?' for u in urls ] known = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE %s' % ' OR '.join(query),urls).fetchall() ] time_now = time.time() new = [ (time_now,i,0,5,0) for i in urls if not i in known ] if len(new): sqlite.executemany('INSERT INTO uris (added,url,check_time,error,driver) values(?,?,?,?,?)', new) sqlite.commit() _log('+%d item(s) from %s' % (len(new), search), 'added') sqlite.commit() def is_reserved_ipv4(ip): if ( ip.startswith("10.") or ip.startswith("192.168.") or ip.startswith("127.") or ip.startswith("0.")) or \ (ip.startswith("172.") and (int(ip.split(".")[1]) >= 16 and int(ip.split(".")[1]) <= 31)): return True return False def proxyleech(sqlite, rows): #print('entering proxyleech...') for row in rows: try: content = fetch_contents(row[0]) except KeyboardInterrupt as e: raise e except: content = '' uniques = [] for p in sorted(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', cleanhtml(content))): if p in uniques: continue try: if not is_reserved_ipv4(p.split(':')[0]): uniques.append(p) except KeyboardInterrupt as e: raise e except: pass hash = hashlib.md5(''.join(uniques)).hexdigest() #print('unique; hash: %s, len: %d' % (hash, len(uniques))) ## empty list of proxies: increment error by two if not len(uniques): row[2] = (row[2] * 2) ## same proxy list: increment error by one elif hash == row[1]: row[2] = (row[2] + 1) ## proxylist was updated: error is zero else: row[2] = 0 check_time = (time.time() + 3600 + (3600 * row[2])) sqlite.execute('UPDATE uris SET error=?,hash=?,check_time=? where url=?', (row[2],hash, check_time,row[0])) sqlite.commit() if not row[1] or row[2] > 0: return add = [] for i in uniques: add.append(i) if len(add) > 500: insert_proxies(add, row[0], sqlite) add = [] if len(add): insert_proxies(add, row[0], sqlite) if __name__ == '__main__': config.load() proxies={'http':'socks4://%s' % random.choice(config.torhosts),'https':'socks4://%s' % random.choice(config.torhosts)} sqlite = mysqlite.mysqlite(config.database, str) ## create dbs if required sqlite.execute('CREATE TABLE IF NOT EXISTS uris (added INT, url TEXT, check_time INT, error INT, driver INT, hash TEXT)') sqlite.execute('CREATE TABLE IF NOT EXISTS proxylist (proxy BLOB, country BLOB, added INT, failed INT, tested INT, dronebl INT, proto TEXT, success_count INT, total_duration INT)') sqlite.commit() import_from_file('import.txt', sqlite) ## load search terms with open('search_terms.txt', 'r') as f: search_terms = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] ## load bad terms with open('urignore.txt', 'r') as f: urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] ## add searx instances as bad terms (avoid loops) empty = [ urignore.append(i.split('/')[2]) for i in searx_instances ] # start proxy watcher if config.watchd_threads > 0: watcherd = proxywatchd.Proxywatchd() watcherd.start() else: watcherd = None while True: try: ## any site that needs to be checked ? rows = [ [i[0],i[1],i[2]] for i in sqlite.execute('SELECT url,hash,error FROM uris WHERE (check_time