#!/usr/bin/env python import os import sys import socket import requests import socks import random, time import sqlite3 import re import urllib import threading import hashlib import ipcalc from soup_parser import soupify from ConfigParser import SafeConfigParser from requests.packages.urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning) from dns import resolver #from selenium import webdriver sys.path.append('./includes') import mysqlite from misc import _log base_header = { 'Accept-Language':'en-US,en;q=0.8', 'Cache-Control':'max-age=0', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'DNT': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' } database = 'proxies.sqlite' searx_instances = ('https://searx.me', 'https://searx.xyz', 'https://searx.site', 'https://searx.win', 'https://searx.ru', 'https://stemy.me/searx', 'https://searx.at', 'https://listi.me', 'https://searx.dk', 'https://searx.laquadrature.net' ) retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded') proxies={ 'http': 'socks5://127.0.0.1:9050', 'https': 'socks5://127.0.0.1:9050', 'ftp': 'socks5://127.0.0.1:9050'} # include own classes sys.path.append('./includes') import proxywatchd, mysqlite from misc import _log CONFIG = 'config.ini' def cleanhtml(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, ':', raw_html) cleantext = re.sub('::+',':', cleantext) return cleantext def import_from_file(fn, sqlite): with open(fn, 'r') as f: for u in f.read().split('\n'): if not len(u): continue exists = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE url=?',(u,)).fetchall() ] if exists: continue print('adding "%s"' % u) sqlite.execute('INSERT INTO uris (added,url,check_time,error) VALUES (?,?,?,?)', (time.time(),u,0,1)) sqlite.commit() def fetch_contents(uri, driver=None): headers = base_header ## use requests (default) if not driver: try: resp = requests.get(uri, timeout=45, headers=headers, verify=False, proxies=proxies) except: return '' data = resp.text ## phantomjs else: for key, value in enumerate(base_header): capability_key = 'phantomjs.page.customHeaders.{}'.format(key) webdriver.DesiredCapabilities.PHANTOMJS[capability_key] = value service_args = ['--proxy=127.0.0.1:9050', '--proxy-type=socks5'] driver = webdriver.PhantomJS() try: driver.implicitly_wait(45) driver.set_page_load_timeout(45) driver.get(uri) data = driver.page_source except: data = '' finally:driver.quit() for retry_message in retry_messages: if retry_message in data: return '' return data def update_proxy_sources(sqlite, proxies, uri): for proxy in proxies: md5sum = hashlib.md5(proxy).hexdigest() sqlite.execute('CREATE TABLE IF NOT EXISTS "%s" (uri TEXT)' % md5sum) sqlite.commit() #check = [ i for i in sqlite.execute('SELECT uri FROM "%s" WHERE uri=?' % md5sum, (uri,)).fetchall() ] check = sqlite.execute('SELECT uri FROM "%s" WHERE uri=?' % md5sum, (uri,)).fetchall() if not len(check): sqlite.execute('INSERT INTO "%s" (uri) VALUES(?)' % md5sum, (uri,)) sqlite.commit() def insert_proxies(proxies, uri, sqlite): time_now = time.time() added = 0 ## very wasteful #update_proxy_sources(sqlite, proxies, uri) query = [ 'proxy=?' for p in proxies ] known = [ i[0] for i in sqlite.execute('SELECT proxy FROM proxylist WHERE %s' % ' OR '.join(query), proxies).fetchall() ] new = [ (time_now,i,3,0) for i in proxies if not i in known ] if len(new): added = added + len(new) sqlite.executemany('INSERT INTO proxylist (added,proxy,failed,tested) VALUES (?,?,?,?)', new) sqlite.commit() if added > 0: _log('+%d item(s) from %s' % (added, uri), 'added') def proxyfind(sqlite = None): #print('entering proxyfind...') if not sqlite: sqlite = mysqlite.mysqlite(database,str) uris = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE error=0 and url not like "%github%" ORDER BY RANDOM() LIMIT 10').fetchall() ] if len(uris) > 0 and random.random() < random.random(): search = urllib.quote_plus('site:%s' % random.choice(uris).split('/')[2]) else: search = urllib.quote_plus(random.choice(search_terms)) choice = random.choice(searx_instances) urls = [] content = fetch_contents('%s/?q=%s&pageno=%d' % (choice, search, random.randint(0,10))) if not content: return soup = soupify(content) for a in soup.body.find_all('a'): if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue badurl = [ i for i in urignore if re.findall(i,a.attrs['href'], re.IGNORECASE) ] if not len(badurl): urls.append(a.attrs['href']) if len(urls): query = [ 'url=?' for u in urls ] known = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE %s' % ' OR '.join(query),urls).fetchall() ] time_now = time.time() new = [ (time_now,i,0,5,0) for i in urls if not i in known ] if len(new): sqlite.executemany('INSERT INTO uris (added,url,check_time,error,driver) values(?,?,?,?,?)', new) sqlite.commit() _log('+%d item(s) from %s' % (len(new), search), 'added') sqlite.commit() def proxyleech(sqlite, rows): #print('entering proxyleech...') for row in rows: try: content = fetch_contents(row[0], None) except: content = '' uniques = [] for p in sorted(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', cleanhtml(content))): if p in uniques: continue try: if str(ipcalc.Network(p.split(':')[0]).subnet) != 'PRIVATE': uniques.append(p) except: pass hash = hashlib.md5(''.join(uniques)).hexdigest() #print('unique; hash: %s, len: %d' % (hash, len(uniques))) ## empty list of proxies: increment error by two if not len(uniques): row[2] = (row[2] * 2) ## same proxy list: increment error by one elif hash == row[1]: row[2] = (row[2] + 1) ## proxylist was updated: error is zero else: row[2] = 0 check_time = (time.time() + 3600 + (3600 * row[2])) sqlite.execute('UPDATE uris SET error=?,hash=?,check_time=? where url=?', (row[2],hash, check_time,row[0])) sqlite.commit() if not row[1] or row[2] > 0: return add = [] for i in uniques: add.append(i) if len(add) < 500: continue insert_proxies(add, row[0], sqlite) add = [] if len(add): insert_proxies(add, row[0], sqlite) if __name__ == '__main__': ## read the config files parser = SafeConfigParser() parser.read(CONFIG) database = parser.get('global', 'database') search = parser.getboolean('proxyfind', 'search') sqlite = mysqlite.mysqlite(database, str) ## create dbs if required sqlite.execute('CREATE TABLE IF NOT EXISTS uris (added INT, url TEXT, check_time INT, error INT, driver INT, hash TEXT)') sqlite.execute('CREATE TABLE IF NOT EXISTS proxylist (proxy BLOB, country BLOB, added INT, failed INT, tested INT, dronebl INT, proto TEXT, duration INT)') sqlite.commit() import_from_file('import.txt', sqlite) ## load search terms with open('search_terms.txt', 'r') as f: search_terms = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] ## load bad terms with open('urignore.txt', 'r') as f: urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] ## add searx instances as bad terms (avoid loops) empty = [ urignore.append(i.split('/')[2]) for i in searx_instances ] # start proxy watcher watcherd = proxywatchd.Proxywatchd(CONFIG) if parser.getboolean('watcherd', 'enabled') else None while True: try: ## any site that needs to be checked ? rows = [ [i[0],i[1],i[2]] for i in sqlite.execute('SELECT url,hash,error FROM uris WHERE (check_time