basically the issue was that the main loop received the SIGINT and therefore broke out before reaching the parts of the code that care about bringing down the child threads. therefore there's now a finish() method that needs to be called after stop(). because sqlite dbs insists to be used from the thread that created the object, the DB cleanup operation are done from the thread that controls it. for standalone operation, in order to keep the main thread alive, an additional run() method is used. this is not necessary when used via ppf.py.
200 lines
6.6 KiB
Python
Executable File
200 lines
6.6 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
import sys
|
|
import requests
|
|
import random, time
|
|
import re
|
|
import urllib
|
|
import hashlib
|
|
from requests.packages.urllib3.exceptions import InsecureRequestWarning
|
|
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
|
|
import mysqlite
|
|
import proxywatchd
|
|
from misc import _log
|
|
from soup_parser import soupify
|
|
import config
|
|
|
|
base_header = {
|
|
'Accept-Language':'en-US,en;q=0.8',
|
|
'Cache-Control':'max-age=0',
|
|
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'DNT': '1',
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'
|
|
}
|
|
|
|
searx_instances = ('https://searx.me', 'https://searx.xyz', 'https://searx.site', 'https://searx.win', 'https://searx.ru', 'https://stemy.me/searx', 'https://searx.at', 'https://listi.me', 'https://searx.dk', 'https://searx.laquadrature.net' )
|
|
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
|
|
|
|
def cleanhtml(raw_html):
|
|
cleanr = re.compile('<.*?>')
|
|
cleantext = re.sub(cleanr, ':', raw_html)
|
|
cleantext = re.sub('::+',':', cleantext)
|
|
return cleantext
|
|
|
|
def import_from_file(fn, sqlite):
|
|
with open(fn, 'r') as f:
|
|
for u in f.read().split('\n'):
|
|
if not len(u): continue
|
|
exists = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE url=?',(u,)).fetchall() ]
|
|
if exists: continue
|
|
print('adding "%s"' % u)
|
|
sqlite.execute('INSERT INTO uris (added,url,check_time,error) VALUES (?,?,?,?)', (time.time(),u,0,1))
|
|
sqlite.commit()
|
|
|
|
def fetch_contents(uri):
|
|
headers = base_header
|
|
try: resp = requests.get(uri, timeout=45, headers=headers, verify=False, proxies=proxies)
|
|
except KeyboardInterrupt as e: raise e
|
|
except: return ''
|
|
data = resp.text
|
|
|
|
for retry_message in retry_messages:
|
|
if retry_message in data: return ''
|
|
|
|
return data
|
|
|
|
def insert_proxies(proxies, uri, sqlite):
|
|
time_now = time.time()
|
|
|
|
query = [ 'proxy=?' for p in proxies ]
|
|
known = [ i[0] for i in sqlite.execute('SELECT proxy FROM proxylist WHERE %s' % ' OR '.join(query), proxies).fetchall() ]
|
|
new = [ (time_now,i,3,0) for i in proxies if not i in known ]
|
|
|
|
if len(new):
|
|
sqlite.executemany('INSERT INTO proxylist (added,proxy,failed,tested) VALUES (?,?,?,?)', new)
|
|
sqlite.commit()
|
|
_log('+%d item(s) from %s' % (len(new), uri), 'added')
|
|
time.sleep(0.1)
|
|
|
|
def proxyfind(sqlite = None):
|
|
#print('entering proxyfind...')
|
|
|
|
if not sqlite: sqlite = mysqlite.mysqlite(config.database,str)
|
|
|
|
uris = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE error=0 and url not like "%github%" ORDER BY RANDOM() LIMIT 10').fetchall() ]
|
|
|
|
if len(uris) > 0 and random.random() < random.random():
|
|
search = urllib.quote_plus('site:%s' % random.choice(uris).split('/')[2])
|
|
else:
|
|
search = urllib.quote_plus(random.choice(search_terms))
|
|
|
|
choice = random.choice(searx_instances)
|
|
urls = []
|
|
|
|
content = fetch_contents('%s/?q=%s&pageno=%d' % (choice, search, random.randint(0,10)))
|
|
|
|
if not content: return
|
|
soup = soupify(content)
|
|
|
|
for a in soup.body.find_all('a'):
|
|
if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue
|
|
badurl = [ i for i in urignore if re.findall(i,a.attrs['href'], re.IGNORECASE) ]
|
|
if not len(badurl): urls.append(a.attrs['href'])
|
|
|
|
if len(urls):
|
|
query = [ 'url=?' for u in urls ]
|
|
known = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE %s' % ' OR '.join(query),urls).fetchall() ]
|
|
time_now = time.time()
|
|
new = [ (time_now,i,0,5,0) for i in urls if not i in known ]
|
|
if len(new):
|
|
sqlite.executemany('INSERT INTO uris (added,url,check_time,error,driver) values(?,?,?,?,?)', new)
|
|
sqlite.commit()
|
|
_log('+%d item(s) from %s' % (len(new), search), 'added')
|
|
|
|
sqlite.commit()
|
|
|
|
def is_reserved_ipv4(ip):
|
|
if ( ip.startswith("10.") or ip.startswith("192.168.") or ip.startswith("127.") or ip.startswith("0.")) or \
|
|
(ip.startswith("172.") and (int(ip.split(".")[1]) >= 16 and int(ip.split(".")[1]) <= 31)): return True
|
|
return False
|
|
|
|
def proxyleech(sqlite, rows):
|
|
#print('entering proxyleech...')
|
|
|
|
for row in rows:
|
|
try: content = fetch_contents(row[0])
|
|
except KeyboardInterrupt as e: raise e
|
|
except: content = ''
|
|
|
|
uniques = []
|
|
for p in sorted(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', cleanhtml(content))):
|
|
if p in uniques: continue
|
|
try:
|
|
if not is_reserved_ipv4(p.split(':')[0]): uniques.append(p)
|
|
except KeyboardInterrupt as e: raise e
|
|
except:
|
|
pass
|
|
|
|
hash = hashlib.md5(''.join(uniques)).hexdigest()
|
|
#print('unique; hash: %s, len: %d' % (hash, len(uniques)))
|
|
|
|
## empty list of proxies: increment error by two
|
|
if not len(uniques): row[2] = (row[2] * 2)
|
|
## same proxy list: increment error by one
|
|
elif hash == row[1]: row[2] = (row[2] + 1)
|
|
## proxylist was updated: error is zero
|
|
else: row[2] = 0
|
|
|
|
check_time = (time.time() + 3600 + (3600 * row[2]))
|
|
sqlite.execute('UPDATE uris SET error=?,hash=?,check_time=? where url=?', (row[2],hash, check_time,row[0]))
|
|
sqlite.commit()
|
|
|
|
if not row[1] or row[2] > 0: return
|
|
|
|
add = []
|
|
for i in uniques:
|
|
add.append(i)
|
|
if len(add) > 500:
|
|
insert_proxies(add, row[0], sqlite)
|
|
add = []
|
|
if len(add): insert_proxies(add, row[0], sqlite)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
config.load()
|
|
proxies={'http':'socks4://%s' % random.choice(config.torhosts),'https':'socks4://%s' % random.choice(config.torhosts)}
|
|
|
|
sqlite = mysqlite.mysqlite(config.database, str)
|
|
|
|
## create dbs if required
|
|
sqlite.execute('CREATE TABLE IF NOT EXISTS uris (added INT, url TEXT, check_time INT, error INT, driver INT, hash TEXT)')
|
|
sqlite.execute('CREATE TABLE IF NOT EXISTS proxylist (proxy BLOB, country BLOB, added INT, failed INT, tested INT, dronebl INT, proto TEXT, duration INT)')
|
|
sqlite.commit()
|
|
|
|
import_from_file('import.txt', sqlite)
|
|
|
|
## load search terms
|
|
with open('search_terms.txt', 'r') as f:
|
|
search_terms = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ]
|
|
|
|
## load bad terms
|
|
with open('urignore.txt', 'r') as f:
|
|
urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ]
|
|
## add searx instances as bad terms (avoid loops)
|
|
empty = [ urignore.append(i.split('/')[2]) for i in searx_instances ]
|
|
|
|
# start proxy watcher
|
|
if config.watchd_threads > 0:
|
|
watcherd = proxywatchd.Proxywatchd()
|
|
watcherd.start()
|
|
else:
|
|
watcherd = None
|
|
|
|
while True:
|
|
try:
|
|
## any site that needs to be checked ?
|
|
rows = [ [i[0],i[1],i[2]] for i in sqlite.execute('SELECT url,hash,error FROM uris WHERE (check_time<? AND error<?) ORDER BY RANDOM() LIMIT 25', (time.time(), 10)).fetchall() ]
|
|
if len(rows): proxyleech(sqlite,rows)
|
|
## search for new website during free time
|
|
elif config.search: proxyfind(sqlite)
|
|
## sleep
|
|
else: time.sleep(10)
|
|
|
|
except KeyboardInterrupt:
|
|
if watcherd:
|
|
watcherd.stop()
|
|
watcherd.finish()
|
|
break
|
|
|
|
print '\r',
|