Files
ppf/ppf.py
2019-01-06 02:58:58 +00:00

190 lines
6.5 KiB
Python
Executable File

#!/usr/bin/env python
import sys
import requests
import random, time
import re
import urllib
import hashlib
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import mysqlite
import proxywatchd
from misc import _log
from soup_parser import soupify
import config
base_header = {
'Accept-Language':'en-US,en;q=0.8',
'Cache-Control':'max-age=0',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'DNT': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'
}
searx_instances = ('https://searx.me', 'https://searx.xyz', 'https://searx.site', 'https://searx.win', 'https://searx.ru', 'https://stemy.me/searx', 'https://searx.at', 'https://listi.me', 'https://searx.dk', 'https://searx.laquadrature.net' )
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, ':', raw_html)
cleantext = re.sub('::+',':', cleantext)
return cleantext
def import_from_file(fn, sqlite):
with open(fn, 'r') as f:
for u in f.read().split('\n'):
if not len(u): continue
exists = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE url=?',(u,)).fetchall() ]
if exists: continue
print('adding "%s"' % u)
sqlite.execute('INSERT INTO uris (added,url,check_time,error) VALUES (?,?,?,?)', (time.time(),u,0,1))
sqlite.commit()
def fetch_contents(uri):
headers = base_header
try: resp = requests.get(uri, timeout=45, headers=headers, verify=False, proxies=proxies)
except KeyboardInterrupt as e: raise e
except: return ''
data = resp.text
for retry_message in retry_messages:
if retry_message in data: return ''
return data
def insert_proxies(proxies, uri, sqlite):
time_now = time.time()
query = [ 'proxy=?' for p in proxies ]
known = [ i[0] for i in sqlite.execute('SELECT proxy FROM proxylist WHERE %s' % ' OR '.join(query), proxies).fetchall() ]
new = [ (time_now,i,3,0,0,0) for i in proxies if not i in known ]
if len(new):
sqlite.executemany('INSERT INTO proxylist (added,proxy,failed,tested,success_count,total_duration) VALUES (?,?,?,?,?,?)', new)
sqlite.commit()
_log('+%d item(s) from %s' % (len(new), uri), 'added')
time.sleep(0.1)
def proxyfind(sqlite = None):
if not sqlite: sqlite = mysqlite.mysqlite(config.database,str)
choice = random.choice(searx_instances)
urls = []
uris = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE error=0 and url not like "%github%" ORDER BY RANDOM() LIMIT 10').fetchall() ]
if len(uris) > 0 and random.random() < random.random():
search = 'site:%s' % random.choice(uris).split('/')[2]
else:
search = random.choice(search_terms)
content = fetch_contents('%s/?q=%s&pageno=%d' % (choice, urllib.quote_plus(search), random.randint(0,10)))
if not content: return
soup = soupify(content)
for a in soup.body.find_all('a'):
if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue
badurl = [ i for i in urignore if re.findall(i,a.attrs['href'], re.IGNORECASE) ]
if not len(badurl): urls.append(a.attrs['href'])
if len(urls):
query = [ 'url=?' for u in urls ]
known = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE %s' % ' OR '.join(query),urls).fetchall() ]
time_now = time.time()
new = [ (time_now,i,0,5,0) for i in urls if not i in known ]
if len(new):
sqlite.executemany('INSERT INTO uris (added,url,check_time,error,driver) values(?,?,?,?,?)', new)
sqlite.commit()
_log('+%d item(s) from %s' % (len(new), search), 'added')
sqlite.commit()
def is_reserved_ipv4(ip):
if ( ip.startswith("10.") or ip.startswith("192.168.") or ip.startswith("127.") or ip.startswith("0.")) or \
(ip.startswith("172.") and (int(ip.split(".")[1]) >= 16 and int(ip.split(".")[1]) <= 31)): return True
return False
def proxyleech(sqlite, rows):
for row in rows:
try: content = fetch_contents(row[0])
except KeyboardInterrupt as e: raise e
except: content = ''
uniques = []
for p in sorted(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', cleanhtml(content))):
if p in uniques: continue
try:
if not is_reserved_ipv4(p.split(':')[0]): uniques.append(p)
except KeyboardInterrupt as e: raise e
except:
pass
hash = hashlib.md5(''.join(uniques)).hexdigest()
## empty list of proxies: increment error by two
if not len(uniques): row[2] = (row[2] * 2)
## same proxy list: increment error by one
elif hash == row[1]: row[2] = (row[2] + 1)
## proxylist was updated: error is zero
else: row[2] = 0
check_time = (time.time() + 3600 + (3600 * row[2]))
sqlite.execute('UPDATE uris SET error=?,hash=?,check_time=? where url=?', (row[2],hash, check_time,row[0]))
sqlite.commit()
if not row[1] or row[2] > 0: return
add = []
for i in uniques:
add.append(i)
if len(add) > 500:
insert_proxies(add, row[0], sqlite)
add = []
if len(add): insert_proxies(add, row[0], sqlite)
if __name__ == '__main__':
config.load()
proxies={'http':'socks4://%s' % random.choice(config.torhosts),'https':'socks4://%s' % random.choice(config.torhosts)}
sqlite = mysqlite.mysqlite(config.database, str)
## create dbs if required
sqlite.execute('CREATE TABLE IF NOT EXISTS uris (added INT, url TEXT, check_time INT, error INT, driver INT, hash TEXT)')
sqlite.execute('CREATE TABLE IF NOT EXISTS proxylist (proxy BLOB, country BLOB, added INT, failed INT, tested INT, dronebl INT, proto TEXT, success_count INT, total_duration INT)')
sqlite.commit()
import_from_file('import.txt', sqlite)
## load search terms
with open('search_terms.txt', 'r') as f:
search_terms = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ]
## load bad terms
with open('urignore.txt', 'r') as f:
urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ]
## add searx instances as bad terms (avoid loops)
empty = [ urignore.append(i.split('/')[2]) for i in searx_instances ]
# start proxy watcher
if config.watchd_threads > 0:
watcherd = proxywatchd.Proxywatchd()
watcherd.start()
else:
watcherd = None
while True:
try:
## any site that needs to be checked ?
rows = [ [i[0],i[1],i[2]] for i in sqlite.execute('SELECT url,hash,error FROM uris WHERE (check_time<? AND error<?) ORDER BY RANDOM() LIMIT 25', (time.time(), 10)).fetchall() ]
if len(rows): proxyleech(sqlite,rows)
## search for new website during free time
elif config.search: proxyfind(sqlite)
## sleep
else: time.sleep(10)
except KeyboardInterrupt:
if watcherd:
watcherd.stop()
watcherd.finish()
break
print '\r',