Files
ppf/ppf.py
mickael d7f79708ca misc minor changes
proxywatchd: remove sleep
ppf: update proxies schemes
2019-01-04 20:23:50 +00:00

206 lines
6.6 KiB
Python
Executable File

#!/usr/bin/env python
import sys
import requests
import random, time
import re
import urllib
import hashlib
import ipcalc
from soup_parser import soupify
from ConfigParser import SafeConfigParser
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
sys.path.append('./includes')
import mysqlite
from misc import _log
base_header = {
'Accept-Language':'en-US,en;q=0.8',
'Cache-Control':'max-age=0',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'DNT': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'
}
database = 'proxies.sqlite'
searx_instances = ('https://searx.me', 'https://searx.xyz', 'https://searx.site', 'https://searx.win', 'https://searx.ru', 'https://stemy.me/searx', 'https://searx.at', 'https://listi.me', 'https://searx.dk', 'https://searx.laquadrature.net' )
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
proxies={'http':'socks5://127.0.0.1:9050','https':'socks5://127.0.0.1:9050'}
# include own classes
sys.path.append('./includes')
import proxywatchd, mysqlite
from misc import _log
CONFIG = 'config.ini'
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, ':', raw_html)
cleantext = re.sub('::+',':', cleantext)
return cleantext
def import_from_file(fn, sqlite):
with open(fn, 'r') as f:
for u in f.read().split('\n'):
if not len(u): continue
exists = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE url=?',(u,)).fetchall() ]
if exists: continue
print('adding "%s"' % u)
sqlite.execute('INSERT INTO uris (added,url,check_time,error) VALUES (?,?,?,?)', (time.time(),u,0,1))
sqlite.commit()
def fetch_contents(uri):
headers = base_header
try: resp = requests.get(uri, timeout=45, headers=headers, verify=False, proxies=proxies)
except: return ''
data = resp.text
for retry_message in retry_messages:
if retry_message in data: return ''
return data
def insert_proxies(proxies, uri, sqlite):
time_now = time.time()
added = 0
query = [ 'proxy=?' for p in proxies ]
known = [ i[0] for i in sqlite.execute('SELECT proxy FROM proxylist WHERE %s' % ' OR '.join(query), proxies).fetchall() ]
new = [ (time_now,i,3,0) for i in proxies if not i in known ]
if len(new):
added = added + len(new)
sqlite.executemany('INSERT INTO proxylist (added,proxy,failed,tested) VALUES (?,?,?,?)', new)
sqlite.commit()
if added > 0: _log('+%d item(s) from %s' % (added, uri), 'added')
def proxyfind(sqlite = None):
#print('entering proxyfind...')
if not sqlite: sqlite = mysqlite.mysqlite(database,str)
uris = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE error=0 and url not like "%github%" ORDER BY RANDOM() LIMIT 10').fetchall() ]
if len(uris) > 0 and random.random() < random.random():
search = urllib.quote_plus('site:%s' % random.choice(uris).split('/')[2])
else:
search = urllib.quote_plus(random.choice(search_terms))
choice = random.choice(searx_instances)
urls = []
content = fetch_contents('%s/?q=%s&pageno=%d' % (choice, search, random.randint(0,10)))
if not content: return
soup = soupify(content)
for a in soup.body.find_all('a'):
if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue
badurl = [ i for i in urignore if re.findall(i,a.attrs['href'], re.IGNORECASE) ]
if not len(badurl): urls.append(a.attrs['href'])
if len(urls):
query = [ 'url=?' for u in urls ]
known = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE %s' % ' OR '.join(query),urls).fetchall() ]
time_now = time.time()
new = [ (time_now,i,0,5,0) for i in urls if not i in known ]
if len(new):
sqlite.executemany('INSERT INTO uris (added,url,check_time,error,driver) values(?,?,?,?,?)', new)
sqlite.commit()
_log('+%d item(s) from %s' % (len(new), search), 'added')
sqlite.commit()
def proxyleech(sqlite, rows):
#print('entering proxyleech...')
for row in rows:
try: content = fetch_contents(row[0])
except: content = ''
uniques = []
for p in sorted(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', cleanhtml(content))):
if p in uniques: continue
try:
if str(ipcalc.Network(p.split(':')[0]).subnet) != 'PRIVATE': uniques.append(p)
except:
pass
hash = hashlib.md5(''.join(uniques)).hexdigest()
#print('unique; hash: %s, len: %d' % (hash, len(uniques)))
## empty list of proxies: increment error by two
if not len(uniques): row[2] = (row[2] * 2)
## same proxy list: increment error by one
elif hash == row[1]: row[2] = (row[2] + 1)
## proxylist was updated: error is zero
else: row[2] = 0
check_time = (time.time() + 3600 + (3600 * row[2]))
sqlite.execute('UPDATE uris SET error=?,hash=?,check_time=? where url=?', (row[2],hash, check_time,row[0]))
sqlite.commit()
if not row[1] or row[2] > 0: return
add = []
for i in uniques:
add.append(i)
if len(add) < 500: continue
insert_proxies(add, row[0], sqlite)
add = []
if len(add): insert_proxies(add, row[0], sqlite)
if __name__ == '__main__':
## read the config files
parser = SafeConfigParser()
parser.read(CONFIG)
database = parser.get('global', 'database')
search = parser.getboolean('proxyfind', 'search')
sqlite = mysqlite.mysqlite(database, str)
## create dbs if required
sqlite.execute('CREATE TABLE IF NOT EXISTS uris (added INT, url TEXT, check_time INT, error INT, driver INT, hash TEXT)')
sqlite.execute('CREATE TABLE IF NOT EXISTS proxylist (proxy BLOB, country BLOB, added INT, failed INT, tested INT, dronebl INT, proto TEXT, duration INT)')
sqlite.commit()
import_from_file('import.txt', sqlite)
## load search terms
with open('search_terms.txt', 'r') as f:
search_terms = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ]
## load bad terms
with open('urignore.txt', 'r') as f:
urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ]
## add searx instances as bad terms (avoid loops)
empty = [ urignore.append(i.split('/')[2]) for i in searx_instances ]
# start proxy watcher
watcherd = proxywatchd.Proxywatchd(CONFIG) if parser.getboolean('watcherd', 'enabled') else None
while True:
try:
## any site that needs to be checked ?
rows = [ [i[0],i[1],i[2]] for i in sqlite.execute('SELECT url,hash,error FROM uris WHERE (check_time<? AND error<?) ORDER BY RANDOM() LIMIT 25', (time.time(), 10)).fetchall() ]
if len(rows): proxyleech(sqlite,rows)
## search for new website during free time
elif search: proxyfind(sqlite)
## sleep
else: time.sleep(10)
except KeyboardInterrupt: break
print '\r',
# stop things
if watcherd: watcherd.stop()