201 lines
6.7 KiB
Python
Executable File
201 lines
6.7 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
import sys
|
|
import requests
|
|
import random, time
|
|
import re
|
|
import urllib
|
|
import hashlib
|
|
from ConfigParser import SafeConfigParser
|
|
from requests.packages.urllib3.exceptions import InsecureRequestWarning
|
|
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
|
|
import mysqlite
|
|
import proxywatchd
|
|
from misc import _log
|
|
from soup_parser import soupify
|
|
|
|
base_header = {
|
|
'Accept-Language':'en-US,en;q=0.8',
|
|
'Cache-Control':'max-age=0',
|
|
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'DNT': '1',
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'
|
|
}
|
|
|
|
searx_instances = ('https://searx.me', 'https://searx.xyz', 'https://searx.site', 'https://searx.win', 'https://searx.ru', 'https://stemy.me/searx', 'https://searx.at', 'https://listi.me', 'https://searx.dk', 'https://searx.laquadrature.net' )
|
|
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
|
|
CONFIG = 'config.ini'
|
|
|
|
def cleanhtml(raw_html):
|
|
cleanr = re.compile('<.*?>')
|
|
cleantext = re.sub(cleanr, ':', raw_html)
|
|
cleantext = re.sub('::+',':', cleantext)
|
|
return cleantext
|
|
|
|
def import_from_file(fn, sqlite):
|
|
with open(fn, 'r') as f:
|
|
for u in f.read().split('\n'):
|
|
if not len(u): continue
|
|
exists = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE url=?',(u,)).fetchall() ]
|
|
if exists: continue
|
|
print('adding "%s"' % u)
|
|
sqlite.execute('INSERT INTO uris (added,url,check_time,error) VALUES (?,?,?,?)', (time.time(),u,0,1))
|
|
sqlite.commit()
|
|
|
|
def fetch_contents(uri):
|
|
headers = base_header
|
|
try: resp = requests.get(uri, timeout=45, headers=headers, verify=False, proxies=proxies)
|
|
except: return ''
|
|
data = resp.text
|
|
|
|
for retry_message in retry_messages:
|
|
if retry_message in data: return ''
|
|
|
|
return data
|
|
|
|
def insert_proxies(proxies, uri, sqlite):
|
|
time_now = time.time()
|
|
|
|
query = [ 'proxy=?' for p in proxies ]
|
|
known = [ i[0] for i in sqlite.execute('SELECT proxy FROM proxylist WHERE %s' % ' OR '.join(query), proxies).fetchall() ]
|
|
new = [ (time_now,i,3,0) for i in proxies if not i in known ]
|
|
|
|
if len(new):
|
|
sqlite.executemany('INSERT INTO proxylist (added,proxy,failed,tested) VALUES (?,?,?,?)', new)
|
|
sqlite.commit()
|
|
_log('+%d item(s) from %s' % (len(new), uri), 'added')
|
|
time.sleep(0.1)
|
|
|
|
def proxyfind(sqlite = None):
|
|
#print('entering proxyfind...')
|
|
|
|
if not sqlite: sqlite = mysqlite.mysqlite(database,str)
|
|
|
|
uris = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE error=0 and url not like "%github%" ORDER BY RANDOM() LIMIT 10').fetchall() ]
|
|
|
|
if len(uris) > 0 and random.random() < random.random():
|
|
search = urllib.quote_plus('site:%s' % random.choice(uris).split('/')[2])
|
|
else:
|
|
search = urllib.quote_plus(random.choice(search_terms))
|
|
|
|
choice = random.choice(searx_instances)
|
|
urls = []
|
|
|
|
content = fetch_contents('%s/?q=%s&pageno=%d' % (choice, search, random.randint(0,10)))
|
|
|
|
if not content: return
|
|
soup = soupify(content)
|
|
|
|
for a in soup.body.find_all('a'):
|
|
if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue
|
|
badurl = [ i for i in urignore if re.findall(i,a.attrs['href'], re.IGNORECASE) ]
|
|
if not len(badurl): urls.append(a.attrs['href'])
|
|
|
|
if len(urls):
|
|
query = [ 'url=?' for u in urls ]
|
|
known = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE %s' % ' OR '.join(query),urls).fetchall() ]
|
|
time_now = time.time()
|
|
new = [ (time_now,i,0,5,0) for i in urls if not i in known ]
|
|
if len(new):
|
|
sqlite.executemany('INSERT INTO uris (added,url,check_time,error,driver) values(?,?,?,?,?)', new)
|
|
sqlite.commit()
|
|
_log('+%d item(s) from %s' % (len(new), search), 'added')
|
|
time.sleep(0.1)
|
|
|
|
sqlite.commit()
|
|
|
|
def is_reserved_ipv4(ip):
|
|
if ( ip.startswith("10.") or ip.startswith("192.168.") or ip.startswith("127.") or ip.startswith("0.")) or \
|
|
(ip.startswith("172.") and (int(ip.split(".")[1]) >= 16 and int(ip.split(".")[1]) <= 31)): return True
|
|
return False
|
|
|
|
def proxyleech(sqlite, rows):
|
|
#print('entering proxyleech...')
|
|
|
|
for row in rows:
|
|
try: content = fetch_contents(row[0])
|
|
except: content = ''
|
|
|
|
uniques = []
|
|
for p in sorted(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', cleanhtml(content))):
|
|
if p in uniques: continue
|
|
try:
|
|
if not is_reserved_ipv4(p.split(':')[0]): uniques.append(p)
|
|
except:
|
|
pass
|
|
|
|
hash = hashlib.md5(''.join(uniques)).hexdigest()
|
|
#print('unique; hash: %s, len: %d' % (hash, len(uniques)))
|
|
|
|
## empty list of proxies: increment error by two
|
|
if not len(uniques): row[2] = (row[2] * 2)
|
|
## same proxy list: increment error by one
|
|
elif hash == row[1]: row[2] = (row[2] + 1)
|
|
## proxylist was updated: error is zero
|
|
else: row[2] = 0
|
|
|
|
check_time = (time.time() + 3600 + (3600 * row[2]))
|
|
sqlite.execute('UPDATE uris SET error=?,hash=?,check_time=? where url=?', (row[2],hash, check_time,row[0]))
|
|
sqlite.commit()
|
|
|
|
if not row[1] or row[2] > 0: return
|
|
|
|
add = []
|
|
for i in uniques:
|
|
add.append(i)
|
|
if len(add) > 500:
|
|
insert_proxies(add, row[0], sqlite)
|
|
add = []
|
|
time.sleep(0.1)
|
|
if len(add): insert_proxies(add, row[0], sqlite)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
## read the config files
|
|
parser = SafeConfigParser()
|
|
parser.read(CONFIG)
|
|
|
|
database = parser.get('global', 'database')
|
|
search = parser.getboolean('proxyfind', 'search')
|
|
tor_hosts = parser.get('global', 'tor_host').split(',')
|
|
proxies={'http':'socks4://%s' % random.choice(tor_hosts),'https':'socks4://%s' % random.choice(tor_hosts)}
|
|
|
|
sqlite = mysqlite.mysqlite(database, str)
|
|
|
|
## create dbs if required
|
|
sqlite.execute('CREATE TABLE IF NOT EXISTS uris (added INT, url TEXT, check_time INT, error INT, driver INT, hash TEXT)')
|
|
sqlite.execute('CREATE TABLE IF NOT EXISTS proxylist (proxy BLOB, country BLOB, added INT, failed INT, tested INT, dronebl INT, proto TEXT, duration INT)')
|
|
sqlite.commit()
|
|
|
|
import_from_file('import.txt', sqlite)
|
|
|
|
## load search terms
|
|
with open('search_terms.txt', 'r') as f:
|
|
search_terms = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ]
|
|
|
|
## load bad terms
|
|
with open('urignore.txt', 'r') as f:
|
|
urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ]
|
|
## add searx instances as bad terms (avoid loops)
|
|
empty = [ urignore.append(i.split('/')[2]) for i in searx_instances ]
|
|
|
|
# start proxy watcher
|
|
watcherd = proxywatchd.Proxywatchd(CONFIG) if parser.getboolean('watcherd', 'enabled') else None
|
|
|
|
while True:
|
|
try:
|
|
## any site that needs to be checked ?
|
|
rows = [ [i[0],i[1],i[2]] for i in sqlite.execute('SELECT url,hash,error FROM uris WHERE (check_time<? AND error<?) ORDER BY RANDOM() LIMIT 25', (time.time(), 10)).fetchall() ]
|
|
if len(rows): proxyleech(sqlite,rows)
|
|
## search for new website during free time
|
|
elif search: proxyfind(sqlite)
|
|
## sleep
|
|
else: time.sleep(10)
|
|
|
|
except KeyboardInterrupt: break
|
|
|
|
print '\r',
|
|
|
|
# stop things
|
|
if watcherd: watcherd.stop()
|