initial commit
This commit is contained in:
245
ppf.py
Executable file
245
ppf.py
Executable file
@@ -0,0 +1,245 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os
|
||||
import sys
|
||||
import socket
|
||||
import requests
|
||||
import socks
|
||||
import random, time
|
||||
import sqlite3
|
||||
import re
|
||||
import urllib
|
||||
import threading
|
||||
import hashlib
|
||||
import ipcalc
|
||||
from soup_parser import soupify
|
||||
from ConfigParser import SafeConfigParser
|
||||
from requests.packages.urllib3.exceptions import InsecureRequestWarning
|
||||
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
|
||||
from dns import resolver
|
||||
#from selenium import webdriver
|
||||
|
||||
sys.path.append('./includes')
|
||||
import mysqlite
|
||||
from misc import _log
|
||||
|
||||
base_header = {
|
||||
'Accept-Language':'en-US,en;q=0.8',
|
||||
'Cache-Control':'max-age=0',
|
||||
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'DNT': '1',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'
|
||||
}
|
||||
|
||||
database = 'proxies.sqlite'
|
||||
|
||||
searx_instances = ('https://searx.me', 'https://searx.xyz', 'https://searx.site', 'https://searx.win', 'https://searx.ru', 'https://stemy.me/searx', 'https://searx.at', 'https://listi.me', 'https://searx.dk', 'https://searx.laquadrature.net' )
|
||||
|
||||
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
|
||||
proxies={ 'http': 'socks5://127.0.0.1:9050', 'https': 'socks5://127.0.0.1:9050', 'ftp': 'socks5://127.0.0.1:9050'}
|
||||
|
||||
|
||||
# include own classes
|
||||
sys.path.append('./includes')
|
||||
import proxywatchd, mysqlite
|
||||
from misc import _log
|
||||
|
||||
CONFIG = 'config.ini'
|
||||
|
||||
|
||||
def cleanhtml(raw_html):
|
||||
cleanr = re.compile('<.*?>')
|
||||
cleantext = re.sub(cleanr, ':', raw_html)
|
||||
cleantext = re.sub('::+',':', cleantext)
|
||||
return cleantext
|
||||
|
||||
def import_from_file(fn, sqlite):
|
||||
with open(fn, 'r') as f:
|
||||
for u in f.read().split('\n'):
|
||||
if not len(u): continue
|
||||
exists = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE url=?',(u,)).fetchall() ]
|
||||
if exists: continue
|
||||
print('adding "%s"' % u)
|
||||
sqlite.execute('INSERT INTO uris (added,url,check_time,error) VALUES (?,?,?,?)', (time.time(),u,0,1))
|
||||
sqlite.commit()
|
||||
|
||||
def fetch_contents(uri, driver=None):
|
||||
headers = base_header
|
||||
## use requests (default)
|
||||
if not driver:
|
||||
try: resp = requests.get(uri, timeout=45, headers=headers, verify=False, proxies=proxies)
|
||||
except: return ''
|
||||
data = resp.text
|
||||
|
||||
## phantomjs
|
||||
else:
|
||||
for key, value in enumerate(base_header):
|
||||
capability_key = 'phantomjs.page.customHeaders.{}'.format(key)
|
||||
webdriver.DesiredCapabilities.PHANTOMJS[capability_key] = value
|
||||
|
||||
service_args = ['--proxy=127.0.0.1:9050', '--proxy-type=socks5']
|
||||
driver = webdriver.PhantomJS()
|
||||
try:
|
||||
driver.implicitly_wait(45)
|
||||
driver.set_page_load_timeout(45)
|
||||
driver.get(uri)
|
||||
data = driver.page_source
|
||||
|
||||
except: data = ''
|
||||
finally:driver.quit()
|
||||
|
||||
for retry_message in retry_messages:
|
||||
if retry_message in data: return ''
|
||||
return data
|
||||
|
||||
def update_proxy_sources(sqlite, proxies, uri):
|
||||
for proxy in proxies:
|
||||
md5sum = hashlib.md5(proxy).hexdigest()
|
||||
sqlite.execute('CREATE TABLE IF NOT EXISTS "%s" (uri TEXT)' % md5sum)
|
||||
sqlite.commit()
|
||||
#check = [ i for i in sqlite.execute('SELECT uri FROM "%s" WHERE uri=?' % md5sum, (uri,)).fetchall() ]
|
||||
check = sqlite.execute('SELECT uri FROM "%s" WHERE uri=?' % md5sum, (uri,)).fetchall()
|
||||
if not len(check):
|
||||
sqlite.execute('INSERT INTO "%s" (uri) VALUES(?)' % md5sum, (uri,))
|
||||
sqlite.commit()
|
||||
|
||||
def insert_proxies(proxies, uri, sqlite):
|
||||
time_now = time.time()
|
||||
added = 0
|
||||
|
||||
## very wasteful
|
||||
#update_proxy_sources(sqlite, proxies, uri)
|
||||
|
||||
query = [ 'proxy=?' for p in proxies ]
|
||||
known = [ i[0] for i in sqlite.execute('SELECT proxy FROM proxylist WHERE %s' % ' OR '.join(query), proxies).fetchall() ]
|
||||
new = [ (time_now,i,3,0) for i in proxies if not i in known ]
|
||||
if len(new):
|
||||
added = added + len(new)
|
||||
sqlite.executemany('INSERT INTO proxylist (added,proxy,failed,tested) VALUES (?,?,?,?)', new)
|
||||
sqlite.commit()
|
||||
|
||||
if added > 0: _log('+%d item(s) from %s' % (added, uri), 'added')
|
||||
|
||||
def proxyfind(sqlite = None):
|
||||
#print('entering proxyfind...')
|
||||
|
||||
if not sqlite: sqlite = mysqlite.mysqlite(database,str)
|
||||
|
||||
uris = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE error=0 and url not like "%github%" ORDER BY RANDOM() LIMIT 10').fetchall() ]
|
||||
|
||||
if len(uris) > 0 and random.random() < random.random():
|
||||
search = urllib.quote_plus('site:%s' % random.choice(uris).split('/')[2])
|
||||
else:
|
||||
search = urllib.quote_plus(random.choice(search_terms))
|
||||
|
||||
choice = random.choice(searx_instances)
|
||||
urls = []
|
||||
|
||||
content = fetch_contents('%s/?q=%s&pageno=%d' % (choice, search, random.randint(0,10)))
|
||||
|
||||
if not content: return
|
||||
soup = soupify(content)
|
||||
|
||||
for a in soup.body.find_all('a'):
|
||||
if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue
|
||||
badurl = [ i for i in urignore if re.findall(i,a.attrs['href'], re.IGNORECASE) ]
|
||||
if not len(badurl): urls.append(a.attrs['href'])
|
||||
|
||||
if len(urls):
|
||||
query = [ 'url=?' for u in urls ]
|
||||
known = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE %s' % ' OR '.join(query),urls).fetchall() ]
|
||||
time_now = time.time()
|
||||
new = [ (time_now,i,0,5,0) for i in urls if not i in known ]
|
||||
if len(new):
|
||||
sqlite.executemany('INSERT INTO uris (added,url,check_time,error,driver) values(?,?,?,?,?)', new)
|
||||
sqlite.commit()
|
||||
_log('+%d item(s) from %s' % (len(new), search), 'added')
|
||||
|
||||
sqlite.commit()
|
||||
|
||||
def proxyleech(sqlite, rows):
|
||||
#print('entering proxyleech...')
|
||||
|
||||
for row in rows:
|
||||
try: content = fetch_contents(row[0], None)
|
||||
except: content = ''
|
||||
|
||||
uniques = []
|
||||
for p in sorted(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', cleanhtml(content))):
|
||||
if p in uniques: continue
|
||||
try:
|
||||
if str(ipcalc.Network(p.split(':')[0]).subnet) != 'PRIVATE': uniques.append(p)
|
||||
except:
|
||||
pass
|
||||
|
||||
hash = hashlib.md5(''.join(uniques)).hexdigest()
|
||||
#print('unique; hash: %s, len: %d' % (hash, len(uniques)))
|
||||
|
||||
## empty list of proxies: increment error by two
|
||||
if not len(uniques): row[2] = (row[2] * 2)
|
||||
## same proxy list: increment error by one
|
||||
elif hash == row[1]: row[2] = (row[2] + 1)
|
||||
## proxylist was updated: error is zero
|
||||
else: row[2] = 0
|
||||
|
||||
check_time = (time.time() + 3600 + (3600 * row[2]))
|
||||
sqlite.execute('UPDATE uris SET error=?,hash=?,check_time=? where url=?', (row[2],hash, check_time,row[0]))
|
||||
sqlite.commit()
|
||||
|
||||
if not row[1] or row[2] > 0: return
|
||||
|
||||
add = []
|
||||
for i in uniques:
|
||||
add.append(i)
|
||||
if len(add) < 500: continue
|
||||
insert_proxies(add, row[0], sqlite)
|
||||
add = []
|
||||
if len(add): insert_proxies(add, row[0], sqlite)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
## read the config files
|
||||
parser = SafeConfigParser()
|
||||
parser.read(CONFIG)
|
||||
|
||||
database = parser.get('global', 'database')
|
||||
search = parser.getboolean('proxyfind', 'search')
|
||||
|
||||
sqlite = mysqlite.mysqlite(database, str)
|
||||
|
||||
## create dbs if required
|
||||
sqlite.execute('CREATE TABLE IF NOT EXISTS uris (added INT, url TEXT, check_time INT, error INT, driver INT, hash TEXT)')
|
||||
sqlite.execute('CREATE TABLE IF NOT EXISTS proxylist (proxy BLOB, country BLOB, added INT, failed INT, tested INT, dronebl INT, proto TEXT, duration INT)')
|
||||
sqlite.commit()
|
||||
|
||||
import_from_file('import.txt', sqlite)
|
||||
|
||||
## load search terms
|
||||
with open('search_terms.txt', 'r') as f:
|
||||
search_terms = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ]
|
||||
|
||||
## load bad terms
|
||||
with open('urignore.txt', 'r') as f:
|
||||
urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ]
|
||||
## add searx instances as bad terms (avoid loops)
|
||||
empty = [ urignore.append(i.split('/')[2]) for i in searx_instances ]
|
||||
|
||||
# start proxy watcher
|
||||
watcherd = proxywatchd.Proxywatchd(CONFIG) if parser.getboolean('watcherd', 'enabled') else None
|
||||
|
||||
while True:
|
||||
try:
|
||||
## any site that needs to be checked ?
|
||||
rows = [ [i[0],i[1],i[2]] for i in sqlite.execute('SELECT url,hash,error FROM uris WHERE (check_time<? AND error<?) ORDER BY RANDOM() LIMIT 25', (time.time(), 10)).fetchall() ]
|
||||
if len(rows): proxyleech(sqlite,rows)
|
||||
## search for new website during free time
|
||||
elif search: proxyfind(sqlite)
|
||||
## sleep
|
||||
else: time.sleep(10)
|
||||
|
||||
except KeyboardInterrupt: break
|
||||
|
||||
print '\r',
|
||||
|
||||
# stop things
|
||||
if watcherd: watcherd.stop()
|
||||
Reference in New Issue
Block a user