Files
ppf/ppf.py
2019-05-14 19:31:19 +02:00

149 lines
4.0 KiB
Python
Executable File

#!/usr/bin/env python
import dbs
import time
import mysqlite
import proxywatchd
from misc import _log
from config import Config
import fetch
import sys
from bs4 import BeautifulSoup
import re
config = Config()
def import_from_file(fn, sqlite):
with open(fn, 'r') as f:
urls = [ url for url in f.read().split('\n') if url != '' ]
dbs.insert_urls(urls, 'import.txt', urldb)
def get_content_type(url):
hdr = fetch.fetch_contents(url, head=True)
for h in hdr.split('\n'):
if h.lower().startswith('content-type: '): return h.lower().split(':')[1].strip()
return ''
def is_good_content_type(string):
allowed_ct = [ 'text/html', 'text/plain', 'atom+xml' ]
for ct in allowed_ct:
if ct.lower() in string.lower(): return True
return False
def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added, content_type):
if not content_type: content_type = get_content_type(url)
if is_good_content_type(content_type):
try: content = fetch.fetch_contents(url)
except KeyboardInterrupt as e: raise e
except: content = ''
else:
content = ''
unique_count, new = fetch.extract_proxies(content, proxydb)
if retrievals == 0: # new site
if content != '' and unique_count == 0: # site works but has zero proxy addresses
error = 99999
else:
if len(new) == 0:
stale_count += 1
else:
extract_urls(content, url)
stale_count = 0
if content == '':
error += 1
else:
retrievals += 1
error = 0
urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=?,content_type=? where url=?', (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, url))
urldb.commit()
if not len(new): return
dbs.insert_proxies(proxydb, new, url)
def is_bad_url(uri, domain=None, samedomain=False):
for u in urignore:
if re.findall(u, uri): return True
return False
def extract_urls(html, url):
mytime = int(time.time())
proto = url.split(':')[0]
domain = url.split('/')[2]
urls = []
soup = BeautifulSoup(html, features='lxml')
for a in soup.find_all('a', href=True):
item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href']
item = item.strip()
if item.startswith('www.'):
item = 'http://%s' % item
elif not item.startswith('http'):
if not item.startswith('/'): item = '/%s' % item
item = '%s://%s%s' % (proto,domain,item)
elif is_bad_url(item):
continue
if not item in urls: urls.append(item)
if len(urls): dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls)
def import_proxies_from_file(proxydb, fn):
content = open(fn, 'r').read()
unique_count, new = fetch.extract_proxies(content, proxydb)
if len(new):
dbs.insert_proxies(proxydb, new, fn)
return 0
return 1
if __name__ == '__main__':
config.load()
fetch.set_config(config)
proxydb = mysqlite.mysqlite(config.watchd.database, str)
dbs.create_table_if_not_exists(proxydb, 'proxylist')
with open('urignore.txt', 'r') as f:
urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ]
urldb = mysqlite.mysqlite(config.ppf.database, str)
dbs.create_table_if_not_exists(urldb, 'uris')
import_from_file('import.txt', urldb)
if len(sys.argv) == 3 and sys.argv[1] == "--file":
sys.exit(import_proxies_from_file(proxydb, sys.argv[2]))
# start proxy watcher
if config.watchd.threads > 0:
watcherd = proxywatchd.Proxywatchd()
watcherd.start()
else:
watcherd = None
while True:
try:
## any site that needs to be checked ?
rows = urldb.execute('SELECT url,stale_count,error,retrievals,proxies_added,content_type FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) <?) ORDER BY RANDOM() LIMIT 25', (config.ppf.max_fail, config.ppf.checktime, config.ppf.perfail_checktime, int(time.time()))).fetchall()
if not len(rows): time.sleep(10)
for row in rows:
proxyleech(proxydb, urldb, row[0], row[1], row[2], row[3], row[4], row[5])
#time.sleep(10)
except KeyboardInterrupt:
if watcherd:
watcherd.stop()
watcherd.finish()
break
print '\r',