diff --git a/ppf.py b/ppf.py index 0f21089..b9d1be3 100755 --- a/ppf.py +++ b/ppf.py @@ -8,6 +8,8 @@ from misc import _log from config import Config import fetch import sys +from bs4 import BeautifulSoup +import re config = Config() @@ -61,6 +63,7 @@ def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_adde else: retrievals += 1 error = 0 + extract_urls(content, url) urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=?,content_type=? where url=?', (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, url)) urldb.commit() @@ -69,6 +72,47 @@ def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_adde dbs.insert_proxies(proxydb, new, url) +def is_bad_url(uri): + for u in urignore: + if re.findall(u, uri): return True + return False + +def extract_urls(html, url): + mytime = int(time.time()) + proto = url.split(':')[0] + domain = url.split('/')[2] + urls = [] + + soup = BeautifulSoup(html, features='lxml') + + for a in soup.find_all('a', href=True): + item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href'] + + if is_bad_url(item): + continue + elif item.startswith('www.'): + item = 'http://%s' % item + elif not item.startswith('http'): + if not item.startswith('/'): item = '/%s' % item + item = '%s://%s%s' % (proto,domain,item) + + if not item in urls: urls.append(item) + if len(urls) < 200: continue + insert_if_not_exists(urls) + urls = [] + + if len(urls): insert_if_not_exists(urls) + +def insert_if_not_exists(urls): + mytime = int(time.time()) + query = 'SELECT url FROM uris WHERE %s' % ' OR '.join( [ 'url=?' for u in urls ] ) + known = [ item[0] for item in urldb.execute(query, urls) ] + args = [ [mytime, u, (mytime - 3600), 1, 0,0,0] for u in urls if not u in known ] + if len(args): + print('new items: %s' % args) + urldb.executemany('INSERT OR IGNORE INTO uris (added,url,check_time,error,stale_count,proxies_added,retrievals) VALUES (?,?,?,?,?,?,?)', args) + urldb.commit() + def import_proxies_from_file(proxydb, fn): content = open(fn, 'r').read() @@ -85,6 +129,9 @@ if __name__ == '__main__': proxydb = mysqlite.mysqlite(config.watchd.database, str) dbs.create_table_if_not_exists(proxydb, 'proxylist') + with open('urignore.txt', 'r') as f: + urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] + urldb = mysqlite.mysqlite(config.ppf.database, str) dbs.create_table_if_not_exists(urldb, 'uris') import_from_file('import.txt', urldb)