From 2bacf77c8cfb249e63227e20891db94a1165eda8 Mon Sep 17 00:00:00 2001 From: rofl0r Date: Fri, 18 Jan 2019 22:53:35 +0000 Subject: [PATCH] split ppf into two programs, ppf/scraper --- ppf.py | 45 ++-------------------------------- scraper.py | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 43 deletions(-) create mode 100755 scraper.py diff --git a/ppf.py b/ppf.py index ce21129..f6a87df 100755 --- a/ppf.py +++ b/ppf.py @@ -1,8 +1,7 @@ #!/usr/bin/env python import dbs -import random, time -import urllib +import time import mysqlite import proxywatchd from misc import _log @@ -12,9 +11,6 @@ import sys config = Config() -searx_instances = ('https://searx.me', 'https://searx.xyz', 'https://searx.site', 'https://searx.win', 'https://searx.ru', 'https://stemy.me/searx', 'https://searx.at', 'https://listi.me', 'https://searx.dk', 'https://searx.laquadrature.net' ) - - def import_from_file(fn, sqlite): with open(fn, 'r') as f: for u in f.read().split('\n'): @@ -25,28 +21,6 @@ def import_from_file(fn, sqlite): sqlite.execute('INSERT INTO uris (added,url,check_time,error,stale_count,proxies_added) VALUES (?,?,?,?,?,?)', (int(time.time()),u,0,0,0,0)) sqlite.commit() - -def proxyfind(sqlite = None): - if not sqlite: sqlite = mysqlite.mysqlite(config.ppf.database,str) - - uris = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE error=0 and url not like "%github%" ORDER BY RANDOM() LIMIT 10').fetchall() ] - if len(uris) > 0 and random.random() < random.random(): - search = 'site:%s' % random.choice(uris).split('/')[2] - else: - search = random.choice(search_terms) - - search = '%s -intitle:pdf' % search - search_args = [ 'category=general', 'time_range=day', 'q=%s' % urllib.quote_plus(search) ] - for srx in searx_instances: - urls = [] - random.shuffle(search_args) - search_arg = '&'.join(search_args) - for x in range(1,10): - content = fetch.fetch_contents('%s/?%s&pageno=%d' % (srx,search_arg,x)) - if content: urls = fetch.extract_urls(content, urls, urignore) - if len(urls): dbs.insert_urls(urls, search_arg, sqlite) - - def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added): try: content = fetch.fetch_contents(url) except KeyboardInterrupt as e: raise e @@ -97,17 +71,6 @@ if __name__ == '__main__': if len(sys.argv) == 3 and sys.argv[1] == "--file": sys.exit(import_proxies_from_file(proxydb, sys.argv[2])) - if config.ppf.search: - ## load search terms - with open('search_terms.txt', 'r') as f: - search_terms = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] - ## load bad terms - with open('urignore.txt', 'r') as f: - urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] - ## add searx instances as bad terms (avoid loops) - for i in searx_instances: - urignore.append(i.split('/')[2]) - # start proxy watcher if config.watchd.threads > 0: watcherd = proxywatchd.Proxywatchd() @@ -115,7 +78,6 @@ if __name__ == '__main__': else: watcherd = None - while True: try: ## any site that needs to be checked ? @@ -124,10 +86,7 @@ if __name__ == '__main__': for row in rows: proxyleech(proxydb, urldb, row[0], row[1], row[2], row[3], row[4]) - ## search for new website during free time - if config.ppf.search: proxyfind(urldb) - ## sleep - else: time.sleep(10) + time.sleep(10) except KeyboardInterrupt: if watcherd: diff --git a/scraper.py b/scraper.py new file mode 100755 index 0000000..6f60cbf --- /dev/null +++ b/scraper.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python + +import dbs +import random, time +import urllib +import mysqlite +import proxywatchd +from misc import _log +from config import Config +import fetch +import sys + +config = Config() + +searx_instances = ('https://searx.me', 'https://searx.xyz', 'https://searx.site', 'https://searx.win', 'https://searx.ru', 'https://stemy.me/searx', 'https://searx.at', 'https://listi.me', 'https://searx.dk', 'https://searx.laquadrature.net' ) + +def proxyfind(sqlite = None, urignore=None): + if not sqlite: sqlite = mysqlite.mysqlite(config.ppf.database,str) + + uris = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE error=0 and url not like "%github%" ORDER BY RANDOM() LIMIT 10').fetchall() ] + if len(uris) > 0 and random.random() < random.random(): + search = 'site:%s' % random.choice(uris).split('/')[2] + else: + search = random.choice(search_terms) + + search = '%s -intitle:pdf' % search + search_args = [ 'category=general', 'time_range=day', 'q=%s' % urllib.quote_plus(search) ] + for srx in searx_instances: + urls = [] + random.shuffle(search_args) + search_arg = '&'.join(search_args) + for x in range(1,10): + content = fetch.fetch_contents('%s/?%s&pageno=%d' % (srx,search_arg,x)) + if content: urls = fetch.extract_urls(content, urls, urignore) + if len(urls): dbs.insert_urls(urls, search_arg, sqlite) + + +def load_urignore(): + ## load bad terms + with open('urignore.txt', 'r') as f: + urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] + ## add searx instances as bad terms (avoid loops) + for i in searx_instances: + urignore.append(i.split('/')[2]) + return urignore + + +if __name__ == '__main__': + config.load() + fetch.set_config(config) + + proxydb = mysqlite.mysqlite(config.watchd.database, str) + dbs.create_table_if_not_exists(proxydb, 'proxylist') + + urldb = mysqlite.mysqlite(config.ppf.database, str) + dbs.create_table_if_not_exists(urldb, 'uris') + + ## load search terms + with open('search_terms.txt', 'r') as f: + search_terms = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] + + urignore = load_urignore() + + while True: + try: + proxyfind(urldb, urignore) + + except KeyboardInterrupt: + + break + + print '\r',