diff --git a/config.py b/config.py index 8a7b618..a898e13 100644 --- a/config.py +++ b/config.py @@ -41,5 +41,6 @@ class Config(ComboParser): section = 'scraper' self.add_item(section, 'debug', bool, False, 'scraper: whether to print additional debug info', False) + self.add_item(section, 'query', str, 'psw', 'build query using Proxies, Search, Websites', False) self.aparser.add_argument("--file", help="import a single file containing proxy addrs", type=str, default='', required=False) diff --git a/scraper.py b/scraper.py index a75e2d9..d851166 100755 --- a/scraper.py +++ b/scraper.py @@ -17,24 +17,26 @@ with open('searx.instances') as h: print(searx_instances) def proxyfind(sqlite = None, urignore=None): - random.shuffle(searx_instances) - proxydb = mysqlite.mysqlite(config.watchd.database,str) - proxies = [ i[0] for i in proxydb.execute('SELECT proxy FROM proxylist WHERE failed=0 ORDER BY RANDOM() LIMIT 10').fetchall() ] search = '' - if len(proxies) and random.random() < random.random(): - search = ' '.join( random.sample(proxies, random.randint(1,3))) + random.shuffle(searx_instances) + if 'p' in config.scraper.query: + proxydb = mysqlite.mysqlite(config.watchd.database,str) + proxies = [ i[0] for i in proxydb.execute('SELECT proxy FROM proxylist WHERE failed=0 ORDER BY RANDOM() LIMIT 10').fetchall() ] + if len(proxies) and random.random() < random.random(): + search = ' '.join( random.sample(proxies, random.randint(1,3))) - if not len(search) or random.random() < random.random(): + if 'w' in config.scraper.query and not len(search) or random.random() < random.random(): if not sqlite: sqlite = mysqlite.mysqlite(config.ppf.database,str) uris = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE error=0 and url not like "%github%" ORDER BY RANDOM() LIMIT 10').fetchall() ] if len(uris) > 0 and random.random() < random.random(): if len(search): search = '%s ' % search search = search + 'site:%s' % random.choice(uris).split('/')[2] - if not len(search) or random.random() < random.random(): + if 's' in config.scraper.query and not len(search) or random.random() < random.random(): if len(search): search = '%s ' % search search = search + random.choice(search_terms) + if not len(search): return search_args = [ 'category=general', 'time_range=%s' % random.choice(['day','week','month','year']), 'q=%s' % urllib.quote_plus(search) ] random.shuffle(search_args) search_arg = '&'.join(search_args)