diff --git a/proxywatchd.py b/proxywatchd.py index 47f9b28..c410f93 100644 --- a/proxywatchd.py +++ b/proxywatchd.py @@ -83,10 +83,7 @@ class WorkerJob(): ] try: - if self.isoldies: - sock = rocksock.Rocksock(host=srv, port=server_port, ssl=use_ssl, proxies=proxies, timeout=config.watchd.timeout - 2) - else: - sock = rocksock.Rocksock(host=srv, port=server_port, ssl=use_ssl, proxies=proxies, timeout=config.watchd.timeout) + sock = rocksock.Rocksock(host=srv, port=server_port, ssl=use_ssl, proxies=proxies, timeout=config.watchd.timeout) sock.connect() sock.send('NICK\n') return sock, proto, duration, torhost, srvname, 0 @@ -276,11 +273,13 @@ class Proxywatchd(): q = 'SELECT proxy,proto,failed,success_count,total_duration,country FROM proxylist WHERE failed >= ? and failed < ? and (tested + ? + (failed * ?)) < ? ORDER BY RANDOM()' rows = self.mysqlite.execute(q, (0, config.watchd.max_fail, config.watchd.checktime, config.watchd.perfail_checktime, time.time())).fetchall() # check oldies ? - if len(rows) < config.watchd.threads and config.watchd.oldies: - self.isoldies = True - ## disable tor safeguard for old proxies - if self.tor_safeguard: self.tor_safeguard = False - rows = self.mysqlite.execute(q, (config.watchd.max_fail, config.watchd.max_fail*2, config.watchd.checktime, config.watchd.oldies_checktime, time.time())).fetchall() + if len(rows) < config.watchd.threads: + rows = [] + if config.watchd.oldies: + self.isoldies = True + ## disable tor safeguard for old proxies + if self.tor_safeguard: self.tor_safeguard = False + rows = self.mysqlite.execute(q, (config.watchd.max_fail, config.watchd.max_fail*2, config.watchd.checktime, config.watchd.oldies_checktime, time.time())).fetchall() return rows def prepare_jobs(self): diff --git a/scraper.py b/scraper.py index b8b036b..adf2c68 100755 --- a/scraper.py +++ b/scraper.py @@ -13,78 +13,82 @@ import sys config = Config() with open('searx.instances') as h: - searx_instances = [ line.strip() for line in h.readlines() if line.lower().startswith('http') ] - print(searx_instances) + searx_instances = [ line.strip() for line in h.readlines() if line.lower().startswith('http') ] + print(searx_instances) def proxyfind(sqlite = None, urignore=None): - search = '' - random.shuffle(searx_instances) - if 'p' in config.scraper.query: - proxydb = mysqlite.mysqlite(config.watchd.database,str) - proxies = [ i[0] for i in proxydb.execute('SELECT proxy FROM proxylist WHERE failed=0 ORDER BY RANDOM() LIMIT 10').fetchall() ] - if len(proxies) and random.random() < random.random(): - search = ' '.join( random.sample(proxies, random.randint(1,2))) + search = '' + random.shuffle(searx_instances) - if 'w' in config.scraper.query and not len(search) or random.random() < random.random(): - if not sqlite: sqlite = mysqlite.mysqlite(config.ppf.database,str) - uris = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE error=0 and url not like "%github%" ORDER BY RANDOM() LIMIT 10').fetchall() ] - if len(uris) > 0 and random.random() < random.random(): - if len(search): search = '%s OR ' % search - search = search + 'site:%s' % random.choice(uris).split('/')[2] + ## search by working proxy + if 'p' in config.scraper.query: + proxydb = mysqlite.mysqlite(config.watchd.database,str) + proxies = [ i[0] for i in proxydb.execute('SELECT proxy FROM proxylist WHERE failed=0 ORDER BY RANDOM() LIMIT 10').fetchall() ] + if len(proxies) and random.random() < random.random(): + search = ' '.join( random.sample(proxies, random.randint(1,2))) - if 's' in config.scraper.query and not len(search) or random.random() < random.random(): - if len(search): search = '%s OR ' % search - search = search + random.choice(search_terms) + ## search by relative url + if 'w' in config.scraper.query and not len(search) or random.random() < random.random(): + if not sqlite: sqlite = mysqlite.mysqlite(config.ppf.database,str) + uris = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE error=0 and url not like "%github%" ORDER BY RANDOM() LIMIT 10').fetchall() ] + if len(uris) > 0 and random.random() < random.random(): + if len(search): search = '%s OR ' % search + search = search + 'site:%s' % random.choice(uris).split('/')[2] - if not len(search): return - search_args = [ 'category=general', 'time_range=%s' % random.choice(['day','week','month','year']), 'q=%s' % urllib.quote_plus(search) ] - random.shuffle(search_args) - search_arg = '&'.join(search_args) + ## build string + if 's' in config.scraper.query and not len(search) or random.random() < random.random(): + if len(search): search = '%s OR ' % search + search = search + random.choice(search_terms) - if config.scraper.debug: - print('search_arg: %s' % search_arg) + if not len(search): return + search_args = [ 'category=general', 'time_range=%s' % random.choice(['day','week','month','year']), 'q=%s' % urllib.quote_plus(search) ] + random.shuffle(search_args) + search_arg = '&'.join(search_args) - for srx in searx_instances: - x = 0 - while 1: - urls = [] - if x > 0: content = fetch.fetch_contents('%s/?%s&pageno=%d' % (srx,search_arg,x)) - else: content = fetch.fetch_contents('%s/?%s' % (srx,search_arg)) - if content: urls = fetch.extract_urls(content, urls, urignore) + if config.scraper.debug: + print('search_arg: %s' % search_arg) - if not len(urls): break - dbs.insert_urls(urls, '%s/?%s (pageno: %d)' % (srx.split('/')[2],search_arg,x) , sqlite) - x = x + 1 + for srx in searx_instances: + x = 0 + while 1: + urls = [] + if x > 0: content = fetch.fetch_contents('%s/?%s&pageno=%d' % (srx,search_arg,x)) + else: content = fetch.fetch_contents('%s/?%s' % (srx,search_arg)) + if content: urls = fetch.extract_urls(content, urls, urignore) + + if not len(urls): break + dbs.insert_urls(urls, '%s/?%s (pageno: %d)' % (srx.split('/')[2],search_arg,x) , sqlite) + x = x + 1 def load_urignore(): - ## load bad terms - with open('urignore.txt', 'r') as f: - urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] - ## add searx instances as bad terms (avoid loops) - for i in searx_instances: - urignore.append(i.split('/')[2]) - return urignore + ## load bad terms + with open('urignore.txt', 'r') as f: + urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] + ## add searx instances as bad terms (avoid loops) + for i in searx_instances: + urignore.append(i.split('/')[2]) + return urignore if __name__ == '__main__': - config.load() - fetch.set_config(config) + config.load() + fetch.set_config(config) - proxydb = mysqlite.mysqlite(config.watchd.database, str) - dbs.create_table_if_not_exists(proxydb, 'proxylist') + proxydb = mysqlite.mysqlite(config.watchd.database, str) + dbs.create_table_if_not_exists(proxydb, 'proxylist') - urldb = mysqlite.mysqlite(config.ppf.database, str) - dbs.create_table_if_not_exists(urldb, 'uris') + urldb = mysqlite.mysqlite(config.ppf.database, str) + dbs.create_table_if_not_exists(urldb, 'uris') - ## load search terms - with open('search_terms.txt', 'r') as f: - search_terms = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] + ## load search terms + with open('search_terms.txt', 'r') as f: + search_terms = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] - urignore = load_urignore() + urignore = load_urignore() - while True: - try: proxyfind(urldb, urignore) - except KeyboardInterrupt: break + while True: + try: proxyfind(urldb, urignore) + except KeyboardInterrupt: break - print '\r', + print '\r', diff --git a/search_terms.txt b/search_terms.txt index f1e1966..19ea398 100644 --- a/search_terms.txt +++ b/search_terms.txt @@ -1,6 +1,8 @@ -site:github.com :8081 :8888 :8080 -site:github.com :4444 :1234 :3124 -site:github.com proxylist +elite proxylist +elite http proxies +elite socks proxies +anonymous proxies +anonymous proxylist hourly http proxy hourly socks proxy daily http proxy @@ -11,3 +13,7 @@ updated http proxy list updated socks proxy list download http proxy download socks proxy +доверенное лицо +свежий список прокси +http прокси +socks прокси