diff --git a/config.py b/config.py index 8b028f3..7ba49b8 100644 --- a/config.py +++ b/config.py @@ -30,16 +30,16 @@ class Config(ComboParser): self.add_item(section, 'tor_safeguard', bool, True, 'enable tor safeguard (default: True)', False) section = 'httpd' - self.add_item(section, 'listenip', str, '127.0.0.1', 'address for the httpd to listen to (default: 127.0.0.1)', True) - self.add_item(section, 'port', int, 8081, 'port for the httpd to listen to (default: 8081)', True) - self.add_item(section, 'enabled', bool, False, 'start httpd (default: False)', True) + self.add_item(section, 'listenip', str, '127.0.0.1', 'address for the httpd to listen to (default: 127.0.0.1)', True) + self.add_item(section, 'port', int, 8081, 'port for the httpd to listen to (default: 8081)', True) + self.add_item(section, 'enabled', bool, False, 'start httpd (default: False)', True) section = 'ppf' self.add_item(section, 'debug', bool, False, 'whether to print additional debug info', False) self.add_item(section, 'search', bool, True, 'whether to use searx search engine to find new proxy lists', False) self.add_item(section, 'timeout', float, 15, 'timeout for blocking operations (connect/recv/...) for proxy checks in seconds', False) self.add_item(section, 'http_retries', int, 1, 'number of retries for http connects', False) - self.add_item(section, 'threads', int, 1, 'number of threads to run (default: 1)', False) + self.add_item(section, 'threads', int, 1, 'number of threads to run (default: 1)', False) self.add_item(section, 'checktime', int, 3600, 'base checking interval for urls in db in seconds', False) self.add_item(section, 'perfail_checktime', int, 3600, 'additional checking interval for urls in db in seconds per resultless check', False) self.add_item(section, 'max_fail', int, 5, 'number of fails after which an url is considered dead', False) @@ -52,32 +52,32 @@ class Config(ComboParser): self.aparser.add_argument("--file", help="import a single file containing proxy addrs", type=str, default='', required=False) - section = 'flood' - self.add_item(section, 'server', str, None, 'irc server address', False) - self.add_item(section, 'target', str, None, 'target to flood', False) - self.add_item(section, 'nickserv', str, 'nickserv', "nickserv's nickname", False) - self.add_item(section, 'message', str, None, 'message', False) - self.add_item(section, 'threads', int, 1, '# of threads', False) - self.add_item(section, 'register', int, 0, 'register nickname when required', False) + section = 'flood' + self.add_item(section, 'server', str, None, 'irc server address', False) + self.add_item(section, 'target', str, None, 'target to flood', False) + self.add_item(section, 'nickserv', str, 'nickserv', "nickserv's nickname", False) + self.add_item(section, 'message', str, None, 'message', False) + self.add_item(section, 'threads', int, 1, '# of threads', False) + self.add_item(section, 'register', int, 0, 'register nickname when required', False) - self.add_item(section, 'wait', int, 0, 'wait prior sending messages', False) - self.add_item(section, 'once', int, 0, 'quit as soon as possible', False) - self.add_item(section, 'hilight', int, 0, 'try to hilight all nicks?', False) - self.add_item(section, 'waitonsuccess', int, 0, 'wait for a while on success', False) - self.add_item(section, 'debug', int, 0, 'use debug', False) - self.add_item(section, 'duration', int, 180, 'maximum time to run', False) - self.add_item(section, 'delay', str, 14400, 'if waitonsuccess, wait for $delay before sending other bots', False) - self.add_item(section, 'nick', str, None, 'specify nickname to use', False) - self.add_item(section, 'use_ssl', int, 2, 'Use ssl? (0: false, 1: true, 2: random)', False) - self.add_item(section, 'cycle', int, 0, 'cycle flood', False) - self.add_item(section, 'change_nick', int, 0, 'Change nick between messages (useful when flooding privates)', False) - self.add_item(section, 'use_timeout', int, 0, 'make connexions quit through timeout', False) - self.add_item(section, 'clones', int, 1, 'Number of connexion repeat to run', False) - self.add_item(section, 'query', bool, False, 'also flood in query', False) - self.add_item(section, 'noquerybefore', int, 10, 'do not send query before x secs being connected', False) - self.add_item(section, 'oper', bool, False, 'piss of opers', False) - self.add_item(section, 'whois', bool, False, 'piss of opers with /whois', False) - self.add_item(section, 'modex', bool, False, 'make +/- x mode', False) - self.add_item(section, 'os', bool, False, 'piss off opers with /os', False) - self.add_item(section, 'file', str, None, 'read flood content from file', False) - self.add_item(section, 'failid', str, None, 'generate nickserv warn. about IDENTIFY attempts', False) + self.add_item(section, 'wait', int, 0, 'wait prior sending messages', False) + self.add_item(section, 'once', int, 0, 'quit as soon as possible', False) + self.add_item(section, 'hilight', int, 0, 'try to hilight all nicks?', False) + self.add_item(section, 'waitonsuccess', int, 0, 'wait for a while on success', False) + self.add_item(section, 'debug', int, 0, 'use debug', False) + self.add_item(section, 'duration', int, 180, 'maximum time to run', False) + self.add_item(section, 'delay', str, 14400, 'if waitonsuccess, wait for $delay before sending other bots', False) + self.add_item(section, 'nick', str, None, 'specify nickname to use', False) + self.add_item(section, 'use_ssl', int, 2, 'Use ssl? (0: false, 1: true, 2: random)', False) + self.add_item(section, 'cycle', int, 0, 'cycle flood', False) + self.add_item(section, 'change_nick', int, 0, 'Change nick between messages (useful when flooding privates)', False) + self.add_item(section, 'use_timeout', int, 0, 'make connexions quit through timeout', False) + self.add_item(section, 'clones', int, 1, 'Number of connexion repeat to run', False) + self.add_item(section, 'query', bool, False, 'also flood in query', False) + self.add_item(section, 'noquerybefore', int, 10, 'do not send query before x secs being connected', False) + self.add_item(section, 'oper', bool, False, 'piss of opers', False) + self.add_item(section, 'whois', bool, False, 'piss of opers with /whois', False) + self.add_item(section, 'modex', bool, False, 'make +/- x mode', False) + self.add_item(section, 'os', bool, False, 'piss off opers with /os', False) + self.add_item(section, 'file', str, None, 'read flood content from file', False) + self.add_item(section, 'failid', str, None, 'generate nickserv warn. about IDENTIFY attempts', False) diff --git a/fetch.py b/fetch.py index 1750d08..be4c829 100644 --- a/fetch.py +++ b/fetch.py @@ -6,119 +6,119 @@ from misc import _log config = None def set_config(cfg): - global config - config = cfg + global config + config = cfg cleanhtml_re = [ - re.compile('<.*?>'), - re.compile('\s+'), - re.compile('::+'), + re.compile('<.*?>'), + re.compile('\s+'), + re.compile('::+'), ] def cleanhtml(raw_html): - html = raw_html.replace(' ', ' ') - html = re.sub(cleanhtml_re[0], ':', html) - html = re.sub(cleanhtml_re[1], ':', html) - html = re.sub(cleanhtml_re[2], ':', html) - return html + html = raw_html.replace(' ', ' ') + html = re.sub(cleanhtml_re[0], ':', html) + html = re.sub(cleanhtml_re[1], ':', html) + html = re.sub(cleanhtml_re[2], ':', html) + return html retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded') def fetch_contents(url, head = False): - host, port, ssl, uri = _parse_url(url) - headers=[ - 'Accept-Language: en-US,en;q=0.8', - 'Cache-Control: max-age=0', - ] - if config.ppf.debug: - _log("connecting to %s... (header: %s)" % (url, str(head)), "debug") - while True: - proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))] - http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0') - if not http.connect(): - _log("failed to connect to %s"%url, "ppf") - e = http.get_last_rocksock_exception() - if not e: - return '' - et = e.get_errortype() - ee = e.get_error() - ef = e.get_failedproxy() - if et == rocksock.RS_ET_OWN and \ - ee == rocksock.RS_E_TARGET_CONN_REFUSED \ - and ef == 0: - _log("could not connect to proxy 0 - check your connection", "error") - time.sleep(5) - continue - return '' - break + host, port, ssl, uri = _parse_url(url) + headers=[ + 'Accept-Language: en-US,en;q=0.8', + 'Cache-Control: max-age=0', + ] + if config.ppf.debug: + _log("connecting to %s... (header: %s)" % (url, str(head)), "debug") + while True: + proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))] + http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0') + if not http.connect(): + _log("failed to connect to %s"%url, "ppf") + e = http.get_last_rocksock_exception() + if not e: + return '' + et = e.get_errortype() + ee = e.get_error() + ef = e.get_failedproxy() + if et == rocksock.RS_ET_OWN and \ + ee == rocksock.RS_E_TARGET_CONN_REFUSED \ + and ef == 0: + _log("could not connect to proxy 0 - check your connection", "error") + time.sleep(5) + continue + return '' + break - ## only request header - if head: - hdr = http.head(uri, headers) - return hdr + ## only request header + if head: + hdr = http.head(uri, headers) + return hdr - hdr, res = http.get(uri, headers) - res = res.encode('utf-8') if isinstance(res, unicode) else res - for retry_message in retry_messages: - if retry_message in res: return '' + hdr, res = http.get(uri, headers) + res = res.encode('utf-8') if isinstance(res, unicode) else res + for retry_message in retry_messages: + if retry_message in res: return '' - return res + return res def valid_port(port): - return port > 0 and port < 65535 + return port > 0 and port < 65535 def is_usable_proxy(proxy): - ip, port = proxy.split(':') - if not valid_port(int(port)): return False + ip, port = proxy.split(':') + if not valid_port(int(port)): return False - octets = ip.split('.') - A = int(octets[0]) - B = int(octets[1]) - C = int(octets[2]) - D = int(octets[3]) + octets = ip.split('.') + A = int(octets[0]) + B = int(octets[1]) + C = int(octets[2]) + D = int(octets[3]) - if (A < 1 or A > 254 or \ - B > 255 or C > 255 or D > 255) or \ - (A == 10 or A == 127) or \ - (A == 192 and B == 168) or \ - (A == 172 and B >= 16 and B <= 31): return False - return True + if (A < 1 or A > 254 or \ + B > 255 or C > 255 or D > 255) or \ + (A == 10 or A == 127) or \ + (A == 192 and B == 168) or \ + (A == 172 and B >= 16 and B <= 31): return False + return True _known_proxies = {} def extract_proxies(content, proxydb): - matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content)) + matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content)) - uniques_dict = {} - for p in matches: - uniques_dict[p] = True + uniques_dict = {} + for p in matches: + uniques_dict[p] = True - uniques = [] - for p in uniques_dict.keys(): - if is_usable_proxy(p): uniques.append(p) + uniques = [] + for p in uniques_dict.keys(): + if is_usable_proxy(p): uniques.append(p) - global _known_proxies - if len(_known_proxies) == 0: - known = proxydb.execute('SELECT proxy FROM proxylist').fetchall() - for k in known: - _known_proxies[k[0]] = True + global _known_proxies + if len(_known_proxies) == 0: + known = proxydb.execute('SELECT proxy FROM proxylist').fetchall() + for k in known: + _known_proxies[k[0]] = True - new = [] - for p in uniques: - if not p in _known_proxies: - new.append(p) - _known_proxies[p] = True + new = [] + for p in uniques: + if not p in _known_proxies: + new.append(p) + _known_proxies[p] = True - return len(uniques), new + return len(uniques), new def extract_urls(content, urls = None, urignore=None): - urls = [] if not urls else urls - soup = soupify(content) - for a in soup.body.find_all('a'): - if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue - bad = False - href = a.attrs['href'] - for i in urignore: - if re.findall(i, href): - bad = True - break - if not bad: urls.append(href) - return urls + urls = [] if not urls else urls + soup = soupify(content) + for a in soup.body.find_all('a'): + if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue + bad = False + href = a.attrs['href'] + for i in urignore: + if re.findall(i, href): + bad = True + break + if not bad: urls.append(href) + return urls diff --git a/ppf.py b/ppf.py index ccfc1dd..571f7b4 100755 --- a/ppf.py +++ b/ppf.py @@ -17,370 +17,370 @@ config = Config() _known_proxies = {} def import_from_file(fn, sqlite): - with open(fn, 'r') as f: - urls = [ url for url in f.read().split('\n') if url != '' ] - cinc = 0 - while True: - chunk = urls[cinc:cinc+200] - if len(chunk): dbs.insert_urls(chunk, 'import.txt', urldb) - else: break - cinc = cinc + 200 + with open(fn, 'r') as f: + urls = [ url for url in f.read().split('\n') if url != '' ] + cinc = 0 + while True: + chunk = urls[cinc:cinc+200] + if len(chunk): dbs.insert_urls(chunk, 'import.txt', urldb) + else: break + cinc = cinc + 200 def get_content_type(url): - hdr = fetch.fetch_contents(url, head=True) + hdr = fetch.fetch_contents(url, head=True) - for h in hdr.split('\n'): - if h.lower().startswith('content-type: '): return h.lower().split(':')[1].strip() + for h in hdr.split('\n'): + if h.lower().startswith('content-type: '): return h.lower().split(':')[1].strip() - return '' + return '' def is_good_content_type(string): - allowed_ct = [ 'text/html', 'text/plain', 'atom+xml' ] - for ct in allowed_ct: - if ct.lower() in string.lower(): return True - return False + allowed_ct = [ 'text/html', 'text/plain', 'atom+xml' ] + for ct in allowed_ct: + if ct.lower() in string.lower(): return True + return False def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added, content_type): - if not content_type: content_type = get_content_type(url) + if not content_type: content_type = get_content_type(url) - if is_good_content_type(content_type): - try: content = fetch.fetch_contents(url) - except KeyboardInterrupt as e: raise e - except: content = '' - else: - content = '' + if is_good_content_type(content_type): + try: content = fetch.fetch_contents(url) + except KeyboardInterrupt as e: raise e + except: content = '' + else: + content = '' - unique_count, new = fetch.extract_proxies(content, proxydb) + unique_count, new = fetch.extract_proxies(content, proxydb) - if retrievals == 0: # new site - if content != '' and unique_count == 0: # site works but has zero proxy addresses - error = 99999 - else: - if len(new) == 0: - stale_count += 1 - else: - stale_count = 0 - if content == '': - error += 1 - else: - retrievals += 1 - error = 0 - if unique_count: - extract_urls(content, url) + if retrievals == 0: # new site + if content != '' and unique_count == 0: # site works but has zero proxy addresses + error = 99999 + else: + if len(new) == 0: + stale_count += 1 + else: + stale_count = 0 + if content == '': + error += 1 + else: + retrievals += 1 + error = 0 + if unique_count: + extract_urls(content, url) - urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=?,content_type=? where url=?', (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, url)) - urldb.commit() + urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=?,content_type=? where url=?', (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, url)) + urldb.commit() - if not len(new): return + if not len(new): return - dbs.insert_proxies(proxydb, new, url) + dbs.insert_proxies(proxydb, new, url) def is_bad_url(uri, domain=None, samedomain=False): - # if uri needs to be from same domain and domains missmatch - if samedomain and str(uri.split('/')[2]).lower() != str(domain).lower(): - return True - for u in urignore: - if re.findall(u, uri): return True - return False + # if uri needs to be from same domain and domains missmatch + if samedomain and str(uri.split('/')[2]).lower() != str(domain).lower(): + return True + for u in urignore: + if re.findall(u, uri): return True + return False def extract_urls(html, url): - mytime = int(time.time()) - proto = url.split(':')[0] - domain = url.split('/')[2] - urls = [] + mytime = int(time.time()) + proto = url.split(':')[0] + domain = url.split('/')[2] + urls = [] - soup = BeautifulSoup(html, features='lxml') + soup = BeautifulSoup(html, features='lxml') - for a in soup.find_all('a', href=True): - item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href'] - item = item.strip() + for a in soup.find_all('a', href=True): + item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href'] + item = item.strip() - if item.startswith('www.'): - item = 'http://%s' % item - elif not item.startswith('http'): - if not item.startswith('/'): item = '/%s' % item - item = '%s://%s%s' % (proto,domain,item) + if item.startswith('www.'): + item = 'http://%s' % item + elif not item.startswith('http'): + if not item.startswith('/'): item = '/%s' % item + item = '%s://%s%s' % (proto,domain,item) - elif is_bad_url(item, domain=domain, samedomain=config.ppf.extract_samedomain): - continue - if not item in urls: urls.append(item) + elif is_bad_url(item, domain=domain, samedomain=config.ppf.extract_samedomain): + continue + if not item in urls: urls.append(item) - if len(urls): dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls) + if len(urls): dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls) def import_proxies_from_file(proxydb, fn): - content = open(fn, 'r').read() - unique_count, new = fetch.extract_proxies(content, proxydb) - if len(new): - dbs.insert_proxies(proxydb, new, fn) - return 0 - return 1 + content = open(fn, 'r').read() + unique_count, new = fetch.extract_proxies(content, proxydb) + if len(new): + dbs.insert_proxies(proxydb, new, fn) + return 0 + return 1 def serve_loop(hs, done): - client_threads = [] - while not done.is_set(): - c = hs.wait_client() + client_threads = [] + while not done.is_set(): + c = hs.wait_client() - evt_done = threading.Event() - cthread = threading.Thread(target=httpsrv_client_thread, args=(c,evt_done)) - cthread.daemon = True - cthread.start() + evt_done = threading.Event() + cthread = threading.Thread(target=httpsrv_client_thread, args=(c,evt_done)) + cthread.daemon = True + cthread.start() - ctrm = [] - for ct, ct_done in client_threads: - if ct_done.is_set(): - ctrm.append((ct,ct_done)) - ct.join() + ctrm = [] + for ct, ct_done in client_threads: + if ct_done.is_set(): + ctrm.append((ct,ct_done)) + ct.join() - if len(ctrm): - client_threads = [ x for x in client_threads if not x in ctrm ] + if len(ctrm): + client_threads = [ x for x in client_threads if not x in ctrm ] - client_threads.append((cthread, evt_done)) + client_threads.append((cthread, evt_done)) def forbidden_page(): - return ( - '\n' - ' \n' - ' \n' - ' Forbidden\n' - ' \n' - ' \n' - '
🖕
\n' - ' \n' - '') + return ( + '\n' + ' \n' + ' \n' + ' Forbidden\n' + ' \n' + ' \n' + '
🖕
\n' + ' \n' + '') def httpsrv_client_thread(c, evt_done): - req = c.read_request() - if req is None: pass - elif len(watchlist) == 0: - c.redirect('/config.html') - elif os.path.isdir(req['url'][1:]): - c.send(403,'Forbidden', forbidden_page()) - elif req['url'] == '/': - c.redirect('/index.html') - elif req['url'].startswith('/index.html'): - variables = variables_from_request(req) - r, redir = render_site(variables) - if redir is not "": - c.redirect(redir) - else: - if r == '': r = render_empty(variables=variables) - c.send(200, "OK", r) - elif not '..' in req['url'] and file_exists(os.getcwd() + req['url']): - c.serve_file(os.getcwd() + req['url']) - elif req['url'] == '/robots.txt': - c.send(200, "OK", "User-agent: *\nDisallow: /") + req = c.read_request() + if req is None: pass + elif len(watchlist) == 0: + c.redirect('/config.html') + elif os.path.isdir(req['url'][1:]): + c.send(403,'Forbidden', forbidden_page()) + elif req['url'] == '/': + c.redirect('/index.html') + elif req['url'].startswith('/index.html'): + variables = variables_from_request(req) + r, redir = render_site(variables) + if redir is not "": + c.redirect(redir) + else: + if r == '': r = render_empty(variables=variables) + c.send(200, "OK", r) + elif not '..' in req['url'] and file_exists(os.getcwd() + req['url']): + c.serve_file(os.getcwd() + req['url']) + elif req['url'] == '/robots.txt': + c.send(200, "OK", "User-agent: *\nDisallow: /") - elif req['url'].startswith('/config.html'): - if args.config > 0: - variables=variables_from_request(req) - r, redir = configpage(req,variables) - else: - redir = '/index.html' - if redir is not "": - c.redirect(redir) - else: - if r == '': r = render_empty(variables=variables) - c.send(200, "OK", r) + elif req['url'].startswith('/config.html'): + if args.config > 0: + variables=variables_from_request(req) + r, redir = configpage(req,variables) + else: + redir = '/index.html' + if redir is not "": + c.redirect(redir) + else: + if r == '': r = render_empty(variables=variables) + c.send(200, "OK", r) - else: - c.send(404, "not exist", "the reqested file not exist!!!1") - c.disconnect() - evt_done.set() + else: + c.send(404, "not exist", "the reqested file not exist!!!1") + c.disconnect() + evt_done.set() def start_server(ip, port): - done = threading.Event() - from httpsrv import HttpSrv - hs = HttpSrv(ip, port) - try: - hs.setup() - except socket.error as e: - if e.errno == errno.EADDRINUSE: - sys.stderr.write(( - "ERROR: server socket address in use\n" - "wait a couple seconds and try again.\n" - "in case you're in pdb, you need to quit it\n")) - sys.exit(1) - else: - raise e + done = threading.Event() + from httpsrv import HttpSrv + hs = HttpSrv(ip, port) + try: + hs.setup() + except socket.error as e: + if e.errno == errno.EADDRINUSE: + sys.stderr.write(( + "ERROR: server socket address in use\n" + "wait a couple seconds and try again.\n" + "in case you're in pdb, you need to quit it\n")) + sys.exit(1) + else: + raise e - t = threading.Thread(target=serve_loop, args=(hs, done)) - t.daemon = True - t.start() - return t, done + t = threading.Thread(target=serve_loop, args=(hs, done)) + t.daemon = True + t.start() + return t, done def extract_proxies(content): - matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', fetch.cleanhtml(content)) - uniques_dict = {} - for p in matches: - uniques_dict[p] = True + matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', fetch.cleanhtml(content)) + uniques_dict = {} + for p in matches: + uniques_dict[p] = True - uniques = [] - for p in uniques_dict.keys(): - if fetch.is_usable_proxy(p): uniques.append(p) + uniques = [] + for p in uniques_dict.keys(): + if fetch.is_usable_proxy(p): uniques.append(p) - return uniques + return uniques class Leechered(threading.Thread): - #def __init__(self, proxydb, urldb, url, stale_count, error, retrievals, proxies_added, content_type): - def __init__(self, url, stale_count, error, retrievals, proxies_added, content_type): - self.status = 'nok' - self.proxylist = [] - self.running = True - self.url = url - self.stale_count = stale_count - self.error = error - self.retrievals = retrievals - self.proxies_added = proxies_added - self.content_type = content_type - self.execute = '' - threading.Thread.__init__(self) + #def __init__(self, proxydb, urldb, url, stale_count, error, retrievals, proxies_added, content_type): + def __init__(self, url, stale_count, error, retrievals, proxies_added, content_type): + self.status = 'nok' + self.proxylist = [] + self.running = True + self.url = url + self.stale_count = stale_count + self.error = error + self.retrievals = retrievals + self.proxies_added = proxies_added + self.content_type = content_type + self.execute = '' + threading.Thread.__init__(self) - def retrieve(self): - return self.url, self.proxylist, self.stale_count, self.error, self.retrievals, self.content_type, self.proxies_added, self.execute - def status(self): - return self.status + def retrieve(self): + return self.url, self.proxylist, self.stale_count, self.error, self.retrievals, self.content_type, self.proxies_added, self.execute + def status(self): + return self.status - def run(self): - self.status = 'nok' + def run(self): + self.status = 'nok' - if not self.content_type: self.content_type = get_content_type(self.url) + if not self.content_type: self.content_type = get_content_type(self.url) - if is_good_content_type(self.content_type): - try: content = fetch.fetch_contents(self.url) - except KeyboardInterrupt as e: raise e - except: content = '' - else: - content = '' + if is_good_content_type(self.content_type): + try: content = fetch.fetch_contents(self.url) + except KeyboardInterrupt as e: raise e + except: content = '' + else: + content = '' - unique = extract_proxies(content) - self.proxylist = [ proxy for proxy in unique if not proxy in _known_proxies ] - proxy_count = len(self.proxylist) + unique = extract_proxies(content) + self.proxylist = [ proxy for proxy in unique if not proxy in _known_proxies ] + proxy_count = len(self.proxylist) - if self.retrievals == 0: # new site - if content != '' and len(self.proxylist) == 0: # site works but has zero proxy addresses - self.error += 1 - self.stale_count += 1 - elif proxy_count > 0: - self.error = 0 - self.stale_count = 0 - else: - self.error += 2 - self.stale_count += 2 - else: # not a new site - # proxylist is empty - if proxy_count == 0: - self.stale_count += 1 - # proxylist is not empty: site is working - else: - self.stale_count = 0 - self.error = 0 - # site has no content - if content == '': - self.error += 1 - self.stale_count += 1 - #else: - # self.retrievals += 1 - # self.error = 0 - # self.stale_count = 0 - # site has proxies - if proxy_count: - self.error = 0 - self.stale_count = 0 - extract_urls(content, self.url) + if self.retrievals == 0: # new site + if content != '' and len(self.proxylist) == 0: # site works but has zero proxy addresses + self.error += 1 + self.stale_count += 1 + elif proxy_count > 0: + self.error = 0 + self.stale_count = 0 + else: + self.error += 2 + self.stale_count += 2 + else: # not a new site + # proxylist is empty + if proxy_count == 0: + self.stale_count += 1 + # proxylist is not empty: site is working + else: + self.stale_count = 0 + self.error = 0 + # site has no content + if content == '': + self.error += 1 + self.stale_count += 1 + #else: + # self.retrievals += 1 + # self.error = 0 + # self.stale_count = 0 + # site has proxies + if proxy_count: + self.error = 0 + self.stale_count = 0 + extract_urls(content, self.url) - self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added+len(self.proxylist), self.content_type, self.url) - self.status = 'ok' + self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added+len(self.proxylist), self.content_type, self.url) + self.status = 'ok' if __name__ == '__main__': - config.load() - fetch.set_config(config) + config.load() + fetch.set_config(config) - proxydb = mysqlite.mysqlite(config.watchd.database, str) - dbs.create_table_if_not_exists(proxydb, 'proxylist') - known = proxydb.execute('SELECT proxy FROM proxylist').fetchall() - for k in known: - _known_proxies[k[0]] = True + proxydb = mysqlite.mysqlite(config.watchd.database, str) + dbs.create_table_if_not_exists(proxydb, 'proxylist') + known = proxydb.execute('SELECT proxy FROM proxylist').fetchall() + for k in known: + _known_proxies[k[0]] = True - with open('urignore.txt', 'r') as f: - urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] + with open('urignore.txt', 'r') as f: + urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] - urldb = mysqlite.mysqlite(config.ppf.database, str) - dbs.create_table_if_not_exists(urldb, 'uris') - import_from_file('import.txt', urldb) - if len(sys.argv) == 3 and sys.argv[1] == "--file": - sys.exit(import_proxies_from_file(proxydb, sys.argv[2])) + urldb = mysqlite.mysqlite(config.ppf.database, str) + dbs.create_table_if_not_exists(urldb, 'uris') + import_from_file('import.txt', urldb) + if len(sys.argv) == 3 and sys.argv[1] == "--file": + sys.exit(import_proxies_from_file(proxydb, sys.argv[2])) - # start proxy watcher - if config.watchd.threads > 0: - watcherd = proxywatchd.Proxywatchd() - watcherd.start() - else: - watcherd = None + # start proxy watcher + if config.watchd.threads > 0: + watcherd = proxywatchd.Proxywatchd() + watcherd.start() + else: + watcherd = None - #start_server(config.httpd.listenip, config.httpd.port) - - #qurl = 'SELECT url,stale_count,error,retrievals,proxies_added,content_type FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) 180: - _log('running %d thread(s) over %d' % (len(threads), config.ppf.threads), 'ppf') - statusmsg = time.time() - if not len(rows): - if (time.time() - reqtime) > 3: - rows = urldb.execute(qurl, (config.ppf.max_fail, config.ppf.checktime, config.ppf.perfail_checktime, int(time.time()))).fetchall() - reqtime = time.time() - if len(rows) < config.ppf.threads: - time.sleep(60) - rows = [] - else: - _log('handing %d job(s) to %d thread(s)' % ( len(rows), config.ppf.threads ), 'ppf') - #nao = time.time() - #args = [ (nao, row[0]) for row in rows ] - #urldb.executemany('UPDATE uris SET check_time=? where url=?', args) - #urldb.commit() + #start_server(config.httpd.listenip, config.httpd.port) + + #qurl = 'SELECT url,stale_count,error,retrievals,proxies_added,content_type FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) 180: + _log('running %d thread(s) over %d' % (len(threads), config.ppf.threads), 'ppf') + statusmsg = time.time() + if not len(rows): + if (time.time() - reqtime) > 3: + rows = urldb.execute(qurl, (config.ppf.max_fail, config.ppf.checktime, config.ppf.perfail_checktime, int(time.time()))).fetchall() + reqtime = time.time() + if len(rows) < config.ppf.threads: + time.sleep(60) + rows = [] + else: + _log('handing %d job(s) to %d thread(s)' % ( len(rows), config.ppf.threads ), 'ppf') + #nao = time.time() + #args = [ (nao, row[0]) for row in rows ] + #urldb.executemany('UPDATE uris SET check_time=? where url=?', args) + #urldb.commit() - for thread in threads: - if thread.status == 'ok': - url, proxylist, stale_count, error, retrievals, content_type, proxies_added, execute = thread.retrieve() - new = [] - for p in proxylist: - if not p in _known_proxies: - new.append(p) - _known_proxies[p]=1 - execute = (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, url) - urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=?,content_type=? where url=?', execute) - urldb.commit() - if len(new): dbs.insert_proxies(proxydb, new, url) + for thread in threads: + if thread.status == 'ok': + url, proxylist, stale_count, error, retrievals, content_type, proxies_added, execute = thread.retrieve() + new = [] + for p in proxylist: + if not p in _known_proxies: + new.append(p) + _known_proxies[p]=1 + execute = (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, url) + urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=?,content_type=? where url=?', execute) + urldb.commit() + if len(new): dbs.insert_proxies(proxydb, new, url) - threads = [ thread for thread in threads if thread.is_alive() ] - if len(threads) < config.ppf.threads and len(rows): - row = random.choice(rows) - urldb.execute('UPDATE uris SET check_time=? where url=?', (time.time(), row[0])) - urldb.commit() - rows.remove(row) - t = Leechered(row[0], row[1], row[2], row[3], row[4], row[5]) - threads.append(t) - t.start() - #time.sleep(random.random()/100) + threads = [ thread for thread in threads if thread.is_alive() ] + if len(threads) < config.ppf.threads and len(rows): + row = random.choice(rows) + urldb.execute('UPDATE uris SET check_time=? where url=?', (time.time(), row[0])) + urldb.commit() + rows.remove(row) + t = Leechered(row[0], row[1], row[2], row[3], row[4], row[5]) + threads.append(t) + t.start() + #time.sleep(random.random()/100) - except KeyboardInterrupt: - if watcherd: - watcherd.stop() - watcherd.finish() - break + except KeyboardInterrupt: + if watcherd: + watcherd.stop() + watcherd.finish() + break - print '\r', + print '\r', diff --git a/proxywatchd.py b/proxywatchd.py index 10487a7..82831be 100644 --- a/proxywatchd.py +++ b/proxywatchd.py @@ -3,10 +3,10 @@ import threading import time, random, string, re, copy try: - from geoip import geolite2 - geolite = True + from geoip import geolite2 + geolite = True except: - geolite = False + geolite = False from config import Config @@ -20,415 +20,415 @@ _run_standalone = False cached_dns = dict() def try_div(a, b): - if b != 0: return a/float(b) - return 0 + if b != 0: return a/float(b) + return 0 def socks4_resolve(srvname, server_port): - srv = srvname - if srv in cached_dns: - srv = cached_dns[srvname] - if config.watchd.debug: - _log("using cached ip (%s) for %s"%(srv, srvname), "debug") - else: - dns_fail = False - try: - af, sa = rocksock.resolve(rocksock.RocksockHostinfo(srvname, server_port), want_v4=True) - if sa is not None: - cached_dns[srvname] = sa[0] - srv = sa[0] - else: dns_fail = True - except rocksock.RocksockException as e: - assert(e.get_errortype() == rocksock.RS_ET_GAI) - dns_fail = True - if dns_fail: - fail_inc = 0 - _log("could not resolve connection target %s"%srvname, "ERROR") - return False - return srv + srv = srvname + if srv in cached_dns: + srv = cached_dns[srvname] + if config.watchd.debug: + _log("using cached ip (%s) for %s"%(srv, srvname), "debug") + else: + dns_fail = False + try: + af, sa = rocksock.resolve(rocksock.RocksockHostinfo(srvname, server_port), want_v4=True) + if sa is not None: + cached_dns[srvname] = sa[0] + srv = sa[0] + else: dns_fail = True + except rocksock.RocksockException as e: + assert(e.get_errortype() == rocksock.RS_ET_GAI) + dns_fail = True + if dns_fail: + fail_inc = 0 + _log("could not resolve connection target %s"%srvname, "ERROR") + return False + return srv class WorkerJob(): - def __init__(self, proxy, proto, failcount, success_count, total_duration, country, oldies = False): - self.proxy = proxy - self.proto = proto - self.failcount = failcount - self.checktime = None - self.success_count = success_count - self.total_duration = total_duration - self.country = country - self.isoldies = oldies + def __init__(self, proxy, proto, failcount, success_count, total_duration, country, oldies = False): + self.proxy = proxy + self.proto = proto + self.failcount = failcount + self.checktime = None + self.success_count = success_count + self.total_duration = total_duration + self.country = country + self.isoldies = oldies - def connect_socket(self): - srvname = random.choice(config.servers).strip() - protos = ['http', 'socks5', 'socks4'] if self.proto is None else [self.proto] - use_ssl = random.choice([0,1]) if config.watchd.use_ssl == 2 else config.watchd.use_ssl - server_port = 6697 if use_ssl else 6667 + def connect_socket(self): + srvname = random.choice(config.servers).strip() + protos = ['http', 'socks5', 'socks4'] if self.proto is None else [self.proto] + use_ssl = random.choice([0,1]) if config.watchd.use_ssl == 2 else config.watchd.use_ssl + server_port = 6697 if use_ssl else 6667 - fail_inc = 1 + fail_inc = 1 - for proto in protos: - torhost = random.choice(config.torhosts) - # socks4 (without 4a) requires a raw ip address - # rocksock automatically resolves if needed, but it's more - # efficient to cache the result. - if proto == 'socks4': srv = socks4_resolve(srvname, server_port) - else: srv = srvname - ## skip socks4 failed resolution - if not srv: continue + for proto in protos: + torhost = random.choice(config.torhosts) + # socks4 (without 4a) requires a raw ip address + # rocksock automatically resolves if needed, but it's more + # efficient to cache the result. + if proto == 'socks4': srv = socks4_resolve(srvname, server_port) + else: srv = srvname + ## skip socks4 failed resolution + if not srv: continue - duration = time.time() - #rocksock.RocksockProxyFromURL('socks4://%s' % torhost), - proxies = [ - rocksock.RocksockProxyFromURL('socks5://%s' % torhost), - rocksock.RocksockProxyFromURL('%s://%s' % (proto, self.proxy)), - ] + duration = time.time() + #rocksock.RocksockProxyFromURL('socks4://%s' % torhost), + proxies = [ + rocksock.RocksockProxyFromURL('socks5://%s' % torhost), + rocksock.RocksockProxyFromURL('%s://%s' % (proto, self.proxy)), + ] - try: - sock = rocksock.Rocksock(host=srv, port=server_port, ssl=use_ssl, proxies=proxies, timeout=config.watchd.timeout) - sock.connect() - sock.send('NICK\n') - return sock, proto, duration, torhost, srvname, 0 - except rocksock.RocksockException as e: - if config.watchd.debug: - _log("proxy failed: %s://%s: %s"%(proto, self.proxy, e.get_errormessage()), 'debug') + try: + sock = rocksock.Rocksock(host=srv, port=server_port, ssl=use_ssl, proxies=proxies, timeout=config.watchd.timeout) + sock.connect() + sock.send('NICK\n') + return sock, proto, duration, torhost, srvname, 0 + except rocksock.RocksockException as e: + if config.watchd.debug: + _log("proxy failed: %s://%s: %s"%(proto, self.proxy, e.get_errormessage()), 'debug') - et = e.get_errortype() - err = e.get_error() - fp = e.get_failedproxy() + et = e.get_errortype() + err = e.get_error() + fp = e.get_failedproxy() - sock.disconnect() + sock.disconnect() - if et == rocksock.RS_ET_OWN: - if fp == 1 and \ - err == rocksock.RS_E_REMOTE_DISCONNECTED or \ - err == rocksock.RS_E_HIT_TIMEOUT: - # proxy is not online, so don't waste time trying all possible protocols - break - elif fp == 0 and \ - err == rocksock.RS_E_TARGET_CONN_REFUSED: - fail_inc = 0 - if random.randint(0, (config.watchd.threads-1)/2) == 0: - _log("could not connect to proxy 0, sleep 5s", "ERROR") - time.sleep(5) - elif et == rocksock.RS_ET_GAI: - assert(0) - fail_inc = 0 - _log("could not resolve connection target %s"%srvname, "ERROR") - break + if et == rocksock.RS_ET_OWN: + if fp == 1 and \ + err == rocksock.RS_E_REMOTE_DISCONNECTED or \ + err == rocksock.RS_E_HIT_TIMEOUT: + # proxy is not online, so don't waste time trying all possible protocols + break + elif fp == 0 and \ + err == rocksock.RS_E_TARGET_CONN_REFUSED: + fail_inc = 0 + if random.randint(0, (config.watchd.threads-1)/2) == 0: + _log("could not connect to proxy 0, sleep 5s", "ERROR") + time.sleep(5) + elif et == rocksock.RS_ET_GAI: + assert(0) + fail_inc = 0 + _log("could not resolve connection target %s"%srvname, "ERROR") + break - except KeyboardInterrupt as e: - raise(e) + except KeyboardInterrupt as e: + raise(e) - return None, None, None, None, None, fail_inc + return None, None, None, None, None, fail_inc - def run(self): - self.checktime = int(time.time()) + def run(self): + self.checktime = int(time.time()) - sock, proto, duration, tor, srv, failinc = self.connect_socket() - if not sock: - self.failcount += failinc - return - try: - recv = sock.recv(6) - #recv = sock.recvline() + sock, proto, duration, tor, srv, failinc = self.connect_socket() + if not sock: + self.failcount += failinc + return + try: + recv = sock.recv(6) + #recv = sock.recvline() - # good data - if re.match('^(:|NOTICE|ERROR)', recv, re.IGNORECASE): - duration = (time.time() - duration) + # good data + if re.match('^(:|NOTICE|ERROR)', recv, re.IGNORECASE): + duration = (time.time() - duration) - if geolite and not self.country or self.country == 'unknown' or self.country == 'N/A': - match = geolite2.lookup(self.proxy.split(':')[0]) - if match is not None: self.country = match.country - else: self.country = 'N/A' + if geolite and not self.country or self.country == 'unknown' or self.country == 'N/A': + match = geolite2.lookup(self.proxy.split(':')[0]) + if match is not None: self.country = match.country + else: self.country = 'N/A' - self.proto = proto - self.failcount = 0 - self.success_count = self.success_count + 1 - self.total_duration += int(duration*1000) - torstats = "" if len(config.torhosts)==1 else ' tor: %s;'%tor - recvstats = "".join([x if x in string.printable and ord(x) > 32 else '.' for x in recv]) - _log('%s://%s (%s) d: %.2f sec(s);%s srv: %s; recv: %s' % (proto, self.proxy, self.country, duration, torstats, srv, recvstats), 'xxxxx') - else: - self.failcount += 1 - except KeyboardInterrupt as e: - raise e - except rocksock.RocksockException as e: - self.failcount += 1 - finally: - sock.disconnect() + self.proto = proto + self.failcount = 0 + self.success_count = self.success_count + 1 + self.total_duration += int(duration*1000) + torstats = "" if len(config.torhosts)==1 else ' tor: %s;'%tor + recvstats = "".join([x if x in string.printable and ord(x) > 32 else '.' for x in recv]) + _log('%s://%s (%s) d: %.2f sec(s);%s srv: %s; recv: %s' % (proto, self.proxy, self.country, duration, torstats, srv, recvstats), 'xxxxx') + else: + self.failcount += 1 + except KeyboardInterrupt as e: + raise e + except rocksock.RocksockException as e: + self.failcount += 1 + finally: + sock.disconnect() class WorkerThread(): - def __init__ (self, id): - self.id = id - self.done = threading.Event() - self.thread = None - self.workqueue = [] - self.workdone = [] - self.lock = threading.Lock() - def stop(self): - self.done.set() - def term(self): - if self.thread: self.thread.join() - def add_jobs(self, jobs): - with self.lock: - self.workqueue.extend(jobs) - def return_jobs(self): - with self.lock: - jobs = self.workqueue - self.workqueue = [] - return jobs - def jobcount(self): - return len(self.workqueue) - def collect(self): - wd = copy.copy(self.workdone) - self.workdone = [] - return wd - def start_thread(self): - self.thread = threading.Thread(target=self.workloop) - self.thread.start() - def pop_if_possible(self): - with self.lock: - if len(self.workqueue): - job = self.workqueue.pop() - else: - job = None - return job - def workloop(self): - success_count = 0 - job_count = 0 - duration_total = 0 - duration_success_total = 0 - while True: - job = self.pop_if_possible() - if job: - nao = time.time() - job.run() - spent = time.time() - nao - if job.failcount == 0: - duration_success_total += spent - success_count += 1 - job_count += 1 - duration_total += spent - self.workdone.append(job) - elif not self.thread: - break - if self.done.is_set(): break - time.sleep( random.random() / 100) - if self.thread: - succ_rate = try_div(success_count, job_count)*100 - avg_succ_t = try_div(duration_success_total, success_count) - avg_fail_t = try_div(duration_total-duration_success_total, job_count-success_count) - avg_t = try_div(duration_total, job_count) - _log("terminated, %d/%d (%.2f%%), avg.time S/F/T %.2f, %.2f, %.2f" \ - % (success_count, job_count, succ_rate, avg_succ_t, avg_fail_t, avg_t) \ - , self.id) + def __init__ (self, id): + self.id = id + self.done = threading.Event() + self.thread = None + self.workqueue = [] + self.workdone = [] + self.lock = threading.Lock() + def stop(self): + self.done.set() + def term(self): + if self.thread: self.thread.join() + def add_jobs(self, jobs): + with self.lock: + self.workqueue.extend(jobs) + def return_jobs(self): + with self.lock: + jobs = self.workqueue + self.workqueue = [] + return jobs + def jobcount(self): + return len(self.workqueue) + def collect(self): + wd = copy.copy(self.workdone) + self.workdone = [] + return wd + def start_thread(self): + self.thread = threading.Thread(target=self.workloop) + self.thread.start() + def pop_if_possible(self): + with self.lock: + if len(self.workqueue): + job = self.workqueue.pop() + else: + job = None + return job + def workloop(self): + success_count = 0 + job_count = 0 + duration_total = 0 + duration_success_total = 0 + while True: + job = self.pop_if_possible() + if job: + nao = time.time() + job.run() + spent = time.time() - nao + if job.failcount == 0: + duration_success_total += spent + success_count += 1 + job_count += 1 + duration_total += spent + self.workdone.append(job) + elif not self.thread: + break + if self.done.is_set(): break + time.sleep( random.random() / 100) + if self.thread: + succ_rate = try_div(success_count, job_count)*100 + avg_succ_t = try_div(duration_success_total, success_count) + avg_fail_t = try_div(duration_total-duration_success_total, job_count-success_count) + avg_t = try_div(duration_total, job_count) + _log("terminated, %d/%d (%.2f%%), avg.time S/F/T %.2f, %.2f, %.2f" \ + % (success_count, job_count, succ_rate, avg_succ_t, avg_fail_t, avg_t) \ + , self.id) class Proxywatchd(): - def stop(self): - _log('halting... (%d thread(s))' % len([item for item in self.threads if True]), 'watchd') - self.stopping.set() + def stop(self): + _log('halting... (%d thread(s))' % len([item for item in self.threads if True]), 'watchd') + self.stopping.set() - def _cleanup(self): - for wt in self.threads: - wt.stop() - for wt in self.threads: - wt.term() - self.collect_work() - self.submit_collected() - self.stopped.set() + def _cleanup(self): + for wt in self.threads: + wt.stop() + for wt in self.threads: + wt.term() + self.collect_work() + self.submit_collected() + self.stopped.set() - def finish(self): - if not self.in_background: self._cleanup() - while not self.stopped.is_set(): time.sleep(0.1) - success_rate = try_div(self.totals['success'], self.totals['submitted']) * 100 - _log("total results: %d/%d (%.2f%%)"%(self.totals['success'], self.totals['submitted'], success_rate), "watchd") + def finish(self): + if not self.in_background: self._cleanup() + while not self.stopped.is_set(): time.sleep(0.1) + success_rate = try_div(self.totals['success'], self.totals['submitted']) * 100 + _log("total results: %d/%d (%.2f%%)"%(self.totals['success'], self.totals['submitted'], success_rate), "watchd") - def _prep_db(self): - self.mysqlite = mysqlite.mysqlite(config.watchd.database, str) - def _close_db(self): - if self.mysqlite: - self.mysqlite.close() - self.mysqlite = None - def __init__(self): - config.load() - self.in_background = False - self.threads = [] - self.stopping = threading.Event() - self.stopped = threading.Event() + def _prep_db(self): + self.mysqlite = mysqlite.mysqlite(config.watchd.database, str) + def _close_db(self): + if self.mysqlite: + self.mysqlite.close() + self.mysqlite = None + def __init__(self): + config.load() + self.in_background = False + self.threads = [] + self.stopping = threading.Event() + self.stopped = threading.Event() - # create table if needed - self._prep_db() - self.mysqlite.execute('CREATE TABLE IF NOT EXISTS proxylist (proxy BLOB, country BLOB, added INT, failed INT, tested INT, source BLOB, dronebl INT, proto TEXT, success_count INT, total_duration INT)') - self.mysqlite.commit() - self._close_db() + # create table if needed + self._prep_db() + self.mysqlite.execute('CREATE TABLE IF NOT EXISTS proxylist (proxy BLOB, country BLOB, added INT, failed INT, tested INT, source BLOB, dronebl INT, proto TEXT, success_count INT, total_duration INT)') + self.mysqlite.commit() + self._close_db() - self.submit_after = config.watchd.submit_after # number of collected jobs before writing db - self.jobs = [] - self.collected = [] - self.totals = { - 'submitted':0, - 'success':0, - } + self.submit_after = config.watchd.submit_after # number of collected jobs before writing db + self.jobs = [] + self.collected = [] + self.totals = { + 'submitted':0, + 'success':0, + } - def fetch_rows(self): - self.isoldies = False - q = 'SELECT proxy,proto,failed,success_count,total_duration,country FROM proxylist WHERE failed >= ? and failed < ? and (tested + ? + (failed * ?)) < ? ORDER BY RANDOM()' - rows = self.mysqlite.execute(q, (0, config.watchd.max_fail, config.watchd.checktime, config.watchd.perfail_checktime, time.time())).fetchall() - # check oldies ? - if len(rows) < config.watchd.threads: - rows = [] - if config.watchd.oldies: - self.isoldies = True - ## disable tor safeguard for old proxies - if self.tor_safeguard: self.tor_safeguard = False - rows = self.mysqlite.execute(q, (config.watchd.max_fail, config.watchd.max_fail + round( config.watchd.max_fail/2), config.watchd.checktime, config.watchd.oldies_checktime, time.time())).fetchall() - return rows + def fetch_rows(self): + self.isoldies = False + q = 'SELECT proxy,proto,failed,success_count,total_duration,country FROM proxylist WHERE failed >= ? and failed < ? and (tested + ? + (failed * ?)) < ? ORDER BY RANDOM()' + rows = self.mysqlite.execute(q, (0, config.watchd.max_fail, config.watchd.checktime, config.watchd.perfail_checktime, time.time())).fetchall() + # check oldies ? + if len(rows) < config.watchd.threads: + rows = [] + if config.watchd.oldies: + self.isoldies = True + ## disable tor safeguard for old proxies + if self.tor_safeguard: self.tor_safeguard = False + rows = self.mysqlite.execute(q, (config.watchd.max_fail, config.watchd.max_fail + round( config.watchd.max_fail/2), config.watchd.checktime, config.watchd.oldies_checktime, time.time())).fetchall() + return rows - def prepare_jobs(self): - self._prep_db() - ## enable tor safeguard by default - self.tor_safeguard = config.watchd.tor_safeguard - rows = self.fetch_rows() - #print('preparing jobbs, oldies: %s' % str(self.isoldies)) - for row in rows: - job = WorkerJob(row[0], row[1], row[2], row[3], row[4], row[5], self.isoldies) - self.jobs.append(job) - self._close_db() + def prepare_jobs(self): + self._prep_db() + ## enable tor safeguard by default + self.tor_safeguard = config.watchd.tor_safeguard + rows = self.fetch_rows() + #print('preparing jobbs, oldies: %s' % str(self.isoldies)) + for row in rows: + job = WorkerJob(row[0], row[1], row[2], row[3], row[4], row[5], self.isoldies) + self.jobs.append(job) + self._close_db() - def collect_work(self): - for wt in self.threads: - self.collected.extend(wt.collect()) + def collect_work(self): + for wt in self.threads: + self.collected.extend(wt.collect()) - def collect_unfinished(self): - for wt in self.threads: - jobs = wt.return_jobs() - self.jobs.extend(jobs) - if len(self.jobs): - _log("collected %d unfinished jobs"%len(self.jobs), "watchd") + def collect_unfinished(self): + for wt in self.threads: + jobs = wt.return_jobs() + self.jobs.extend(jobs) + if len(self.jobs): + _log("collected %d unfinished jobs"%len(self.jobs), "watchd") - def submit_collected(self): - if len(self.collected) == 0: return True - sc = 0 - args = [] - for job in self.collected: - if job.failcount == 0: sc += 1 - args.append( (job.failcount, job.checktime, 1, job.country, job.proto, job.success_count, job.total_duration, job.proxy) ) + def submit_collected(self): + if len(self.collected) == 0: return True + sc = 0 + args = [] + for job in self.collected: + if job.failcount == 0: sc += 1 + args.append( (job.failcount, job.checktime, 1, job.country, job.proto, job.success_count, job.total_duration, job.proxy) ) - success_rate = (float(sc) / len(self.collected)) * 100 - ret = True - if len(self.collected) >= 100 and success_rate <= config.watchd.outage_threshold and self.tor_safeguard: - _log("WATCHD %.2f%% SUCCESS RATE - tor circuit blocked? won't submit fails"%success_rate, "ERROR") - if sc == 0: return False - args = [] - for job in self.collected: - if job.failcount == 0: - args.append( (job.failcount, job.checktime, 1, job.country, job.proto, job.success_count, job.total_duration, job.proxy) ) - ret = False + success_rate = (float(sc) / len(self.collected)) * 100 + ret = True + if len(self.collected) >= 100 and success_rate <= config.watchd.outage_threshold and self.tor_safeguard: + _log("WATCHD %.2f%% SUCCESS RATE - tor circuit blocked? won't submit fails"%success_rate, "ERROR") + if sc == 0: return False + args = [] + for job in self.collected: + if job.failcount == 0: + args.append( (job.failcount, job.checktime, 1, job.country, job.proto, job.success_count, job.total_duration, job.proxy) ) + ret = False - _log("updating %d DB entries (success rate: %.2f%%)"%(len(self.collected), success_rate), 'watchd') - self._prep_db() - query = 'UPDATE proxylist SET failed=?,tested=?,dronebl=?,country=?,proto=?,success_count=?,total_duration=? WHERE proxy=?' - self.mysqlite.executemany(query, args) - self.mysqlite.commit() - self._close_db() - self.collected = [] - self.totals['submitted'] += len(args) - self.totals['success'] += sc - return ret + _log("updating %d DB entries (success rate: %.2f%%)"%(len(self.collected), success_rate), 'watchd') + self._prep_db() + query = 'UPDATE proxylist SET failed=?,tested=?,dronebl=?,country=?,proto=?,success_count=?,total_duration=? WHERE proxy=?' + self.mysqlite.executemany(query, args) + self.mysqlite.commit() + self._close_db() + self.collected = [] + self.totals['submitted'] += len(args) + self.totals['success'] += sc + return ret - def start(self): - if config.watchd.threads == 1 and _run_standalone: - return self._run() - else: - return self._run_background() + def start(self): + if config.watchd.threads == 1 and _run_standalone: + return self._run() + else: + return self._run_background() - def run(self): - if self.in_background: - while 1: time.sleep(1) + def run(self): + if self.in_background: + while 1: time.sleep(1) - def _run_background(self): - self.in_background = True - t = threading.Thread(target=self._run) - t.start() + def _run_background(self): + self.in_background = True + t = threading.Thread(target=self._run) + t.start() - def _run(self): - _log('starting...', 'watchd') + def _run(self): + _log('starting...', 'watchd') - for i in range(config.watchd.threads): - threadid = ''.join( [ random.choice(string.letters) for x in range(5) ] ) - wt = WorkerThread(threadid) - if self.in_background: - wt.start_thread() - self.threads.append(wt) - #time.sleep( (random.random()/100) ) - time.sleep( (random.random()/10) ) + for i in range(config.watchd.threads): + threadid = ''.join( [ random.choice(string.letters) for x in range(5) ] ) + wt = WorkerThread(threadid) + if self.in_background: + wt.start_thread() + self.threads.append(wt) + #time.sleep( (random.random()/100) ) + time.sleep( (random.random()/10) ) - sleeptime = 0 - while True: + sleeptime = 0 + while True: - if self.stopping.is_set(): - print('stopping is_set') - if self.in_background: self._cleanup() - break + if self.stopping.is_set(): + print('stopping is_set') + if self.in_background: self._cleanup() + break - if sleeptime == 0: - sleeptime = 1 - else: - time.sleep(1) - sleeptime -= 1 - continue + if sleeptime == 0: + sleeptime = 1 + else: + time.sleep(1) + sleeptime -= 1 + continue - if self.threads[random.choice(xrange(len(self.threads)))].jobcount() == 0: - self.collect_unfinished() - if not len(self.jobs): - self.collect_work() - if not self.submit_collected() and self.tor_safeguard: - _log("zzZzZzzZ sleeping 1 minute(s) due to tor issues - consider decreasing thread number!", "watchd") - self.collect_unfinished() - sleeptime = 1*60 - else: - self.prepare_jobs() - else: - if len(self.jobs) < len(self.threads): - # allow threads enough time to consume the jobs - sleeptime = 10 - #if len(self.jobs) >= len(self.threads): - if len(self.jobs): - _log("handing out %d jobs to %d thread(s)"% (len(self.jobs), len(self.threads)), 'watchd') - jpt = len(self.jobs)/len(self.threads) - if len(self.jobs)/float(len(self.threads)) - jpt > 0.0: jpt += 1 - for tid in xrange(len(self.threads)): - self.threads[tid].add_jobs(self.jobs[tid*jpt:tid*jpt+jpt]) - self.jobs = [] + if self.threads[random.choice(xrange(len(self.threads)))].jobcount() == 0: + self.collect_unfinished() + if not len(self.jobs): + self.collect_work() + if not self.submit_collected() and self.tor_safeguard: + _log("zzZzZzzZ sleeping 1 minute(s) due to tor issues - consider decreasing thread number!", "watchd") + self.collect_unfinished() + sleeptime = 1*60 + else: + self.prepare_jobs() + else: + if len(self.jobs) < len(self.threads): + # allow threads enough time to consume the jobs + sleeptime = 10 + #if len(self.jobs) >= len(self.threads): + if len(self.jobs): + _log("handing out %d jobs to %d thread(s)"% (len(self.jobs), len(self.threads)), 'watchd') + jpt = len(self.jobs)/len(self.threads) + if len(self.jobs)/float(len(self.threads)) - jpt > 0.0: jpt += 1 + for tid in xrange(len(self.threads)): + self.threads[tid].add_jobs(self.jobs[tid*jpt:tid*jpt+jpt]) + self.jobs = [] - if not self.in_background: # single_thread scenario - self.threads[0].workloop() + if not self.in_background: # single_thread scenario + self.threads[0].workloop() - self.collect_work() + self.collect_work() - if len(self.collected) > self.submit_after: - if not self.submit_collected() and self.tor_safeguard: - _log("zzZzZzzZ sleeping 1 minute(s) due to tor issues - consider decreasing thread number!", "watchd") - self.collect_unfinished() - sleeptime = 1*60 + if len(self.collected) > self.submit_after: + if not self.submit_collected() and self.tor_safeguard: + _log("zzZzZzzZ sleeping 1 minute(s) due to tor issues - consider decreasing thread number!", "watchd") + self.collect_unfinished() + sleeptime = 1*60 - time.sleep(1) - sleeptime -= 1 + time.sleep(1) + sleeptime -= 1 if __name__ == '__main__': - _run_standalone = True + _run_standalone = True - config.load() + config.load() - w = Proxywatchd() - try: - w.start() - w.run() - except KeyboardInterrupt as e: - pass - finally: - w.stop() - w.finish() + w = Proxywatchd() + try: + w.start() + w.run() + except KeyboardInterrupt as e: + pass + finally: + w.stop() + w.finish() diff --git a/scraper.py b/scraper.py index 5108fd1..49b2c8d 100755 --- a/scraper.py +++ b/scraper.py @@ -13,83 +13,83 @@ import sys config = Config() with open('searx.instances') as h: - searx_instances = [ line.strip() for line in h.readlines() if line.lower().startswith('http') ] - print(searx_instances) + searx_instances = [ line.strip() for line in h.readlines() if line.lower().startswith('http') ] + print(searx_instances) def proxyfind(sqlite = None, urignore=None): - search = '' - random.shuffle(searx_instances) + search = '' + random.shuffle(searx_instances) - ## search by working proxy - if 'p' in config.scraper.query: - proxydb = mysqlite.mysqlite(config.watchd.database,str) - proxies = [ i[0] for i in proxydb.execute('SELECT proxy FROM proxylist WHERE failed=0 ORDER BY RANDOM() LIMIT 10').fetchall() ] - if len(proxies) and random.random() < random.random(): - search = ' '.join( random.sample(proxies, random.randint(1,2))) + ## search by working proxy + if 'p' in config.scraper.query: + proxydb = mysqlite.mysqlite(config.watchd.database,str) + proxies = [ i[0] for i in proxydb.execute('SELECT proxy FROM proxylist WHERE failed=0 ORDER BY RANDOM() LIMIT 10').fetchall() ] + if len(proxies) and random.random() < random.random(): + search = ' '.join( random.sample(proxies, random.randint(1,2))) - ## search by relative url - if 'w' in config.scraper.query and not len(search) or random.random() < random.random(): - if not sqlite: sqlite = mysqlite.mysqlite(config.ppf.database,str) - uris = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE error=0 and url not like "%github%" ORDER BY RANDOM() LIMIT 10').fetchall() ] - if len(uris) > 0 and random.random() < random.random(): - if len(search): search = '%s OR ' % search - search = search + 'site:%s' % random.choice(uris).split('/')[2] + ## search by relative url + if 'w' in config.scraper.query and not len(search) or random.random() < random.random(): + if not sqlite: sqlite = mysqlite.mysqlite(config.ppf.database,str) + uris = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE error=0 and url not like "%github%" ORDER BY RANDOM() LIMIT 10').fetchall() ] + if len(uris) > 0 and random.random() < random.random(): + if len(search): search = '%s OR ' % search + search = search + 'site:%s' % random.choice(uris).split('/')[2] - ## build string - if 's' in config.scraper.query and not len(search) or random.random() < random.random(): - if len(search): search = '%s OR ' % search - search = search + random.choice(search_terms) + ## build string + if 's' in config.scraper.query and not len(search) or random.random() < random.random(): + if len(search): search = '%s OR ' % search + search = search + random.choice(search_terms) - if not len(search): return - #search_args = [ 'category=general', 'time_range=%s' % random.choice(['day','week','month','year']), 'q=%s' % urllib.quote_plus(search) ] - search_args = [ 'category=general', 'time_range=%s' % random.choice(['day','week']), 'q=%s' % urllib.quote_plus(search) ] - random.shuffle(search_args) - search_arg = '&'.join(search_args) + if not len(search): return + #search_args = [ 'category=general', 'time_range=%s' % random.choice(['day','week','month','year']), 'q=%s' % urllib.quote_plus(search) ] + search_args = [ 'category=general', 'time_range=%s' % random.choice(['day','week']), 'q=%s' % urllib.quote_plus(search) ] + random.shuffle(search_args) + search_arg = '&'.join(search_args) - if config.scraper.debug: - print('search_arg: %s' % search_arg) + if config.scraper.debug: + print('search_arg: %s' % search_arg) - for srx in searx_instances: - x = 0 - while 1: - urls = [] - if x > 0: content = fetch.fetch_contents('%s/?%s&pageno=%d' % (srx,search_arg,x)) - else: content = fetch.fetch_contents('%s/?%s' % (srx,search_arg)) - if content: urls = fetch.extract_urls(content, urls, urignore) + for srx in searx_instances: + x = 0 + while 1: + urls = [] + if x > 0: content = fetch.fetch_contents('%s/?%s&pageno=%d' % (srx,search_arg,x)) + else: content = fetch.fetch_contents('%s/?%s' % (srx,search_arg)) + if content: urls = fetch.extract_urls(content, urls, urignore) - if not len(urls): break - dbs.insert_urls(urls, '%s/?%s (pageno: %d)' % (srx.split('/')[2],search_arg,x) , sqlite) - x = x + 1 + if not len(urls): break + dbs.insert_urls(urls, '%s/?%s (pageno: %d)' % (srx.split('/')[2],search_arg,x) , sqlite) + x = x + 1 def load_urignore(): - ## load bad terms - with open('urignore.txt', 'r') as f: - urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] - ## add searx instances as bad terms (avoid loops) - for i in searx_instances: - urignore.append(i.split('/')[2]) - return urignore + ## load bad terms + with open('urignore.txt', 'r') as f: + urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] + ## add searx instances as bad terms (avoid loops) + for i in searx_instances: + urignore.append(i.split('/')[2]) + return urignore if __name__ == '__main__': - config.load() - fetch.set_config(config) + config.load() + fetch.set_config(config) - proxydb = mysqlite.mysqlite(config.watchd.database, str) - dbs.create_table_if_not_exists(proxydb, 'proxylist') + proxydb = mysqlite.mysqlite(config.watchd.database, str) + dbs.create_table_if_not_exists(proxydb, 'proxylist') - urldb = mysqlite.mysqlite(config.ppf.database, str) - dbs.create_table_if_not_exists(urldb, 'uris') + urldb = mysqlite.mysqlite(config.ppf.database, str) + dbs.create_table_if_not_exists(urldb, 'uris') - ## load search terms - with open('search_terms.txt', 'r') as f: - search_terms = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] + ## load search terms + with open('search_terms.txt', 'r') as f: + search_terms = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] - urignore = load_urignore() + urignore = load_urignore() - while True: - try: proxyfind(urldb, urignore) - except KeyboardInterrupt: break + while True: + try: proxyfind(urldb, urignore) + except KeyboardInterrupt: break - print '\r', + print '\r',