diff --git a/config.py b/config.py index 8b028f3..7ba49b8 100644 --- a/config.py +++ b/config.py @@ -30,16 +30,16 @@ class Config(ComboParser): self.add_item(section, 'tor_safeguard', bool, True, 'enable tor safeguard (default: True)', False) section = 'httpd' - self.add_item(section, 'listenip', str, '127.0.0.1', 'address for the httpd to listen to (default: 127.0.0.1)', True) - self.add_item(section, 'port', int, 8081, 'port for the httpd to listen to (default: 8081)', True) - self.add_item(section, 'enabled', bool, False, 'start httpd (default: False)', True) + self.add_item(section, 'listenip', str, '127.0.0.1', 'address for the httpd to listen to (default: 127.0.0.1)', True) + self.add_item(section, 'port', int, 8081, 'port for the httpd to listen to (default: 8081)', True) + self.add_item(section, 'enabled', bool, False, 'start httpd (default: False)', True) section = 'ppf' self.add_item(section, 'debug', bool, False, 'whether to print additional debug info', False) self.add_item(section, 'search', bool, True, 'whether to use searx search engine to find new proxy lists', False) self.add_item(section, 'timeout', float, 15, 'timeout for blocking operations (connect/recv/...) for proxy checks in seconds', False) self.add_item(section, 'http_retries', int, 1, 'number of retries for http connects', False) - self.add_item(section, 'threads', int, 1, 'number of threads to run (default: 1)', False) + self.add_item(section, 'threads', int, 1, 'number of threads to run (default: 1)', False) self.add_item(section, 'checktime', int, 3600, 'base checking interval for urls in db in seconds', False) self.add_item(section, 'perfail_checktime', int, 3600, 'additional checking interval for urls in db in seconds per resultless check', False) self.add_item(section, 'max_fail', int, 5, 'number of fails after which an url is considered dead', False) @@ -52,32 +52,32 @@ class Config(ComboParser): self.aparser.add_argument("--file", help="import a single file containing proxy addrs", type=str, default='', required=False) - section = 'flood' - self.add_item(section, 'server', str, None, 'irc server address', False) - self.add_item(section, 'target', str, None, 'target to flood', False) - self.add_item(section, 'nickserv', str, 'nickserv', "nickserv's nickname", False) - self.add_item(section, 'message', str, None, 'message', False) - self.add_item(section, 'threads', int, 1, '# of threads', False) - self.add_item(section, 'register', int, 0, 'register nickname when required', False) + section = 'flood' + self.add_item(section, 'server', str, None, 'irc server address', False) + self.add_item(section, 'target', str, None, 'target to flood', False) + self.add_item(section, 'nickserv', str, 'nickserv', "nickserv's nickname", False) + self.add_item(section, 'message', str, None, 'message', False) + self.add_item(section, 'threads', int, 1, '# of threads', False) + self.add_item(section, 'register', int, 0, 'register nickname when required', False) - self.add_item(section, 'wait', int, 0, 'wait prior sending messages', False) - self.add_item(section, 'once', int, 0, 'quit as soon as possible', False) - self.add_item(section, 'hilight', int, 0, 'try to hilight all nicks?', False) - self.add_item(section, 'waitonsuccess', int, 0, 'wait for a while on success', False) - self.add_item(section, 'debug', int, 0, 'use debug', False) - self.add_item(section, 'duration', int, 180, 'maximum time to run', False) - self.add_item(section, 'delay', str, 14400, 'if waitonsuccess, wait for $delay before sending other bots', False) - self.add_item(section, 'nick', str, None, 'specify nickname to use', False) - self.add_item(section, 'use_ssl', int, 2, 'Use ssl? (0: false, 1: true, 2: random)', False) - self.add_item(section, 'cycle', int, 0, 'cycle flood', False) - self.add_item(section, 'change_nick', int, 0, 'Change nick between messages (useful when flooding privates)', False) - self.add_item(section, 'use_timeout', int, 0, 'make connexions quit through timeout', False) - self.add_item(section, 'clones', int, 1, 'Number of connexion repeat to run', False) - self.add_item(section, 'query', bool, False, 'also flood in query', False) - self.add_item(section, 'noquerybefore', int, 10, 'do not send query before x secs being connected', False) - self.add_item(section, 'oper', bool, False, 'piss of opers', False) - self.add_item(section, 'whois', bool, False, 'piss of opers with /whois', False) - self.add_item(section, 'modex', bool, False, 'make +/- x mode', False) - self.add_item(section, 'os', bool, False, 'piss off opers with /os', False) - self.add_item(section, 'file', str, None, 'read flood content from file', False) - self.add_item(section, 'failid', str, None, 'generate nickserv warn. about IDENTIFY attempts', False) + self.add_item(section, 'wait', int, 0, 'wait prior sending messages', False) + self.add_item(section, 'once', int, 0, 'quit as soon as possible', False) + self.add_item(section, 'hilight', int, 0, 'try to hilight all nicks?', False) + self.add_item(section, 'waitonsuccess', int, 0, 'wait for a while on success', False) + self.add_item(section, 'debug', int, 0, 'use debug', False) + self.add_item(section, 'duration', int, 180, 'maximum time to run', False) + self.add_item(section, 'delay', str, 14400, 'if waitonsuccess, wait for $delay before sending other bots', False) + self.add_item(section, 'nick', str, None, 'specify nickname to use', False) + self.add_item(section, 'use_ssl', int, 2, 'Use ssl? (0: false, 1: true, 2: random)', False) + self.add_item(section, 'cycle', int, 0, 'cycle flood', False) + self.add_item(section, 'change_nick', int, 0, 'Change nick between messages (useful when flooding privates)', False) + self.add_item(section, 'use_timeout', int, 0, 'make connexions quit through timeout', False) + self.add_item(section, 'clones', int, 1, 'Number of connexion repeat to run', False) + self.add_item(section, 'query', bool, False, 'also flood in query', False) + self.add_item(section, 'noquerybefore', int, 10, 'do not send query before x secs being connected', False) + self.add_item(section, 'oper', bool, False, 'piss of opers', False) + self.add_item(section, 'whois', bool, False, 'piss of opers with /whois', False) + self.add_item(section, 'modex', bool, False, 'make +/- x mode', False) + self.add_item(section, 'os', bool, False, 'piss off opers with /os', False) + self.add_item(section, 'file', str, None, 'read flood content from file', False) + self.add_item(section, 'failid', str, None, 'generate nickserv warn. about IDENTIFY attempts', False) diff --git a/fetch.py b/fetch.py index 1750d08..be4c829 100644 --- a/fetch.py +++ b/fetch.py @@ -6,119 +6,119 @@ from misc import _log config = None def set_config(cfg): - global config - config = cfg + global config + config = cfg cleanhtml_re = [ - re.compile('<.*?>'), - re.compile('\s+'), - re.compile('::+'), + re.compile('<.*?>'), + re.compile('\s+'), + re.compile('::+'), ] def cleanhtml(raw_html): - html = raw_html.replace(' ', ' ') - html = re.sub(cleanhtml_re[0], ':', html) - html = re.sub(cleanhtml_re[1], ':', html) - html = re.sub(cleanhtml_re[2], ':', html) - return html + html = raw_html.replace(' ', ' ') + html = re.sub(cleanhtml_re[0], ':', html) + html = re.sub(cleanhtml_re[1], ':', html) + html = re.sub(cleanhtml_re[2], ':', html) + return html retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded') def fetch_contents(url, head = False): - host, port, ssl, uri = _parse_url(url) - headers=[ - 'Accept-Language: en-US,en;q=0.8', - 'Cache-Control: max-age=0', - ] - if config.ppf.debug: - _log("connecting to %s... (header: %s)" % (url, str(head)), "debug") - while True: - proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))] - http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0') - if not http.connect(): - _log("failed to connect to %s"%url, "ppf") - e = http.get_last_rocksock_exception() - if not e: - return '' - et = e.get_errortype() - ee = e.get_error() - ef = e.get_failedproxy() - if et == rocksock.RS_ET_OWN and \ - ee == rocksock.RS_E_TARGET_CONN_REFUSED \ - and ef == 0: - _log("could not connect to proxy 0 - check your connection", "error") - time.sleep(5) - continue - return '' - break + host, port, ssl, uri = _parse_url(url) + headers=[ + 'Accept-Language: en-US,en;q=0.8', + 'Cache-Control: max-age=0', + ] + if config.ppf.debug: + _log("connecting to %s... (header: %s)" % (url, str(head)), "debug") + while True: + proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))] + http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0') + if not http.connect(): + _log("failed to connect to %s"%url, "ppf") + e = http.get_last_rocksock_exception() + if not e: + return '' + et = e.get_errortype() + ee = e.get_error() + ef = e.get_failedproxy() + if et == rocksock.RS_ET_OWN and \ + ee == rocksock.RS_E_TARGET_CONN_REFUSED \ + and ef == 0: + _log("could not connect to proxy 0 - check your connection", "error") + time.sleep(5) + continue + return '' + break - ## only request header - if head: - hdr = http.head(uri, headers) - return hdr + ## only request header + if head: + hdr = http.head(uri, headers) + return hdr - hdr, res = http.get(uri, headers) - res = res.encode('utf-8') if isinstance(res, unicode) else res - for retry_message in retry_messages: - if retry_message in res: return '' + hdr, res = http.get(uri, headers) + res = res.encode('utf-8') if isinstance(res, unicode) else res + for retry_message in retry_messages: + if retry_message in res: return '' - return res + return res def valid_port(port): - return port > 0 and port < 65535 + return port > 0 and port < 65535 def is_usable_proxy(proxy): - ip, port = proxy.split(':') - if not valid_port(int(port)): return False + ip, port = proxy.split(':') + if not valid_port(int(port)): return False - octets = ip.split('.') - A = int(octets[0]) - B = int(octets[1]) - C = int(octets[2]) - D = int(octets[3]) + octets = ip.split('.') + A = int(octets[0]) + B = int(octets[1]) + C = int(octets[2]) + D = int(octets[3]) - if (A < 1 or A > 254 or \ - B > 255 or C > 255 or D > 255) or \ - (A == 10 or A == 127) or \ - (A == 192 and B == 168) or \ - (A == 172 and B >= 16 and B <= 31): return False - return True + if (A < 1 or A > 254 or \ + B > 255 or C > 255 or D > 255) or \ + (A == 10 or A == 127) or \ + (A == 192 and B == 168) or \ + (A == 172 and B >= 16 and B <= 31): return False + return True _known_proxies = {} def extract_proxies(content, proxydb): - matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content)) + matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content)) - uniques_dict = {} - for p in matches: - uniques_dict[p] = True + uniques_dict = {} + for p in matches: + uniques_dict[p] = True - uniques = [] - for p in uniques_dict.keys(): - if is_usable_proxy(p): uniques.append(p) + uniques = [] + for p in uniques_dict.keys(): + if is_usable_proxy(p): uniques.append(p) - global _known_proxies - if len(_known_proxies) == 0: - known = proxydb.execute('SELECT proxy FROM proxylist').fetchall() - for k in known: - _known_proxies[k[0]] = True + global _known_proxies + if len(_known_proxies) == 0: + known = proxydb.execute('SELECT proxy FROM proxylist').fetchall() + for k in known: + _known_proxies[k[0]] = True - new = [] - for p in uniques: - if not p in _known_proxies: - new.append(p) - _known_proxies[p] = True + new = [] + for p in uniques: + if not p in _known_proxies: + new.append(p) + _known_proxies[p] = True - return len(uniques), new + return len(uniques), new def extract_urls(content, urls = None, urignore=None): - urls = [] if not urls else urls - soup = soupify(content) - for a in soup.body.find_all('a'): - if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue - bad = False - href = a.attrs['href'] - for i in urignore: - if re.findall(i, href): - bad = True - break - if not bad: urls.append(href) - return urls + urls = [] if not urls else urls + soup = soupify(content) + for a in soup.body.find_all('a'): + if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue + bad = False + href = a.attrs['href'] + for i in urignore: + if re.findall(i, href): + bad = True + break + if not bad: urls.append(href) + return urls diff --git a/ppf.py b/ppf.py index ccfc1dd..571f7b4 100755 --- a/ppf.py +++ b/ppf.py @@ -17,370 +17,370 @@ config = Config() _known_proxies = {} def import_from_file(fn, sqlite): - with open(fn, 'r') as f: - urls = [ url for url in f.read().split('\n') if url != '' ] - cinc = 0 - while True: - chunk = urls[cinc:cinc+200] - if len(chunk): dbs.insert_urls(chunk, 'import.txt', urldb) - else: break - cinc = cinc + 200 + with open(fn, 'r') as f: + urls = [ url for url in f.read().split('\n') if url != '' ] + cinc = 0 + while True: + chunk = urls[cinc:cinc+200] + if len(chunk): dbs.insert_urls(chunk, 'import.txt', urldb) + else: break + cinc = cinc + 200 def get_content_type(url): - hdr = fetch.fetch_contents(url, head=True) + hdr = fetch.fetch_contents(url, head=True) - for h in hdr.split('\n'): - if h.lower().startswith('content-type: '): return h.lower().split(':')[1].strip() + for h in hdr.split('\n'): + if h.lower().startswith('content-type: '): return h.lower().split(':')[1].strip() - return '' + return '' def is_good_content_type(string): - allowed_ct = [ 'text/html', 'text/plain', 'atom+xml' ] - for ct in allowed_ct: - if ct.lower() in string.lower(): return True - return False + allowed_ct = [ 'text/html', 'text/plain', 'atom+xml' ] + for ct in allowed_ct: + if ct.lower() in string.lower(): return True + return False def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added, content_type): - if not content_type: content_type = get_content_type(url) + if not content_type: content_type = get_content_type(url) - if is_good_content_type(content_type): - try: content = fetch.fetch_contents(url) - except KeyboardInterrupt as e: raise e - except: content = '' - else: - content = '' + if is_good_content_type(content_type): + try: content = fetch.fetch_contents(url) + except KeyboardInterrupt as e: raise e + except: content = '' + else: + content = '' - unique_count, new = fetch.extract_proxies(content, proxydb) + unique_count, new = fetch.extract_proxies(content, proxydb) - if retrievals == 0: # new site - if content != '' and unique_count == 0: # site works but has zero proxy addresses - error = 99999 - else: - if len(new) == 0: - stale_count += 1 - else: - stale_count = 0 - if content == '': - error += 1 - else: - retrievals += 1 - error = 0 - if unique_count: - extract_urls(content, url) + if retrievals == 0: # new site + if content != '' and unique_count == 0: # site works but has zero proxy addresses + error = 99999 + else: + if len(new) == 0: + stale_count += 1 + else: + stale_count = 0 + if content == '': + error += 1 + else: + retrievals += 1 + error = 0 + if unique_count: + extract_urls(content, url) - urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=?,content_type=? where url=?', (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, url)) - urldb.commit() + urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=?,content_type=? where url=?', (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, url)) + urldb.commit() - if not len(new): return + if not len(new): return - dbs.insert_proxies(proxydb, new, url) + dbs.insert_proxies(proxydb, new, url) def is_bad_url(uri, domain=None, samedomain=False): - # if uri needs to be from same domain and domains missmatch - if samedomain and str(uri.split('/')[2]).lower() != str(domain).lower(): - return True - for u in urignore: - if re.findall(u, uri): return True - return False + # if uri needs to be from same domain and domains missmatch + if samedomain and str(uri.split('/')[2]).lower() != str(domain).lower(): + return True + for u in urignore: + if re.findall(u, uri): return True + return False def extract_urls(html, url): - mytime = int(time.time()) - proto = url.split(':')[0] - domain = url.split('/')[2] - urls = [] + mytime = int(time.time()) + proto = url.split(':')[0] + domain = url.split('/')[2] + urls = [] - soup = BeautifulSoup(html, features='lxml') + soup = BeautifulSoup(html, features='lxml') - for a in soup.find_all('a', href=True): - item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href'] - item = item.strip() + for a in soup.find_all('a', href=True): + item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href'] + item = item.strip() - if item.startswith('www.'): - item = 'http://%s' % item - elif not item.startswith('http'): - if not item.startswith('/'): item = '/%s' % item - item = '%s://%s%s' % (proto,domain,item) + if item.startswith('www.'): + item = 'http://%s' % item + elif not item.startswith('http'): + if not item.startswith('/'): item = '/%s' % item + item = '%s://%s%s' % (proto,domain,item) - elif is_bad_url(item, domain=domain, samedomain=config.ppf.extract_samedomain): - continue - if not item in urls: urls.append(item) + elif is_bad_url(item, domain=domain, samedomain=config.ppf.extract_samedomain): + continue + if not item in urls: urls.append(item) - if len(urls): dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls) + if len(urls): dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls) def import_proxies_from_file(proxydb, fn): - content = open(fn, 'r').read() - unique_count, new = fetch.extract_proxies(content, proxydb) - if len(new): - dbs.insert_proxies(proxydb, new, fn) - return 0 - return 1 + content = open(fn, 'r').read() + unique_count, new = fetch.extract_proxies(content, proxydb) + if len(new): + dbs.insert_proxies(proxydb, new, fn) + return 0 + return 1 def serve_loop(hs, done): - client_threads = [] - while not done.is_set(): - c = hs.wait_client() + client_threads = [] + while not done.is_set(): + c = hs.wait_client() - evt_done = threading.Event() - cthread = threading.Thread(target=httpsrv_client_thread, args=(c,evt_done)) - cthread.daemon = True - cthread.start() + evt_done = threading.Event() + cthread = threading.Thread(target=httpsrv_client_thread, args=(c,evt_done)) + cthread.daemon = True + cthread.start() - ctrm = [] - for ct, ct_done in client_threads: - if ct_done.is_set(): - ctrm.append((ct,ct_done)) - ct.join() + ctrm = [] + for ct, ct_done in client_threads: + if ct_done.is_set(): + ctrm.append((ct,ct_done)) + ct.join() - if len(ctrm): - client_threads = [ x for x in client_threads if not x in ctrm ] + if len(ctrm): + client_threads = [ x for x in client_threads if not x in ctrm ] - client_threads.append((cthread, evt_done)) + client_threads.append((cthread, evt_done)) def forbidden_page(): - return ( - '\n' - '
\n' - ' \n' - '