diff --git a/ppf.py b/ppf.py index bb2cdb5..d559604 100755 --- a/ppf.py +++ b/ppf.py @@ -15,231 +15,231 @@ import threading config = Config() def import_from_file(fn, sqlite): - with open(fn, 'r') as f: - urls = [ url for url in f.read().split('\n') if url != '' ] - dbs.insert_urls(urls, 'import.txt', urldb) + with open(fn, 'r') as f: + urls = [ url for url in f.read().split('\n') if url != '' ] + dbs.insert_urls(urls, 'import.txt', urldb) def get_content_type(url): - hdr = fetch.fetch_contents(url, head=True) + hdr = fetch.fetch_contents(url, head=True) - for h in hdr.split('\n'): - if h.lower().startswith('content-type: '): return h.lower().split(':')[1].strip() + for h in hdr.split('\n'): + if h.lower().startswith('content-type: '): return h.lower().split(':')[1].strip() - return '' + return '' def is_good_content_type(string): - allowed_ct = [ 'text/html', 'text/plain', 'atom+xml' ] - for ct in allowed_ct: - if ct.lower() in string.lower(): return True - return False + allowed_ct = [ 'text/html', 'text/plain', 'atom+xml' ] + for ct in allowed_ct: + if ct.lower() in string.lower(): return True + return False def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added, content_type): - if not content_type: content_type = get_content_type(url) + if not content_type: content_type = get_content_type(url) - if is_good_content_type(content_type): - try: content = fetch.fetch_contents(url) - except KeyboardInterrupt as e: raise e - except: content = '' - else: - content = '' + if is_good_content_type(content_type): + try: content = fetch.fetch_contents(url) + except KeyboardInterrupt as e: raise e + except: content = '' + else: + content = '' - unique_count, new = fetch.extract_proxies(content, proxydb) + unique_count, new = fetch.extract_proxies(content, proxydb) - if retrievals == 0: # new site - if content != '' and unique_count == 0: # site works but has zero proxy addresses - error = 99999 - else: - if len(new) == 0: - stale_count += 1 - else: - extract_urls(content, url) - stale_count = 0 - if content == '': - error += 1 - else: - retrievals += 1 - error = 0 + if retrievals == 0: # new site + if content != '' and unique_count == 0: # site works but has zero proxy addresses + error = 99999 + else: + if len(new) == 0: + stale_count += 1 + else: + extract_urls(content, url) + stale_count = 0 + if content == '': + error += 1 + else: + retrievals += 1 + error = 0 - urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=?,content_type=? where url=?', (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, url)) - urldb.commit() + urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=?,content_type=? where url=?', (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, url)) + urldb.commit() - if not len(new): return + if not len(new): return - dbs.insert_proxies(proxydb, new, url) + dbs.insert_proxies(proxydb, new, url) def is_bad_url(uri, domain=None, samedomain=False): - # if uri needs to be from same domain and domains missmatch - if samedomain and str(uri.split('/')[2]).lower() != str(domain).lower(): - return True - for u in urignore: - if re.findall(u, uri): return True - return False + # if uri needs to be from same domain and domains missmatch + if samedomain and str(uri.split('/')[2]).lower() != str(domain).lower(): + return True + for u in urignore: + if re.findall(u, uri): return True + return False def extract_urls(html, url): - mytime = int(time.time()) - proto = url.split(':')[0] - domain = url.split('/')[2] - urls = [] + mytime = int(time.time()) + proto = url.split(':')[0] + domain = url.split('/')[2] + urls = [] - soup = BeautifulSoup(html, features='lxml') + soup = BeautifulSoup(html, features='lxml') - for a in soup.find_all('a', href=True): - item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href'] - item = item.strip() + for a in soup.find_all('a', href=True): + item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href'] + item = item.strip() - if item.startswith('www.'): - item = 'http://%s' % item - elif not item.startswith('http'): - if not item.startswith('/'): item = '/%s' % item - item = '%s://%s%s' % (proto,domain,item) - elif is_bad_url(item, domain=domain, samedomain=config.ppf.extract_samedomain): - continue + if item.startswith('www.'): + item = 'http://%s' % item + elif not item.startswith('http'): + if not item.startswith('/'): item = '/%s' % item + item = '%s://%s%s' % (proto,domain,item) + elif is_bad_url(item, domain=domain, samedomain=config.ppf.extract_samedomain): + continue - if not item in urls: urls.append(item) + if not item in urls: urls.append(item) - if len(urls): dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls) + if len(urls): dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls) def import_proxies_from_file(proxydb, fn): - content = open(fn, 'r').read() - unique_count, new = fetch.extract_proxies(content, proxydb) - if len(new): - dbs.insert_proxies(proxydb, new, fn) - return 0 - return 1 + content = open(fn, 'r').read() + unique_count, new = fetch.extract_proxies(content, proxydb) + if len(new): + dbs.insert_proxies(proxydb, new, fn) + return 0 + return 1 def serve_loop(hs, done): - client_threads = [] - while not done.is_set(): - c = hs.wait_client() + client_threads = [] + while not done.is_set(): + c = hs.wait_client() - evt_done = threading.Event() - cthread = threading.Thread(target=httpsrv_client_thread, args=(c,evt_done)) - cthread.daemon = True - cthread.start() + evt_done = threading.Event() + cthread = threading.Thread(target=httpsrv_client_thread, args=(c,evt_done)) + cthread.daemon = True + cthread.start() - ctrm = [] - for ct, ct_done in client_threads: - if ct_done.is_set(): - ctrm.append((ct,ct_done)) - ct.join() + ctrm = [] + for ct, ct_done in client_threads: + if ct_done.is_set(): + ctrm.append((ct,ct_done)) + ct.join() - if len(ctrm): - client_threads = [ x for x in client_threads if not x in ctrm ] + if len(ctrm): + client_threads = [ x for x in client_threads if not x in ctrm ] - client_threads.append((cthread, evt_done)) + client_threads.append((cthread, evt_done)) def forbidden_page(): - return ( - '\n' - '
\n' - ' \n' - '