diff --git a/ppf.py b/ppf.py index bb2cdb5..d559604 100755 --- a/ppf.py +++ b/ppf.py @@ -15,231 +15,231 @@ import threading config = Config() def import_from_file(fn, sqlite): - with open(fn, 'r') as f: - urls = [ url for url in f.read().split('\n') if url != '' ] - dbs.insert_urls(urls, 'import.txt', urldb) + with open(fn, 'r') as f: + urls = [ url for url in f.read().split('\n') if url != '' ] + dbs.insert_urls(urls, 'import.txt', urldb) def get_content_type(url): - hdr = fetch.fetch_contents(url, head=True) + hdr = fetch.fetch_contents(url, head=True) - for h in hdr.split('\n'): - if h.lower().startswith('content-type: '): return h.lower().split(':')[1].strip() + for h in hdr.split('\n'): + if h.lower().startswith('content-type: '): return h.lower().split(':')[1].strip() - return '' + return '' def is_good_content_type(string): - allowed_ct = [ 'text/html', 'text/plain', 'atom+xml' ] - for ct in allowed_ct: - if ct.lower() in string.lower(): return True - return False + allowed_ct = [ 'text/html', 'text/plain', 'atom+xml' ] + for ct in allowed_ct: + if ct.lower() in string.lower(): return True + return False def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added, content_type): - if not content_type: content_type = get_content_type(url) + if not content_type: content_type = get_content_type(url) - if is_good_content_type(content_type): - try: content = fetch.fetch_contents(url) - except KeyboardInterrupt as e: raise e - except: content = '' - else: - content = '' + if is_good_content_type(content_type): + try: content = fetch.fetch_contents(url) + except KeyboardInterrupt as e: raise e + except: content = '' + else: + content = '' - unique_count, new = fetch.extract_proxies(content, proxydb) + unique_count, new = fetch.extract_proxies(content, proxydb) - if retrievals == 0: # new site - if content != '' and unique_count == 0: # site works but has zero proxy addresses - error = 99999 - else: - if len(new) == 0: - stale_count += 1 - else: - extract_urls(content, url) - stale_count = 0 - if content == '': - error += 1 - else: - retrievals += 1 - error = 0 + if retrievals == 0: # new site + if content != '' and unique_count == 0: # site works but has zero proxy addresses + error = 99999 + else: + if len(new) == 0: + stale_count += 1 + else: + extract_urls(content, url) + stale_count = 0 + if content == '': + error += 1 + else: + retrievals += 1 + error = 0 - urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=?,content_type=? where url=?', (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, url)) - urldb.commit() + urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=?,content_type=? where url=?', (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, url)) + urldb.commit() - if not len(new): return + if not len(new): return - dbs.insert_proxies(proxydb, new, url) + dbs.insert_proxies(proxydb, new, url) def is_bad_url(uri, domain=None, samedomain=False): - # if uri needs to be from same domain and domains missmatch - if samedomain and str(uri.split('/')[2]).lower() != str(domain).lower(): - return True - for u in urignore: - if re.findall(u, uri): return True - return False + # if uri needs to be from same domain and domains missmatch + if samedomain and str(uri.split('/')[2]).lower() != str(domain).lower(): + return True + for u in urignore: + if re.findall(u, uri): return True + return False def extract_urls(html, url): - mytime = int(time.time()) - proto = url.split(':')[0] - domain = url.split('/')[2] - urls = [] + mytime = int(time.time()) + proto = url.split(':')[0] + domain = url.split('/')[2] + urls = [] - soup = BeautifulSoup(html, features='lxml') + soup = BeautifulSoup(html, features='lxml') - for a in soup.find_all('a', href=True): - item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href'] - item = item.strip() + for a in soup.find_all('a', href=True): + item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href'] + item = item.strip() - if item.startswith('www.'): - item = 'http://%s' % item - elif not item.startswith('http'): - if not item.startswith('/'): item = '/%s' % item - item = '%s://%s%s' % (proto,domain,item) - elif is_bad_url(item, domain=domain, samedomain=config.ppf.extract_samedomain): - continue + if item.startswith('www.'): + item = 'http://%s' % item + elif not item.startswith('http'): + if not item.startswith('/'): item = '/%s' % item + item = '%s://%s%s' % (proto,domain,item) + elif is_bad_url(item, domain=domain, samedomain=config.ppf.extract_samedomain): + continue - if not item in urls: urls.append(item) + if not item in urls: urls.append(item) - if len(urls): dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls) + if len(urls): dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls) def import_proxies_from_file(proxydb, fn): - content = open(fn, 'r').read() - unique_count, new = fetch.extract_proxies(content, proxydb) - if len(new): - dbs.insert_proxies(proxydb, new, fn) - return 0 - return 1 + content = open(fn, 'r').read() + unique_count, new = fetch.extract_proxies(content, proxydb) + if len(new): + dbs.insert_proxies(proxydb, new, fn) + return 0 + return 1 def serve_loop(hs, done): - client_threads = [] - while not done.is_set(): - c = hs.wait_client() + client_threads = [] + while not done.is_set(): + c = hs.wait_client() - evt_done = threading.Event() - cthread = threading.Thread(target=httpsrv_client_thread, args=(c,evt_done)) - cthread.daemon = True - cthread.start() + evt_done = threading.Event() + cthread = threading.Thread(target=httpsrv_client_thread, args=(c,evt_done)) + cthread.daemon = True + cthread.start() - ctrm = [] - for ct, ct_done in client_threads: - if ct_done.is_set(): - ctrm.append((ct,ct_done)) - ct.join() + ctrm = [] + for ct, ct_done in client_threads: + if ct_done.is_set(): + ctrm.append((ct,ct_done)) + ct.join() - if len(ctrm): - client_threads = [ x for x in client_threads if not x in ctrm ] + if len(ctrm): + client_threads = [ x for x in client_threads if not x in ctrm ] - client_threads.append((cthread, evt_done)) + client_threads.append((cthread, evt_done)) def forbidden_page(): - return ( - '\n' - ' \n' - ' \n' - ' Forbidden\n' - ' \n' - ' \n' - '
🖕
\n' - ' \n' - '') + return ( + '\n' + ' \n' + ' \n' + ' Forbidden\n' + ' \n' + ' \n' + '
🖕
\n' + ' \n' + '') def httpsrv_client_thread(c, evt_done): - req = c.read_request() - if req is None: pass - elif len(watchlist) == 0: - c.redirect('/config.html') - elif os.path.isdir(req['url'][1:]): - c.send(403,'Forbidden', forbidden_page()) - elif req['url'] == '/': - c.redirect('/index.html') - elif req['url'].startswith('/index.html'): - variables = variables_from_request(req) - r, redir = render_site(variables) - if redir is not "": - c.redirect(redir) - else: - if r == '': r = render_empty(variables=variables) - c.send(200, "OK", r) - elif not '..' in req['url'] and file_exists(os.getcwd() + req['url']): - c.serve_file(os.getcwd() + req['url']) - elif req['url'] == '/robots.txt': - c.send(200, "OK", "User-agent: *\nDisallow: /") + req = c.read_request() + if req is None: pass + elif len(watchlist) == 0: + c.redirect('/config.html') + elif os.path.isdir(req['url'][1:]): + c.send(403,'Forbidden', forbidden_page()) + elif req['url'] == '/': + c.redirect('/index.html') + elif req['url'].startswith('/index.html'): + variables = variables_from_request(req) + r, redir = render_site(variables) + if redir is not "": + c.redirect(redir) + else: + if r == '': r = render_empty(variables=variables) + c.send(200, "OK", r) + elif not '..' in req['url'] and file_exists(os.getcwd() + req['url']): + c.serve_file(os.getcwd() + req['url']) + elif req['url'] == '/robots.txt': + c.send(200, "OK", "User-agent: *\nDisallow: /") - elif req['url'].startswith('/config.html'): - if args.config > 0: - variables=variables_from_request(req) - r, redir = configpage(req,variables) - else: - redir = '/index.html' - if redir is not "": - c.redirect(redir) - else: - if r == '': r = render_empty(variables=variables) - c.send(200, "OK", r) + elif req['url'].startswith('/config.html'): + if args.config > 0: + variables=variables_from_request(req) + r, redir = configpage(req,variables) + else: + redir = '/index.html' + if redir is not "": + c.redirect(redir) + else: + if r == '': r = render_empty(variables=variables) + c.send(200, "OK", r) - else: - c.send(404, "not exist", "the reqested file not exist!!!1") - c.disconnect() - evt_done.set() + else: + c.send(404, "not exist", "the reqested file not exist!!!1") + c.disconnect() + evt_done.set() def start_server(ip, port): - done = threading.Event() - from httpsrv import HttpSrv - hs = HttpSrv(ip, port) - try: - hs.setup() - except socket.error as e: - if e.errno == errno.EADDRINUSE: - sys.stderr.write(( - "ERROR: server socket address in use\n" - "wait a couple seconds and try again.\n" - "in case you're in pdb, you need to quit it\n")) - sys.exit(1) - else: - raise e + done = threading.Event() + from httpsrv import HttpSrv + hs = HttpSrv(ip, port) + try: + hs.setup() + except socket.error as e: + if e.errno == errno.EADDRINUSE: + sys.stderr.write(( + "ERROR: server socket address in use\n" + "wait a couple seconds and try again.\n" + "in case you're in pdb, you need to quit it\n")) + sys.exit(1) + else: + raise e - t = threading.Thread(target=serve_loop, args=(hs, done)) - t.daemon = True - t.start() - return t, done + t = threading.Thread(target=serve_loop, args=(hs, done)) + t.daemon = True + t.start() + return t, done if __name__ == '__main__': - config.load() - fetch.set_config(config) + config.load() + fetch.set_config(config) - proxydb = mysqlite.mysqlite(config.watchd.database, str) - dbs.create_table_if_not_exists(proxydb, 'proxylist') + proxydb = mysqlite.mysqlite(config.watchd.database, str) + dbs.create_table_if_not_exists(proxydb, 'proxylist') - with open('urignore.txt', 'r') as f: - urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] + with open('urignore.txt', 'r') as f: + urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ] - urldb = mysqlite.mysqlite(config.ppf.database, str) - dbs.create_table_if_not_exists(urldb, 'uris') - import_from_file('import.txt', urldb) - if len(sys.argv) == 3 and sys.argv[1] == "--file": - sys.exit(import_proxies_from_file(proxydb, sys.argv[2])) + urldb = mysqlite.mysqlite(config.ppf.database, str) + dbs.create_table_if_not_exists(urldb, 'uris') + import_from_file('import.txt', urldb) + if len(sys.argv) == 3 and sys.argv[1] == "--file": + sys.exit(import_proxies_from_file(proxydb, sys.argv[2])) - # start proxy watcher - if config.watchd.threads > 0: - watcherd = proxywatchd.Proxywatchd() - watcherd.start() - else: - watcherd = None + # start proxy watcher + if config.watchd.threads > 0: + watcherd = proxywatchd.Proxywatchd() + watcherd.start() + else: + watcherd = None - start_server(config.httpd.listenip, config.httpd.port) + start_server(config.httpd.listenip, config.httpd.port) - while True: - try: - ## any site that needs to be checked ? - rows = urldb.execute('SELECT url,stale_count,error,retrievals,proxies_added,content_type FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?)