diff --git a/ppf.py b/ppf.py index d42fde6..0eada2f 100755 --- a/ppf.py +++ b/ppf.py @@ -67,34 +67,34 @@ def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_adde dbs.insert_proxies(proxydb, new, url) -def is_bad_url(uri): - for u in urignore: - if re.findall(u, uri): return True - return False +def is_bad_url(uri, domain=None, samedomain=False): + for u in urignore: + if re.findall(u, uri): return True + return False def extract_urls(html, url): - mytime = int(time.time()) - proto = url.split(':')[0] - domain = url.split('/')[2] - urls = [] + mytime = int(time.time()) + proto = url.split(':')[0] + domain = url.split('/')[2] + urls = [] - soup = BeautifulSoup(html, features='lxml') + soup = BeautifulSoup(html, features='lxml') - for a in soup.find_all('a', href=True): - item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href'] + for a in soup.find_all('a', href=True): + item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href'] item = item.strip() - if is_bad_url(item): - continue - elif item.startswith('www.'): - item = 'http://%s' % item - elif not item.startswith('http'): - if not item.startswith('/'): item = '/%s' % item - item = '%s://%s%s' % (proto,domain,item) + if is_bad_url(item): + continue + elif item.startswith('www.'): + item = 'http://%s' % item + elif not item.startswith('http'): + if not item.startswith('/'): item = '/%s' % item + item = '%s://%s%s' % (proto,domain,item) - if not item in urls: urls.append(item) + if not item in urls: urls.append(item) - if len(urls): dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls) + if len(urls): dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls) def import_proxies_from_file(proxydb, fn): content = open(fn, 'r').read()