space2tab

This commit is contained in:
Mickaël Serneels
2019-05-14 19:27:09 +02:00
parent bcaf7af0e7
commit eeae849e12

40
ppf.py
View File

@@ -67,34 +67,34 @@ def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_adde
dbs.insert_proxies(proxydb, new, url)
def is_bad_url(uri):
for u in urignore:
if re.findall(u, uri): return True
return False
def is_bad_url(uri, domain=None, samedomain=False):
for u in urignore:
if re.findall(u, uri): return True
return False
def extract_urls(html, url):
mytime = int(time.time())
proto = url.split(':')[0]
domain = url.split('/')[2]
urls = []
mytime = int(time.time())
proto = url.split(':')[0]
domain = url.split('/')[2]
urls = []
soup = BeautifulSoup(html, features='lxml')
soup = BeautifulSoup(html, features='lxml')
for a in soup.find_all('a', href=True):
item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href']
for a in soup.find_all('a', href=True):
item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href']
item = item.strip()
if is_bad_url(item):
continue
elif item.startswith('www.'):
item = 'http://%s' % item
elif not item.startswith('http'):
if not item.startswith('/'): item = '/%s' % item
item = '%s://%s%s' % (proto,domain,item)
if is_bad_url(item):
continue
elif item.startswith('www.'):
item = 'http://%s' % item
elif not item.startswith('http'):
if not item.startswith('/'): item = '/%s' % item
item = '%s://%s%s' % (proto,domain,item)
if not item in urls: urls.append(item)
if not item in urls: urls.append(item)
if len(urls): dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls)
if len(urls): dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls)
def import_proxies_from_file(proxydb, fn):
content = open(fn, 'r').read()