space2tab
This commit is contained in:
40
ppf.py
40
ppf.py
@@ -67,34 +67,34 @@ def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_adde
|
||||
|
||||
dbs.insert_proxies(proxydb, new, url)
|
||||
|
||||
def is_bad_url(uri):
|
||||
for u in urignore:
|
||||
if re.findall(u, uri): return True
|
||||
return False
|
||||
def is_bad_url(uri, domain=None, samedomain=False):
|
||||
for u in urignore:
|
||||
if re.findall(u, uri): return True
|
||||
return False
|
||||
|
||||
def extract_urls(html, url):
|
||||
mytime = int(time.time())
|
||||
proto = url.split(':')[0]
|
||||
domain = url.split('/')[2]
|
||||
urls = []
|
||||
mytime = int(time.time())
|
||||
proto = url.split(':')[0]
|
||||
domain = url.split('/')[2]
|
||||
urls = []
|
||||
|
||||
soup = BeautifulSoup(html, features='lxml')
|
||||
soup = BeautifulSoup(html, features='lxml')
|
||||
|
||||
for a in soup.find_all('a', href=True):
|
||||
item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href']
|
||||
for a in soup.find_all('a', href=True):
|
||||
item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href']
|
||||
item = item.strip()
|
||||
|
||||
if is_bad_url(item):
|
||||
continue
|
||||
elif item.startswith('www.'):
|
||||
item = 'http://%s' % item
|
||||
elif not item.startswith('http'):
|
||||
if not item.startswith('/'): item = '/%s' % item
|
||||
item = '%s://%s%s' % (proto,domain,item)
|
||||
if is_bad_url(item):
|
||||
continue
|
||||
elif item.startswith('www.'):
|
||||
item = 'http://%s' % item
|
||||
elif not item.startswith('http'):
|
||||
if not item.startswith('/'): item = '/%s' % item
|
||||
item = '%s://%s%s' % (proto,domain,item)
|
||||
|
||||
if not item in urls: urls.append(item)
|
||||
if not item in urls: urls.append(item)
|
||||
|
||||
if len(urls): dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls)
|
||||
if len(urls): dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls)
|
||||
|
||||
def import_proxies_from_file(proxydb, fn):
|
||||
content = open(fn, 'r').read()
|
||||
|
||||
Reference in New Issue
Block a user