space2tab
This commit is contained in:
40
ppf.py
40
ppf.py
@@ -67,34 +67,34 @@ def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_adde
|
|||||||
|
|
||||||
dbs.insert_proxies(proxydb, new, url)
|
dbs.insert_proxies(proxydb, new, url)
|
||||||
|
|
||||||
def is_bad_url(uri):
|
def is_bad_url(uri, domain=None, samedomain=False):
|
||||||
for u in urignore:
|
for u in urignore:
|
||||||
if re.findall(u, uri): return True
|
if re.findall(u, uri): return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def extract_urls(html, url):
|
def extract_urls(html, url):
|
||||||
mytime = int(time.time())
|
mytime = int(time.time())
|
||||||
proto = url.split(':')[0]
|
proto = url.split(':')[0]
|
||||||
domain = url.split('/')[2]
|
domain = url.split('/')[2]
|
||||||
urls = []
|
urls = []
|
||||||
|
|
||||||
soup = BeautifulSoup(html, features='lxml')
|
soup = BeautifulSoup(html, features='lxml')
|
||||||
|
|
||||||
for a in soup.find_all('a', href=True):
|
for a in soup.find_all('a', href=True):
|
||||||
item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href']
|
item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href']
|
||||||
item = item.strip()
|
item = item.strip()
|
||||||
|
|
||||||
if is_bad_url(item):
|
if is_bad_url(item):
|
||||||
continue
|
continue
|
||||||
elif item.startswith('www.'):
|
elif item.startswith('www.'):
|
||||||
item = 'http://%s' % item
|
item = 'http://%s' % item
|
||||||
elif not item.startswith('http'):
|
elif not item.startswith('http'):
|
||||||
if not item.startswith('/'): item = '/%s' % item
|
if not item.startswith('/'): item = '/%s' % item
|
||||||
item = '%s://%s%s' % (proto,domain,item)
|
item = '%s://%s%s' % (proto,domain,item)
|
||||||
|
|
||||||
if not item in urls: urls.append(item)
|
if not item in urls: urls.append(item)
|
||||||
|
|
||||||
if len(urls): dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls)
|
if len(urls): dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls)
|
||||||
|
|
||||||
def import_proxies_from_file(proxydb, fn):
|
def import_proxies_from_file(proxydb, fn):
|
||||||
content = open(fn, 'r').read()
|
content = open(fn, 'r').read()
|
||||||
|
|||||||
Reference in New Issue
Block a user