check if bad url *after* building the url
This commit is contained in:
6
ppf.py
6
ppf.py
@@ -84,13 +84,13 @@ def extract_urls(html, url):
|
|||||||
item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href']
|
item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href']
|
||||||
item = item.strip()
|
item = item.strip()
|
||||||
|
|
||||||
if is_bad_url(item):
|
if item.startswith('www.'):
|
||||||
continue
|
|
||||||
elif item.startswith('www.'):
|
|
||||||
item = 'http://%s' % item
|
item = 'http://%s' % item
|
||||||
elif not item.startswith('http'):
|
elif not item.startswith('http'):
|
||||||
if not item.startswith('/'): item = '/%s' % item
|
if not item.startswith('/'): item = '/%s' % item
|
||||||
item = '%s://%s%s' % (proto,domain,item)
|
item = '%s://%s%s' % (proto,domain,item)
|
||||||
|
elif is_bad_url(item):
|
||||||
|
continue
|
||||||
|
|
||||||
if not item in urls: urls.append(item)
|
if not item in urls: urls.append(item)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user