check if bad url *after* building the url

This commit is contained in:
Mickaël Serneels
2019-05-14 19:31:19 +02:00
parent eeae849e12
commit b226bc0b03

6
ppf.py
View File

@@ -84,13 +84,13 @@ def extract_urls(html, url):
item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href']
item = item.strip()
if is_bad_url(item):
continue
elif item.startswith('www.'):
if item.startswith('www.'):
item = 'http://%s' % item
elif not item.startswith('http'):
if not item.startswith('/'): item = '/%s' % item
item = '%s://%s%s' % (proto,domain,item)
elif is_bad_url(item):
continue
if not item in urls: urls.append(item)