fetch.py: improve readability of extract_urls
This commit is contained in:
8
fetch.py
8
fetch.py
@@ -107,7 +107,11 @@ def extract_urls(content, urls = None, urignore=None):
|
|||||||
soup = soupify(content)
|
soup = soupify(content)
|
||||||
for a in soup.body.find_all('a'):
|
for a in soup.body.find_all('a'):
|
||||||
if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue
|
if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue
|
||||||
badurl = [ i for i in urignore if re.findall(i,a.attrs['href'], re.IGNORECASE) ]
|
bad = False
|
||||||
if not len(badurl): urls.append(a.attrs['href'])
|
for i in urignore:
|
||||||
|
if re.findall(i,a.attrs['href'], re.IGNORECASE):
|
||||||
|
bad = True
|
||||||
|
break
|
||||||
|
if not bad: urls.append(a.attrs['href'])
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user