extract url only from same domains ? (default: False)

setting this option will make ppf not follow external links when extracting uris
This commit is contained in:
Mickaël Serneels
2019-05-14 21:24:29 +02:00
parent b226bc0b03
commit eeedf9d0a1
2 changed files with 5 additions and 1 deletions

5
ppf.py
View File

@@ -68,6 +68,9 @@ def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_adde
dbs.insert_proxies(proxydb, new, url)
def is_bad_url(uri, domain=None, samedomain=False):
# if uri needs to be from same domain and domains missmatch
if samedomain and str(uri.split('/')[2]).lower() != str(domain).lower():
return True
for u in urignore:
if re.findall(u, uri): return True
return False
@@ -89,7 +92,7 @@ def extract_urls(html, url):
elif not item.startswith('http'):
if not item.startswith('/'): item = '/%s' % item
item = '%s://%s%s' % (proto,domain,item)
elif is_bad_url(item):
elif is_bad_url(item, domain=domain, samedomain=config.ppf.extract_samedomain):
continue
if not item in urls: urls.append(item)