extract url only from same domains ? (default: False)

setting this option will make ppf not follow external links when extracting uris
This commit is contained in:
Mickaël Serneels
2019-05-14 21:24:29 +02:00
parent b226bc0b03
commit eeedf9d0a1
2 changed files with 5 additions and 1 deletions

View File

@@ -38,6 +38,7 @@ class Config(ComboParser):
self.add_item(section, 'perfail_checktime', int, 3600, 'additional checking interval for urls in db in seconds per resultless check', False)
self.add_item(section, 'max_fail', int, 5, 'number of fails after which an url is considered dead', False)
self.add_item(section, 'database', str, 'proxies.sqlite', 'filename of database', True)
self.add_item(section, 'extract_samedomain', bool, False, 'extract only url from same domains? (default: False)', False)
section = 'scraper'
self.add_item(section, 'debug', bool, False, 'scraper: whether to print additional debug info', False)

5
ppf.py
View File

@@ -68,6 +68,9 @@ def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_adde
dbs.insert_proxies(proxydb, new, url)
def is_bad_url(uri, domain=None, samedomain=False):
# if uri needs to be from same domain and domains missmatch
if samedomain and str(uri.split('/')[2]).lower() != str(domain).lower():
return True
for u in urignore:
if re.findall(u, uri): return True
return False
@@ -89,7 +92,7 @@ def extract_urls(html, url):
elif not item.startswith('http'):
if not item.startswith('/'): item = '/%s' % item
item = '%s://%s%s' % (proto,domain,item)
elif is_bad_url(item):
elif is_bad_url(item, domain=domain, samedomain=config.ppf.extract_samedomain):
continue
if not item in urls: urls.append(item)