diff --git a/config.py b/config.py index a898e13..ed29ece 100644 --- a/config.py +++ b/config.py @@ -38,6 +38,7 @@ class Config(ComboParser): self.add_item(section, 'perfail_checktime', int, 3600, 'additional checking interval for urls in db in seconds per resultless check', False) self.add_item(section, 'max_fail', int, 5, 'number of fails after which an url is considered dead', False) self.add_item(section, 'database', str, 'proxies.sqlite', 'filename of database', True) + self.add_item(section, 'extract_samedomain', bool, False, 'extract only url from same domains? (default: False)', False) section = 'scraper' self.add_item(section, 'debug', bool, False, 'scraper: whether to print additional debug info', False) diff --git a/ppf.py b/ppf.py index a23cbf8..e1f67c0 100755 --- a/ppf.py +++ b/ppf.py @@ -68,6 +68,9 @@ def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_adde dbs.insert_proxies(proxydb, new, url) def is_bad_url(uri, domain=None, samedomain=False): + # if uri needs to be from same domain and domains missmatch + if samedomain and str(uri.split('/')[2]).lower() != str(domain).lower(): + return True for u in urignore: if re.findall(u, uri): return True return False @@ -89,7 +92,7 @@ def extract_urls(html, url): elif not item.startswith('http'): if not item.startswith('/'): item = '/%s' % item item = '%s://%s%s' % (proto,domain,item) - elif is_bad_url(item): + elif is_bad_url(item, domain=domain, samedomain=config.ppf.extract_samedomain): continue if not item in urls: urls.append(item)