extract url only from same domains ? (default: False)
setting this option will make ppf not follow external links when extracting uris
This commit is contained in:
@@ -38,6 +38,7 @@ class Config(ComboParser):
|
||||
self.add_item(section, 'perfail_checktime', int, 3600, 'additional checking interval for urls in db in seconds per resultless check', False)
|
||||
self.add_item(section, 'max_fail', int, 5, 'number of fails after which an url is considered dead', False)
|
||||
self.add_item(section, 'database', str, 'proxies.sqlite', 'filename of database', True)
|
||||
self.add_item(section, 'extract_samedomain', bool, False, 'extract only url from same domains? (default: False)', False)
|
||||
|
||||
section = 'scraper'
|
||||
self.add_item(section, 'debug', bool, False, 'scraper: whether to print additional debug info', False)
|
||||
|
||||
5
ppf.py
5
ppf.py
@@ -68,6 +68,9 @@ def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_adde
|
||||
dbs.insert_proxies(proxydb, new, url)
|
||||
|
||||
def is_bad_url(uri, domain=None, samedomain=False):
|
||||
# if uri needs to be from same domain and domains missmatch
|
||||
if samedomain and str(uri.split('/')[2]).lower() != str(domain).lower():
|
||||
return True
|
||||
for u in urignore:
|
||||
if re.findall(u, uri): return True
|
||||
return False
|
||||
@@ -89,7 +92,7 @@ def extract_urls(html, url):
|
||||
elif not item.startswith('http'):
|
||||
if not item.startswith('/'): item = '/%s' % item
|
||||
item = '%s://%s%s' % (proto,domain,item)
|
||||
elif is_bad_url(item):
|
||||
elif is_bad_url(item, domain=domain, samedomain=config.ppf.extract_samedomain):
|
||||
continue
|
||||
|
||||
if not item in urls: urls.append(item)
|
||||
|
||||
Reference in New Issue
Block a user