extract url only from same domains ? (default: False)
setting this option will make ppf not follow external links when extracting uris
This commit is contained in:
@@ -38,6 +38,7 @@ class Config(ComboParser):
|
|||||||
self.add_item(section, 'perfail_checktime', int, 3600, 'additional checking interval for urls in db in seconds per resultless check', False)
|
self.add_item(section, 'perfail_checktime', int, 3600, 'additional checking interval for urls in db in seconds per resultless check', False)
|
||||||
self.add_item(section, 'max_fail', int, 5, 'number of fails after which an url is considered dead', False)
|
self.add_item(section, 'max_fail', int, 5, 'number of fails after which an url is considered dead', False)
|
||||||
self.add_item(section, 'database', str, 'proxies.sqlite', 'filename of database', True)
|
self.add_item(section, 'database', str, 'proxies.sqlite', 'filename of database', True)
|
||||||
|
self.add_item(section, 'extract_samedomain', bool, False, 'extract only url from same domains? (default: False)', False)
|
||||||
|
|
||||||
section = 'scraper'
|
section = 'scraper'
|
||||||
self.add_item(section, 'debug', bool, False, 'scraper: whether to print additional debug info', False)
|
self.add_item(section, 'debug', bool, False, 'scraper: whether to print additional debug info', False)
|
||||||
|
|||||||
5
ppf.py
5
ppf.py
@@ -68,6 +68,9 @@ def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_adde
|
|||||||
dbs.insert_proxies(proxydb, new, url)
|
dbs.insert_proxies(proxydb, new, url)
|
||||||
|
|
||||||
def is_bad_url(uri, domain=None, samedomain=False):
|
def is_bad_url(uri, domain=None, samedomain=False):
|
||||||
|
# if uri needs to be from same domain and domains missmatch
|
||||||
|
if samedomain and str(uri.split('/')[2]).lower() != str(domain).lower():
|
||||||
|
return True
|
||||||
for u in urignore:
|
for u in urignore:
|
||||||
if re.findall(u, uri): return True
|
if re.findall(u, uri): return True
|
||||||
return False
|
return False
|
||||||
@@ -89,7 +92,7 @@ def extract_urls(html, url):
|
|||||||
elif not item.startswith('http'):
|
elif not item.startswith('http'):
|
||||||
if not item.startswith('/'): item = '/%s' % item
|
if not item.startswith('/'): item = '/%s' % item
|
||||||
item = '%s://%s%s' % (proto,domain,item)
|
item = '%s://%s%s' % (proto,domain,item)
|
||||||
elif is_bad_url(item):
|
elif is_bad_url(item, domain=domain, samedomain=config.ppf.extract_samedomain):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not item in urls: urls.append(item)
|
if not item in urls: urls.append(item)
|
||||||
|
|||||||
Reference in New Issue
Block a user