extract url only from same domains ? (default: False)

setting this option will make ppf not follow external links when extracting uris
2019-05-14 21:24:29 +02:00
parent b226bc0b03
commit eeedf9d0a1
2 changed files with 5 additions and 1 deletions
--- a/config.py
+++ b/config.py
@@ -38,6 +38,7 @@ class Config(ComboParser):
 		self.add_item(section, 'perfail_checktime', int, 3600, 'additional checking interval for urls in db in seconds per resultless check', False)
 		self.add_item(section, 'max_fail', int, 5, 'number of fails after which an url is considered dead', False)
 		self.add_item(section, 'database', str, 'proxies.sqlite', 'filename of database', True)
+		self.add_item(section, 'extract_samedomain', bool, False, 'extract only url from same domains? (default: False)', False)

 		section = 'scraper'
 		self.add_item(section, 'debug', bool, False, 'scraper: whether to print additional debug info', False)
--- a/ppf.py
+++ b/ppf.py
@@ -68,6 +68,9 @@ def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_adde
 	dbs.insert_proxies(proxydb, new, url)

 def is_bad_url(uri, domain=None, samedomain=False):
+	# if uri needs to be from same domain and domains missmatch
+	if samedomain and str(uri.split('/')[2]).lower() != str(domain).lower():
+		return True
 	for u in urignore:
 		if re.findall(u, uri): return True
 	return False
@@ -89,7 +92,7 @@ def extract_urls(html, url):
 		elif not item.startswith('http'):
 			if not item.startswith('/'): item = '/%s' % item
 			item = '%s://%s%s' % (proto,domain,item)
-		elif is_bad_url(item):
+		elif is_bad_url(item, domain=domain, samedomain=config.ppf.extract_samedomain):
 			continue

 		if not item in urls: urls.append(item)