ppf: check content-type (once) before trying to download/extract proxies

avoid trying to extract stuff from pdf and such (only accept text/*) REQUIRES: sqlite3 websites.sqlite "alter table uris add content_type text" Don't test known uris: sqlite3 websites.sqlite "update uris set content_type='text/manual' WHERE error=0"
2019-04-22 21:45:13 +02:00
parent e19c473514
commit 0155c6f2ad
2 changed files with 26 additions and 7 deletions
--- a/fetch.py
+++ b/fetch.py
@@ -22,7 +22,7 @@ def cleanhtml(raw_html):
 	return html

 retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
-def fetch_contents(url):
+def fetch_contents(url, head = False):
 	host, port, ssl, uri = _parse_url(url)
 	headers=[
 		'Accept-Language: en-US,en;q=0.8',
@@ -49,6 +49,12 @@ def fetch_contents(url):
 				continue
 			return ''
 		break
+
+	## only request header
+	if head:
+		hdr = http.head(uri, headers)
+		return hdr
+
 	hdr, res = http.get(uri, headers)
 	res = res.encode('utf-8') if isinstance(res, unicode) else res
 	for retry_message in retry_messages:
--- a/ppf.py
+++ b/ppf.py
@@ -21,10 +21,23 @@ def import_from_file(fn, sqlite):
 			sqlite.execute('INSERT INTO uris (added,url,check_time,error,stale_count,proxies_added,retrievals) VALUES (?,?,?,?,?,?,?)', (int(time.time()),u,0,0,0,0,0))
 		sqlite.commit()

-def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added):
-	try: content = fetch.fetch_contents(url)
-	except KeyboardInterrupt as e: raise e
-	except: content = ''
+def get_content_type(url):
+	hdr = fetch.fetch_contents(url, head=True)
+
+	for h in hdr.split('\n'):
+		if h.lower().startswith('content-type: '): return h.lower().split(':')[1].strip()
+
+	return ''
+
+def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added, content_type):
+	if not content_type: content_type = get_content_type(url)
+	if content_type.lower().startswith('text/') or 'atom+xml' in content_type.lower():
+		try: content = fetch.fetch_contents(url)
+		except KeyboardInterrupt as e: raise e
+		except: content = ''
+	else:
+		print('WRONG CONTENT_TYPE: %s (%s)' % (url, content_type))
+		content = ''

 	unique_count, new = fetch.extract_proxies(content, proxydb)

@@ -81,10 +94,10 @@ if __name__ == '__main__':
 	while True:
 		try:
 			## any site that needs to be checked ?
-			rows = urldb.execute('SELECT url,stale_count,error,retrievals,proxies_added FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) <?) ORDER BY RANDOM() LIMIT 25', (config.ppf.max_fail, config.ppf.checktime, config.ppf.perfail_checktime, int(time.time()))).fetchall()
+			rows = urldb.execute('SELECT url,stale_count,error,retrievals,proxies_added,content_type FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) <?) ORDER BY RANDOM() LIMIT 25', (config.ppf.max_fail, config.ppf.checktime, config.ppf.perfail_checktime, int(time.time()))).fetchall()

 			for row in rows:
-				proxyleech(proxydb, urldb, row[0], row[1], row[2], row[3], row[4])
+				proxyleech(proxydb, urldb, row[0], row[1], row[2], row[3], row[4], row[5])

 			time.sleep(10)