ppf: check content-type (once) before trying to download/extract proxies

avoid trying to extract stuff from pdf and such (only accept text/*)

REQUIRES:
sqlite3 websites.sqlite "alter table uris add content_type text"

Don't test known uris:
sqlite3 websites.sqlite "update uris set content_type='text/manual' WHERE error=0"
This commit is contained in:
Mickaël Serneels
2019-04-22 21:45:13 +02:00
parent e19c473514
commit 0155c6f2ad
2 changed files with 26 additions and 7 deletions

25
ppf.py
View File

@@ -21,10 +21,23 @@ def import_from_file(fn, sqlite):
sqlite.execute('INSERT INTO uris (added,url,check_time,error,stale_count,proxies_added,retrievals) VALUES (?,?,?,?,?,?,?)', (int(time.time()),u,0,0,0,0,0))
sqlite.commit()
def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added):
try: content = fetch.fetch_contents(url)
except KeyboardInterrupt as e: raise e
except: content = ''
def get_content_type(url):
hdr = fetch.fetch_contents(url, head=True)
for h in hdr.split('\n'):
if h.lower().startswith('content-type: '): return h.lower().split(':')[1].strip()
return ''
def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added, content_type):
if not content_type: content_type = get_content_type(url)
if content_type.lower().startswith('text/') or 'atom+xml' in content_type.lower():
try: content = fetch.fetch_contents(url)
except KeyboardInterrupt as e: raise e
except: content = ''
else:
print('WRONG CONTENT_TYPE: %s (%s)' % (url, content_type))
content = ''
unique_count, new = fetch.extract_proxies(content, proxydb)
@@ -81,10 +94,10 @@ if __name__ == '__main__':
while True:
try:
## any site that needs to be checked ?
rows = urldb.execute('SELECT url,stale_count,error,retrievals,proxies_added FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) <?) ORDER BY RANDOM() LIMIT 25', (config.ppf.max_fail, config.ppf.checktime, config.ppf.perfail_checktime, int(time.time()))).fetchall()
rows = urldb.execute('SELECT url,stale_count,error,retrievals,proxies_added,content_type FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) <?) ORDER BY RANDOM() LIMIT 25', (config.ppf.max_fail, config.ppf.checktime, config.ppf.perfail_checktime, int(time.time()))).fetchall()
for row in rows:
proxyleech(proxydb, urldb, row[0], row[1], row[2], row[3], row[4])
proxyleech(proxydb, urldb, row[0], row[1], row[2], row[3], row[4], row[5])
time.sleep(10)