ppf: check content-type (once) before trying to download/extract proxies

avoid trying to extract stuff from pdf and such (only accept text/*)

REQUIRES:
sqlite3 websites.sqlite "alter table uris add content_type text"

Don't test known uris:
sqlite3 websites.sqlite "update uris set content_type='text/manual' WHERE error=0"
This commit is contained in:
Mickaël Serneels
2019-04-22 21:45:13 +02:00
parent e19c473514
commit 0155c6f2ad
2 changed files with 26 additions and 7 deletions

View File

@@ -22,7 +22,7 @@ def cleanhtml(raw_html):
return html
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
def fetch_contents(url):
def fetch_contents(url, head = False):
host, port, ssl, uri = _parse_url(url)
headers=[
'Accept-Language: en-US,en;q=0.8',
@@ -49,6 +49,12 @@ def fetch_contents(url):
continue
return ''
break
## only request header
if head:
hdr = http.head(uri, headers)
return hdr
hdr, res = http.get(uri, headers)
res = res.encode('utf-8') if isinstance(res, unicode) else res
for retry_message in retry_messages: