ppf: check content-type (once) before trying to download/extract proxies
avoid trying to extract stuff from pdf and such (only accept text/*) REQUIRES: sqlite3 websites.sqlite "alter table uris add content_type text" Don't test known uris: sqlite3 websites.sqlite "update uris set content_type='text/manual' WHERE error=0"
This commit is contained in:
8
fetch.py
8
fetch.py
@@ -22,7 +22,7 @@ def cleanhtml(raw_html):
|
||||
return html
|
||||
|
||||
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
|
||||
def fetch_contents(url):
|
||||
def fetch_contents(url, head = False):
|
||||
host, port, ssl, uri = _parse_url(url)
|
||||
headers=[
|
||||
'Accept-Language: en-US,en;q=0.8',
|
||||
@@ -49,6 +49,12 @@ def fetch_contents(url):
|
||||
continue
|
||||
return ''
|
||||
break
|
||||
|
||||
## only request header
|
||||
if head:
|
||||
hdr = http.head(uri, headers)
|
||||
return hdr
|
||||
|
||||
hdr, res = http.get(uri, headers)
|
||||
res = res.encode('utf-8') if isinstance(res, unicode) else res
|
||||
for retry_message in retry_messages:
|
||||
|
||||
Reference in New Issue
Block a user