ppf: check content-type (once) before trying to download/extract proxies

avoid trying to extract stuff from pdf and such (only accept text/*)

REQUIRES:
sqlite3 websites.sqlite "alter table uris add content_type text"

Don't test known uris:
sqlite3 websites.sqlite "update uris set content_type='text/manual' WHERE error=0"
This commit is contained in:
Mickaël Serneels
2019-04-22 21:45:13 +02:00
parent e19c473514
commit 0155c6f2ad
2 changed files with 26 additions and 7 deletions

View File

@@ -22,7 +22,7 @@ def cleanhtml(raw_html):
return html
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
def fetch_contents(url):
def fetch_contents(url, head = False):
host, port, ssl, uri = _parse_url(url)
headers=[
'Accept-Language: en-US,en;q=0.8',
@@ -49,6 +49,12 @@ def fetch_contents(url):
continue
return ''
break
## only request header
if head:
hdr = http.head(uri, headers)
return hdr
hdr, res = http.get(uri, headers)
res = res.encode('utf-8') if isinstance(res, unicode) else res
for retry_message in retry_messages:

25
ppf.py
View File

@@ -21,10 +21,23 @@ def import_from_file(fn, sqlite):
sqlite.execute('INSERT INTO uris (added,url,check_time,error,stale_count,proxies_added,retrievals) VALUES (?,?,?,?,?,?,?)', (int(time.time()),u,0,0,0,0,0))
sqlite.commit()
def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added):
try: content = fetch.fetch_contents(url)
except KeyboardInterrupt as e: raise e
except: content = ''
def get_content_type(url):
hdr = fetch.fetch_contents(url, head=True)
for h in hdr.split('\n'):
if h.lower().startswith('content-type: '): return h.lower().split(':')[1].strip()
return ''
def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added, content_type):
if not content_type: content_type = get_content_type(url)
if content_type.lower().startswith('text/') or 'atom+xml' in content_type.lower():
try: content = fetch.fetch_contents(url)
except KeyboardInterrupt as e: raise e
except: content = ''
else:
print('WRONG CONTENT_TYPE: %s (%s)' % (url, content_type))
content = ''
unique_count, new = fetch.extract_proxies(content, proxydb)
@@ -81,10 +94,10 @@ if __name__ == '__main__':
while True:
try:
## any site that needs to be checked ?
rows = urldb.execute('SELECT url,stale_count,error,retrievals,proxies_added FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) <?) ORDER BY RANDOM() LIMIT 25', (config.ppf.max_fail, config.ppf.checktime, config.ppf.perfail_checktime, int(time.time()))).fetchall()
rows = urldb.execute('SELECT url,stale_count,error,retrievals,proxies_added,content_type FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) <?) ORDER BY RANDOM() LIMIT 25', (config.ppf.max_fail, config.ppf.checktime, config.ppf.perfail_checktime, int(time.time()))).fetchall()
for row in rows:
proxyleech(proxydb, urldb, row[0], row[1], row[2], row[3], row[4])
proxyleech(proxydb, urldb, row[0], row[1], row[2], row[3], row[4], row[5])
time.sleep(10)