ppf: check content-type (once) before trying to download/extract proxies
avoid trying to extract stuff from pdf and such (only accept text/*) REQUIRES: sqlite3 websites.sqlite "alter table uris add content_type text" Don't test known uris: sqlite3 websites.sqlite "update uris set content_type='text/manual' WHERE error=0"
This commit is contained in:
8
fetch.py
8
fetch.py
@@ -22,7 +22,7 @@ def cleanhtml(raw_html):
|
|||||||
return html
|
return html
|
||||||
|
|
||||||
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
|
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
|
||||||
def fetch_contents(url):
|
def fetch_contents(url, head = False):
|
||||||
host, port, ssl, uri = _parse_url(url)
|
host, port, ssl, uri = _parse_url(url)
|
||||||
headers=[
|
headers=[
|
||||||
'Accept-Language: en-US,en;q=0.8',
|
'Accept-Language: en-US,en;q=0.8',
|
||||||
@@ -49,6 +49,12 @@ def fetch_contents(url):
|
|||||||
continue
|
continue
|
||||||
return ''
|
return ''
|
||||||
break
|
break
|
||||||
|
|
||||||
|
## only request header
|
||||||
|
if head:
|
||||||
|
hdr = http.head(uri, headers)
|
||||||
|
return hdr
|
||||||
|
|
||||||
hdr, res = http.get(uri, headers)
|
hdr, res = http.get(uri, headers)
|
||||||
res = res.encode('utf-8') if isinstance(res, unicode) else res
|
res = res.encode('utf-8') if isinstance(res, unicode) else res
|
||||||
for retry_message in retry_messages:
|
for retry_message in retry_messages:
|
||||||
|
|||||||
25
ppf.py
25
ppf.py
@@ -21,10 +21,23 @@ def import_from_file(fn, sqlite):
|
|||||||
sqlite.execute('INSERT INTO uris (added,url,check_time,error,stale_count,proxies_added,retrievals) VALUES (?,?,?,?,?,?,?)', (int(time.time()),u,0,0,0,0,0))
|
sqlite.execute('INSERT INTO uris (added,url,check_time,error,stale_count,proxies_added,retrievals) VALUES (?,?,?,?,?,?,?)', (int(time.time()),u,0,0,0,0,0))
|
||||||
sqlite.commit()
|
sqlite.commit()
|
||||||
|
|
||||||
def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added):
|
def get_content_type(url):
|
||||||
try: content = fetch.fetch_contents(url)
|
hdr = fetch.fetch_contents(url, head=True)
|
||||||
except KeyboardInterrupt as e: raise e
|
|
||||||
except: content = ''
|
for h in hdr.split('\n'):
|
||||||
|
if h.lower().startswith('content-type: '): return h.lower().split(':')[1].strip()
|
||||||
|
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added, content_type):
|
||||||
|
if not content_type: content_type = get_content_type(url)
|
||||||
|
if content_type.lower().startswith('text/') or 'atom+xml' in content_type.lower():
|
||||||
|
try: content = fetch.fetch_contents(url)
|
||||||
|
except KeyboardInterrupt as e: raise e
|
||||||
|
except: content = ''
|
||||||
|
else:
|
||||||
|
print('WRONG CONTENT_TYPE: %s (%s)' % (url, content_type))
|
||||||
|
content = ''
|
||||||
|
|
||||||
unique_count, new = fetch.extract_proxies(content, proxydb)
|
unique_count, new = fetch.extract_proxies(content, proxydb)
|
||||||
|
|
||||||
@@ -81,10 +94,10 @@ if __name__ == '__main__':
|
|||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
## any site that needs to be checked ?
|
## any site that needs to be checked ?
|
||||||
rows = urldb.execute('SELECT url,stale_count,error,retrievals,proxies_added FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) <?) ORDER BY RANDOM() LIMIT 25', (config.ppf.max_fail, config.ppf.checktime, config.ppf.perfail_checktime, int(time.time()))).fetchall()
|
rows = urldb.execute('SELECT url,stale_count,error,retrievals,proxies_added,content_type FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) <?) ORDER BY RANDOM() LIMIT 25', (config.ppf.max_fail, config.ppf.checktime, config.ppf.perfail_checktime, int(time.time()))).fetchall()
|
||||||
|
|
||||||
for row in rows:
|
for row in rows:
|
||||||
proxyleech(proxydb, urldb, row[0], row[1], row[2], row[3], row[4])
|
proxyleech(proxydb, urldb, row[0], row[1], row[2], row[3], row[4], row[5])
|
||||||
|
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user