diff --git a/fetch.py b/fetch.py index 8ebdd90..6b42dda 100644 --- a/fetch.py +++ b/fetch.py @@ -22,7 +22,7 @@ def cleanhtml(raw_html): return html retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded') -def fetch_contents(url): +def fetch_contents(url, head = False): host, port, ssl, uri = _parse_url(url) headers=[ 'Accept-Language: en-US,en;q=0.8', @@ -49,6 +49,12 @@ def fetch_contents(url): continue return '' break + + ## only request header + if head: + hdr = http.head(uri, headers) + return hdr + hdr, res = http.get(uri, headers) res = res.encode('utf-8') if isinstance(res, unicode) else res for retry_message in retry_messages: diff --git a/ppf.py b/ppf.py index f7907d4..2108a09 100755 --- a/ppf.py +++ b/ppf.py @@ -21,10 +21,23 @@ def import_from_file(fn, sqlite): sqlite.execute('INSERT INTO uris (added,url,check_time,error,stale_count,proxies_added,retrievals) VALUES (?,?,?,?,?,?,?)', (int(time.time()),u,0,0,0,0,0)) sqlite.commit() -def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added): - try: content = fetch.fetch_contents(url) - except KeyboardInterrupt as e: raise e - except: content = '' +def get_content_type(url): + hdr = fetch.fetch_contents(url, head=True) + + for h in hdr.split('\n'): + if h.lower().startswith('content-type: '): return h.lower().split(':')[1].strip() + + return '' + +def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added, content_type): + if not content_type: content_type = get_content_type(url) + if content_type.lower().startswith('text/') or 'atom+xml' in content_type.lower(): + try: content = fetch.fetch_contents(url) + except KeyboardInterrupt as e: raise e + except: content = '' + else: + print('WRONG CONTENT_TYPE: %s (%s)' % (url, content_type)) + content = '' unique_count, new = fetch.extract_proxies(content, proxydb) @@ -81,10 +94,10 @@ if __name__ == '__main__': while True: try: ## any site that needs to be checked ? - rows = urldb.execute('SELECT url,stale_count,error,retrievals,proxies_added FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?)