diff --git a/ppf.py b/ppf.py index 2108a09..c4e2785 100755 --- a/ppf.py +++ b/ppf.py @@ -21,6 +21,7 @@ def import_from_file(fn, sqlite): sqlite.execute('INSERT INTO uris (added,url,check_time,error,stale_count,proxies_added,retrievals) VALUES (?,?,?,?,?,?,?)', (int(time.time()),u,0,0,0,0,0)) sqlite.commit() + def get_content_type(url): hdr = fetch.fetch_contents(url, head=True) @@ -29,9 +30,17 @@ def get_content_type(url): return '' +def is_good_content_type(string): + allowed_ct = [ 'text/html', 'text/plain', 'atom+xml' ] + for ct in allowed_ct: + if ct.lower() in string.lower(): return True + return False + def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added, content_type): if not content_type: content_type = get_content_type(url) - if content_type.lower().startswith('text/') or 'atom+xml' in content_type.lower(): + + #if content_type.lower().startswith('text/') or 'atom+xml' in content_type.lower(): + if is_good_content_type(content_type): try: content = fetch.fetch_contents(url) except KeyboardInterrupt as e: raise e except: content = '' @@ -55,7 +64,7 @@ def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_adde retrievals += 1 error = 0 - urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=? where url=?', (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), url)) + urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=?,content_type=? where url=?', (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, url)) urldb.commit() if not len(new): return @@ -96,10 +105,12 @@ if __name__ == '__main__': ## any site that needs to be checked ? rows = urldb.execute('SELECT url,stale_count,error,retrievals,proxies_added,content_type FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?)