diff --git a/config.ini.sample b/config.ini.sample index 04cb8c2..201a1be 100644 --- a/config.ini.sample +++ b/config.ini.sample @@ -13,9 +13,10 @@ perfail_checktime = 3600 database = proxies.sqlite [ppf] +max_fail = 5 search = true timeout = 30 http_retries = 1 checktime = 3600 -perfail_checktime = 3600 +perfail_checktime = 16000 database = websites.sqlite diff --git a/config.py b/config.py index 772e5a5..dd5d688 100644 --- a/config.py +++ b/config.py @@ -28,5 +28,6 @@ class Config(ComboParser): self.add_item(section, 'timeout', float, 15, 'timeout for blocking operations (connect/recv/...) for proxy checks in seconds', False) self.add_item(section, 'http_retries', int, 1, 'number of retries for http connects', False) self.add_item(section, 'checktime', int, 3600, 'base checking interval for urls in db in seconds', False) - self.add_item(section, 'perfail_checktime', int, 3600, 'additional checking interval for urls in db in seconds per experienced failure', False) + self.add_item(section, 'perfail_checktime', int, 3600, 'additional checking interval for urls in db in seconds per resultless check', False) + self.add_item(section, 'max_fail', int, 5, 'number of fails after which an url is considered dead', False) self.add_item(section, 'database', str, 'proxies.sqlite', 'filename of database', True) diff --git a/ppf.py b/ppf.py index 8700e47..d91eff0 100755 --- a/ppf.py +++ b/ppf.py @@ -4,7 +4,6 @@ import dbs import random, time import re import urllib -import hashlib import mysqlite import proxywatchd from misc import _log @@ -79,31 +78,6 @@ def fetch_contents(url): return res -def valid_port(proxy): - ip, port = proxy.split(':') - port = int(port) - return port > 0 and port < 65535 - -_known_proxies = {} -def insert_proxies(proxies, uri, sqlite, timestamp): - global _known_proxies - if len(_known_proxies) == 0: - known = sqlite.execute('SELECT proxy FROM proxylist').fetchall() - for k in known: - _known_proxies[k[0]] = True - - new = [] - for p in proxies: - if not p in _known_proxies: - if not valid_port(p): continue - new.append((timestamp,p,3,0,0,0)) - _known_proxies[p] = True - - if len(new): - sqlite.executemany('INSERT INTO proxylist (added,proxy,failed,tested,success_count,total_duration) VALUES (?,?,?,?,?,?)', new) - sqlite.commit() - _log('+%d item(s) from %s' % (len(new), uri), 'added') - def proxyfind(sqlite = None): if not sqlite: sqlite = mysqlite.mysqlite(config.ppf.database,str) @@ -142,8 +116,14 @@ def insert_urls(urls, search, sqlite): sqlite.commit() _log('+%d item(s) from %s' % (len(new), search), 'added') +def valid_port(port): + return port > 0 and port < 65535 + def is_usable_proxy(proxy): - octets = proxy.split(':')[0].split('.') + ip, port = proxy.split(':') + if not valid_port(int(port)): return False + + octets = ip.split('.') A = int(octets[0]) B = int(octets[1]) C = int(octets[2]) @@ -156,48 +136,67 @@ def is_usable_proxy(proxy): (A == 172 and B >= 16 and B <= 31): return False return True -def proxyleech(proxydb, urldb, rows): - for row in rows: - try: content = fetch_contents(row[0]) - except KeyboardInterrupt as e: raise e - except: content = '' +def insert_proxies(proxies, sqlite, timestamp): + new = [] + for p in proxies: + new.append((timestamp,p,3,0,0,0)) - matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content)) + if len(new): + sqlite.executemany('INSERT INTO proxylist (added,proxy,failed,tested,success_count,total_duration) VALUES (?,?,?,?,?,?)', new) + sqlite.commit() - uniques_dict = {} - for p in matches: - uniques_dict[p] = True +_known_proxies = {} +def proxyleech(proxydb, urldb, url, stale_count, error): + try: content = fetch_contents(url) + except KeyboardInterrupt as e: raise e + except: content = '' - uniques = [] - for p in uniques_dict.keys(): - if is_usable_proxy(p): uniques.append(p) + matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content)) - hash = hashlib.md5(''.join(uniques)).hexdigest() + uniques_dict = {} + for p in matches: + uniques_dict[p] = True - ## empty list of proxies: multiply error by two - if not len(uniques): - if row[1]: row[2] = (row[2] * 2) - else: row[2] = 99999 + uniques = [] + for p in uniques_dict.keys(): + if is_usable_proxy(p): uniques.append(p) - ## same proxy list: increment error by one - elif hash == row[1]: row[2] = (row[2] + 1) - ## proxylist was updated: error is zero - else: row[2] = 0 + global _known_proxies + if len(_known_proxies) == 0: + known = proxydb.execute('SELECT proxy FROM proxylist').fetchall() + for k in known: + _known_proxies[k[0]] = True - urldb.execute('UPDATE uris SET error=?,hash=?,check_time=? where url=?', (row[2],hash, int(time.time()),row[0])) - urldb.commit() + new = [] + for p in uniques: + if not p in _known_proxies: + new.append(p) + _known_proxies[p] = True - if not row[1] or row[2] > 0: return + if stale_count == 0 and error == 0: # new site + if content != '' and len(uniques) == 0: # site works but has zero proxy addresses + error = 99999 + else: + if len(new) == 0: stale_count += 1 + if content == '': + error += 1 + else: + error = 0 - add = [] - time_now = int(time.time()) - for i in uniques: - add.append(i) - if len(add) > 500: - insert_proxies(add, row[0], proxydb, time_now) - add = [] - if len(add): insert_proxies(add, row[0], proxydb, time_now) + urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=? where url=?', (error, stale_count, int(time.time()), url)) + urldb.commit() + if not len(new): return + + add = [] + time_now = int(time.time()) + for i in new: + add.append(i) + if len(add) >= 500: + insert_proxies(add, proxydb, time_now) + add = [] + if len(add): insert_proxies(add, proxydb, time_now) + _log('+%d item(s) from %s' % (len(new), url), 'added') if __name__ == '__main__': @@ -233,9 +232,11 @@ if __name__ == '__main__': while True: try: ## any site that needs to be checked ? - rows = [ [i[0],i[1],i[2]] for i in urldb.execute('SELECT url,hash,error FROM uris WHERE (check_time+?+(error*?)