ppf.py: start using stale_count
This commit is contained in:
@@ -13,9 +13,10 @@ perfail_checktime = 3600
|
|||||||
database = proxies.sqlite
|
database = proxies.sqlite
|
||||||
|
|
||||||
[ppf]
|
[ppf]
|
||||||
|
max_fail = 5
|
||||||
search = true
|
search = true
|
||||||
timeout = 30
|
timeout = 30
|
||||||
http_retries = 1
|
http_retries = 1
|
||||||
checktime = 3600
|
checktime = 3600
|
||||||
perfail_checktime = 3600
|
perfail_checktime = 16000
|
||||||
database = websites.sqlite
|
database = websites.sqlite
|
||||||
|
|||||||
@@ -28,5 +28,6 @@ class Config(ComboParser):
|
|||||||
self.add_item(section, 'timeout', float, 15, 'timeout for blocking operations (connect/recv/...) for proxy checks in seconds', False)
|
self.add_item(section, 'timeout', float, 15, 'timeout for blocking operations (connect/recv/...) for proxy checks in seconds', False)
|
||||||
self.add_item(section, 'http_retries', int, 1, 'number of retries for http connects', False)
|
self.add_item(section, 'http_retries', int, 1, 'number of retries for http connects', False)
|
||||||
self.add_item(section, 'checktime', int, 3600, 'base checking interval for urls in db in seconds', False)
|
self.add_item(section, 'checktime', int, 3600, 'base checking interval for urls in db in seconds', False)
|
||||||
self.add_item(section, 'perfail_checktime', int, 3600, 'additional checking interval for urls in db in seconds per experienced failure', False)
|
self.add_item(section, 'perfail_checktime', int, 3600, 'additional checking interval for urls in db in seconds per resultless check', False)
|
||||||
|
self.add_item(section, 'max_fail', int, 5, 'number of fails after which an url is considered dead', False)
|
||||||
self.add_item(section, 'database', str, 'proxies.sqlite', 'filename of database', True)
|
self.add_item(section, 'database', str, 'proxies.sqlite', 'filename of database', True)
|
||||||
|
|||||||
123
ppf.py
123
ppf.py
@@ -4,7 +4,6 @@ import dbs
|
|||||||
import random, time
|
import random, time
|
||||||
import re
|
import re
|
||||||
import urllib
|
import urllib
|
||||||
import hashlib
|
|
||||||
import mysqlite
|
import mysqlite
|
||||||
import proxywatchd
|
import proxywatchd
|
||||||
from misc import _log
|
from misc import _log
|
||||||
@@ -79,31 +78,6 @@ def fetch_contents(url):
|
|||||||
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def valid_port(proxy):
|
|
||||||
ip, port = proxy.split(':')
|
|
||||||
port = int(port)
|
|
||||||
return port > 0 and port < 65535
|
|
||||||
|
|
||||||
_known_proxies = {}
|
|
||||||
def insert_proxies(proxies, uri, sqlite, timestamp):
|
|
||||||
global _known_proxies
|
|
||||||
if len(_known_proxies) == 0:
|
|
||||||
known = sqlite.execute('SELECT proxy FROM proxylist').fetchall()
|
|
||||||
for k in known:
|
|
||||||
_known_proxies[k[0]] = True
|
|
||||||
|
|
||||||
new = []
|
|
||||||
for p in proxies:
|
|
||||||
if not p in _known_proxies:
|
|
||||||
if not valid_port(p): continue
|
|
||||||
new.append((timestamp,p,3,0,0,0))
|
|
||||||
_known_proxies[p] = True
|
|
||||||
|
|
||||||
if len(new):
|
|
||||||
sqlite.executemany('INSERT INTO proxylist (added,proxy,failed,tested,success_count,total_duration) VALUES (?,?,?,?,?,?)', new)
|
|
||||||
sqlite.commit()
|
|
||||||
_log('+%d item(s) from %s' % (len(new), uri), 'added')
|
|
||||||
|
|
||||||
def proxyfind(sqlite = None):
|
def proxyfind(sqlite = None):
|
||||||
if not sqlite: sqlite = mysqlite.mysqlite(config.ppf.database,str)
|
if not sqlite: sqlite = mysqlite.mysqlite(config.ppf.database,str)
|
||||||
|
|
||||||
@@ -142,8 +116,14 @@ def insert_urls(urls, search, sqlite):
|
|||||||
sqlite.commit()
|
sqlite.commit()
|
||||||
_log('+%d item(s) from %s' % (len(new), search), 'added')
|
_log('+%d item(s) from %s' % (len(new), search), 'added')
|
||||||
|
|
||||||
|
def valid_port(port):
|
||||||
|
return port > 0 and port < 65535
|
||||||
|
|
||||||
def is_usable_proxy(proxy):
|
def is_usable_proxy(proxy):
|
||||||
octets = proxy.split(':')[0].split('.')
|
ip, port = proxy.split(':')
|
||||||
|
if not valid_port(int(port)): return False
|
||||||
|
|
||||||
|
octets = ip.split('.')
|
||||||
A = int(octets[0])
|
A = int(octets[0])
|
||||||
B = int(octets[1])
|
B = int(octets[1])
|
||||||
C = int(octets[2])
|
C = int(octets[2])
|
||||||
@@ -156,48 +136,67 @@ def is_usable_proxy(proxy):
|
|||||||
(A == 172 and B >= 16 and B <= 31): return False
|
(A == 172 and B >= 16 and B <= 31): return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def proxyleech(proxydb, urldb, rows):
|
def insert_proxies(proxies, sqlite, timestamp):
|
||||||
for row in rows:
|
new = []
|
||||||
try: content = fetch_contents(row[0])
|
for p in proxies:
|
||||||
except KeyboardInterrupt as e: raise e
|
new.append((timestamp,p,3,0,0,0))
|
||||||
except: content = ''
|
|
||||||
|
|
||||||
matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content))
|
if len(new):
|
||||||
|
sqlite.executemany('INSERT INTO proxylist (added,proxy,failed,tested,success_count,total_duration) VALUES (?,?,?,?,?,?)', new)
|
||||||
|
sqlite.commit()
|
||||||
|
|
||||||
uniques_dict = {}
|
_known_proxies = {}
|
||||||
for p in matches:
|
def proxyleech(proxydb, urldb, url, stale_count, error):
|
||||||
uniques_dict[p] = True
|
try: content = fetch_contents(url)
|
||||||
|
except KeyboardInterrupt as e: raise e
|
||||||
|
except: content = ''
|
||||||
|
|
||||||
uniques = []
|
matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content))
|
||||||
for p in uniques_dict.keys():
|
|
||||||
if is_usable_proxy(p): uniques.append(p)
|
|
||||||
|
|
||||||
hash = hashlib.md5(''.join(uniques)).hexdigest()
|
uniques_dict = {}
|
||||||
|
for p in matches:
|
||||||
|
uniques_dict[p] = True
|
||||||
|
|
||||||
## empty list of proxies: multiply error by two
|
uniques = []
|
||||||
if not len(uniques):
|
for p in uniques_dict.keys():
|
||||||
if row[1]: row[2] = (row[2] * 2)
|
if is_usable_proxy(p): uniques.append(p)
|
||||||
else: row[2] = 99999
|
|
||||||
|
|
||||||
## same proxy list: increment error by one
|
global _known_proxies
|
||||||
elif hash == row[1]: row[2] = (row[2] + 1)
|
if len(_known_proxies) == 0:
|
||||||
## proxylist was updated: error is zero
|
known = proxydb.execute('SELECT proxy FROM proxylist').fetchall()
|
||||||
else: row[2] = 0
|
for k in known:
|
||||||
|
_known_proxies[k[0]] = True
|
||||||
|
|
||||||
urldb.execute('UPDATE uris SET error=?,hash=?,check_time=? where url=?', (row[2],hash, int(time.time()),row[0]))
|
new = []
|
||||||
urldb.commit()
|
for p in uniques:
|
||||||
|
if not p in _known_proxies:
|
||||||
|
new.append(p)
|
||||||
|
_known_proxies[p] = True
|
||||||
|
|
||||||
if not row[1] or row[2] > 0: return
|
if stale_count == 0 and error == 0: # new site
|
||||||
|
if content != '' and len(uniques) == 0: # site works but has zero proxy addresses
|
||||||
|
error = 99999
|
||||||
|
else:
|
||||||
|
if len(new) == 0: stale_count += 1
|
||||||
|
if content == '':
|
||||||
|
error += 1
|
||||||
|
else:
|
||||||
|
error = 0
|
||||||
|
|
||||||
add = []
|
urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=? where url=?', (error, stale_count, int(time.time()), url))
|
||||||
time_now = int(time.time())
|
urldb.commit()
|
||||||
for i in uniques:
|
|
||||||
add.append(i)
|
|
||||||
if len(add) > 500:
|
|
||||||
insert_proxies(add, row[0], proxydb, time_now)
|
|
||||||
add = []
|
|
||||||
if len(add): insert_proxies(add, row[0], proxydb, time_now)
|
|
||||||
|
|
||||||
|
if not len(new): return
|
||||||
|
|
||||||
|
add = []
|
||||||
|
time_now = int(time.time())
|
||||||
|
for i in new:
|
||||||
|
add.append(i)
|
||||||
|
if len(add) >= 500:
|
||||||
|
insert_proxies(add, proxydb, time_now)
|
||||||
|
add = []
|
||||||
|
if len(add): insert_proxies(add, proxydb, time_now)
|
||||||
|
_log('+%d item(s) from %s' % (len(new), url), 'added')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
@@ -233,9 +232,11 @@ if __name__ == '__main__':
|
|||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
## any site that needs to be checked ?
|
## any site that needs to be checked ?
|
||||||
rows = [ [i[0],i[1],i[2]] for i in urldb.execute('SELECT url,hash,error FROM uris WHERE (check_time+?+(error*?) <?) ORDER BY RANDOM() LIMIT 25', (config.ppf.checktime, config.ppf.perfail_checktime, int(time.time()))).fetchall() ]
|
rows = [ [i[0],i[1],i[2]] for i in urldb.execute('SELECT url,stale_count,error FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) <?) ORDER BY RANDOM() LIMIT 25', (config.ppf.max_fail, config.ppf.checktime, config.ppf.perfail_checktime, int(time.time()))).fetchall() ]
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
proxyleech(proxydb, urldb, row[0], row[1], row[2])
|
||||||
|
|
||||||
if len(rows): proxyleech(proxydb, urldb, rows)
|
|
||||||
## search for new website during free time
|
## search for new website during free time
|
||||||
if config.ppf.search: proxyfind(urldb)
|
if config.ppf.search: proxyfind(urldb)
|
||||||
## sleep
|
## sleep
|
||||||
|
|||||||
Reference in New Issue
Block a user