331 lines
12 KiB
Python
331 lines
12 KiB
Python
#!/usr/bin/env python2
|
|
|
|
import cProfile
|
|
import pstats
|
|
import signal
|
|
import dbs
|
|
import time
|
|
import mysqlite
|
|
import proxywatchd
|
|
from misc import _log
|
|
from config import Config
|
|
import fetch
|
|
import sys
|
|
from soup_parser import set_nobs
|
|
import re
|
|
import threading
|
|
import random
|
|
|
|
# Global profiler for signal handler access
|
|
_profiler = None
|
|
|
|
# Handle SIGTERM gracefully (for container stop)
|
|
def sigterm_handler(signum, frame):
|
|
global _profiler
|
|
if _profiler:
|
|
_profiler.disable()
|
|
_profiler.dump_stats('data/profile.stats')
|
|
_log('profile stats written to data/profile.stats (SIGTERM)', 'info')
|
|
raise KeyboardInterrupt
|
|
|
|
signal.signal(signal.SIGTERM, sigterm_handler)
|
|
|
|
config = Config()
|
|
|
|
|
|
def format_duration(seconds):
|
|
"""Format seconds into compact human-readable duration."""
|
|
if seconds < 60:
|
|
return '%ds' % seconds
|
|
elif seconds < 3600:
|
|
return '%dm' % (seconds // 60)
|
|
elif seconds < 86400:
|
|
h, m = divmod(seconds, 3600)
|
|
m = m // 60
|
|
return '%dh %dm' % (h, m) if m else '%dh' % h
|
|
else:
|
|
d, rem = divmod(seconds, 86400)
|
|
h = rem // 3600
|
|
return '%dd %dh' % (d, h) if h else '%dd' % d
|
|
|
|
|
|
def import_from_file(fn, urldb):
|
|
"""Import URLs from a text file into the database."""
|
|
try:
|
|
with open(fn, 'r') as f:
|
|
urls = [url.strip() for url in f if url.strip()]
|
|
except IOError:
|
|
return # File not found, silently skip
|
|
for i in range(0, len(urls), 200):
|
|
chunk = urls[i:i+200]
|
|
if chunk:
|
|
dbs.insert_urls(chunk, 'import.txt', urldb)
|
|
|
|
|
|
def get_content_type(url, proxy):
|
|
hdr = fetch.fetch_contents(url, head=True, proxy=proxy)
|
|
|
|
for h in hdr.split('\n'):
|
|
if h.lower().startswith('content-type: '): return h.lower().split(':')[1].strip()
|
|
|
|
return ''
|
|
|
|
def is_good_content_type(string):
|
|
allowed_ct = [ 'text/html', 'text/plain', 'atom+xml' ]
|
|
for ct in allowed_ct:
|
|
if ct.lower() in string.lower(): return True
|
|
return False
|
|
|
|
def import_proxies_from_file(proxydb, fn):
|
|
content = open(fn, 'r').read()
|
|
# Detect protocol from filename (e.g., socks5.txt, http-proxies.txt)
|
|
proto = fetch.detect_proto_from_path(fn)
|
|
unique_count, new = fetch.extract_proxies(content, proxydb, proto=proto)
|
|
if new:
|
|
dbs.insert_proxies(proxydb, new, fn)
|
|
return 0
|
|
return 1
|
|
|
|
class Leechered(threading.Thread):
|
|
def __init__(self, url, stale_count, error, retrievals, proxies_added, content_type, content_hash, proxy):
|
|
self.status = 'nok'
|
|
self.proxylist = []
|
|
self.running = True
|
|
self.url = url
|
|
self.stale_count = stale_count
|
|
self.error = error
|
|
self.retrievals = retrievals
|
|
self.proxies_added = proxies_added
|
|
self.content_type = content_type
|
|
self.content_hash = content_hash
|
|
self.new_hash = None
|
|
self.hash_unchanged = False
|
|
self.proxy = proxy
|
|
self.execute = ''
|
|
threading.Thread.__init__(self)
|
|
|
|
def retrieve(self):
|
|
return self.url, self.proxylist, self.stale_count, self.error, self.retrievals, self.content_type, self.proxies_added, self.execute
|
|
def status(self):
|
|
return self.status
|
|
|
|
def run(self):
|
|
self.status = 'nok'
|
|
|
|
try:
|
|
if not self.content_type: self.content_type = get_content_type(self.url, self.proxy)
|
|
|
|
if is_good_content_type(self.content_type):
|
|
try:
|
|
content = fetch.fetch_contents(self.url, proxy=self.proxy)
|
|
except KeyboardInterrupt as e:
|
|
raise e
|
|
except Exception as e:
|
|
try:
|
|
err_msg = repr(e)
|
|
if isinstance(err_msg, unicode):
|
|
err_msg = err_msg.encode('ascii', 'backslashreplace')
|
|
except:
|
|
err_msg = type(e).__name__
|
|
_log('%s: fetch error: %s' % (self.url.split('/')[2], err_msg), 'error')
|
|
content = ''
|
|
else:
|
|
content = ''
|
|
|
|
# Detect protocol from source URL (e.g., .../socks5/list.txt)
|
|
proto = fetch.detect_proto_from_path(self.url)
|
|
unique = fetch.extract_proxies(content, filter_known=False, proto=proto)
|
|
|
|
# Compute hash of all extracted proxies for change detection
|
|
self.new_hash = dbs.compute_proxy_list_hash(unique)
|
|
|
|
# Check if content unchanged (same proxies as last time)
|
|
if self.new_hash and self.content_hash and self.new_hash == self.content_hash:
|
|
self.hash_unchanged = True
|
|
self.proxylist = []
|
|
self.stale_count += 1
|
|
next_check = config.ppf.checktime + (self.error + self.stale_count) * config.ppf.perfail_checktime
|
|
_log('%s: unchanged (hash match), next in %s' % (self.url.split('/')[2], format_duration(next_check)), 'stale')
|
|
# Content unchanged - increment stale_count, update check_time
|
|
self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added, self.content_type, self.url)
|
|
self.status = 'ok'
|
|
return
|
|
|
|
# Content changed or first fetch - reset stale_count, proceed with normal processing
|
|
self.stale_count = 0
|
|
# unique is list of (address, proto) tuples; filter by address, keep tuple
|
|
self.proxylist = [(addr, pr) for addr, pr in unique if not fetch.is_known_proxy(addr)]
|
|
proxy_count = len(self.proxylist)
|
|
|
|
if self.retrievals == 0: # new site
|
|
if content and not self.proxylist: # site works but has zero proxy addresses
|
|
self.error += 1
|
|
self.stale_count += 1
|
|
elif proxy_count:
|
|
self.error = 0
|
|
self.stale_count = 0
|
|
else:
|
|
self.error += 2
|
|
self.stale_count += 2
|
|
else: # not a new site
|
|
# proxylist is empty
|
|
if not proxy_count:
|
|
self.stale_count += 1
|
|
# proxylist is not empty: site is working
|
|
else:
|
|
self.stale_count = 0
|
|
self.error = 0
|
|
# site has no content
|
|
if not content:
|
|
self.error += 1
|
|
self.stale_count += 1
|
|
# site has proxies
|
|
if proxy_count:
|
|
self.error = 0
|
|
self.stale_count = 0
|
|
|
|
self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added+len(self.proxylist), self.content_type, self.url)
|
|
self.status = 'ok'
|
|
|
|
except KeyboardInterrupt:
|
|
raise
|
|
except Exception as e:
|
|
try:
|
|
host = self.url.split('/')[2] if '/' in self.url else self.url
|
|
err_msg = repr(e)
|
|
if isinstance(err_msg, unicode):
|
|
err_msg = err_msg.encode('ascii', 'backslashreplace')
|
|
except:
|
|
host = 'unknown'
|
|
err_msg = type(e).__name__
|
|
_log('%s: thread error: %s' % (host, err_msg), 'error')
|
|
# Set error state so site gets retried later
|
|
self.error += 1
|
|
self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added, self.content_type, self.url)
|
|
self.status = 'nok'
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
global config
|
|
|
|
proxydb = mysqlite.mysqlite(config.watchd.database, str)
|
|
dbs.create_table_if_not_exists(proxydb, 'proxylist')
|
|
fetch.init_known_proxies(proxydb)
|
|
|
|
with open('urignore.txt', 'r') as f:
|
|
urignore = [ i.strip() for i in f.read().split('\n') if i.strip() ]
|
|
|
|
urldb = mysqlite.mysqlite(config.ppf.database, str)
|
|
dbs.create_table_if_not_exists(urldb, 'uris')
|
|
dbs.seed_proxy_sources(urldb)
|
|
import_from_file('import.txt', urldb)
|
|
if len(sys.argv) == 3 and sys.argv[1] == "--file":
|
|
sys.exit(import_proxies_from_file(proxydb, sys.argv[2]))
|
|
|
|
# start proxy watcher
|
|
if config.watchd.threads > 0:
|
|
watcherd = proxywatchd.Proxywatchd()
|
|
watcherd.start()
|
|
else:
|
|
watcherd = None
|
|
|
|
# start scraper threads if enabled
|
|
scrapers = []
|
|
if config.scraper.enabled:
|
|
import scraper
|
|
for i in range(config.scraper.threads):
|
|
s = scraper.Scraper(config)
|
|
s.start()
|
|
scrapers.append(s)
|
|
_log('started %d scraper thread(s)' % len(scrapers), 'info')
|
|
|
|
qurl = 'SELECT url,stale_count,error,retrievals,proxies_added,content_type,content_hash FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) <?) ORDER BY RANDOM()'
|
|
threads = []
|
|
rows = []
|
|
reqtime = time.time() - 3600
|
|
statusmsg = time.time()
|
|
while True:
|
|
try:
|
|
time.sleep(random.random()/10)
|
|
if (time.time() - statusmsg) > 180:
|
|
_log('running %d thread(s) over %d' % (len(threads), config.ppf.threads), 'ppf')
|
|
statusmsg = time.time()
|
|
if not rows:
|
|
if (time.time() - reqtime) > 3:
|
|
rows = urldb.execute(qurl, (config.ppf.max_fail, config.ppf.checktime, config.ppf.perfail_checktime, int(time.time()))).fetchall()
|
|
reqtime = time.time()
|
|
if len(rows) < config.ppf.threads:
|
|
time.sleep(60)
|
|
rows = []
|
|
else:
|
|
_log('handing %d job(s) to %d thread(s)' % ( len(rows), config.ppf.threads ), 'ppf')
|
|
|
|
_proxylist = [ '%s://%s' % (p[0], p[1]) for p in proxydb.execute('SELECT proto,proxy from proxylist where failed=0').fetchall() ]
|
|
if not _proxylist: _proxylist = None
|
|
|
|
for thread in threads:
|
|
if thread.status == 'ok':
|
|
url, proxylist, stale_count, error, retrievals, content_type, proxies_added, execute = thread.retrieve()
|
|
# proxylist is list of (address, proto) tuples
|
|
new = [(addr, pr) for addr, pr in proxylist if not fetch.is_known_proxy(addr)]
|
|
if new:
|
|
fetch.add_known_proxies([addr for addr, pr in new])
|
|
# Update content_hash if we have a new one
|
|
new_hash = thread.new_hash
|
|
execute = (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, new_hash, url)
|
|
urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=?,content_type=?,content_hash=? where url=?', execute)
|
|
urldb.commit()
|
|
if new: dbs.insert_proxies(proxydb, new, url)
|
|
|
|
threads = [ thread for thread in threads if thread.is_alive() ]
|
|
if len(threads) < config.ppf.threads and rows:
|
|
p = random.sample(_proxylist, min(5, len(_proxylist))) if _proxylist else None
|
|
row = random.choice(rows)
|
|
urldb.execute('UPDATE uris SET check_time=? where url=?', (time.time(), row[0]))
|
|
urldb.commit()
|
|
rows.remove(row)
|
|
# row: url, stale_count, error, retrievals, proxies_added, content_type, content_hash
|
|
t = Leechered(row[0], row[1], row[2], row[3], row[4], row[5], row[6], p)
|
|
threads.append(t)
|
|
t.start()
|
|
|
|
except KeyboardInterrupt:
|
|
for s in scrapers:
|
|
s.stop()
|
|
if watcherd:
|
|
watcherd.stop()
|
|
watcherd.finish()
|
|
break
|
|
|
|
_log('ppf stopped', 'info')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
config.load()
|
|
errors = config.validate()
|
|
if errors:
|
|
for e in errors:
|
|
_log(e, 'error')
|
|
sys.exit(1)
|
|
fetch.set_config(config)
|
|
|
|
# handle flags
|
|
if config.args.nobs:
|
|
set_nobs(True)
|
|
|
|
if config.args.profile or config.common.profiling:
|
|
_log('profiling enabled, output to data/profile.stats', 'info')
|
|
_profiler = cProfile.Profile()
|
|
try:
|
|
_profiler.enable()
|
|
main()
|
|
finally:
|
|
_profiler.disable()
|
|
_profiler.dump_stats('data/profile.stats')
|
|
_log('profile stats written to data/profile.stats', 'info')
|
|
stats = pstats.Stats('data/profile.stats')
|
|
stats.strip_dirs().sort_stats('cumulative').print_stats(20)
|
|
else:
|
|
main()
|