ppf/stats.py

#!/usr/bin/env python2
"""Statistics tracking for PPF proxy validation."""
from __future__ import division

import threading
import time

from misc import _log


def try_div(a, b):
    if b != 0:
        return a / float(b)
    return 0


class JudgeStats():
    """Track per-judge success/failure rates for reliability scoring.

    Judges that frequently block or rate-limit are temporarily avoided.
    Stats decay over time to allow recovery.
    """

    def __init__(self, cooldown_seconds=300, block_threshold=3):
        self.lock = threading.Lock()
        self.stats = {}  # judge -> {'success': n, 'fail': n, 'block': n, 'last_block': timestamp}
        self.cooldown_seconds = cooldown_seconds  # seconds to avoid blocked judges
        self.block_threshold = block_threshold    # consecutive blocks before cooldown

    def record_success(self, judge):
        """Record successful judge response."""
        with self.lock:
            if judge not in self.stats:
                self.stats[judge] = {'success': 0, 'fail': 0, 'block': 0, 'last_block': 0}
            self.stats[judge]['success'] += 1
            # Reset block count on success
            self.stats[judge]['block'] = 0

    def record_failure(self, judge):
        """Record judge failure (proxy failed, not judge block)."""
        with self.lock:
            if judge not in self.stats:
                self.stats[judge] = {'success': 0, 'fail': 0, 'block': 0, 'last_block': 0}
            self.stats[judge]['fail'] += 1

    def record_block(self, judge):
        """Record judge blocking the proxy (403, captcha, rate-limit)."""
        with self.lock:
            if judge not in self.stats:
                self.stats[judge] = {'success': 0, 'fail': 0, 'block': 0, 'last_block': 0}
            self.stats[judge]['block'] += 1
            self.stats[judge]['last_block'] = time.time()

    def is_available(self, judge):
        """Check if judge is available (not in cooldown)."""
        with self.lock:
            if judge not in self.stats:
                return True
            s = self.stats[judge]
            # Check if in cooldown period
            if s['block'] >= self.block_threshold:
                if (time.time() - s['last_block']) < self.cooldown_seconds:
                    return False
                # Cooldown expired, reset block count
                s['block'] = 0
            return True

    def get_available_judges(self, judge_list):
        """Return list of judges not in cooldown."""
        return [j for j in judge_list if self.is_available(j)]

    def status_line(self):
        """Return status summary for logging."""
        with self.lock:
            total = len(self.stats)
            blocked = sum(1 for s in self.stats.values()
                          if s['block'] >= self.block_threshold and
                          (time.time() - s['last_block']) < self.cooldown_seconds)
            return 'judges: %d total, %d in cooldown' % (total, blocked)

    def get_stats(self):
        """Return statistics dict for API/dashboard."""
        with self.lock:
            now = time.time()
            total = len(self.stats)
            in_cooldown = sum(1 for s in self.stats.values()
                              if s['block'] >= self.block_threshold and
                              (now - s['last_block']) < self.cooldown_seconds)
            available = total - in_cooldown
            # Get top judges by success count
            top = []
            for judge, s in self.stats.items():
                total_tests = s['success'] + s['fail']
                if total_tests > 0:
                    success_pct = (s['success'] * 100.0) / total_tests
                    top.append({'judge': judge, 'success': s['success'],
                                'tests': total_tests, 'rate': round(success_pct, 1)})
            top.sort(key=lambda x: x['success'], reverse=True)
            return {'total': total, 'available': available, 'in_cooldown': in_cooldown, 'top': top}


# HTTP targets - check for specific headers
regexes = {
    'www.facebook.com': 'X-FB-Debug',
    'www.fbcdn.net': 'X-FB-Debug',
    'www.reddit.com': 'x-clacks-overhead',
    'www.twitter.com': 'x-connection-hash',
    't.co': 'x-connection-hash',
    'www.msn.com': 'x-aspnetmvc-version',
    'www.bing.com': 'p3p',
    'www.ask.com': 'x-served-by',
    'www.hotmail.com': 'x-msedge-ref',
    'www.bbc.co.uk': 'x-bbc-edge-cache-status',
    'www.skype.com': 'X-XSS-Protection',
    'www.alibaba.com': 'object-status',
    'www.mozilla.org': 'cf-ray',
    'www.cloudflare.com': 'cf-ray',
    'www.wikimedia.org': 'x-client-ip',
    'www.vk.com': 'x-frontend',
    'www.tinypic.com': 'x-amz-cf-pop',
    'www.netflix.com': 'X-Netflix.proxy.execution-time',
    'www.amazon.de': 'x-amz-cf-id',
    'www.reuters.com': 'x-amz-cf-id',
    'www.ikea.com': 'x-frame-options',
    'www.twitpic.com': 'timing-allow-origin',
    'www.digg.com': 'cf-request-id',
    'www.wikia.com': 'x-served-by',
    'www.wp.com': 'x-ac',
    'www.last.fm': 'x-timer',
    'www.usps.com': 'x-ruleset-version',
    'www.linkedin.com': 'x-li-uuid',
    'www.vimeo.com': 'x-timer',
    'www.yelp.com': 'x-timer',
    'www.ebay.com': 'x-envoy-upstream-service-time',
    'www.wikihow.com': 'x-c',
    'www.archive.org': 'referrer-policy',
    'www.pandora.tv': 'X-UA-Compatible',
    'www.w3.org': 'x-backend',
    'www.time.com': 'x-amz-cf-pop'
}

# SSL targets - verify TLS handshake only (MITM detection)
ssl_targets = [
    'www.google.com',
    'www.microsoft.com',
    'www.apple.com',
    'www.amazon.com',
    'www.cloudflare.com',
    'www.github.com',
    'www.mozilla.org',
    'www.wikipedia.org',
    'www.reddit.com',
    'www.twitter.com',
    'x.com',
    'www.facebook.com',
    'www.linkedin.com',
    'www.paypal.com',
    'www.stripe.com',
    'www.digicert.com',
    'www.letsencrypt.org',
]


class Stats():
    """Track and report comprehensive runtime statistics."""

    HISTORY_SIZE = 120  # 10 min at 5s intervals
    LATENCY_BUCKETS = [100, 250, 500, 1000, 2000, 5000, 10000]  # ms thresholds

    def __init__(self):
        self.lock = threading.RLock()  # RLock for reentrant access (get_runtime_stats)
        self.tested = 0
        self.passed = 0
        self.failed = 0
        self.start_time = time.time()
        self.last_report = time.time()

        # Failure category tracking
        self.fail_categories = {}

        # Protocol tracking (tested, passed, and failed separately)
        self.proto_tested = {'http': 0, 'socks4': 0, 'socks5': 0}
        self.proto_passed = {'http': 0, 'socks4': 0, 'socks5': 0}
        self.proto_failed = {'http': {}, 'socks4': {}, 'socks5': {}}  # Failures by category per proto
        self.by_proto = self.proto_passed  # Alias for dashboard API

        # Time series history (5s intervals)
        self.rate_history = []
        self.success_rate_history = []
        self.latency_history = []
        self.last_history_time = time.time()
        self.last_history_tested = 0
        self.last_history_passed = 0

        # Peak values (delayed measurement to avoid startup anomalies)
        self.peak_rate = 0.0
        self.peak_success_rate = 0.0
        self.peak_grace_period = 30  # seconds before recording peaks
        self.min_latency = float('inf')
        self.max_latency = 0.0

        # Latency tracking with percentiles
        self.latency_sum = 0.0
        self.latency_count = 0
        self.latency_samples = []  # Recent samples for percentiles
        self.latency_buckets = {b: 0 for b in self.LATENCY_BUCKETS + [float('inf')]}

        # Recent window (last 60s)
        self.recent_tested = 0
        self.recent_passed = 0
        self.recent_start = time.time()

        # Country/ASN tracking (top N)
        self.country_passed = {}
        self.asn_passed = {}

        # Hourly aggregates
        self.hourly_tested = 0
        self.hourly_passed = 0
        self.hourly_start = time.time()
        self.hours_data = []  # Last 24 hours

        # SSL/TLS tracking
        self.ssl_tested = 0
        self.ssl_passed = 0
        self.ssl_failed = 0
        self.ssl_fail_categories = {}  # Track SSL failures by category
        self.mitm_detected = 0
        self.cert_errors = 0

    def record(self, success, category=None, proto=None, latency_ms=None, country=None, asn=None,
               ssl_test=False, mitm=False, cert_error=False):
        with self.lock:
            self.tested += 1
            self.recent_tested += 1
            self.hourly_tested += 1

            # Track protocol tests
            if proto and proto in self.proto_tested:
                self.proto_tested[proto] += 1

            if success:
                self.passed += 1
                self.recent_passed += 1
                self.hourly_passed += 1

                if proto and proto in self.proto_passed:
                    self.proto_passed[proto] += 1

                if latency_ms and latency_ms > 0:
                    self.latency_sum += latency_ms
                    self.latency_count += 1
                    # Track min/max
                    if latency_ms < self.min_latency:
                        self.min_latency = latency_ms
                    if latency_ms > self.max_latency:
                        self.max_latency = latency_ms
                    # Keep recent samples for percentiles (max 1000)
                    self.latency_samples.append(latency_ms)
                    if len(self.latency_samples) > 1000:
                        self.latency_samples.pop(0)
                    # Bucket for histogram
                    for bucket in self.LATENCY_BUCKETS:
                        if latency_ms <= bucket:
                            self.latency_buckets[bucket] += 1
                            break
                    else:
                        self.latency_buckets[float('inf')] += 1

                # Track country/ASN
                if country:
                    self.country_passed[country] = self.country_passed.get(country, 0) + 1
                if asn:
                    self.asn_passed[asn] = self.asn_passed.get(asn, 0) + 1
            else:
                self.failed += 1
                if category:
                    self.fail_categories[category] = self.fail_categories.get(category, 0) + 1
                    # Track failures by protocol
                    if proto and proto in self.proto_failed:
                        self.proto_failed[proto][category] = self.proto_failed[proto].get(category, 0) + 1
                # Log failure category breakdown every 1000 failures
                if self.failed % 1000 == 0:
                    top_cats = sorted(self.fail_categories.items(), key=lambda x: -x[1])[:5]
                    cats_str = ', '.join(['%s:%d' % (c, n) for c, n in top_cats])
                    _log('fail breakdown (%d total): %s' % (self.failed, cats_str), 'diag')

            # SSL/TLS tracking
            if ssl_test:
                self.ssl_tested += 1
                if success:
                    self.ssl_passed += 1
                else:
                    self.ssl_failed += 1
                    # Track which error caused the SSL failure
                    if category:
                        self.ssl_fail_categories[category] = self.ssl_fail_categories.get(category, 0) + 1
            if mitm:
                self.mitm_detected += 1
            if cert_error:
                self.cert_errors += 1

    def update_history(self):
        """Update time series history (call periodically)."""
        now = time.time()
        with self.lock:
            elapsed = now - self.last_history_time
            if elapsed >= 5:  # Update every 5 seconds
                # Rate - with sanity checks
                tests_delta = self.tested - self.last_history_tested
                if tests_delta < 0:
                    # Counter wrapped or corrupted - reset baseline
                    self.last_history_tested = self.tested
                    tests_delta = 0
                rate = tests_delta / elapsed if elapsed > 0 else 0
                # Cap at reasonable max (100/s is generous for proxy testing)
                if rate > 100:
                    rate = 0  # Discard bogus value
                self.rate_history.append(round(rate, 2))
                if len(self.rate_history) > self.HISTORY_SIZE:
                    self.rate_history.pop(0)
                # Only record peaks after grace period (avoid startup anomalies)
                uptime = now - self.start_time
                if uptime >= self.peak_grace_period and rate > self.peak_rate and rate <= 100:
                    self.peak_rate = rate

                # Success rate - with sanity checks
                passed_delta = self.passed - self.last_history_passed
                if passed_delta < 0:
                    self.last_history_passed = self.passed
                    passed_delta = 0
                sr = (passed_delta / tests_delta * 100) if tests_delta > 0 else 0
                sr = min(sr, 100.0)  # Cap at 100%
                self.success_rate_history.append(round(sr, 1))
                if len(self.success_rate_history) > self.HISTORY_SIZE:
                    self.success_rate_history.pop(0)
                if uptime >= self.peak_grace_period and sr > self.peak_success_rate:
                    self.peak_success_rate = sr

                # Average latency for this interval
                avg_lat = self.get_avg_latency()
                self.latency_history.append(round(avg_lat, 0))
                if len(self.latency_history) > self.HISTORY_SIZE:
                    self.latency_history.pop(0)

                self.last_history_time = now
                self.last_history_tested = self.tested
                self.last_history_passed = self.passed

            # Reset recent window every 60s
            if now - self.recent_start >= 60:
                self.recent_tested = 0
                self.recent_passed = 0
                self.recent_start = now

            # Hourly aggregation
            if now - self.hourly_start >= 3600:
                self.hours_data.append({
                    'tested': self.hourly_tested,
                    'passed': self.hourly_passed,
                    'rate': self.hourly_passed / 3600.0 if self.hourly_tested > 0 else 0,
                    'success_rate': (self.hourly_passed / self.hourly_tested * 100) if self.hourly_tested > 0 else 0,
                })
                if len(self.hours_data) > 24:
                    self.hours_data.pop(0)
                self.hourly_tested = 0
                self.hourly_passed = 0
                self.hourly_start = now

    def get_recent_rate(self):
        """Get rate for last 60 seconds."""
        with self.lock:
            elapsed = time.time() - self.recent_start
            if elapsed > 0:
                return self.recent_tested / elapsed
            return 0.0

    def get_recent_success_rate(self):
        """Get success rate for last 60 seconds."""
        with self.lock:
            if self.recent_tested > 0:
                return (self.recent_passed / self.recent_tested) * 100
            return 0.0

    def get_avg_latency(self):
        """Get average latency in ms."""
        with self.lock:
            if self.latency_count > 0:
                return self.latency_sum / self.latency_count
            return 0.0

    def get_latency_percentiles(self):
        """Get latency percentiles (p50, p90, p99)."""
        with self.lock:
            if not self.latency_samples:
                return {'p50': 0, 'p90': 0, 'p99': 0}
            sorted_samples = sorted(self.latency_samples)
            n = len(sorted_samples)
            return {
                'p50': sorted_samples[int(n * 0.50)] if n > 0 else 0,
                'p90': sorted_samples[int(n * 0.90)] if n > 0 else 0,
                'p99': sorted_samples[min(int(n * 0.99), n - 1)] if n > 0 else 0,
            }

    def get_latency_histogram(self):
        """Get latency distribution histogram."""
        with self.lock:
            total = sum(self.latency_buckets.values())
            if total == 0:
                return []
            result = []
            prev = 0
            for bucket in self.LATENCY_BUCKETS:
                count = self.latency_buckets[bucket]
                result.append({
                    'range': '%d-%d' % (prev, bucket),
                    'count': count,
                    'pct': round(count / total * 100, 1),
                })
                prev = bucket
            # Over max bucket
            over = self.latency_buckets[float('inf')]
            if over > 0:
                result.append({
                    'range': '>%d' % self.LATENCY_BUCKETS[-1],
                    'count': over,
                    'pct': round(over / total * 100, 1),
                })
            return result

    def get_proto_stats(self):
        """Get protocol-specific success rates and failure breakdown."""
        with self.lock:
            result = {}
            for proto in ['http', 'socks4', 'socks5']:
                tested = self.proto_tested[proto]
                passed = self.proto_passed[proto]
                failed = sum(self.proto_failed[proto].values())
                result[proto] = {
                    'tested': tested,
                    'passed': passed,
                    'failed': failed,
                    'success_rate': round(passed / tested * 100, 1) if tested > 0 else 0,
                    'fail_reasons': dict(self.proto_failed[proto]) if self.proto_failed[proto] else {},
                }
            return result

    def get_top_countries(self, limit=10):
        """Get top countries by working proxy count."""
        with self.lock:
            sorted_countries = sorted(self.country_passed.items(), key=lambda x: -x[1])
            return sorted_countries[:limit]

    def get_top_asns(self, limit=10):
        """Get top ASNs by working proxy count."""
        with self.lock:
            sorted_asns = sorted(self.asn_passed.items(), key=lambda x: -x[1])
            return sorted_asns[:limit]

    def get_hourly_data(self):
        """Get last 24 hours of hourly data."""
        with self.lock:
            return list(self.hours_data)

    def load_state(self, state):
        """Load persisted state from a dict (from database).

        Args:
            state: dict from dbs.load_session_state()
        """
        if not state:
            return
        with self.lock:
            self.tested = state.get('tested', 0)
            self.passed = state.get('passed', 0)
            self.failed = state.get('failed', 0)
            self.ssl_tested = state.get('ssl_tested', 0)
            self.ssl_passed = state.get('ssl_passed', 0)
            self.ssl_failed = state.get('ssl_failed', 0)
            self.mitm_detected = state.get('mitm_detected', 0)
            self.cert_errors = state.get('cert_errors', 0)
            self.proto_tested['http'] = state.get('proto_http_tested', 0)
            self.proto_passed['http'] = state.get('proto_http_passed', 0)
            self.proto_tested['socks4'] = state.get('proto_socks4_tested', 0)
            self.proto_passed['socks4'] = state.get('proto_socks4_passed', 0)
            self.proto_tested['socks5'] = state.get('proto_socks5_tested', 0)
            self.proto_passed['socks5'] = state.get('proto_socks5_passed', 0)
            # Note: peak_rate is per-session, not restored (avoids stale/corrupt values)
            # Note: start_time is NOT restored - uptime reflects current session
            # Restore failure categories
            if state.get('fail_categories'):
                self.fail_categories = dict(state['fail_categories'])
            # Restore SSL failure categories
            if state.get('ssl_fail_categories'):
                self.ssl_fail_categories = dict(state['ssl_fail_categories'])
            # Restore protocol failure categories
            if state.get('proto_failed'):
                for proto in ['http', 'socks4', 'socks5']:
                    if proto in state['proto_failed']:
                        self.proto_failed[proto] = dict(state['proto_failed'][proto])
            # Restore geo tracking
            if state.get('country_passed'):
                self.country_passed = dict(state['country_passed'])
            if state.get('asn_passed'):
                # Convert string keys back to int for ASN
                self.asn_passed = {int(k) if k.isdigit() else k: v
                                   for k, v in state['asn_passed'].items()}
            _log('restored session: %d tested, %d passed' % (self.tested, self.passed), 'info')

    def should_report(self, interval):
        return (time.time() - self.last_report) >= interval

    def report(self):
        with self.lock:
            self.last_report = time.time()
            elapsed = time.time() - self.start_time
            rate = try_div(self.tested, elapsed)
            pct = try_div(self.passed * 100.0, self.tested)
            base = 'tested=%d passed=%d (%.1f%%) rate=%.2f/s uptime=%dm' % (
                self.tested, self.passed, pct, rate, int(elapsed / 60))
            # Add failure breakdown if there are failures
            if self.fail_categories:
                cats = ' '.join('%s=%d' % (k, v) for k, v in sorted(self.fail_categories.items()))
                return '%s [%s]' % (base, cats)
            return base

    def get_full_stats(self):
        """Get comprehensive stats dict for API."""
        with self.lock:
            elapsed = time.time() - self.start_time
            return {
                'tested': self.tested,
                'passed': self.passed,
                'failed': self.failed,
                'success_rate': round(self.passed / self.tested * 100, 1) if self.tested > 0 else 0,
                'rate': round(self.tested / elapsed, 2) if elapsed > 0 else 0,
                'pass_rate': round(self.passed / elapsed, 2) if elapsed > 0 else 0,
                'recent_rate': self.get_recent_rate(),
                'recent_success_rate': self.get_recent_success_rate(),
                'peak_rate': self.peak_rate,
                'peak_success_rate': self.peak_success_rate,
                'uptime_seconds': int(elapsed),
                'rate_history': list(self.rate_history),
                'success_rate_history': list(self.success_rate_history),
                'latency_history': list(self.latency_history),
                'avg_latency': self.get_avg_latency(),
                'min_latency': self.min_latency if self.min_latency != float('inf') else 0,
                'max_latency': self.max_latency,
                'latency_percentiles': self.get_latency_percentiles(),
                'latency_histogram': self.get_latency_histogram(),
                'by_proto': dict(self.proto_passed),
                'proto_stats': self.get_proto_stats(),
                'failures': dict(self.fail_categories),
                'top_countries': self.get_top_countries(),
                'top_asns': self.get_top_asns(),
                'hourly_data': self.get_hourly_data(),
            }