ppf/connection_pool.py

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""Tor connection pooling for improved proxy testing throughput.

This module provides connection pooling to reduce the overhead of
establishing Tor circuits for each proxy test.

Key features:
- Worker-Tor affinity (consistent host assignment per thread)
- Connection pre-warming (establish circuits at startup)
- Health monitoring (detect and skip unresponsive Tor hosts)
- Usage statistics for monitoring
"""

import threading
import time
import random
import socket

try:
    import Queue
except ImportError:
    import queue as Queue

import rocksock
from misc import _log


class TorHostState(object):
    """Track state and health of a single Tor SOCKS host."""

    def __init__(self, host_addr):
        self.host_addr = host_addr
        self.lock = threading.Lock()
        # Health tracking
        self.consecutive_failures = 0
        self.last_success = 0
        self.last_failure = 0
        self.total_successes = 0
        self.total_failures = 0
        # Backoff
        self.backoff_until = 0
        # Latency tracking (rolling average)
        self.latency_samples = []
        self.max_samples = 20

    def record_success(self, latency=0):
        """Record a successful connection through this host."""
        with self.lock:
            self.consecutive_failures = 0
            self.last_success = time.time()
            self.total_successes += 1
            self.backoff_until = 0
            if latency > 0:
                self.latency_samples.append(latency)
                if len(self.latency_samples) > self.max_samples:
                    self.latency_samples.pop(0)

    def record_failure(self):
        """Record a failed connection through this host."""
        with self.lock:
            self.consecutive_failures += 1
            self.last_failure = time.time()
            self.total_failures += 1
            # Exponential backoff: 5s, 10s, 20s, 40s, max 60s
            delay = min(5 * (2 ** (self.consecutive_failures - 1)), 60)
            self.backoff_until = time.time() + delay

    def is_available(self):
        """Check if host is available (not in backoff)."""
        with self.lock:
            return time.time() >= self.backoff_until

    def avg_latency(self):
        """Get average connection latency in seconds."""
        with self.lock:
            if not self.latency_samples:
                return 0
            return sum(self.latency_samples) / len(self.latency_samples)

    def success_rate(self):
        """Get success rate as percentage."""
        with self.lock:
            total = self.total_successes + self.total_failures
            if total == 0:
                return 100.0
            return (float(self.total_successes) / total) * 100


class TorConnectionPool(object):
    """Pool of Tor SOCKS connections with health monitoring.

    Provides:
    - Consistent worker-to-host assignment (circuit affinity)
    - Health-based host selection
    - Connection pre-warming
    - Usage statistics
    """

    def __init__(self, tor_hosts, warmup=True, warmup_target='www.google.com'):
        """Initialize the connection pool.

        Args:
            tor_hosts: List of Tor SOCKS addresses (host:port)
            warmup: Whether to pre-warm connections at startup
            warmup_target: Target host for warmup connections
        """
        self.tor_hosts = tor_hosts
        self.host_states = {}
        self.lock = threading.Lock()

        # Worker affinity mapping: thread_id -> tor_host
        self.worker_assignments = {}

        # Statistics
        self.stats_lock = threading.Lock()
        self.total_requests = 0
        self.total_successes = 0
        self.warmup_complete = False

        # Initialize host states
        for host in tor_hosts:
            self.host_states[host] = TorHostState(host)

        # Pre-warm connections
        if warmup and len(tor_hosts) > 0:
            self._warmup_connections(warmup_target)

    def _warmup_connections(self, target_host, target_port=80, timeout=10):
        """Pre-warm Tor circuits by making test connections."""
        _log('warming up %d tor host(s)...' % len(self.tor_hosts), 'pool')
        warmed = 0

        for host_addr in self.tor_hosts:
            try:
                start = time.time()
                proxy = rocksock.RocksockProxyFromURL('socks5://%s' % host_addr)
                sock = rocksock.Rocksock(
                    host=target_host,
                    port=target_port,
                    proxies=[proxy],
                    timeout=timeout
                )
                sock.connect()
                sock.disconnect()
                elapsed = time.time() - start
                self.host_states[host_addr].record_success(elapsed)
                warmed += 1
            except Exception as e:
                self.host_states[host_addr].record_failure()
                _log('warmup failed for %s: %s' % (host_addr, str(e)), 'debug')

        self.warmup_complete = True
        _log('warmup complete: %d/%d hosts ready' % (warmed, len(self.tor_hosts)), 'pool')

    def get_tor_host(self, worker_id=None):
        """Get a Tor host for a worker, with affinity and health-awareness.

        Args:
            worker_id: Optional worker identifier for consistent assignment

        Returns:
            Tor host address string, or None if all hosts unavailable
        """
        with self.stats_lock:
            self.total_requests += 1

        # Check for existing assignment with affinity
        if worker_id is not None:
            with self.lock:
                if worker_id in self.worker_assignments:
                    assigned = self.worker_assignments[worker_id]
                    if self.host_states[assigned].is_available():
                        return assigned

        # Get available hosts sorted by success rate
        available = []
        for host_addr, state in self.host_states.items():
            if state.is_available():
                available.append((host_addr, state.success_rate(), state.avg_latency()))

        if not available:
            # All hosts in backoff, return random one anyway
            return random.choice(self.tor_hosts) if self.tor_hosts else None

        # Sort by success rate (desc), then latency (asc)
        available.sort(key=lambda x: (-x[1], x[2]))

        # Pick from top performers with some randomness
        top_count = max(1, len(available) // 2)
        chosen = random.choice(available[:top_count])[0]

        # Record affinity
        if worker_id is not None:
            with self.lock:
                self.worker_assignments[worker_id] = chosen

        return chosen

    def record_success(self, host_addr, latency=0):
        """Record successful use of a Tor host."""
        if host_addr in self.host_states:
            self.host_states[host_addr].record_success(latency)
            with self.stats_lock:
                self.total_successes += 1

    def record_failure(self, host_addr):
        """Record failed use of a Tor host."""
        if host_addr in self.host_states:
            self.host_states[host_addr].record_failure()

    def get_stats(self):
        """Get pool statistics."""
        with self.stats_lock:
            total = self.total_requests
            successes = self.total_successes

        available_count = sum(1 for s in self.host_states.values() if s.is_available())
        host_stats = []
        for host_addr, state in self.host_states.items():
            host_stats.append({
                'host': host_addr,
                'available': state.is_available(),
                'success_rate': state.success_rate(),
                'avg_latency': state.avg_latency(),
                'total_successes': state.total_successes,
                'total_failures': state.total_failures,
            })

        return {
            'total_requests': total,
            'total_successes': successes,
            'success_rate': (float(successes) / total * 100) if total > 0 else 0,
            'available_hosts': available_count,
            'total_hosts': len(self.tor_hosts),
            'warmup_complete': self.warmup_complete,
            'hosts': host_stats,
        }

    def status_line(self):
        """Get a compact status line for logging."""
        stats = self.get_stats()
        return 'tor pool: %d/%d hosts, %.1f%% success, %d requests' % (
            stats['available_hosts'],
            stats['total_hosts'],
            stats['success_rate'],
            stats['total_requests']
        )


# Global pool instance (initialized by proxywatchd)
_pool = None


def init_pool(tor_hosts, warmup=True):
    """Initialize the global connection pool."""
    global _pool
    _pool = TorConnectionPool(tor_hosts, warmup=warmup)
    return _pool


def get_pool():
    """Get the global connection pool."""
    return _pool