196 lines
5.3 KiB
Python
196 lines
5.3 KiB
Python
#!/usr/bin/env python2
|
|
# -*- coding: utf-8 -*-
|
|
"""Logging and utility functions."""
|
|
|
|
import time
|
|
import sys
|
|
import random
|
|
import string
|
|
|
|
# Log levels: lower number = more verbose
|
|
LOG_LEVELS = {
|
|
'debug': 0,
|
|
'info': 1,
|
|
'rate': 1, # rate limiting info, same as info
|
|
'scraper': 1, # scraper info, same as info
|
|
'stats': 1, # statistics, same as info
|
|
'diag': 1, # diagnostic sampling, same as info
|
|
'warn': 2,
|
|
'error': 3,
|
|
'none': 99, # suppress all
|
|
}
|
|
|
|
# Failure categories for proxy errors
|
|
FAIL_TIMEOUT = 'timeout'
|
|
FAIL_REFUSED = 'refused'
|
|
FAIL_AUTH = 'auth'
|
|
FAIL_UNREACHABLE = 'unreachable'
|
|
FAIL_DNS = 'dns'
|
|
FAIL_SSL = 'ssl'
|
|
FAIL_CLOSED = 'closed'
|
|
FAIL_PROXY = 'proxy'
|
|
FAIL_OTHER = 'other'
|
|
|
|
# SSL errors - proxy is actively intercepting (still working for MITM proxies)
|
|
SSL_ERRORS = frozenset({FAIL_SSL})
|
|
|
|
# Connection errors - proxy might be dead, need secondary verification
|
|
CONN_ERRORS = frozenset({FAIL_TIMEOUT, FAIL_REFUSED, FAIL_UNREACHABLE, FAIL_CLOSED, FAIL_DNS})
|
|
|
|
# SSL protocol errors - proxy doesn't support SSL, no fallback needed
|
|
# These indicate protocol mismatch, not certificate issues
|
|
SSL_PROTOCOL_ERROR_PATTERNS = (
|
|
'wrong version number',
|
|
'unsupported protocol',
|
|
'no protocols available',
|
|
'protocol is shutdown',
|
|
'unexpected eof',
|
|
'eof occurred',
|
|
'alert protocol version',
|
|
'alert handshake failure',
|
|
'http request', # Sent HTTP to HTTPS port
|
|
'wrong ssl version',
|
|
'no ciphers available',
|
|
'unknown protocol',
|
|
'record layer failure',
|
|
'bad record mac',
|
|
'decryption failed',
|
|
'packet length too long',
|
|
)
|
|
|
|
|
|
def is_ssl_protocol_error(reason):
|
|
"""Check if SSL error reason indicates protocol incompatibility.
|
|
|
|
Args:
|
|
reason: SSL error reason string (from failedproxy)
|
|
|
|
Returns:
|
|
True if this is a protocol error (proxy doesn't support SSL),
|
|
False if it might be a cert or other error where fallback makes sense.
|
|
"""
|
|
if not reason:
|
|
return False
|
|
reason_lower = reason.lower()
|
|
for pattern in SSL_PROTOCOL_ERROR_PATTERNS:
|
|
if pattern in reason_lower:
|
|
return True
|
|
return False
|
|
|
|
|
|
# Levels that go to stderr
|
|
STDERR_LEVELS = ('warn', 'error')
|
|
|
|
# Default log level (info)
|
|
_log_level = 1
|
|
|
|
|
|
def set_log_level(level):
|
|
"""Set minimum log level. Messages below this level are suppressed."""
|
|
global _log_level
|
|
if isinstance(level, int):
|
|
_log_level = level
|
|
else:
|
|
_log_level = LOG_LEVELS.get(level, 1)
|
|
|
|
|
|
def get_log_level():
|
|
"""Get current log level."""
|
|
return _log_level
|
|
|
|
|
|
def _log(msg, level='info'):
|
|
"""Log a message if it meets the current log level threshold.
|
|
|
|
Args:
|
|
msg: Message to log
|
|
level: Log level (debug, info, warn, error)
|
|
"""
|
|
msg_level = LOG_LEVELS.get(level, 1)
|
|
if msg_level < _log_level:
|
|
return
|
|
|
|
output = sys.stderr if level in STDERR_LEVELS else sys.stdout
|
|
print >> output, '\r%s/%s\t%s' % (timestamp(), level, msg)
|
|
output.flush() # Force flush for container logs
|
|
|
|
|
|
def timestamp():
|
|
"""Return formatted timestamp."""
|
|
return time.strftime('%H:%M:%S', time.gmtime())
|
|
|
|
|
|
def categorize_error(exc):
|
|
"""Categorize a RocksockException into failure type.
|
|
|
|
Args:
|
|
exc: RocksockException instance
|
|
|
|
Returns:
|
|
One of FAIL_* constants
|
|
"""
|
|
# Import here to avoid circular dependency
|
|
import rocksock
|
|
|
|
if not hasattr(exc, 'get_error'):
|
|
return FAIL_OTHER
|
|
|
|
error = exc.get_error()
|
|
errortype = exc.get_errortype()
|
|
|
|
# DNS resolution failures
|
|
if errortype == rocksock.RS_ET_GAI:
|
|
return FAIL_DNS
|
|
|
|
# SSL errors
|
|
if errortype == rocksock.RS_ET_SSL:
|
|
return FAIL_SSL
|
|
if error in (rocksock.RS_E_SSL_GENERIC, rocksock.RS_E_SSL_CERTIFICATE_ERROR):
|
|
return FAIL_SSL
|
|
|
|
# Timeout errors
|
|
if error in (rocksock.RS_E_HIT_TIMEOUT, rocksock.RS_E_HIT_READTIMEOUT,
|
|
rocksock.RS_E_HIT_WRITETIMEOUT, rocksock.RS_E_HIT_CONNECTTIMEOUT):
|
|
return FAIL_TIMEOUT
|
|
|
|
# Connection refused
|
|
if error in (rocksock.RS_E_TARGET_CONN_REFUSED,
|
|
rocksock.RS_E_TARGETPROXY_CONN_REFUSED):
|
|
return FAIL_REFUSED
|
|
|
|
# Authentication failures
|
|
if error in (rocksock.RS_E_PROXY_AUTH_FAILED, rocksock.RS_E_SOCKS4_NOAUTH):
|
|
return FAIL_AUTH
|
|
|
|
# Unreachable
|
|
if error in (rocksock.RS_E_TARGET_NET_UNREACHABLE,
|
|
rocksock.RS_E_TARGETPROXY_NET_UNREACHABLE,
|
|
rocksock.RS_E_TARGET_HOST_UNREACHABLE,
|
|
rocksock.RS_E_TARGETPROXY_HOST_UNREACHABLE,
|
|
rocksock.RS_E_TARGET_TTL_EXPIRED,
|
|
rocksock.RS_E_TARGETPROXY_TTL_EXPIRED):
|
|
return FAIL_UNREACHABLE
|
|
|
|
# Connection closed
|
|
if error == rocksock.RS_E_REMOTE_DISCONNECTED:
|
|
return FAIL_CLOSED
|
|
|
|
# Proxy-specific errors
|
|
if error in (rocksock.RS_E_PROXY_UNEXPECTED_RESPONSE,
|
|
rocksock.RS_E_TARGETPROXY_CONNECT_FAILED,
|
|
rocksock.RS_E_PROXY_GENERAL_FAILURE,
|
|
rocksock.RS_E_PROXY_COMMAND_NOT_SUPPORTED,
|
|
rocksock.RS_E_PROXY_ADDRESSTYPE_NOT_SUPPORTED):
|
|
return FAIL_PROXY
|
|
|
|
return FAIL_OTHER
|
|
|
|
|
|
def tor_proxy_url(torhost):
|
|
"""Generate Tor SOCKS5 proxy URL.
|
|
|
|
Uses no auth to allow Tor to reuse circuits naturally, avoiding
|
|
circuit exhaustion from IsolateSOCKSAuth behavior.
|
|
"""
|
|
return 'socks5://%s' % torhost
|