ppf/fetch.py

import re, random, time
import threading
import rocksock
import network_stats
from http2 import RsHttp, _parse_url
from soup_parser import soupify
from misc import _log

config = None
_last_fail_log = 0
_fail_log_interval = 60

def set_config(cfg):
    global config
    config = cfg

# Pre-compiled regex patterns (compiled once at module load)
cleanhtml_re = [
    re.compile(r'<.*?>'),
    re.compile(r'\s+'),
    re.compile(r'::+'),
]

# Proxy extraction pattern: IP:PORT followed by non-digit or end
# Pattern: 1-3 digits, dot, repeated 3 times, colon, 2-5 digit port
PROXY_PATTERN = re.compile(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]')
def cleanhtml(raw_html):
    html = raw_html.replace('&nbsp;', ' ')
    html = re.sub(cleanhtml_re[0], ':', html)
    html = re.sub(cleanhtml_re[1], ':', html)
    html = re.sub(cleanhtml_re[2], ':', html)
    return html

def fetch_contents(url, head=False, proxy=None):
    content = None
    if proxy is not None and len(proxy):
        for p in proxy:
            content = _fetch_contents(url, head=head, proxy=p)
            if content is not None: break

    else:
        content = _fetch_contents(url, head=head)

    return content if content is not None else ''

retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
def _fetch_contents(url, head = False, proxy=None):
    network_stats.set_category('scraper')
    host, port, ssl, uri = _parse_url(url)
    headers=[
        'Accept-Language: en-US,en;q=0.8',
        'Cache-Control: max-age=0',
    ]
    if config.ppf.debug:
        _log("connecting to %s... (header: %s)" % (url, str(head)), "debug")
    tor_retries = 0
    max_tor_retries = 1
    http = None
    try:
        while True:
            proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))]
            if proxy: proxies.append( rocksock.RocksockProxyFromURL(proxy))

            http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0', log_errors=False)
            if not http.connect():
                global _last_fail_log
                now = time.time()
                if (now - _last_fail_log) >= _fail_log_interval:
                    _log("failed to connect to %s"%url, "ppf")
                    _last_fail_log = now
                e = http.get_last_rocksock_exception()
                if not e:
                    return None
                et = e.get_errortype()
                ee = e.get_error()
                ef = e.get_failedproxy()
                if et == rocksock.RS_ET_OWN and \
                ee == rocksock.RS_E_TARGET_CONN_REFUSED \
                and ef == 0:
                    http.disconnect()
                    http = None
                    tor_retries += 1
                    if tor_retries >= max_tor_retries:
                        _log("tor proxy failed after %d retries" % tor_retries, "error")
                        return None
                    _log("tor proxy retry %d/%d" % (tor_retries, max_tor_retries), "warn")
                    time.sleep(5)
                    continue
                return None
            break

        ## only request header
        if head:
            hdr = http.head(uri, headers)
            return hdr

        hdr, res = http.get(uri, headers)
        res = res.encode('utf-8') if isinstance(res, unicode) else res
        for retry_message in retry_messages:
            if retry_message in res: return None

        return res
    finally:
        if http:
            http.disconnect()

def valid_port(port):
    """Check if port number is valid (1-65535)."""
    return port >= 1 and port <= 65535


def is_usable_proxy(proxy):
    """Validate proxy string format and reject unusable addresses.

    Rejects:
    - Malformed strings (not ip:port format)
    - Invalid port (0, >65535)
    - Invalid IP octets (>255)
    - Private ranges: 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
    - Loopback: 127.0.0.0/8
    - Link-local: 169.254.0.0/16
    - CGNAT: 100.64.0.0/10
    - Multicast: 224.0.0.0/4
    - Reserved: 240.0.0.0/4
    - Unspecified: 0.0.0.0
    """
    try:
        if ':' not in proxy:
            return False

        ip, port_str = proxy.rsplit(':', 1)
        port = int(port_str)

        if not valid_port(port):
            return False

        octets = ip.split('.')
        if len(octets) != 4:
            return False

        A, B, C, D = [int(o) for o in octets]

        # Validate octet ranges
        if any(o < 0 or o > 255 for o in (A, B, C, D)):
            return False

        # Reject first octet 0 (0.0.0.0/8 - unspecified/invalid)
        if A == 0:
            return False

        # Reject loopback (127.0.0.0/8)
        if A == 127:
            return False

        # Reject private 10.0.0.0/8
        if A == 10:
            return False

        # Reject private 172.16.0.0/12
        if A == 172 and 16 <= B <= 31:
            return False

        # Reject private 192.168.0.0/16
        if A == 192 and B == 168:
            return False

        # Reject link-local 169.254.0.0/16
        if A == 169 and B == 254:
            return False

        # Reject CGNAT 100.64.0.0/10 (100.64.0.0 - 100.127.255.255)
        if A == 100 and 64 <= B <= 127:
            return False

        # Reject multicast 224.0.0.0/4 (224-239.x.x.x)
        if 224 <= A <= 239:
            return False

        # Reject reserved/future 240.0.0.0/4 (240-255.x.x.x)
        if A >= 240:
            return False

        return True

    except (ValueError, AttributeError, IndexError):
        return False

_known_proxies = {}
_known_proxies_lock = threading.Lock()

def init_known_proxies(proxydb):
    """Initialize known proxies cache from database."""
    global _known_proxies
    with _known_proxies_lock:
        if _known_proxies:
            return
        known = proxydb.execute('SELECT proxy FROM proxylist').fetchall()
        for k in known:
            _known_proxies[k[0]] = True

def add_known_proxies(proxies):
    """Add proxies to known cache."""
    global _known_proxies
    with _known_proxies_lock:
        for p in proxies:
            _known_proxies[p] = True

def is_known_proxy(proxy):
    """Check if proxy is in known cache."""
    with _known_proxies_lock:
        return proxy in _known_proxies

def detect_proto_from_path(url):
    """Detect proxy protocol from URL path.

    Many proxy lists indicate protocol in their path:
    - /socks5/, /socks5.txt, socks5-proxies.txt -> socks5
    - /socks4/, /socks4a/, /socks4.txt -> socks4
    - /http/, /http.txt, http-proxies.txt -> http
    - /https/, /ssl/ -> http (HTTPS proxies use HTTP CONNECT)

    Args:
        url: Source URL path or full URL

    Returns:
        Protocol string ('http', 'socks4', 'socks5') or None if not detected
    """
    url_lower = url.lower()
    # Check for socks5 indicators
    if 'socks5' in url_lower:
        return 'socks5'
    # Check for socks4/socks4a indicators
    if 'socks4' in url_lower:
        return 'socks4'
    # Check for http/https/ssl/connect indicators
    if any(x in url_lower for x in ('/http', 'http-', 'http_', 'http.', '/https', '/ssl', '/connect')):
        return 'http'
    return None


def extract_proxies(content, proxydb=None, filter_known=True, proto=None):
    """Extract and normalize proxy addresses from content.

    Args:
        content: HTML/text content to parse
        proxydb: Database connection for known proxy lookup (optional)
        filter_known: If True, filter out known proxies and return new only
        proto: Protocol to assign to all extracted proxies (from source URL)

    Returns:
        If filter_known: (unique_count, new_proxies) tuple
            new_proxies is list of (address, proto) tuples
        If not filter_known: list of (address, proto) tuples
    """
    matches = PROXY_PATTERN.findall(cleanhtml(content))

    uniques_dict = {}
    for p in matches:
        ip, port = p.split(':')
        # Normalize IP (remove leading zeros from octets)
        ip = '.'.join(str(int(octet)) for octet in ip.split('.'))
        # Normalize port (remove leading zeros, handle empty case)
        port = int(port.lstrip('0') or '0')
        p = '%s:%s' % (ip, port)
        uniques_dict[p] = True

    uniques = [(p, proto) for p in uniques_dict.keys() if is_usable_proxy(p)]

    if not filter_known:
        return uniques

    # Initialize known proxies from DB if needed
    if proxydb is not None:
        init_known_proxies(proxydb)

    new = []
    for p, pr in uniques:
        if not is_known_proxy(p):
            new.append((p, pr))
            add_known_proxies([p])

    return len(uniques), new

def extract_urls(content, urls = None, urignore=None):
    urls = [] if not urls else urls
    soup = soupify(content)
    for a in soup.body.find_all('a'):
        if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue
        bad = False
        href = a.attrs['href']
        for i in urignore:
            if re.findall(i, href):
                bad = True
                break
        if not bad: urls.append(href)
    return urls