ppf/fetch.py

import re, random, time
import rocksock
from http2 import RsHttp, _parse_url
from soup_parser import soupify
from misc import _log

config = None
def set_config(cfg):
    global config
    config = cfg

cleanhtml_re = [
    re.compile('<.*?>'),
    re.compile('\s+'),
    re.compile('::+'),
]
def cleanhtml(raw_html):
    html = raw_html.replace('&nbsp;', ' ')
    html = re.sub(cleanhtml_re[0], ':', html)
    html = re.sub(cleanhtml_re[1], ':', html)
    html = re.sub(cleanhtml_re[2], ':', html)
    return html

def fetch_contents(url, head=False, proxy=None):
    content = None
    if proxy is not None and len(proxy):
        for p in proxy:
            content = _fetch_contents(url, head=head, proxy=p)
            if content is not None: break

    else:
        content = _fetch_contents(url, head=head)

    return content if content is not None else ''

retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
def _fetch_contents(url, head = False, proxy=None):
    host, port, ssl, uri = _parse_url(url)
    headers=[
        'Accept-Language: en-US,en;q=0.8',
        'Cache-Control: max-age=0',
    ]
    if config.ppf.debug:
        _log("connecting to %s... (header: %s)" % (url, str(head)), "debug")
    while True:
        proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))]
        if proxy: proxies.append( rocksock.RocksockProxyFromURL(proxy))

        http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0')
        if not http.connect():
            _log("failed to connect to %s"%url, "ppf")
            e = http.get_last_rocksock_exception()
            if not e:
                return None
            et = e.get_errortype()
            ee = e.get_error()
            ef = e.get_failedproxy()
            if et == rocksock.RS_ET_OWN and \
            ee == rocksock.RS_E_TARGET_CONN_REFUSED \
            and ef == 0:
                _log("could not connect to proxy 0 - check your connection", "error")
                time.sleep(5)
                continue
            return None
        break

    ## only request header
    if head:
        hdr = http.head(uri, headers)
        return hdr

    hdr, res = http.get(uri, headers)
    res = res.encode('utf-8') if isinstance(res, unicode) else res
    for retry_message in retry_messages:
        if retry_message in res: return None

    return res

def valid_port(port):
    return port > 0 and port < 65535

def is_usable_proxy(proxy):
    ip, port = proxy.split(':')
    if not valid_port(int(port)): return False

    octets = ip.split('.')
    A = int(octets[0])
    B = int(octets[1])
    C = int(octets[2])
    D = int(octets[3])

    if (A < 1 or A > 254 or \
    B > 255 or C > 255 or D > 255) or \
    (A == 10 or A == 127) or \
    (A == 192 and B == 168) or \
    (A == 172 and B >= 16 and B <= 31): return False
    return True

_known_proxies = {}

def init_known_proxies(proxydb):
    """Initialize known proxies cache from database."""
    global _known_proxies
    if _known_proxies:
        return
    known = proxydb.execute('SELECT proxy FROM proxylist').fetchall()
    for k in known:
        _known_proxies[k[0]] = True

def add_known_proxies(proxies):
    """Add proxies to known cache."""
    global _known_proxies
    for p in proxies:
        _known_proxies[p] = True

def is_known_proxy(proxy):
    """Check if proxy is in known cache."""
    return proxy in _known_proxies

def extract_proxies(content, proxydb=None, filter_known=True):
    """Extract and normalize proxy addresses from content.

    Args:
        content: HTML/text content to parse
        proxydb: Database connection for known proxy lookup (optional)
        filter_known: If True, filter out known proxies and return new only

    Returns:
        If filter_known: (unique_count, new_proxies) tuple
        If not filter_known: list of all unique valid proxies
    """
    matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content))

    uniques_dict = {}
    for p in matches:
        ip, port = p.split(':')
        # Normalize IP (remove leading zeros from octets)
        ip = '.'.join(str(int(octet)) for octet in ip.split('.'))
        # Normalize port (remove leading zeros, handle empty case)
        port = int(port.lstrip('0') or '0')
        p = '%s:%s' % (ip, port)
        uniques_dict[p] = True

    uniques = [p for p in uniques_dict.keys() if is_usable_proxy(p)]

    if not filter_known:
        return uniques

    # Initialize known proxies from DB if needed
    if proxydb is not None:
        init_known_proxies(proxydb)

    new = []
    for p in uniques:
        if not is_known_proxy(p):
            new.append(p)
            add_known_proxies([p])

    return len(uniques), new

def extract_urls(content, urls = None, urignore=None):
    urls = [] if not urls else urls
    soup = soupify(content)
    for a in soup.body.find_all('a'):
        if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue
        bad = False
        href = a.attrs['href']
        for i in urignore:
            if re.findall(i, href):
                bad = True
                break
        if not bad: urls.append(href)
    return urls