Files
ppf/fetch.py
2025-12-25 11:13:20 +01:00

298 lines
9.1 KiB
Python

import re, random, time
import threading
import rocksock
import network_stats
from http2 import RsHttp, _parse_url
from soup_parser import soupify
from misc import _log
config = None
_last_fail_log = 0
_fail_log_interval = 60
def set_config(cfg):
global config
config = cfg
# Pre-compiled regex patterns (compiled once at module load)
cleanhtml_re = [
re.compile(r'<.*?>'),
re.compile(r'\s+'),
re.compile(r'::+'),
]
# Proxy extraction pattern: IP:PORT followed by non-digit or end
# Pattern: 1-3 digits, dot, repeated 3 times, colon, 2-5 digit port
PROXY_PATTERN = re.compile(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]')
def cleanhtml(raw_html):
html = raw_html.replace('&nbsp;', ' ')
html = re.sub(cleanhtml_re[0], ':', html)
html = re.sub(cleanhtml_re[1], ':', html)
html = re.sub(cleanhtml_re[2], ':', html)
return html
def fetch_contents(url, head=False, proxy=None):
content = None
if proxy is not None and len(proxy):
for p in proxy:
content = _fetch_contents(url, head=head, proxy=p)
if content is not None: break
else:
content = _fetch_contents(url, head=head)
return content if content is not None else ''
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
def _fetch_contents(url, head = False, proxy=None):
network_stats.set_category('scraper')
host, port, ssl, uri = _parse_url(url)
headers=[
'Accept-Language: en-US,en;q=0.8',
'Cache-Control: max-age=0',
]
if config.ppf.debug:
_log("connecting to %s... (header: %s)" % (url, str(head)), "debug")
tor_retries = 0
max_tor_retries = 1
http = None
try:
while True:
proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))]
if proxy: proxies.append( rocksock.RocksockProxyFromURL(proxy))
http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0', log_errors=False)
if not http.connect():
global _last_fail_log
now = time.time()
if (now - _last_fail_log) >= _fail_log_interval:
_log("failed to connect to %s"%url, "ppf")
_last_fail_log = now
e = http.get_last_rocksock_exception()
if not e:
return None
et = e.get_errortype()
ee = e.get_error()
ef = e.get_failedproxy()
if et == rocksock.RS_ET_OWN and \
ee == rocksock.RS_E_TARGET_CONN_REFUSED \
and ef == 0:
http.disconnect()
http = None
tor_retries += 1
if tor_retries >= max_tor_retries:
_log("tor proxy failed after %d retries" % tor_retries, "error")
return None
_log("tor proxy retry %d/%d" % (tor_retries, max_tor_retries), "warn")
time.sleep(5)
continue
return None
break
## only request header
if head:
hdr = http.head(uri, headers)
return hdr
hdr, res = http.get(uri, headers)
res = res.encode('utf-8') if isinstance(res, unicode) else res
for retry_message in retry_messages:
if retry_message in res: return None
return res
finally:
if http:
http.disconnect()
def valid_port(port):
"""Check if port number is valid (1-65535)."""
return port >= 1 and port <= 65535
def is_usable_proxy(proxy):
"""Validate proxy string format and reject unusable addresses.
Rejects:
- Malformed strings (not ip:port format)
- Invalid port (0, >65535)
- Invalid IP octets (>255)
- Private ranges: 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
- Loopback: 127.0.0.0/8
- Link-local: 169.254.0.0/16
- CGNAT: 100.64.0.0/10
- Multicast: 224.0.0.0/4
- Reserved: 240.0.0.0/4
- Unspecified: 0.0.0.0
"""
try:
if ':' not in proxy:
return False
ip, port_str = proxy.rsplit(':', 1)
port = int(port_str)
if not valid_port(port):
return False
octets = ip.split('.')
if len(octets) != 4:
return False
A, B, C, D = [int(o) for o in octets]
# Validate octet ranges
if any(o < 0 or o > 255 for o in (A, B, C, D)):
return False
# Reject first octet 0 (0.0.0.0/8 - unspecified/invalid)
if A == 0:
return False
# Reject loopback (127.0.0.0/8)
if A == 127:
return False
# Reject private 10.0.0.0/8
if A == 10:
return False
# Reject private 172.16.0.0/12
if A == 172 and 16 <= B <= 31:
return False
# Reject private 192.168.0.0/16
if A == 192 and B == 168:
return False
# Reject link-local 169.254.0.0/16
if A == 169 and B == 254:
return False
# Reject CGNAT 100.64.0.0/10 (100.64.0.0 - 100.127.255.255)
if A == 100 and 64 <= B <= 127:
return False
# Reject multicast 224.0.0.0/4 (224-239.x.x.x)
if 224 <= A <= 239:
return False
# Reject reserved/future 240.0.0.0/4 (240-255.x.x.x)
if A >= 240:
return False
return True
except (ValueError, AttributeError, IndexError):
return False
_known_proxies = {}
_known_proxies_lock = threading.Lock()
def init_known_proxies(proxydb):
"""Initialize known proxies cache from database."""
global _known_proxies
with _known_proxies_lock:
if _known_proxies:
return
known = proxydb.execute('SELECT proxy FROM proxylist').fetchall()
for k in known:
_known_proxies[k[0]] = True
def add_known_proxies(proxies):
"""Add proxies to known cache."""
global _known_proxies
with _known_proxies_lock:
for p in proxies:
_known_proxies[p] = True
def is_known_proxy(proxy):
"""Check if proxy is in known cache."""
with _known_proxies_lock:
return proxy in _known_proxies
def detect_proto_from_path(url):
"""Detect proxy protocol from URL path.
Many proxy lists indicate protocol in their path:
- /socks5/, /socks5.txt, socks5-proxies.txt -> socks5
- /socks4/, /socks4a/, /socks4.txt -> socks4
- /http/, /http.txt, http-proxies.txt -> http
- /https/, /ssl/ -> http (HTTPS proxies use HTTP CONNECT)
Args:
url: Source URL path or full URL
Returns:
Protocol string ('http', 'socks4', 'socks5') or None if not detected
"""
url_lower = url.lower()
# Check for socks5 indicators
if 'socks5' in url_lower:
return 'socks5'
# Check for socks4/socks4a indicators
if 'socks4' in url_lower:
return 'socks4'
# Check for http/https/ssl/connect indicators
if any(x in url_lower for x in ('/http', 'http-', 'http_', 'http.', '/https', '/ssl', '/connect')):
return 'http'
return None
def extract_proxies(content, proxydb=None, filter_known=True, proto=None):
"""Extract and normalize proxy addresses from content.
Args:
content: HTML/text content to parse
proxydb: Database connection for known proxy lookup (optional)
filter_known: If True, filter out known proxies and return new only
proto: Protocol to assign to all extracted proxies (from source URL)
Returns:
If filter_known: (unique_count, new_proxies) tuple
new_proxies is list of (address, proto) tuples
If not filter_known: list of (address, proto) tuples
"""
matches = PROXY_PATTERN.findall(cleanhtml(content))
uniques_dict = {}
for p in matches:
ip, port = p.split(':')
# Normalize IP (remove leading zeros from octets)
ip = '.'.join(str(int(octet)) for octet in ip.split('.'))
# Normalize port (remove leading zeros, handle empty case)
port = int(port.lstrip('0') or '0')
p = '%s:%s' % (ip, port)
uniques_dict[p] = True
uniques = [(p, proto) for p in uniques_dict.keys() if is_usable_proxy(p)]
if not filter_known:
return uniques
# Initialize known proxies from DB if needed
if proxydb is not None:
init_known_proxies(proxydb)
new = []
for p, pr in uniques:
if not is_known_proxy(p):
new.append((p, pr))
add_known_proxies([p])
return len(uniques), new
def extract_urls(content, urls = None, urignore=None):
urls = [] if not urls else urls
soup = soupify(content)
for a in soup.body.find_all('a'):
if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue
bad = False
href = a.attrs['href']
for i in urignore:
if re.findall(i, href):
bad = True
break
if not bad: urls.append(href)
return urls