298 lines
9.1 KiB
Python
298 lines
9.1 KiB
Python
import re, random, time
|
|
import threading
|
|
import rocksock
|
|
import network_stats
|
|
from http2 import RsHttp, _parse_url
|
|
from soup_parser import soupify
|
|
from misc import _log
|
|
|
|
config = None
|
|
_last_fail_log = 0
|
|
_fail_log_interval = 60
|
|
|
|
def set_config(cfg):
|
|
global config
|
|
config = cfg
|
|
|
|
# Pre-compiled regex patterns (compiled once at module load)
|
|
cleanhtml_re = [
|
|
re.compile(r'<.*?>'),
|
|
re.compile(r'\s+'),
|
|
re.compile(r'::+'),
|
|
]
|
|
|
|
# Proxy extraction pattern: IP:PORT followed by non-digit or end
|
|
# Pattern: 1-3 digits, dot, repeated 3 times, colon, 2-5 digit port
|
|
PROXY_PATTERN = re.compile(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]')
|
|
def cleanhtml(raw_html):
|
|
html = raw_html.replace(' ', ' ')
|
|
html = re.sub(cleanhtml_re[0], ':', html)
|
|
html = re.sub(cleanhtml_re[1], ':', html)
|
|
html = re.sub(cleanhtml_re[2], ':', html)
|
|
return html
|
|
|
|
def fetch_contents(url, head=False, proxy=None):
|
|
content = None
|
|
if proxy is not None and len(proxy):
|
|
for p in proxy:
|
|
content = _fetch_contents(url, head=head, proxy=p)
|
|
if content is not None: break
|
|
|
|
else:
|
|
content = _fetch_contents(url, head=head)
|
|
|
|
return content if content is not None else ''
|
|
|
|
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
|
|
def _fetch_contents(url, head = False, proxy=None):
|
|
network_stats.set_category('scraper')
|
|
host, port, ssl, uri = _parse_url(url)
|
|
headers=[
|
|
'Accept-Language: en-US,en;q=0.8',
|
|
'Cache-Control: max-age=0',
|
|
]
|
|
if config.ppf.debug:
|
|
_log("connecting to %s... (header: %s)" % (url, str(head)), "debug")
|
|
tor_retries = 0
|
|
max_tor_retries = 1
|
|
http = None
|
|
try:
|
|
while True:
|
|
proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))]
|
|
if proxy: proxies.append( rocksock.RocksockProxyFromURL(proxy))
|
|
|
|
http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0', log_errors=False)
|
|
if not http.connect():
|
|
global _last_fail_log
|
|
now = time.time()
|
|
if (now - _last_fail_log) >= _fail_log_interval:
|
|
_log("failed to connect to %s"%url, "ppf")
|
|
_last_fail_log = now
|
|
e = http.get_last_rocksock_exception()
|
|
if not e:
|
|
return None
|
|
et = e.get_errortype()
|
|
ee = e.get_error()
|
|
ef = e.get_failedproxy()
|
|
if et == rocksock.RS_ET_OWN and \
|
|
ee == rocksock.RS_E_TARGET_CONN_REFUSED \
|
|
and ef == 0:
|
|
http.disconnect()
|
|
http = None
|
|
tor_retries += 1
|
|
if tor_retries >= max_tor_retries:
|
|
_log("tor proxy failed after %d retries" % tor_retries, "error")
|
|
return None
|
|
_log("tor proxy retry %d/%d" % (tor_retries, max_tor_retries), "warn")
|
|
time.sleep(5)
|
|
continue
|
|
return None
|
|
break
|
|
|
|
## only request header
|
|
if head:
|
|
hdr = http.head(uri, headers)
|
|
return hdr
|
|
|
|
hdr, res = http.get(uri, headers)
|
|
res = res.encode('utf-8') if isinstance(res, unicode) else res
|
|
for retry_message in retry_messages:
|
|
if retry_message in res: return None
|
|
|
|
return res
|
|
finally:
|
|
if http:
|
|
http.disconnect()
|
|
|
|
def valid_port(port):
|
|
"""Check if port number is valid (1-65535)."""
|
|
return port >= 1 and port <= 65535
|
|
|
|
|
|
def is_usable_proxy(proxy):
|
|
"""Validate proxy string format and reject unusable addresses.
|
|
|
|
Rejects:
|
|
- Malformed strings (not ip:port format)
|
|
- Invalid port (0, >65535)
|
|
- Invalid IP octets (>255)
|
|
- Private ranges: 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
|
|
- Loopback: 127.0.0.0/8
|
|
- Link-local: 169.254.0.0/16
|
|
- CGNAT: 100.64.0.0/10
|
|
- Multicast: 224.0.0.0/4
|
|
- Reserved: 240.0.0.0/4
|
|
- Unspecified: 0.0.0.0
|
|
"""
|
|
try:
|
|
if ':' not in proxy:
|
|
return False
|
|
|
|
ip, port_str = proxy.rsplit(':', 1)
|
|
port = int(port_str)
|
|
|
|
if not valid_port(port):
|
|
return False
|
|
|
|
octets = ip.split('.')
|
|
if len(octets) != 4:
|
|
return False
|
|
|
|
A, B, C, D = [int(o) for o in octets]
|
|
|
|
# Validate octet ranges
|
|
if any(o < 0 or o > 255 for o in (A, B, C, D)):
|
|
return False
|
|
|
|
# Reject first octet 0 (0.0.0.0/8 - unspecified/invalid)
|
|
if A == 0:
|
|
return False
|
|
|
|
# Reject loopback (127.0.0.0/8)
|
|
if A == 127:
|
|
return False
|
|
|
|
# Reject private 10.0.0.0/8
|
|
if A == 10:
|
|
return False
|
|
|
|
# Reject private 172.16.0.0/12
|
|
if A == 172 and 16 <= B <= 31:
|
|
return False
|
|
|
|
# Reject private 192.168.0.0/16
|
|
if A == 192 and B == 168:
|
|
return False
|
|
|
|
# Reject link-local 169.254.0.0/16
|
|
if A == 169 and B == 254:
|
|
return False
|
|
|
|
# Reject CGNAT 100.64.0.0/10 (100.64.0.0 - 100.127.255.255)
|
|
if A == 100 and 64 <= B <= 127:
|
|
return False
|
|
|
|
# Reject multicast 224.0.0.0/4 (224-239.x.x.x)
|
|
if 224 <= A <= 239:
|
|
return False
|
|
|
|
# Reject reserved/future 240.0.0.0/4 (240-255.x.x.x)
|
|
if A >= 240:
|
|
return False
|
|
|
|
return True
|
|
|
|
except (ValueError, AttributeError, IndexError):
|
|
return False
|
|
|
|
_known_proxies = {}
|
|
_known_proxies_lock = threading.Lock()
|
|
|
|
def init_known_proxies(proxydb):
|
|
"""Initialize known proxies cache from database."""
|
|
global _known_proxies
|
|
with _known_proxies_lock:
|
|
if _known_proxies:
|
|
return
|
|
known = proxydb.execute('SELECT proxy FROM proxylist').fetchall()
|
|
for k in known:
|
|
_known_proxies[k[0]] = True
|
|
|
|
def add_known_proxies(proxies):
|
|
"""Add proxies to known cache."""
|
|
global _known_proxies
|
|
with _known_proxies_lock:
|
|
for p in proxies:
|
|
_known_proxies[p] = True
|
|
|
|
def is_known_proxy(proxy):
|
|
"""Check if proxy is in known cache."""
|
|
with _known_proxies_lock:
|
|
return proxy in _known_proxies
|
|
|
|
def detect_proto_from_path(url):
|
|
"""Detect proxy protocol from URL path.
|
|
|
|
Many proxy lists indicate protocol in their path:
|
|
- /socks5/, /socks5.txt, socks5-proxies.txt -> socks5
|
|
- /socks4/, /socks4a/, /socks4.txt -> socks4
|
|
- /http/, /http.txt, http-proxies.txt -> http
|
|
- /https/, /ssl/ -> http (HTTPS proxies use HTTP CONNECT)
|
|
|
|
Args:
|
|
url: Source URL path or full URL
|
|
|
|
Returns:
|
|
Protocol string ('http', 'socks4', 'socks5') or None if not detected
|
|
"""
|
|
url_lower = url.lower()
|
|
# Check for socks5 indicators
|
|
if 'socks5' in url_lower:
|
|
return 'socks5'
|
|
# Check for socks4/socks4a indicators
|
|
if 'socks4' in url_lower:
|
|
return 'socks4'
|
|
# Check for http/https/ssl/connect indicators
|
|
if any(x in url_lower for x in ('/http', 'http-', 'http_', 'http.', '/https', '/ssl', '/connect')):
|
|
return 'http'
|
|
return None
|
|
|
|
|
|
def extract_proxies(content, proxydb=None, filter_known=True, proto=None):
|
|
"""Extract and normalize proxy addresses from content.
|
|
|
|
Args:
|
|
content: HTML/text content to parse
|
|
proxydb: Database connection for known proxy lookup (optional)
|
|
filter_known: If True, filter out known proxies and return new only
|
|
proto: Protocol to assign to all extracted proxies (from source URL)
|
|
|
|
Returns:
|
|
If filter_known: (unique_count, new_proxies) tuple
|
|
new_proxies is list of (address, proto) tuples
|
|
If not filter_known: list of (address, proto) tuples
|
|
"""
|
|
matches = PROXY_PATTERN.findall(cleanhtml(content))
|
|
|
|
uniques_dict = {}
|
|
for p in matches:
|
|
ip, port = p.split(':')
|
|
# Normalize IP (remove leading zeros from octets)
|
|
ip = '.'.join(str(int(octet)) for octet in ip.split('.'))
|
|
# Normalize port (remove leading zeros, handle empty case)
|
|
port = int(port.lstrip('0') or '0')
|
|
p = '%s:%s' % (ip, port)
|
|
uniques_dict[p] = True
|
|
|
|
uniques = [(p, proto) for p in uniques_dict.keys() if is_usable_proxy(p)]
|
|
|
|
if not filter_known:
|
|
return uniques
|
|
|
|
# Initialize known proxies from DB if needed
|
|
if proxydb is not None:
|
|
init_known_proxies(proxydb)
|
|
|
|
new = []
|
|
for p, pr in uniques:
|
|
if not is_known_proxy(p):
|
|
new.append((p, pr))
|
|
add_known_proxies([p])
|
|
|
|
return len(uniques), new
|
|
|
|
def extract_urls(content, urls = None, urignore=None):
|
|
urls = [] if not urls else urls
|
|
soup = soupify(content)
|
|
for a in soup.body.find_all('a'):
|
|
if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue
|
|
bad = False
|
|
href = a.attrs['href']
|
|
for i in urignore:
|
|
if re.findall(i, href):
|
|
bad = True
|
|
break
|
|
if not bad: urls.append(href)
|
|
return urls
|
|
|