import re, random, time import rocksock from http2 import RsHttp, _parse_url from soup_parser import soupify from misc import _log config = None def set_config(cfg): global config config = cfg cleanhtml_re = [ re.compile('<.*?>'), re.compile('\s+'), re.compile('::+'), ] def cleanhtml(raw_html): html = raw_html.replace(' ', ' ') html = re.sub(cleanhtml_re[0], ':', html) html = re.sub(cleanhtml_re[1], ':', html) html = re.sub(cleanhtml_re[2], ':', html) return html def fetch_contents(url, head=False, proxy=None): content = None if proxy is not None and len(proxy): for p in proxy: content = _fetch_contents(url, head=head, proxy=p) if content is not None: break else: content = _fetch_contents(url, head=head) return content if content is not None else '' retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded') def _fetch_contents(url, head = False, proxy=None): host, port, ssl, uri = _parse_url(url) headers=[ 'Accept-Language: en-US,en;q=0.8', 'Cache-Control: max-age=0', ] if config.ppf.debug: _log("connecting to %s... (header: %s)" % (url, str(head)), "debug") while True: proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))] if proxy: proxies.append( rocksock.RocksockProxyFromURL(proxy)) http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0') if not http.connect(): _log("failed to connect to %s"%url, "ppf") e = http.get_last_rocksock_exception() if not e: return None et = e.get_errortype() ee = e.get_error() ef = e.get_failedproxy() if et == rocksock.RS_ET_OWN and \ ee == rocksock.RS_E_TARGET_CONN_REFUSED \ and ef == 0: _log("could not connect to proxy 0 - check your connection", "error") time.sleep(5) continue return None break ## only request header if head: hdr = http.head(uri, headers) return hdr hdr, res = http.get(uri, headers) res = res.encode('utf-8') if isinstance(res, unicode) else res for retry_message in retry_messages: if retry_message in res: return None return res def valid_port(port): return port > 0 and port < 65535 def is_usable_proxy(proxy): ip, port = proxy.split(':') if not valid_port(int(port)): return False octets = ip.split('.') A = int(octets[0]) B = int(octets[1]) C = int(octets[2]) D = int(octets[3]) if (A < 1 or A > 254 or \ B > 255 or C > 255 or D > 255) or \ (A == 10 or A == 127) or \ (A == 192 and B == 168) or \ (A == 172 and B >= 16 and B <= 31): return False return True _known_proxies = {} def init_known_proxies(proxydb): """Initialize known proxies cache from database.""" global _known_proxies if _known_proxies: return known = proxydb.execute('SELECT proxy FROM proxylist').fetchall() for k in known: _known_proxies[k[0]] = True def add_known_proxies(proxies): """Add proxies to known cache.""" global _known_proxies for p in proxies: _known_proxies[p] = True def is_known_proxy(proxy): """Check if proxy is in known cache.""" return proxy in _known_proxies def extract_proxies(content, proxydb=None, filter_known=True): """Extract and normalize proxy addresses from content. Args: content: HTML/text content to parse proxydb: Database connection for known proxy lookup (optional) filter_known: If True, filter out known proxies and return new only Returns: If filter_known: (unique_count, new_proxies) tuple If not filter_known: list of all unique valid proxies """ matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content)) uniques_dict = {} for p in matches: ip, port = p.split(':') # Normalize IP (remove leading zeros from octets) ip = '.'.join(str(int(octet)) for octet in ip.split('.')) # Normalize port (remove leading zeros, handle empty case) port = int(port.lstrip('0') or '0') p = '%s:%s' % (ip, port) uniques_dict[p] = True uniques = [p for p in uniques_dict.keys() if is_usable_proxy(p)] if not filter_known: return uniques # Initialize known proxies from DB if needed if proxydb is not None: init_known_proxies(proxydb) new = [] for p in uniques: if not is_known_proxy(p): new.append(p) add_known_proxies([p]) return len(uniques), new def extract_urls(content, urls = None, urignore=None): urls = [] if not urls else urls soup = soupify(content) for a in soup.body.find_all('a'): if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue bad = False href = a.attrs['href'] for i in urignore: if re.findall(i, href): bad = True break if not bad: urls.append(href) return urls