diff --git a/fetch.py b/fetch.py index 6b42dda..78c6455 100644 --- a/fetch.py +++ b/fetch.py @@ -6,119 +6,119 @@ from misc import _log config = None def set_config(cfg): - global config - config = cfg + global config + config = cfg cleanhtml_re = [ - re.compile('<.*?>'), - re.compile('\s+'), - re.compile('::+'), + re.compile('<.*?>'), + re.compile('\s+'), + re.compile('::+'), ] def cleanhtml(raw_html): - html = raw_html.replace(' ', ' ') - html = re.sub(cleanhtml_re[0], ':', html) - html = re.sub(cleanhtml_re[1], ':', html) - html = re.sub(cleanhtml_re[2], ':', html) - return html + html = raw_html.replace(' ', ' ') + html = re.sub(cleanhtml_re[0], ':', html) + html = re.sub(cleanhtml_re[1], ':', html) + html = re.sub(cleanhtml_re[2], ':', html) + return html retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded') def fetch_contents(url, head = False): - host, port, ssl, uri = _parse_url(url) - headers=[ - 'Accept-Language: en-US,en;q=0.8', - 'Cache-Control: max-age=0', - ] - if config.ppf.debug: - _log("connecting to %s..."%url, "debug") - while True: - proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))] - http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0') - if not http.connect(): - _log("failed to connect to %s"%url, "ppf") - e = http.get_last_rocksock_exception() - if not e: - return '' - et = e.get_errortype() - ee = e.get_error() - ef = e.get_failedproxy() - if et == rocksock.RS_ET_OWN and \ - ee == rocksock.RS_E_TARGET_CONN_REFUSED \ - and ef == 0: - _log("could not connect to proxy 0 - check your connection", "error") - time.sleep(5) - continue - return '' - break + host, port, ssl, uri = _parse_url(url) + headers=[ + 'Accept-Language: en-US,en;q=0.8', + 'Cache-Control: max-age=0', + ] + if config.ppf.debug: + _log("connecting to %s..."%url, "debug") + while True: + proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))] + http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0') + if not http.connect(): + _log("failed to connect to %s"%url, "ppf") + e = http.get_last_rocksock_exception() + if not e: + return '' + et = e.get_errortype() + ee = e.get_error() + ef = e.get_failedproxy() + if et == rocksock.RS_ET_OWN and \ + ee == rocksock.RS_E_TARGET_CONN_REFUSED \ + and ef == 0: + _log("could not connect to proxy 0 - check your connection", "error") + time.sleep(5) + continue + return '' + break - ## only request header - if head: - hdr = http.head(uri, headers) - return hdr + ## only request header + if head: + hdr = http.head(uri, headers) + return hdr - hdr, res = http.get(uri, headers) - res = res.encode('utf-8') if isinstance(res, unicode) else res - for retry_message in retry_messages: - if retry_message in res: return '' + hdr, res = http.get(uri, headers) + res = res.encode('utf-8') if isinstance(res, unicode) else res + for retry_message in retry_messages: + if retry_message in res: return '' - return res + return res def valid_port(port): - return port > 0 and port < 65535 + return port > 0 and port < 65535 def is_usable_proxy(proxy): - ip, port = proxy.split(':') - if not valid_port(int(port)): return False + ip, port = proxy.split(':') + if not valid_port(int(port)): return False - octets = ip.split('.') - A = int(octets[0]) - B = int(octets[1]) - C = int(octets[2]) - D = int(octets[3]) + octets = ip.split('.') + A = int(octets[0]) + B = int(octets[1]) + C = int(octets[2]) + D = int(octets[3]) - if (A < 1 or A > 254 or \ - B > 255 or C > 255 or D > 255) or \ - (A == 10 or A == 127) or \ - (A == 192 and B == 168) or \ - (A == 172 and B >= 16 and B <= 31): return False - return True + if (A < 1 or A > 254 or \ + B > 255 or C > 255 or D > 255) or \ + (A == 10 or A == 127) or \ + (A == 192 and B == 168) or \ + (A == 172 and B >= 16 and B <= 31): return False + return True _known_proxies = {} def extract_proxies(content, proxydb): - matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content)) + matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content)) - uniques_dict = {} - for p in matches: - uniques_dict[p] = True + uniques_dict = {} + for p in matches: + uniques_dict[p] = True - uniques = [] - for p in uniques_dict.keys(): - if is_usable_proxy(p): uniques.append(p) + uniques = [] + for p in uniques_dict.keys(): + if is_usable_proxy(p): uniques.append(p) - global _known_proxies - if len(_known_proxies) == 0: - known = proxydb.execute('SELECT proxy FROM proxylist').fetchall() - for k in known: - _known_proxies[k[0]] = True + global _known_proxies + if len(_known_proxies) == 0: + known = proxydb.execute('SELECT proxy FROM proxylist').fetchall() + for k in known: + _known_proxies[k[0]] = True - new = [] - for p in uniques: - if not p in _known_proxies: - new.append(p) - _known_proxies[p] = True + new = [] + for p in uniques: + if not p in _known_proxies: + new.append(p) + _known_proxies[p] = True - return len(uniques), new + return len(uniques), new def extract_urls(content, urls = None, urignore=None): - urls = [] if not urls else urls - soup = soupify(content) - for a in soup.body.find_all('a'): - if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue - bad = False - href = a.attrs['href'] - for i in urignore: - if re.findall(i, href): - bad = True - break - if not bad: urls.append(href) - return urls + urls = [] if not urls else urls + soup = soupify(content) + for a in soup.body.find_all('a'): + if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue + bad = False + href = a.attrs['href'] + for i in urignore: + if re.findall(i, href): + bad = True + break + if not bad: urls.append(href) + return urls