From ee481ea31e6e5651872deff9c274ee8fcee3e50e Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 27 Jul 2021 22:35:39 +0200 Subject: [PATCH] ppf: make scraper use extra proxies if available --- fetch.py | 22 ++++++++++++++++++---- ppf.py | 28 +++++++++++++++++----------- 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/fetch.py b/fetch.py index be4c829..ab71cbf 100644 --- a/fetch.py +++ b/fetch.py @@ -21,8 +21,20 @@ def cleanhtml(raw_html): html = re.sub(cleanhtml_re[2], ':', html) return html +def fetch_contents(url, head=False, proxy=None): + content = None + if len(proxy): + for p in proxy: + content = _fetch_contents(url, head=head, proxy=p) + if content is not None: break + + else: + content = _fetch_contents(url, head=head) + + return content if content is not None else '' + retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded') -def fetch_contents(url, head = False): +def _fetch_contents(url, head = False, proxy=None): host, port, ssl, uri = _parse_url(url) headers=[ 'Accept-Language: en-US,en;q=0.8', @@ -32,12 +44,14 @@ def fetch_contents(url, head = False): _log("connecting to %s... (header: %s)" % (url, str(head)), "debug") while True: proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))] + if proxy: proxies.append( rocksock.RocksockProxyFromURL(proxy)) + http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0') if not http.connect(): _log("failed to connect to %s"%url, "ppf") e = http.get_last_rocksock_exception() if not e: - return '' + return None et = e.get_errortype() ee = e.get_error() ef = e.get_failedproxy() @@ -47,7 +61,7 @@ def fetch_contents(url, head = False): _log("could not connect to proxy 0 - check your connection", "error") time.sleep(5) continue - return '' + return None break ## only request header @@ -58,7 +72,7 @@ def fetch_contents(url, head = False): hdr, res = http.get(uri, headers) res = res.encode('utf-8') if isinstance(res, unicode) else res for retry_message in retry_messages: - if retry_message in res: return '' + if retry_message in res: return None return res diff --git a/ppf.py b/ppf.py index 571f7b4..985e9ab 100755 --- a/ppf.py +++ b/ppf.py @@ -27,8 +27,8 @@ def import_from_file(fn, sqlite): cinc = cinc + 200 -def get_content_type(url): - hdr = fetch.fetch_contents(url, head=True) +def get_content_type(url, proxy): + hdr = fetch.fetch_contents(url, head=True, proxy=proxy) for h in hdr.split('\n'): if h.lower().startswith('content-type: '): return h.lower().split(':')[1].strip() @@ -41,13 +41,14 @@ def is_good_content_type(string): if ct.lower() in string.lower(): return True return False -def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added, content_type): - if not content_type: content_type = get_content_type(url) +def proxyleech(proxydb, urldb, url, stale_count, error, retrievals, proxies_added, content_type, proxy): + if not content_type: content_type = get_content_type(url, proxy=proxy) - if is_good_content_type(content_type): - try: content = fetch.fetch_contents(url) + if is_good_content_type(content_type, proxy=proxy): + try: content = fetch.fetch_contents(url, proxy=proxy) except KeyboardInterrupt as e: raise e - except: content = '' + except: raise + #except: content = '' else: content = '' @@ -225,7 +226,7 @@ def extract_proxies(content): class Leechered(threading.Thread): #def __init__(self, proxydb, urldb, url, stale_count, error, retrievals, proxies_added, content_type): - def __init__(self, url, stale_count, error, retrievals, proxies_added, content_type): + def __init__(self, url, stale_count, error, retrievals, proxies_added, content_type, proxy): self.status = 'nok' self.proxylist = [] self.running = True @@ -235,6 +236,7 @@ class Leechered(threading.Thread): self.retrievals = retrievals self.proxies_added = proxies_added self.content_type = content_type + self.proxy = proxy self.execute = '' threading.Thread.__init__(self) @@ -246,10 +248,10 @@ class Leechered(threading.Thread): def run(self): self.status = 'nok' - if not self.content_type: self.content_type = get_content_type(self.url) + if not self.content_type: self.content_type = get_content_type(self.url, self.proxy) if is_good_content_type(self.content_type): - try: content = fetch.fetch_contents(self.url) + try: content = fetch.fetch_contents(self.url, proxy=self.proxy) except KeyboardInterrupt as e: raise e except: content = '' else: @@ -353,6 +355,9 @@ if __name__ == '__main__': #urldb.commit() + _proxylist = [ '%s://%s' % (p[0], p[1]) for p in proxydb.execute('SELECT proto,proxy from proxylist where failed=0').fetchall() ] + if len(_proxylist) == 0: _proxylist = None + for thread in threads: if thread.status == 'ok': url, proxylist, stale_count, error, retrievals, content_type, proxies_added, execute = thread.retrieve() @@ -368,11 +373,12 @@ if __name__ == '__main__': threads = [ thread for thread in threads if thread.is_alive() ] if len(threads) < config.ppf.threads and len(rows): + p = random.sample(_proxylist, 5) if _proxylist is not None else None row = random.choice(rows) urldb.execute('UPDATE uris SET check_time=? where url=?', (time.time(), row[0])) urldb.commit() rows.remove(row) - t = Leechered(row[0], row[1], row[2], row[3], row[4], row[5]) + t = Leechered(row[0], row[1], row[2], row[3], row[4], row[5], p) threads.append(t) t.start() #time.sleep(random.random()/100)