From f0b7e2dc2f986f0abfe0c4c4c99a3633e8edf754 Mon Sep 17 00:00:00 2001 From: mickael Date: Thu, 3 Jan 2019 21:49:17 +0000 Subject: [PATCH] cleansing --- config.ini | 23 -------------- includes/misc.py | 69 ----------------------------------------- includes/proxywatchd.py | 1 - ppf.py | 51 ++++-------------------------- striphtml.py | 41 ------------------------ 5 files changed, 6 insertions(+), 179 deletions(-) delete mode 100644 config.ini delete mode 100644 striphtml.py diff --git a/config.ini b/config.ini deleted file mode 100644 index 7706dcb..0000000 --- a/config.ini +++ /dev/null @@ -1,23 +0,0 @@ -[global] -tor_host = 127.0.0.1:9050 -i2p_host = 127.0.0.1:4444 -common_proxy_ports = 80, 1080, 3124, 3128, 4145, 4444, 8080, 8081, 8118, 8888, 9999 -proxylist_reload_every = 180 -database = proxylist.sqlite -proxy_max_fail = 5 - -[watcherd] -enabled = true -proxy_file = false -checktime = 1800 -threads = 10 -timeout = 15 -read_timeout = 20 -max_fail = 5 - -[proxyfind] -enabled = true -search = true -maxfail = 10 -timeout = 30 -threads = 3 diff --git a/includes/misc.py b/includes/misc.py index 3c62fda..7d1a91d 100644 --- a/includes/misc.py +++ b/includes/misc.py @@ -16,72 +16,3 @@ def random_string(strlen=20): def _log(strng, level='info'): print '%s/%s\t%s' % (timestamp(), level, strng) -def option_matches_options(strng, items): - try: return [item for item in items if re.match(strng, item)] - except: return False - -def prepare_socksocket(self, destination, path, path_item): - if path_item in self.paths and self.paths[path_item]['path'] == path: - self.paths[path_item]['path'] = False - - #socks.setdefaultproxy() - # relay to i2p http proxy if *.i2p domain - if destination.endswith('i2p'): - proxy = random.choice(self.i2p_host).split(':') - path = False - # or go with tor - else: - proxies = [ rocksock.RocksockProxyFromURL('socks5://%s' % random.choice(self.tor_host)) ] - #socks.adddefaultproxy(*socks.parseproxy('tor://%s' % random.choice(self.tor_host))) - # add 'clearnet' proxies to the chain ? - if self.proxify and (not destination.endswith('onion') and not destination.endswith('.exit')): - - # get a proxy path - path = build_path(self, path_item, path) - - # if path isn't long enough, break - if not len(path): return False, False, False - - # add chain... - #for inc in xrange(len(path) - 1): socks.adddefaultproxy(*socks.parseproxy('http://%s' % path[inc])) - #for inc in xrange(len(path) - 1): socks.adddefaultproxy(*socks.parseproxy('%s://%s' % (path[inc][1], path[inc][0]))) - for inc in xrange(len(path)): proxies.append( rocksock.RocksockProxyFromURL('%s://%s' % (path[inc][1], path[inc][0]))) - - #return True, socks.socksocket, path - return True, proxies, path - -def build_path(self, path_item, path): - - chainlen = random.randint( self.path_len, (self.path_len + self.path_randomlen)) - # if not enough proxies - # FIXME: try to get a proxylist from database - if len(self.proxylist) < chainlen: return [] - - # valid path already available - elif (path_item in self.paths and - self.paths[path_item]['path'] and - (time.time() - self.paths[path_item]['ticks']) < self.path_duration): - - # take available path if any - if path != self.paths[path_item]['path']: path = self.paths[path_item]['path'] - - # or nope, none available - # build a new one from scratch - else: - path = [] - avail = [] - - # dec chainlen if we have to select the exit proxy - if self.exitcountry is not None: chainlen -= 1 - #avail = [item[0] for item in self.proxylist if not item[0] in avail and item[1] != str(self.exitcountry).upper()] - avail = [[item[0],item[2]] for item in self.proxylist if not item[0] in avail and item[1] != str(self.exitcountry).upper()] - path = random.sample(avail, chainlen) - - # choose the exit proxy - if self.exitcountry is not None: - #avail = [item[0] for item in self.proxylist if not item[0] in path and item[1] == str(self.exitcountry).upper()] - avail = [[item[0],item[2]] for item in self.proxylist if not item[0] in path and item[1] == str(self.exitcountry).upper()] - if not len(avail): return [] - path.append(random.choice(avail)) - - return path diff --git a/includes/proxywatchd.py b/includes/proxywatchd.py index 95839e7..96d32af 100644 --- a/includes/proxywatchd.py +++ b/includes/proxywatchd.py @@ -3,7 +3,6 @@ from threading import Thread import threading, commands import socket, time, random, sys, string, re -#import sockschain as socks import requests #from geoip import geolite2 diff --git a/ppf.py b/ppf.py index d430ceb..95b0671 100755 --- a/ppf.py +++ b/ppf.py @@ -1,23 +1,16 @@ #!/usr/bin/env python -import os import sys -import socket import requests -import socks import random, time -import sqlite3 import re import urllib -import threading import hashlib import ipcalc from soup_parser import soupify from ConfigParser import SafeConfigParser from requests.packages.urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning) -from dns import resolver -#from selenium import webdriver sys.path.append('./includes') import mysqlite @@ -63,53 +56,21 @@ def import_from_file(fn, sqlite): sqlite.execute('INSERT INTO uris (added,url,check_time,error) VALUES (?,?,?,?)', (time.time(),u,0,1)) sqlite.commit() -def fetch_contents(uri, driver=None): +def fetch_contents(uri): headers = base_header - ## use requests (default) - if not driver: - try: resp = requests.get(uri, timeout=45, headers=headers, verify=False, proxies=proxies) - except: return '' - data = resp.text - - ## phantomjs - else: - for key, value in enumerate(base_header): - capability_key = 'phantomjs.page.customHeaders.{}'.format(key) - webdriver.DesiredCapabilities.PHANTOMJS[capability_key] = value - - service_args = ['--proxy=127.0.0.1:9050', '--proxy-type=socks5'] - driver = webdriver.PhantomJS() - try: - driver.implicitly_wait(45) - driver.set_page_load_timeout(45) - driver.get(uri) - data = driver.page_source - - except: data = '' - finally:driver.quit() + try: resp = requests.get(uri, timeout=45, headers=headers, verify=False, proxies=proxies) + except: return '' + data = resp.text for retry_message in retry_messages: if retry_message in data: return '' - return data -def update_proxy_sources(sqlite, proxies, uri): - for proxy in proxies: - md5sum = hashlib.md5(proxy).hexdigest() - sqlite.execute('CREATE TABLE IF NOT EXISTS "%s" (uri TEXT)' % md5sum) - sqlite.commit() - #check = [ i for i in sqlite.execute('SELECT uri FROM "%s" WHERE uri=?' % md5sum, (uri,)).fetchall() ] - check = sqlite.execute('SELECT uri FROM "%s" WHERE uri=?' % md5sum, (uri,)).fetchall() - if not len(check): - sqlite.execute('INSERT INTO "%s" (uri) VALUES(?)' % md5sum, (uri,)) - sqlite.commit() + return data def insert_proxies(proxies, uri, sqlite): time_now = time.time() added = 0 - ## very wasteful - #update_proxy_sources(sqlite, proxies, uri) - query = [ 'proxy=?' for p in proxies ] known = [ i[0] for i in sqlite.execute('SELECT proxy FROM proxylist WHERE %s' % ' OR '.join(query), proxies).fetchall() ] new = [ (time_now,i,3,0) for i in proxies if not i in known ] @@ -161,7 +122,7 @@ def proxyleech(sqlite, rows): #print('entering proxyleech...') for row in rows: - try: content = fetch_contents(row[0], None) + try: content = fetch_contents(row[0]) except: content = '' uniques = [] diff --git a/striphtml.py b/striphtml.py deleted file mode 100644 index ef195c4..0000000 --- a/striphtml.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python - -from HTMLParser import HTMLParser -import requests -import re -from selenium.webdriver.common.proxy import * -from selenium import webdriver -from selenium.webdriver.common.by import By -phantomjs_path = '/home/mickael/bin/phantomjs' -def cleanhtml(raw_html): - cleanr = re.compile('<.*?>') - cleantext = re.sub(cleanr, ':', raw_html) - cleantext = re.sub('::+',':', cleantext) - return cleantext - -class MLStripper(HTMLParser): - def __init__(self): - self.reset() - self.fed = [] - def handle_data(self, d): - self.fed.append(d) - def get_data(self): - return ''.join(self.fed) - -def strip_tags(html): - s = MLStripper() - s.feed(html) - return s.get_data() - -service_args = ['--proxy=127.0.0.1:9050', '--proxy-type=socks5'] -driver = webdriver.PhantomJS(phantomjs_path,service_args=service_args) -try: driver.get('http://www.proxz.com/proxy_list_fr_1_ext.html') -except: sys.exit(0) -html = driver.page_source -driver.quit() - -text = cleanhtml(html) -proxies = sorted(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', text)) - -print(text) -print(proxies)