cleansing
This commit is contained in:
23
config.ini
23
config.ini
@@ -1,23 +0,0 @@
|
||||
[global]
|
||||
tor_host = 127.0.0.1:9050
|
||||
i2p_host = 127.0.0.1:4444
|
||||
common_proxy_ports = 80, 1080, 3124, 3128, 4145, 4444, 8080, 8081, 8118, 8888, 9999
|
||||
proxylist_reload_every = 180
|
||||
database = proxylist.sqlite
|
||||
proxy_max_fail = 5
|
||||
|
||||
[watcherd]
|
||||
enabled = true
|
||||
proxy_file = false
|
||||
checktime = 1800
|
||||
threads = 10
|
||||
timeout = 15
|
||||
read_timeout = 20
|
||||
max_fail = 5
|
||||
|
||||
[proxyfind]
|
||||
enabled = true
|
||||
search = true
|
||||
maxfail = 10
|
||||
timeout = 30
|
||||
threads = 3
|
||||
@@ -16,72 +16,3 @@ def random_string(strlen=20):
|
||||
def _log(strng, level='info'):
|
||||
print '%s/%s\t%s' % (timestamp(), level, strng)
|
||||
|
||||
def option_matches_options(strng, items):
|
||||
try: return [item for item in items if re.match(strng, item)]
|
||||
except: return False
|
||||
|
||||
def prepare_socksocket(self, destination, path, path_item):
|
||||
if path_item in self.paths and self.paths[path_item]['path'] == path:
|
||||
self.paths[path_item]['path'] = False
|
||||
|
||||
#socks.setdefaultproxy()
|
||||
# relay to i2p http proxy if *.i2p domain
|
||||
if destination.endswith('i2p'):
|
||||
proxy = random.choice(self.i2p_host).split(':')
|
||||
path = False
|
||||
# or go with tor
|
||||
else:
|
||||
proxies = [ rocksock.RocksockProxyFromURL('socks5://%s' % random.choice(self.tor_host)) ]
|
||||
#socks.adddefaultproxy(*socks.parseproxy('tor://%s' % random.choice(self.tor_host)))
|
||||
# add 'clearnet' proxies to the chain ?
|
||||
if self.proxify and (not destination.endswith('onion') and not destination.endswith('.exit')):
|
||||
|
||||
# get a proxy path
|
||||
path = build_path(self, path_item, path)
|
||||
|
||||
# if path isn't long enough, break
|
||||
if not len(path): return False, False, False
|
||||
|
||||
# add chain...
|
||||
#for inc in xrange(len(path) - 1): socks.adddefaultproxy(*socks.parseproxy('http://%s' % path[inc]))
|
||||
#for inc in xrange(len(path) - 1): socks.adddefaultproxy(*socks.parseproxy('%s://%s' % (path[inc][1], path[inc][0])))
|
||||
for inc in xrange(len(path)): proxies.append( rocksock.RocksockProxyFromURL('%s://%s' % (path[inc][1], path[inc][0])))
|
||||
|
||||
#return True, socks.socksocket, path
|
||||
return True, proxies, path
|
||||
|
||||
def build_path(self, path_item, path):
|
||||
|
||||
chainlen = random.randint( self.path_len, (self.path_len + self.path_randomlen))
|
||||
# if not enough proxies
|
||||
# FIXME: try to get a proxylist from database
|
||||
if len(self.proxylist) < chainlen: return []
|
||||
|
||||
# valid path already available
|
||||
elif (path_item in self.paths and
|
||||
self.paths[path_item]['path'] and
|
||||
(time.time() - self.paths[path_item]['ticks']) < self.path_duration):
|
||||
|
||||
# take available path if any
|
||||
if path != self.paths[path_item]['path']: path = self.paths[path_item]['path']
|
||||
|
||||
# or nope, none available
|
||||
# build a new one from scratch
|
||||
else:
|
||||
path = []
|
||||
avail = []
|
||||
|
||||
# dec chainlen if we have to select the exit proxy
|
||||
if self.exitcountry is not None: chainlen -= 1
|
||||
#avail = [item[0] for item in self.proxylist if not item[0] in avail and item[1] != str(self.exitcountry).upper()]
|
||||
avail = [[item[0],item[2]] for item in self.proxylist if not item[0] in avail and item[1] != str(self.exitcountry).upper()]
|
||||
path = random.sample(avail, chainlen)
|
||||
|
||||
# choose the exit proxy
|
||||
if self.exitcountry is not None:
|
||||
#avail = [item[0] for item in self.proxylist if not item[0] in path and item[1] == str(self.exitcountry).upper()]
|
||||
avail = [[item[0],item[2]] for item in self.proxylist if not item[0] in path and item[1] == str(self.exitcountry).upper()]
|
||||
if not len(avail): return []
|
||||
path.append(random.choice(avail))
|
||||
|
||||
return path
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
from threading import Thread
|
||||
import threading, commands
|
||||
import socket, time, random, sys, string, re
|
||||
#import sockschain as socks
|
||||
import requests
|
||||
#from geoip import geolite2
|
||||
|
||||
|
||||
51
ppf.py
51
ppf.py
@@ -1,23 +1,16 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os
|
||||
import sys
|
||||
import socket
|
||||
import requests
|
||||
import socks
|
||||
import random, time
|
||||
import sqlite3
|
||||
import re
|
||||
import urllib
|
||||
import threading
|
||||
import hashlib
|
||||
import ipcalc
|
||||
from soup_parser import soupify
|
||||
from ConfigParser import SafeConfigParser
|
||||
from requests.packages.urllib3.exceptions import InsecureRequestWarning
|
||||
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
|
||||
from dns import resolver
|
||||
#from selenium import webdriver
|
||||
|
||||
sys.path.append('./includes')
|
||||
import mysqlite
|
||||
@@ -63,53 +56,21 @@ def import_from_file(fn, sqlite):
|
||||
sqlite.execute('INSERT INTO uris (added,url,check_time,error) VALUES (?,?,?,?)', (time.time(),u,0,1))
|
||||
sqlite.commit()
|
||||
|
||||
def fetch_contents(uri, driver=None):
|
||||
def fetch_contents(uri):
|
||||
headers = base_header
|
||||
## use requests (default)
|
||||
if not driver:
|
||||
try: resp = requests.get(uri, timeout=45, headers=headers, verify=False, proxies=proxies)
|
||||
except: return ''
|
||||
data = resp.text
|
||||
|
||||
## phantomjs
|
||||
else:
|
||||
for key, value in enumerate(base_header):
|
||||
capability_key = 'phantomjs.page.customHeaders.{}'.format(key)
|
||||
webdriver.DesiredCapabilities.PHANTOMJS[capability_key] = value
|
||||
|
||||
service_args = ['--proxy=127.0.0.1:9050', '--proxy-type=socks5']
|
||||
driver = webdriver.PhantomJS()
|
||||
try:
|
||||
driver.implicitly_wait(45)
|
||||
driver.set_page_load_timeout(45)
|
||||
driver.get(uri)
|
||||
data = driver.page_source
|
||||
|
||||
except: data = ''
|
||||
finally:driver.quit()
|
||||
try: resp = requests.get(uri, timeout=45, headers=headers, verify=False, proxies=proxies)
|
||||
except: return ''
|
||||
data = resp.text
|
||||
|
||||
for retry_message in retry_messages:
|
||||
if retry_message in data: return ''
|
||||
return data
|
||||
|
||||
def update_proxy_sources(sqlite, proxies, uri):
|
||||
for proxy in proxies:
|
||||
md5sum = hashlib.md5(proxy).hexdigest()
|
||||
sqlite.execute('CREATE TABLE IF NOT EXISTS "%s" (uri TEXT)' % md5sum)
|
||||
sqlite.commit()
|
||||
#check = [ i for i in sqlite.execute('SELECT uri FROM "%s" WHERE uri=?' % md5sum, (uri,)).fetchall() ]
|
||||
check = sqlite.execute('SELECT uri FROM "%s" WHERE uri=?' % md5sum, (uri,)).fetchall()
|
||||
if not len(check):
|
||||
sqlite.execute('INSERT INTO "%s" (uri) VALUES(?)' % md5sum, (uri,))
|
||||
sqlite.commit()
|
||||
return data
|
||||
|
||||
def insert_proxies(proxies, uri, sqlite):
|
||||
time_now = time.time()
|
||||
added = 0
|
||||
|
||||
## very wasteful
|
||||
#update_proxy_sources(sqlite, proxies, uri)
|
||||
|
||||
query = [ 'proxy=?' for p in proxies ]
|
||||
known = [ i[0] for i in sqlite.execute('SELECT proxy FROM proxylist WHERE %s' % ' OR '.join(query), proxies).fetchall() ]
|
||||
new = [ (time_now,i,3,0) for i in proxies if not i in known ]
|
||||
@@ -161,7 +122,7 @@ def proxyleech(sqlite, rows):
|
||||
#print('entering proxyleech...')
|
||||
|
||||
for row in rows:
|
||||
try: content = fetch_contents(row[0], None)
|
||||
try: content = fetch_contents(row[0])
|
||||
except: content = ''
|
||||
|
||||
uniques = []
|
||||
|
||||
41
striphtml.py
41
striphtml.py
@@ -1,41 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from HTMLParser import HTMLParser
|
||||
import requests
|
||||
import re
|
||||
from selenium.webdriver.common.proxy import *
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
phantomjs_path = '/home/mickael/bin/phantomjs'
|
||||
def cleanhtml(raw_html):
|
||||
cleanr = re.compile('<.*?>')
|
||||
cleantext = re.sub(cleanr, ':', raw_html)
|
||||
cleantext = re.sub('::+',':', cleantext)
|
||||
return cleantext
|
||||
|
||||
class MLStripper(HTMLParser):
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
self.fed = []
|
||||
def handle_data(self, d):
|
||||
self.fed.append(d)
|
||||
def get_data(self):
|
||||
return ''.join(self.fed)
|
||||
|
||||
def strip_tags(html):
|
||||
s = MLStripper()
|
||||
s.feed(html)
|
||||
return s.get_data()
|
||||
|
||||
service_args = ['--proxy=127.0.0.1:9050', '--proxy-type=socks5']
|
||||
driver = webdriver.PhantomJS(phantomjs_path,service_args=service_args)
|
||||
try: driver.get('http://www.proxz.com/proxy_list_fr_1_ext.html')
|
||||
except: sys.exit(0)
|
||||
html = driver.page_source
|
||||
driver.quit()
|
||||
|
||||
text = cleanhtml(html)
|
||||
proxies = sorted(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', text))
|
||||
|
||||
print(text)
|
||||
print(proxies)
|
||||
Reference in New Issue
Block a user