Files
ppf/striphtml.py
2019-01-03 16:36:31 +00:00

42 lines
1.1 KiB
Python

#!/usr/bin/env python
from HTMLParser import HTMLParser
import requests
import re
from selenium.webdriver.common.proxy import *
from selenium import webdriver
from selenium.webdriver.common.by import By
phantomjs_path = '/home/mickael/bin/phantomjs'
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, ':', raw_html)
cleantext = re.sub('::+',':', cleantext)
return cleantext
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
service_args = ['--proxy=127.0.0.1:9050', '--proxy-type=socks5']
driver = webdriver.PhantomJS(phantomjs_path,service_args=service_args)
try: driver.get('http://www.proxz.com/proxy_list_fr_1_ext.html')
except: sys.exit(0)
html = driver.page_source
driver.quit()
text = cleanhtml(html)
proxies = sorted(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', text))
print(text)
print(proxies)