#!/usr/bin/env python from HTMLParser import HTMLParser import requests import re from selenium.webdriver.common.proxy import * from selenium import webdriver from selenium.webdriver.common.by import By phantomjs_path = '/home/mickael/bin/phantomjs' def cleanhtml(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, ':', raw_html) cleantext = re.sub('::+',':', cleantext) return cleantext class MLStripper(HTMLParser): def __init__(self): self.reset() self.fed = [] def handle_data(self, d): self.fed.append(d) def get_data(self): return ''.join(self.fed) def strip_tags(html): s = MLStripper() s.feed(html) return s.get_data() service_args = ['--proxy=127.0.0.1:9050', '--proxy-type=socks5'] driver = webdriver.PhantomJS(phantomjs_path,service_args=service_args) try: driver.get('http://www.proxz.com/proxy_list_fr_1_ext.html') except: sys.exit(0) html = driver.page_source driver.quit() text = cleanhtml(html) proxies = sorted(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', text)) print(text) print(proxies)