initial commit
This commit is contained in:
41
striphtml.py
Normal file
41
striphtml.py
Normal file
@@ -0,0 +1,41 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from HTMLParser import HTMLParser
|
||||
import requests
|
||||
import re
|
||||
from selenium.webdriver.common.proxy import *
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
phantomjs_path = '/home/mickael/bin/phantomjs'
|
||||
def cleanhtml(raw_html):
|
||||
cleanr = re.compile('<.*?>')
|
||||
cleantext = re.sub(cleanr, ':', raw_html)
|
||||
cleantext = re.sub('::+',':', cleantext)
|
||||
return cleantext
|
||||
|
||||
class MLStripper(HTMLParser):
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
self.fed = []
|
||||
def handle_data(self, d):
|
||||
self.fed.append(d)
|
||||
def get_data(self):
|
||||
return ''.join(self.fed)
|
||||
|
||||
def strip_tags(html):
|
||||
s = MLStripper()
|
||||
s.feed(html)
|
||||
return s.get_data()
|
||||
|
||||
service_args = ['--proxy=127.0.0.1:9050', '--proxy-type=socks5']
|
||||
driver = webdriver.PhantomJS(phantomjs_path,service_args=service_args)
|
||||
try: driver.get('http://www.proxz.com/proxy_list_fr_1_ext.html')
|
||||
except: sys.exit(0)
|
||||
html = driver.page_source
|
||||
driver.quit()
|
||||
|
||||
text = cleanhtml(html)
|
||||
proxies = sorted(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', text))
|
||||
|
||||
print(text)
|
||||
print(proxies)
|
||||
Reference in New Issue
Block a user