ppf: improve cleanhtml() and cache compiled re's
now it transforms e.g. '<td>118.114.116.36</td>\n<td>1080</td>' correctly. (the newline was formerly preventing success)
This commit is contained in:
15
ppf.py
15
ppf.py
@@ -21,11 +21,18 @@ base_header = {
|
|||||||
searx_instances = ('https://searx.me', 'https://searx.xyz', 'https://searx.site', 'https://searx.win', 'https://searx.ru', 'https://stemy.me/searx', 'https://searx.at', 'https://listi.me', 'https://searx.dk', 'https://searx.laquadrature.net' )
|
searx_instances = ('https://searx.me', 'https://searx.xyz', 'https://searx.site', 'https://searx.win', 'https://searx.ru', 'https://stemy.me/searx', 'https://searx.at', 'https://listi.me', 'https://searx.dk', 'https://searx.laquadrature.net' )
|
||||||
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
|
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
|
||||||
|
|
||||||
|
|
||||||
|
cleanhtml_re = [
|
||||||
|
re.compile('<.*?>'),
|
||||||
|
re.compile('\s+'),
|
||||||
|
re.compile('::+'),
|
||||||
|
]
|
||||||
def cleanhtml(raw_html):
|
def cleanhtml(raw_html):
|
||||||
cleanr = re.compile('<.*?>')
|
html = raw_html.replace(' ', ' ')
|
||||||
cleantext = re.sub(cleanr, ':', raw_html)
|
html = re.sub(cleanhtml_re[0], ':', html)
|
||||||
cleantext = re.sub('::+',':', cleantext)
|
html = re.sub(cleanhtml_re[1], ':', html)
|
||||||
return cleantext
|
html = re.sub(cleanhtml_re[2], ':', html)
|
||||||
|
return html
|
||||||
|
|
||||||
def import_from_file(fn, sqlite):
|
def import_from_file(fn, sqlite):
|
||||||
with open(fn, 'r') as f:
|
with open(fn, 'r') as f:
|
||||||
|
|||||||
Reference in New Issue
Block a user