From 087559637ee02a74c6b9da4e60e0bf76582fa478 Mon Sep 17 00:00:00 2001 From: rofl0r Date: Wed, 9 Jan 2019 22:48:03 +0000 Subject: [PATCH] ppf: improve cleanhtml() and cache compiled re's now it transforms e.g. '118.114.116.36\n1080' correctly. (the newline was formerly preventing success) --- ppf.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/ppf.py b/ppf.py index dca39c6..cb6bcf8 100755 --- a/ppf.py +++ b/ppf.py @@ -21,11 +21,18 @@ base_header = { searx_instances = ('https://searx.me', 'https://searx.xyz', 'https://searx.site', 'https://searx.win', 'https://searx.ru', 'https://stemy.me/searx', 'https://searx.at', 'https://listi.me', 'https://searx.dk', 'https://searx.laquadrature.net' ) retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded') + +cleanhtml_re = [ + re.compile('<.*?>'), + re.compile('\s+'), + re.compile('::+'), +] def cleanhtml(raw_html): - cleanr = re.compile('<.*?>') - cleantext = re.sub(cleanr, ':', raw_html) - cleantext = re.sub('::+',':', cleantext) - return cleantext + html = raw_html.replace(' ', ' ') + html = re.sub(cleanhtml_re[0], ':', html) + html = re.sub(cleanhtml_re[1], ':', html) + html = re.sub(cleanhtml_re[2], ':', html) + return html def import_from_file(fn, sqlite): with open(fn, 'r') as f: