From 087559637ee02a74c6b9da4e60e0bf76582fa478 Mon Sep 17 00:00:00 2001
From: rofl0r <retnyg@gmx.net>
Date: Wed, 9 Jan 2019 22:48:03 +0000
Subject: [PATCH] ppf: improve cleanhtml() and cache compiled re's

now it transforms e.g. '<td>118.114.116.36</td>\n<td>1080</td>'
correctly.
(the newline was formerly preventing success)
---
 ppf.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/ppf.py b/ppf.py
index dca39c6..cb6bcf8 100755
--- a/ppf.py
+++ b/ppf.py
@@ -21,11 +21,18 @@ base_header = {
 searx_instances = ('https://searx.me', 'https://searx.xyz', 'https://searx.site', 'https://searx.win', 'https://searx.ru', 'https://stemy.me/searx', 'https://searx.at', 'https://listi.me', 'https://searx.dk', 'https://searx.laquadrature.net' )
 retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
 
+
+cleanhtml_re = [
+	re.compile('<.*?>'),
+	re.compile('\s+'),
+	re.compile('::+'),
+]
 def cleanhtml(raw_html):
-	cleanr = re.compile('<.*?>')
-	cleantext = re.sub(cleanr, ':', raw_html)
-	cleantext = re.sub('::+',':', cleantext)
-	return cleantext
+	html = raw_html.replace('&nbsp;', ' ')
+	html = re.sub(cleanhtml_re[0], ':', html)
+	html = re.sub(cleanhtml_re[1], ':', html)
+	html = re.sub(cleanhtml_re[2], ':', html)
+	return html
 
 def import_from_file(fn, sqlite):
 	with open(fn, 'r') as f: