app: replace regex html stripping with stdlib parser
Handles malformed tags, nested markup, and CDATA that the naive regex missed.
This commit is contained in:
@@ -5,6 +5,7 @@ from __future__ import annotations
|
|||||||
import html
|
import html
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
|
||||||
from textual import events, on, work
|
from textual import events, on, work
|
||||||
from textual.app import App, ComposeResult
|
from textual.app import App, ComposeResult
|
||||||
@@ -991,9 +992,22 @@ class TuimbleApp(App):
|
|||||||
self.exit()
|
self.exit()
|
||||||
|
|
||||||
|
|
||||||
|
class _HTMLStripper(HTMLParser):
|
||||||
|
"""Extract text content from HTML, discarding all tags."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self._parts: list[str] = []
|
||||||
|
|
||||||
|
def handle_data(self, data: str):
|
||||||
|
self._parts.append(data)
|
||||||
|
|
||||||
|
def get_text(self) -> str:
|
||||||
|
return "".join(self._parts)
|
||||||
|
|
||||||
|
|
||||||
def _strip_html(text: str) -> str:
|
def _strip_html(text: str) -> str:
|
||||||
"""Remove HTML tags and unescape entities from Mumble messages."""
|
"""Remove HTML tags and unescape entities from Mumble messages."""
|
||||||
import re
|
stripper = _HTMLStripper()
|
||||||
|
stripper.feed(text)
|
||||||
clean = re.sub(r"<[^>]+>", "", text)
|
return html.unescape(stripper.get_text())
|
||||||
return html.unescape(clean)
|
|
||||||
|
|||||||
Reference in New Issue
Block a user