app: replace regex html stripping with stdlib parser
Handles malformed tags, nested markup, and CDATA that the naive regex missed.
This commit is contained in:
@@ -5,6 +5,7 @@ from __future__ import annotations
|
||||
import html
|
||||
import logging
|
||||
import time
|
||||
from html.parser import HTMLParser
|
||||
|
||||
from textual import events, on, work
|
||||
from textual.app import App, ComposeResult
|
||||
@@ -991,9 +992,22 @@ class TuimbleApp(App):
|
||||
self.exit()
|
||||
|
||||
|
||||
class _HTMLStripper(HTMLParser):
|
||||
"""Extract text content from HTML, discarding all tags."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._parts: list[str] = []
|
||||
|
||||
def handle_data(self, data: str):
|
||||
self._parts.append(data)
|
||||
|
||||
def get_text(self) -> str:
|
||||
return "".join(self._parts)
|
||||
|
||||
|
||||
def _strip_html(text: str) -> str:
|
||||
"""Remove HTML tags and unescape entities from Mumble messages."""
|
||||
import re
|
||||
|
||||
clean = re.sub(r"<[^>]+>", "", text)
|
||||
return html.unescape(clean)
|
||||
stripper = _HTMLStripper()
|
||||
stripper.feed(text)
|
||||
return html.unescape(stripper.get_text())
|
||||
|
||||
Reference in New Issue
Block a user