feat: check og:title/og:description for keyword match in alerts

When a search result's title/URL doesn't contain the keyword, fetch the page's first 64 KB and parse og:title and og:description meta tags. If the keyword appears there, the result is announced. Prefers og:title as display title when it's richer than the search result title. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 21:28:48 +01:00
parent 0d5855dda3
commit e36ec350f5
1 changed files with 68 additions and 5 deletions
--- a/plugins/alert.py
+++ b/plugins/alert.py
@@ -4,13 +4,17 @@ from __future__ import annotations

 import asyncio
 import json
+import logging
 import re
 import urllib.request
 from datetime import datetime, timezone
+from html.parser import HTMLParser

 from derp.http import urlopen as _urlopen
 from derp.plugin import command, event

+_log = logging.getLogger(__name__)
+
 # -- Constants ---------------------------------------------------------------

 _NAME_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,19}$")
@@ -54,6 +58,50 @@ def _truncate(text: str, max_len: int = _MAX_TITLE_LEN) -> str:
    return text[: max_len - 3].rstrip() + "..."


+class _OGParser(HTMLParser):
+    """Extract og:title and og:description from <meta> tags."""
+
+    def __init__(self):
+        super().__init__()
+        self.og_title = ""
+        self.og_description = ""
+
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        if tag != "meta":
+            return
+        attr_map = {k.lower(): (v or "") for k, v in attrs}
+        prop = attr_map.get("property", "")
+        content = attr_map.get("content", "")
+        if prop == "og:title":
+            self.og_title = content
+        elif prop == "og:description":
+            self.og_description = content
+
+
+_OG_TIMEOUT = 10
+_OG_MAX_BYTES = 64 * 1024  # Only read first 64 KB (OG tags are in <head>)
+
+
+def _fetch_og(url: str) -> tuple[str, str]:
+    """Fetch og:title and og:description from a URL. Blocking.
+
+    Returns (og_title, og_description). Empty strings on failure.
+    """
+    try:
+        req = urllib.request.Request(url, method="GET")
+        req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
+        resp = _urlopen(req, timeout=_OG_TIMEOUT)
+        raw = resp.read(_OG_MAX_BYTES)
+        resp.close()
+        html = raw.decode("utf-8", errors="replace")
+        parser = _OGParser()
+        parser.feed(html)
+        return parser.og_title, parser.og_description
+    except Exception as exc:
+        _log.debug("og fetch failed for %s: %s", url, exc)
+        return "", ""
+
+
 # -- YouTube InnerTube search (blocking) ------------------------------------

 def _extract_videos(obj: object, depth: int = 0) -> list[dict]:
@@ -283,12 +331,27 @@ async def _poll_once(bot, key: str, announce: bool = True) -> None:
        new_items = [item for item in items if item["id"] not in seen_set]

        # Filter: only announce results that actually contain the keyword
+        # Check title/URL first, then fall back to og:title/og:description
        kw_lower = keyword.lower()
-        matched = [
-            item for item in new_items
-            if kw_lower in item.get("title", "").lower()
-            or kw_lower in item.get("url", "").lower()
-        ]
+        matched = []
+        for item in new_items:
+            title_l = item.get("title", "").lower()
+            url_l = item.get("url", "").lower()
+            if kw_lower in title_l or kw_lower in url_l:
+                matched.append(item)
+                continue
+            # Fetch OG tags for items that didn't match on title/URL
+            item_url = item.get("url", "")
+            if item_url:
+                og_title, og_desc = await loop.run_in_executor(
+                    None, _fetch_og, item_url,
+                )
+                if (kw_lower in og_title.lower()
+                        or kw_lower in og_desc.lower()):
+                    # Use og:title as display title if richer
+                    if og_title and len(og_title) > len(item.get("title", "")):
+                        item["title"] = og_title
+                    matched.append(item)

        if announce and matched:
            channel = data["channel"]