From e36ec350f5f054078e8a9b8b137d19bad9a5c2f6 Mon Sep 17 00:00:00 2001 From: user Date: Sun, 15 Feb 2026 21:28:48 +0100 Subject: [PATCH] feat: check og:title/og:description for keyword match in alerts When a search result's title/URL doesn't contain the keyword, fetch the page's first 64 KB and parse og:title and og:description meta tags. If the keyword appears there, the result is announced. Prefers og:title as display title when it's richer than the search result title. Co-Authored-By: Claude Opus 4.6 --- plugins/alert.py | 73 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 5 deletions(-) diff --git a/plugins/alert.py b/plugins/alert.py index 01a0a72..fad1425 100644 --- a/plugins/alert.py +++ b/plugins/alert.py @@ -4,13 +4,17 @@ from __future__ import annotations import asyncio import json +import logging import re import urllib.request from datetime import datetime, timezone +from html.parser import HTMLParser from derp.http import urlopen as _urlopen from derp.plugin import command, event +_log = logging.getLogger(__name__) + # -- Constants --------------------------------------------------------------- _NAME_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,19}$") @@ -54,6 +58,50 @@ def _truncate(text: str, max_len: int = _MAX_TITLE_LEN) -> str: return text[: max_len - 3].rstrip() + "..." +class _OGParser(HTMLParser): + """Extract og:title and og:description from tags.""" + + def __init__(self): + super().__init__() + self.og_title = "" + self.og_description = "" + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + if tag != "meta": + return + attr_map = {k.lower(): (v or "") for k, v in attrs} + prop = attr_map.get("property", "") + content = attr_map.get("content", "") + if prop == "og:title": + self.og_title = content + elif prop == "og:description": + self.og_description = content + + +_OG_TIMEOUT = 10 +_OG_MAX_BYTES = 64 * 1024 # Only read first 64 KB (OG tags are in ) + + +def _fetch_og(url: str) -> tuple[str, str]: + """Fetch og:title and og:description from a URL. Blocking. + + Returns (og_title, og_description). Empty strings on failure. + """ + try: + req = urllib.request.Request(url, method="GET") + req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)") + resp = _urlopen(req, timeout=_OG_TIMEOUT) + raw = resp.read(_OG_MAX_BYTES) + resp.close() + html = raw.decode("utf-8", errors="replace") + parser = _OGParser() + parser.feed(html) + return parser.og_title, parser.og_description + except Exception as exc: + _log.debug("og fetch failed for %s: %s", url, exc) + return "", "" + + # -- YouTube InnerTube search (blocking) ------------------------------------ def _extract_videos(obj: object, depth: int = 0) -> list[dict]: @@ -283,12 +331,27 @@ async def _poll_once(bot, key: str, announce: bool = True) -> None: new_items = [item for item in items if item["id"] not in seen_set] # Filter: only announce results that actually contain the keyword + # Check title/URL first, then fall back to og:title/og:description kw_lower = keyword.lower() - matched = [ - item for item in new_items - if kw_lower in item.get("title", "").lower() - or kw_lower in item.get("url", "").lower() - ] + matched = [] + for item in new_items: + title_l = item.get("title", "").lower() + url_l = item.get("url", "").lower() + if kw_lower in title_l or kw_lower in url_l: + matched.append(item) + continue + # Fetch OG tags for items that didn't match on title/URL + item_url = item.get("url", "") + if item_url: + og_title, og_desc = await loop.run_in_executor( + None, _fetch_og, item_url, + ) + if (kw_lower in og_title.lower() + or kw_lower in og_desc.lower()): + # Use og:title as display title if richer + if og_title and len(og_title) > len(item.get("title", "")): + item["title"] = og_title + matched.append(item) if announce and matched: channel = data["channel"]