feat: check og:title/og:description for keyword match in alerts

When a search result's title/URL doesn't contain the keyword, fetch
the page's first 64 KB and parse og:title and og:description meta
tags. If the keyword appears there, the result is announced. Prefers
og:title as display title when it's richer than the search result
title.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
user
2026-02-15 21:28:48 +01:00
parent 0d5855dda3
commit e36ec350f5

View File

@@ -4,13 +4,17 @@ from __future__ import annotations
import asyncio import asyncio
import json import json
import logging
import re import re
import urllib.request import urllib.request
from datetime import datetime, timezone from datetime import datetime, timezone
from html.parser import HTMLParser
from derp.http import urlopen as _urlopen from derp.http import urlopen as _urlopen
from derp.plugin import command, event from derp.plugin import command, event
_log = logging.getLogger(__name__)
# -- Constants --------------------------------------------------------------- # -- Constants ---------------------------------------------------------------
_NAME_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,19}$") _NAME_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,19}$")
@@ -54,6 +58,50 @@ def _truncate(text: str, max_len: int = _MAX_TITLE_LEN) -> str:
return text[: max_len - 3].rstrip() + "..." return text[: max_len - 3].rstrip() + "..."
class _OGParser(HTMLParser):
"""Extract og:title and og:description from <meta> tags."""
def __init__(self):
super().__init__()
self.og_title = ""
self.og_description = ""
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
if tag != "meta":
return
attr_map = {k.lower(): (v or "") for k, v in attrs}
prop = attr_map.get("property", "")
content = attr_map.get("content", "")
if prop == "og:title":
self.og_title = content
elif prop == "og:description":
self.og_description = content
_OG_TIMEOUT = 10
_OG_MAX_BYTES = 64 * 1024 # Only read first 64 KB (OG tags are in <head>)
def _fetch_og(url: str) -> tuple[str, str]:
"""Fetch og:title and og:description from a URL. Blocking.
Returns (og_title, og_description). Empty strings on failure.
"""
try:
req = urllib.request.Request(url, method="GET")
req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
resp = _urlopen(req, timeout=_OG_TIMEOUT)
raw = resp.read(_OG_MAX_BYTES)
resp.close()
html = raw.decode("utf-8", errors="replace")
parser = _OGParser()
parser.feed(html)
return parser.og_title, parser.og_description
except Exception as exc:
_log.debug("og fetch failed for %s: %s", url, exc)
return "", ""
# -- YouTube InnerTube search (blocking) ------------------------------------ # -- YouTube InnerTube search (blocking) ------------------------------------
def _extract_videos(obj: object, depth: int = 0) -> list[dict]: def _extract_videos(obj: object, depth: int = 0) -> list[dict]:
@@ -283,12 +331,27 @@ async def _poll_once(bot, key: str, announce: bool = True) -> None:
new_items = [item for item in items if item["id"] not in seen_set] new_items = [item for item in items if item["id"] not in seen_set]
# Filter: only announce results that actually contain the keyword # Filter: only announce results that actually contain the keyword
# Check title/URL first, then fall back to og:title/og:description
kw_lower = keyword.lower() kw_lower = keyword.lower()
matched = [ matched = []
item for item in new_items for item in new_items:
if kw_lower in item.get("title", "").lower() title_l = item.get("title", "").lower()
or kw_lower in item.get("url", "").lower() url_l = item.get("url", "").lower()
] if kw_lower in title_l or kw_lower in url_l:
matched.append(item)
continue
# Fetch OG tags for items that didn't match on title/URL
item_url = item.get("url", "")
if item_url:
og_title, og_desc = await loop.run_in_executor(
None, _fetch_og, item_url,
)
if (kw_lower in og_title.lower()
or kw_lower in og_desc.lower()):
# Use og:title as display title if richer
if og_title and len(og_title) > len(item.get("title", "")):
item["title"] = og_title
matched.append(item)
if announce and matched: if announce and matched:
channel = data["channel"] channel = data["channel"]