feat: check og:title/og:description for keyword match in alerts
When a search result's title/URL doesn't contain the keyword, fetch the page's first 64 KB and parse og:title and og:description meta tags. If the keyword appears there, the result is announced. Prefers og:title as display title when it's richer than the search result title. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4,13 +4,17 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
|
||||||
from derp.http import urlopen as _urlopen
|
from derp.http import urlopen as _urlopen
|
||||||
from derp.plugin import command, event
|
from derp.plugin import command, event
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
# -- Constants ---------------------------------------------------------------
|
# -- Constants ---------------------------------------------------------------
|
||||||
|
|
||||||
_NAME_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,19}$")
|
_NAME_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,19}$")
|
||||||
@@ -54,6 +58,50 @@ def _truncate(text: str, max_len: int = _MAX_TITLE_LEN) -> str:
|
|||||||
return text[: max_len - 3].rstrip() + "..."
|
return text[: max_len - 3].rstrip() + "..."
|
||||||
|
|
||||||
|
|
||||||
|
class _OGParser(HTMLParser):
|
||||||
|
"""Extract og:title and og:description from <meta> tags."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.og_title = ""
|
||||||
|
self.og_description = ""
|
||||||
|
|
||||||
|
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
||||||
|
if tag != "meta":
|
||||||
|
return
|
||||||
|
attr_map = {k.lower(): (v or "") for k, v in attrs}
|
||||||
|
prop = attr_map.get("property", "")
|
||||||
|
content = attr_map.get("content", "")
|
||||||
|
if prop == "og:title":
|
||||||
|
self.og_title = content
|
||||||
|
elif prop == "og:description":
|
||||||
|
self.og_description = content
|
||||||
|
|
||||||
|
|
||||||
|
_OG_TIMEOUT = 10
|
||||||
|
_OG_MAX_BYTES = 64 * 1024 # Only read first 64 KB (OG tags are in <head>)
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_og(url: str) -> tuple[str, str]:
|
||||||
|
"""Fetch og:title and og:description from a URL. Blocking.
|
||||||
|
|
||||||
|
Returns (og_title, og_description). Empty strings on failure.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(url, method="GET")
|
||||||
|
req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
|
||||||
|
resp = _urlopen(req, timeout=_OG_TIMEOUT)
|
||||||
|
raw = resp.read(_OG_MAX_BYTES)
|
||||||
|
resp.close()
|
||||||
|
html = raw.decode("utf-8", errors="replace")
|
||||||
|
parser = _OGParser()
|
||||||
|
parser.feed(html)
|
||||||
|
return parser.og_title, parser.og_description
|
||||||
|
except Exception as exc:
|
||||||
|
_log.debug("og fetch failed for %s: %s", url, exc)
|
||||||
|
return "", ""
|
||||||
|
|
||||||
|
|
||||||
# -- YouTube InnerTube search (blocking) ------------------------------------
|
# -- YouTube InnerTube search (blocking) ------------------------------------
|
||||||
|
|
||||||
def _extract_videos(obj: object, depth: int = 0) -> list[dict]:
|
def _extract_videos(obj: object, depth: int = 0) -> list[dict]:
|
||||||
@@ -283,12 +331,27 @@ async def _poll_once(bot, key: str, announce: bool = True) -> None:
|
|||||||
new_items = [item for item in items if item["id"] not in seen_set]
|
new_items = [item for item in items if item["id"] not in seen_set]
|
||||||
|
|
||||||
# Filter: only announce results that actually contain the keyword
|
# Filter: only announce results that actually contain the keyword
|
||||||
|
# Check title/URL first, then fall back to og:title/og:description
|
||||||
kw_lower = keyword.lower()
|
kw_lower = keyword.lower()
|
||||||
matched = [
|
matched = []
|
||||||
item for item in new_items
|
for item in new_items:
|
||||||
if kw_lower in item.get("title", "").lower()
|
title_l = item.get("title", "").lower()
|
||||||
or kw_lower in item.get("url", "").lower()
|
url_l = item.get("url", "").lower()
|
||||||
]
|
if kw_lower in title_l or kw_lower in url_l:
|
||||||
|
matched.append(item)
|
||||||
|
continue
|
||||||
|
# Fetch OG tags for items that didn't match on title/URL
|
||||||
|
item_url = item.get("url", "")
|
||||||
|
if item_url:
|
||||||
|
og_title, og_desc = await loop.run_in_executor(
|
||||||
|
None, _fetch_og, item_url,
|
||||||
|
)
|
||||||
|
if (kw_lower in og_title.lower()
|
||||||
|
or kw_lower in og_desc.lower()):
|
||||||
|
# Use og:title as display title if richer
|
||||||
|
if og_title and len(og_title) > len(item.get("title", "")):
|
||||||
|
item["title"] = og_title
|
||||||
|
matched.append(item)
|
||||||
|
|
||||||
if announce and matched:
|
if announce and matched:
|
||||||
channel = data["channel"]
|
channel = data["channel"]
|
||||||
|
|||||||
Reference in New Issue
Block a user