Files
derp/plugins/alert.py
user 604a0a5830 feat: display published date in alert announcements
Extract dates from multiple sources:
- SearXNG: publishedDate field from search results
- YouTube: publishedTimeText from InnerTube response
- OG fallback: article:published_time, og:updated_time, date,
  dc.date, dcterms.date, sailthru.date meta tags

Date is shown as (YYYY-MM-DD) or relative time after the tag prefix.
OG tags are fetched for date even when title/URL already matched.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 21:30:48 +01:00

654 lines
21 KiB
Python

"""Plugin: keyword alert subscriptions across multiple platforms."""
from __future__ import annotations
import asyncio
import json
import logging
import re
import urllib.request
from datetime import datetime, timezone
from html.parser import HTMLParser
from derp.http import urlopen as _urlopen
from derp.plugin import command, event
_log = logging.getLogger(__name__)
# -- Constants ---------------------------------------------------------------
_NAME_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,19}$")
_MAX_KEYWORD_LEN = 100
_MAX_SEEN = 200
_MAX_ANNOUNCE = 5
_DEFAULT_INTERVAL = 300
_MAX_INTERVAL = 3600
_FETCH_TIMEOUT = 15
_MAX_TITLE_LEN = 80
_MAX_SUBS = 20
_YT_SEARCH_URL = "https://www.youtube.com/youtubei/v1/search"
_YT_CLIENT_VERSION = "2.20250101.00.00"
_GQL_URL = "https://gql.twitch.tv/gql"
_GQL_CLIENT_ID = "kimne78kx3ncx6brgo4mv6wki5h1ko"
_SEARX_URL = "https://searx.mymx.me/search"
# -- Module-level tracking ---------------------------------------------------
_pollers: dict[str, asyncio.Task] = {}
_subscriptions: dict[str, dict] = {}
_errors: dict[str, int] = {}
# -- Pure helpers ------------------------------------------------------------
def _state_key(channel: str, name: str) -> str:
"""Build composite state key."""
return f"{channel}:{name}"
def _validate_name(name: str) -> bool:
"""Check name against allowed pattern."""
return bool(_NAME_RE.match(name))
def _truncate(text: str, max_len: int = _MAX_TITLE_LEN) -> str:
"""Truncate text with ellipsis if needed."""
if len(text) <= max_len:
return text
return text[: max_len - 3].rstrip() + "..."
_DATE_PROPS = {
"article:published_time", "og:article:published_time",
"og:updated_time", "date", "dc.date", "dcterms.date",
"sailthru.date",
}
class _OGParser(HTMLParser):
"""Extract og:title, og:description, and published date from <meta> tags."""
def __init__(self):
super().__init__()
self.og_title = ""
self.og_description = ""
self.published = ""
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
if tag != "meta":
return
attr_map = {k.lower(): (v or "") for k, v in attrs}
prop = attr_map.get("property", "").lower()
name = attr_map.get("name", "").lower()
content = attr_map.get("content", "")
if prop == "og:title":
self.og_title = content
elif prop == "og:description":
self.og_description = content
if not self.published and content:
if prop in _DATE_PROPS or name in _DATE_PROPS:
self.published = content
_OG_TIMEOUT = 10
_OG_MAX_BYTES = 64 * 1024 # Only read first 64 KB (OG tags are in <head>)
def _parse_date(raw: str) -> str:
"""Try to extract a YYYY-MM-DD date from a raw date string."""
m = re.search(r"\d{4}-\d{2}-\d{2}", raw)
return m.group(0) if m else ""
def _fetch_og(url: str) -> tuple[str, str, str]:
"""Fetch og:title, og:description, and published date from a URL.
Returns (og_title, og_description, date). Empty strings on failure.
"""
try:
req = urllib.request.Request(url, method="GET")
req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
resp = _urlopen(req, timeout=_OG_TIMEOUT)
raw = resp.read(_OG_MAX_BYTES)
resp.close()
html = raw.decode("utf-8", errors="replace")
parser = _OGParser()
parser.feed(html)
date = _parse_date(parser.published)
return parser.og_title, parser.og_description, date
except Exception as exc:
_log.debug("og fetch failed for %s: %s", url, exc)
return "", "", ""
# -- YouTube InnerTube search (blocking) ------------------------------------
def _extract_videos(obj: object, depth: int = 0) -> list[dict]:
"""Recursively walk YouTube JSON to find video results.
Finds all objects containing both 'videoId' and 'title' keys.
Resilient to YouTube rearranging wrapper layers.
"""
if depth > 20:
return []
results = []
if isinstance(obj, dict):
video_id = obj.get("videoId")
title_obj = obj.get("title")
if isinstance(video_id, str) and video_id and title_obj is not None:
if isinstance(title_obj, dict):
runs = title_obj.get("runs", [])
title = "".join(r.get("text", "") for r in runs if isinstance(r, dict))
elif isinstance(title_obj, str):
title = title_obj
else:
title = ""
if title:
# Extract relative publish time (e.g. "2 days ago")
pub_obj = obj.get("publishedTimeText")
date = ""
if isinstance(pub_obj, dict):
date = pub_obj.get("simpleText", "")
elif isinstance(pub_obj, str):
date = pub_obj
results.append({
"id": video_id,
"title": title,
"url": f"https://www.youtube.com/watch?v={video_id}",
"date": date,
"extra": "",
})
for val in obj.values():
results.extend(_extract_videos(val, depth + 1))
elif isinstance(obj, list):
for item in obj:
results.extend(_extract_videos(item, depth + 1))
return results
def _search_youtube(keyword: str) -> list[dict]:
"""Search YouTube via InnerTube API. Blocking."""
payload = json.dumps({
"context": {
"client": {
"clientName": "WEB",
"clientVersion": _YT_CLIENT_VERSION,
},
},
"query": keyword,
}).encode()
req = urllib.request.Request(_YT_SEARCH_URL, data=payload, method="POST")
req.add_header("Content-Type", "application/json")
resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
raw = resp.read()
resp.close()
data = json.loads(raw)
videos = _extract_videos(data)
# Deduplicate by videoId (same video can appear in multiple sections)
seen_ids: set[str] = set()
unique: list[dict] = []
for v in videos:
if v["id"] not in seen_ids:
seen_ids.add(v["id"])
unique.append(v)
return unique
# -- Twitch GQL search (blocking) ------------------------------------------
def _search_twitch(keyword: str) -> list[dict]:
"""Search Twitch via public GQL. Blocking."""
query = (
'query{searchFor(userQuery:"'
+ keyword.replace("\\", "\\\\").replace('"', '\\"')
+ '",options:{targets:[{index:STREAM},{index:VOD}]})'
"{streams{items{id broadcaster{login displayName}title game{name}"
"viewersCount}}videos{items{id owner{login displayName}title"
" game{name}viewCount}}}}"
)
body = json.dumps({"query": query}).encode()
req = urllib.request.Request(_GQL_URL, data=body, method="POST")
req.add_header("Client-Id", _GQL_CLIENT_ID)
req.add_header("Content-Type", "application/json")
resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
raw = resp.read()
resp.close()
data = json.loads(raw)
results: list[dict] = []
try:
search = data["data"]["searchFor"]
except (KeyError, TypeError):
return results
if not search:
return results
# Live streams
streams = search.get("streams") or {}
for item in streams.get("items") or []:
stream_id = str(item.get("id", ""))
if not stream_id:
continue
broadcaster = item.get("broadcaster") or {}
login = broadcaster.get("login", "")
display = broadcaster.get("displayName", login)
title = item.get("title", "")
game = (item.get("game") or {}).get("name", "")
line = f"{display} is live: {title}"
if game:
line += f" ({game})"
results.append({
"id": f"stream:{stream_id}",
"title": line,
"url": f"https://twitch.tv/{login}",
"date": "",
"extra": "",
})
# VODs
videos = search.get("videos") or {}
for item in videos.get("items") or []:
vod_id = str(item.get("id", ""))
if not vod_id:
continue
title = item.get("title", "")
results.append({
"id": f"vod:{vod_id}",
"title": title,
"url": f"https://twitch.tv/videos/{vod_id}",
"date": "",
"extra": "",
})
return results
# -- SearXNG search (blocking) ----------------------------------------------
def _search_searx(keyword: str) -> list[dict]:
"""Search SearXNG. Blocking."""
import urllib.parse
params = urllib.parse.urlencode({"q": keyword, "format": "json"})
url = f"{_SEARX_URL}?{params}"
req = urllib.request.Request(url, method="GET")
resp = urllib.request.urlopen(req, timeout=_FETCH_TIMEOUT)
raw = resp.read()
resp.close()
data = json.loads(raw)
results: list[dict] = []
for item in data.get("results", []):
item_url = item.get("url", "")
title = item.get("title", "")
date = _parse_date(item.get("publishedDate", ""))
results.append({
"id": item_url,
"title": title,
"url": item_url,
"date": date,
"extra": "",
})
return results
# -- Backend registry -------------------------------------------------------
_BACKENDS: dict[str, callable] = {
"yt": _search_youtube,
"tw": _search_twitch,
"sx": _search_searx,
}
# -- State helpers -----------------------------------------------------------
def _save(bot, key: str, data: dict) -> None:
"""Persist subscription data to bot.state."""
bot.state.set("alert", key, json.dumps(data))
def _load(bot, key: str) -> dict | None:
"""Load subscription data from bot.state."""
raw = bot.state.get("alert", key)
if raw is None:
return None
try:
return json.loads(raw)
except json.JSONDecodeError:
return None
def _delete(bot, key: str) -> None:
"""Remove subscription data from bot.state."""
bot.state.delete("alert", key)
# -- Polling -----------------------------------------------------------------
async def _poll_once(bot, key: str, announce: bool = True) -> None:
"""Single poll cycle for one alert subscription (all backends)."""
data = _subscriptions.get(key)
if data is None:
data = _load(bot, key)
if data is None:
return
_subscriptions[key] = data
keyword = data["keyword"]
now = datetime.now(timezone.utc).isoformat()
data["last_poll"] = now
had_error = False
loop = asyncio.get_running_loop()
for tag, backend in _BACKENDS.items():
try:
items = await loop.run_in_executor(None, backend, keyword)
except Exception as exc:
data["last_error"] = f"{tag}: {exc}"
had_error = True
continue
seen_set = set(data.get("seen", {}).get(tag, []))
seen_list = list(data.get("seen", {}).get(tag, []))
new_items = [item for item in items if item["id"] not in seen_set]
# Filter: only announce results that actually contain the keyword
# Check title/URL first, then fall back to og:title/og:description
kw_lower = keyword.lower()
matched = []
for item in new_items:
title_l = item.get("title", "").lower()
url_l = item.get("url", "").lower()
if kw_lower in title_l or kw_lower in url_l:
# Fetch OG tags for date if backend didn't provide one
if not item.get("date") and item.get("url"):
_, _, og_date = await loop.run_in_executor(
None, _fetch_og, item["url"],
)
if og_date:
item["date"] = og_date
matched.append(item)
continue
# Fetch OG tags for items that didn't match on title/URL
item_url = item.get("url", "")
if item_url:
og_title, og_desc, og_date = await loop.run_in_executor(
None, _fetch_og, item_url,
)
if (kw_lower in og_title.lower()
or kw_lower in og_desc.lower()):
if og_title and len(og_title) > len(item.get("title", "")):
item["title"] = og_title
if og_date and not item.get("date"):
item["date"] = og_date
matched.append(item)
if announce and matched:
channel = data["channel"]
name = data["name"]
shown = matched[:_MAX_ANNOUNCE]
for item in shown:
title = _truncate(item["title"]) if item["title"] else "(no title)"
url = item["url"]
date = item.get("date", "")
line = f"[{name}/{tag}]"
if date:
line += f" ({date})"
line += f" {title}"
if url:
line += f" -- {url}"
await bot.send(channel, line)
remaining = len(matched) - len(shown)
if remaining > 0:
await bot.send(channel, f"[{name}/{tag}] ... and {remaining} more")
for item in new_items:
seen_list.append(item["id"])
if len(seen_list) > _MAX_SEEN:
seen_list = seen_list[-_MAX_SEEN:]
data.setdefault("seen", {})[tag] = seen_list
if had_error:
_errors[key] = _errors.get(key, 0) + 1
else:
data["last_error"] = ""
_errors[key] = 0
_subscriptions[key] = data
_save(bot, key, data)
async def _poll_loop(bot, key: str) -> None:
"""Infinite poll loop for one alert subscription."""
try:
while True:
data = _subscriptions.get(key) or _load(bot, key)
if data is None:
return
interval = data.get("interval", _DEFAULT_INTERVAL)
errs = _errors.get(key, 0)
if errs >= 5:
interval = min(interval * 2, _MAX_INTERVAL)
await asyncio.sleep(interval)
await _poll_once(bot, key, announce=True)
except asyncio.CancelledError:
pass
def _start_poller(bot, key: str) -> None:
"""Create and track a poller task."""
existing = _pollers.get(key)
if existing and not existing.done():
return
task = asyncio.create_task(_poll_loop(bot, key))
_pollers[key] = task
def _stop_poller(key: str) -> None:
"""Cancel and remove a poller task."""
task = _pollers.pop(key, None)
if task and not task.done():
task.cancel()
_subscriptions.pop(key, None)
_errors.pop(key, 0)
# -- Restore on connect -----------------------------------------------------
def _restore(bot) -> None:
"""Rebuild pollers from persisted state."""
for key in bot.state.keys("alert"):
existing = _pollers.get(key)
if existing and not existing.done():
continue
data = _load(bot, key)
if data is None:
continue
_subscriptions[key] = data
_start_poller(bot, key)
@event("001")
async def on_connect(bot, message):
"""Restore alert subscription pollers on connect."""
_restore(bot)
# -- Command handler ---------------------------------------------------------
@command("alert", help="Alert: !alert add|del|list|check")
async def cmd_alert(bot, message):
"""Per-channel keyword alert subscriptions across platforms.
Usage:
!alert add <name> <keyword...> Add keyword alert (admin)
!alert del <name> Remove alert (admin)
!alert list List alerts
!alert check <name> Force-poll now
"""
parts = message.text.split(None, 3)
if len(parts) < 2:
await bot.reply(message, "Usage: !alert <add|del|list|check> [args]")
return
sub = parts[1].lower()
# -- list (any user, channel only) ----------------------------------------
if sub == "list":
if not message.is_channel:
await bot.reply(message, "Use this command in a channel")
return
channel = message.target
prefix = f"{channel}:"
subs = []
for key in bot.state.keys("alert"):
if key.startswith(prefix):
data = _load(bot, key)
if data:
name = data["name"]
err = data.get("last_error", "")
if err:
subs.append(f"{name} (error)")
else:
subs.append(name)
if not subs:
await bot.reply(message, "No alerts in this channel")
return
await bot.reply(message, f"Alerts: {', '.join(subs)}")
return
# -- check (any user, channel only) ---------------------------------------
if sub == "check":
if not message.is_channel:
await bot.reply(message, "Use this command in a channel")
return
if len(parts) < 3:
await bot.reply(message, "Usage: !alert check <name>")
return
name = parts[2].lower()
channel = message.target
key = _state_key(channel, name)
data = _load(bot, key)
if data is None:
await bot.reply(message, f"No alert '{name}' in this channel")
return
_subscriptions[key] = data
await _poll_once(bot, key, announce=True)
data = _subscriptions.get(key, data)
if data.get("last_error"):
await bot.reply(message, f"{name}: error -- {data['last_error']}")
else:
await bot.reply(message, f"{name}: checked")
return
# -- add (admin, channel only) -------------------------------------------
if sub == "add":
if not bot._is_admin(message):
await bot.reply(message, "Permission denied: add requires admin")
return
if not message.is_channel:
await bot.reply(message, "Use this command in a channel")
return
if len(parts) < 4:
await bot.reply(message, "Usage: !alert add <name> <keyword...>")
return
name = parts[2].lower()
keyword = parts[3]
if not _validate_name(name):
await bot.reply(
message,
"Invalid name (lowercase alphanumeric + hyphens, 1-20 chars)",
)
return
if len(keyword) > _MAX_KEYWORD_LEN:
await bot.reply(message, f"Keyword too long (max {_MAX_KEYWORD_LEN} chars)")
return
irc_channel = message.target
key = _state_key(irc_channel, name)
if _load(bot, key) is not None:
await bot.reply(message, f"Alert '{name}' already exists in this channel")
return
ch_prefix = f"{irc_channel}:"
count = sum(1 for k in bot.state.keys("alert") if k.startswith(ch_prefix))
if count >= _MAX_SUBS:
await bot.reply(message, f"Alert limit reached ({_MAX_SUBS})")
return
# Seed: query all backends to populate seen lists (prevents flood)
loop = asyncio.get_running_loop()
seen: dict[str, list[str]] = {}
for tag, backend in _BACKENDS.items():
try:
items = await loop.run_in_executor(None, backend, keyword)
ids = [item["id"] for item in items]
if len(ids) > _MAX_SEEN:
ids = ids[-_MAX_SEEN:]
seen[tag] = ids
except Exception:
seen[tag] = []
now = datetime.now(timezone.utc).isoformat()
data = {
"keyword": keyword,
"name": name,
"channel": irc_channel,
"interval": _DEFAULT_INTERVAL,
"added_by": message.nick,
"added_at": now,
"last_poll": now,
"last_error": "",
"seen": seen,
}
_save(bot, key, data)
_subscriptions[key] = data
_start_poller(bot, key)
counts = ", ".join(f"{len(seen.get(t, []))} {t}" for t in _BACKENDS)
await bot.reply(
message,
f"Alert '{name}' added for: {keyword} ({counts} existing)",
)
return
# -- del (admin, channel only) -------------------------------------------
if sub == "del":
if not bot._is_admin(message):
await bot.reply(message, "Permission denied: del requires admin")
return
if not message.is_channel:
await bot.reply(message, "Use this command in a channel")
return
if len(parts) < 3:
await bot.reply(message, "Usage: !alert del <name>")
return
name = parts[2].lower()
channel = message.target
key = _state_key(channel, name)
if _load(bot, key) is None:
await bot.reply(message, f"No alert '{name}' in this channel")
return
_stop_poller(key)
_delete(bot, key)
await bot.reply(message, f"Removed '{name}'")
return
await bot.reply(message, "Usage: !alert <add|del|list|check> [args]")