"""Plugin: keyword alert subscriptions across multiple platforms.""" from __future__ import annotations import asyncio import json import logging import re import urllib.request from datetime import datetime, timezone from html.parser import HTMLParser from derp.http import urlopen as _urlopen from derp.plugin import command, event _log = logging.getLogger(__name__) # -- Constants --------------------------------------------------------------- _NAME_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,19}$") _MAX_KEYWORD_LEN = 100 _MAX_SEEN = 200 _MAX_ANNOUNCE = 5 _DEFAULT_INTERVAL = 300 _MAX_INTERVAL = 3600 _FETCH_TIMEOUT = 15 _MAX_TITLE_LEN = 80 _MAX_SUBS = 20 _YT_SEARCH_URL = "https://www.youtube.com/youtubei/v1/search" _YT_CLIENT_VERSION = "2.20250101.00.00" _GQL_URL = "https://gql.twitch.tv/gql" _GQL_CLIENT_ID = "kimne78kx3ncx6brgo4mv6wki5h1ko" _SEARX_URL = "https://searx.mymx.me/search" # -- Module-level tracking --------------------------------------------------- _pollers: dict[str, asyncio.Task] = {} _subscriptions: dict[str, dict] = {} _errors: dict[str, int] = {} # -- Pure helpers ------------------------------------------------------------ def _state_key(channel: str, name: str) -> str: """Build composite state key.""" return f"{channel}:{name}" def _validate_name(name: str) -> bool: """Check name against allowed pattern.""" return bool(_NAME_RE.match(name)) def _truncate(text: str, max_len: int = _MAX_TITLE_LEN) -> str: """Truncate text with ellipsis if needed.""" if len(text) <= max_len: return text return text[: max_len - 3].rstrip() + "..." _DATE_PROPS = { "article:published_time", "og:article:published_time", "og:updated_time", "date", "dc.date", "dcterms.date", "sailthru.date", } class _OGParser(HTMLParser): """Extract og:title, og:description, and published date from tags.""" def __init__(self): super().__init__() self.og_title = "" self.og_description = "" self.published = "" def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: if tag != "meta": return attr_map = {k.lower(): (v or "") for k, v in attrs} prop = attr_map.get("property", "").lower() name = attr_map.get("name", "").lower() content = attr_map.get("content", "") if prop == "og:title": self.og_title = content elif prop == "og:description": self.og_description = content if not self.published and content: if prop in _DATE_PROPS or name in _DATE_PROPS: self.published = content _OG_TIMEOUT = 10 _OG_MAX_BYTES = 64 * 1024 # Only read first 64 KB (OG tags are in ) def _parse_date(raw: str) -> str: """Try to extract a YYYY-MM-DD date from a raw date string.""" m = re.search(r"\d{4}-\d{2}-\d{2}", raw) return m.group(0) if m else "" def _fetch_og(url: str) -> tuple[str, str, str]: """Fetch og:title, og:description, and published date from a URL. Returns (og_title, og_description, date). Empty strings on failure. """ try: req = urllib.request.Request(url, method="GET") req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)") resp = _urlopen(req, timeout=_OG_TIMEOUT) raw = resp.read(_OG_MAX_BYTES) resp.close() html = raw.decode("utf-8", errors="replace") parser = _OGParser() parser.feed(html) date = _parse_date(parser.published) return parser.og_title, parser.og_description, date except Exception as exc: _log.debug("og fetch failed for %s: %s", url, exc) return "", "", "" # -- YouTube InnerTube search (blocking) ------------------------------------ def _extract_videos(obj: object, depth: int = 0) -> list[dict]: """Recursively walk YouTube JSON to find video results. Finds all objects containing both 'videoId' and 'title' keys. Resilient to YouTube rearranging wrapper layers. """ if depth > 20: return [] results = [] if isinstance(obj, dict): video_id = obj.get("videoId") title_obj = obj.get("title") if isinstance(video_id, str) and video_id and title_obj is not None: if isinstance(title_obj, dict): runs = title_obj.get("runs", []) title = "".join(r.get("text", "") for r in runs if isinstance(r, dict)) elif isinstance(title_obj, str): title = title_obj else: title = "" if title: # Extract relative publish time (e.g. "2 days ago") pub_obj = obj.get("publishedTimeText") date = "" if isinstance(pub_obj, dict): date = pub_obj.get("simpleText", "") elif isinstance(pub_obj, str): date = pub_obj results.append({ "id": video_id, "title": title, "url": f"https://www.youtube.com/watch?v={video_id}", "date": date, "extra": "", }) for val in obj.values(): results.extend(_extract_videos(val, depth + 1)) elif isinstance(obj, list): for item in obj: results.extend(_extract_videos(item, depth + 1)) return results def _search_youtube(keyword: str) -> list[dict]: """Search YouTube via InnerTube API. Blocking.""" payload = json.dumps({ "context": { "client": { "clientName": "WEB", "clientVersion": _YT_CLIENT_VERSION, }, }, "query": keyword, }).encode() req = urllib.request.Request(_YT_SEARCH_URL, data=payload, method="POST") req.add_header("Content-Type", "application/json") resp = _urlopen(req, timeout=_FETCH_TIMEOUT) raw = resp.read() resp.close() data = json.loads(raw) videos = _extract_videos(data) # Deduplicate by videoId (same video can appear in multiple sections) seen_ids: set[str] = set() unique: list[dict] = [] for v in videos: if v["id"] not in seen_ids: seen_ids.add(v["id"]) unique.append(v) return unique # -- Twitch GQL search (blocking) ------------------------------------------ def _search_twitch(keyword: str) -> list[dict]: """Search Twitch via public GQL. Blocking.""" query = ( 'query{searchFor(userQuery:"' + keyword.replace("\\", "\\\\").replace('"', '\\"') + '",options:{targets:[{index:STREAM},{index:VOD}]})' "{streams{items{id broadcaster{login displayName}title game{name}" "viewersCount}}videos{items{id owner{login displayName}title" " game{name}viewCount}}}}" ) body = json.dumps({"query": query}).encode() req = urllib.request.Request(_GQL_URL, data=body, method="POST") req.add_header("Client-Id", _GQL_CLIENT_ID) req.add_header("Content-Type", "application/json") resp = _urlopen(req, timeout=_FETCH_TIMEOUT) raw = resp.read() resp.close() data = json.loads(raw) results: list[dict] = [] try: search = data["data"]["searchFor"] except (KeyError, TypeError): return results if not search: return results # Live streams streams = search.get("streams") or {} for item in streams.get("items") or []: stream_id = str(item.get("id", "")) if not stream_id: continue broadcaster = item.get("broadcaster") or {} login = broadcaster.get("login", "") display = broadcaster.get("displayName", login) title = item.get("title", "") game = (item.get("game") or {}).get("name", "") line = f"{display} is live: {title}" if game: line += f" ({game})" results.append({ "id": f"stream:{stream_id}", "title": line, "url": f"https://twitch.tv/{login}", "date": "", "extra": "", }) # VODs videos = search.get("videos") or {} for item in videos.get("items") or []: vod_id = str(item.get("id", "")) if not vod_id: continue title = item.get("title", "") results.append({ "id": f"vod:{vod_id}", "title": title, "url": f"https://twitch.tv/videos/{vod_id}", "date": "", "extra": "", }) return results # -- SearXNG search (blocking) ---------------------------------------------- def _search_searx(keyword: str) -> list[dict]: """Search SearXNG. Blocking.""" import urllib.parse params = urllib.parse.urlencode({"q": keyword, "format": "json"}) url = f"{_SEARX_URL}?{params}" req = urllib.request.Request(url, method="GET") resp = urllib.request.urlopen(req, timeout=_FETCH_TIMEOUT) raw = resp.read() resp.close() data = json.loads(raw) results: list[dict] = [] for item in data.get("results", []): item_url = item.get("url", "") title = item.get("title", "") date = _parse_date(item.get("publishedDate") or "") results.append({ "id": item_url, "title": title, "url": item_url, "date": date, "extra": "", }) return results # -- Backend registry ------------------------------------------------------- _BACKENDS: dict[str, callable] = { "yt": _search_youtube, "tw": _search_twitch, "sx": _search_searx, } # -- State helpers ----------------------------------------------------------- def _save(bot, key: str, data: dict) -> None: """Persist subscription data to bot.state.""" bot.state.set("alert", key, json.dumps(data)) def _load(bot, key: str) -> dict | None: """Load subscription data from bot.state.""" raw = bot.state.get("alert", key) if raw is None: return None try: return json.loads(raw) except json.JSONDecodeError: return None def _delete(bot, key: str) -> None: """Remove subscription data from bot.state.""" bot.state.delete("alert", key) # -- Polling ----------------------------------------------------------------- async def _poll_once(bot, key: str, announce: bool = True) -> None: """Single poll cycle for one alert subscription (all backends).""" data = _subscriptions.get(key) if data is None: data = _load(bot, key) if data is None: return _subscriptions[key] = data keyword = data["keyword"] now = datetime.now(timezone.utc).isoformat() data["last_poll"] = now had_error = False loop = asyncio.get_running_loop() for tag, backend in _BACKENDS.items(): try: items = await loop.run_in_executor(None, backend, keyword) except Exception as exc: data["last_error"] = f"{tag}: {exc}" had_error = True continue seen_set = set(data.get("seen", {}).get(tag, [])) seen_list = list(data.get("seen", {}).get(tag, [])) new_items = [item for item in items if item["id"] not in seen_set] # Filter: only announce results that actually contain the keyword # Check title/URL first, then fall back to og:title/og:description kw_lower = keyword.lower() matched = [] for item in new_items: title_l = item.get("title", "").lower() url_l = item.get("url", "").lower() if kw_lower in title_l or kw_lower in url_l: # Fetch OG tags for date if backend didn't provide one if not item.get("date") and item.get("url"): _, _, og_date = await loop.run_in_executor( None, _fetch_og, item["url"], ) if og_date: item["date"] = og_date matched.append(item) continue # Fetch OG tags for items that didn't match on title/URL item_url = item.get("url", "") if item_url: og_title, og_desc, og_date = await loop.run_in_executor( None, _fetch_og, item_url, ) if (kw_lower in og_title.lower() or kw_lower in og_desc.lower()): if og_title and len(og_title) > len(item.get("title", "")): item["title"] = og_title if og_date and not item.get("date"): item["date"] = og_date matched.append(item) if announce and matched: channel = data["channel"] name = data["name"] shown = matched[:_MAX_ANNOUNCE] for item in shown: title = _truncate(item["title"]) if item["title"] else "(no title)" url = item["url"] date = item.get("date", "") line = f"[{name}/{tag}]" if date: line += f" ({date})" line += f" {title}" if url: line += f" -- {url}" await bot.send(channel, line) remaining = len(matched) - len(shown) if remaining > 0: await bot.send(channel, f"[{name}/{tag}] ... and {remaining} more") for item in new_items: seen_list.append(item["id"]) if len(seen_list) > _MAX_SEEN: seen_list = seen_list[-_MAX_SEEN:] data.setdefault("seen", {})[tag] = seen_list if had_error: _errors[key] = _errors.get(key, 0) + 1 else: data["last_error"] = "" _errors[key] = 0 _subscriptions[key] = data _save(bot, key, data) async def _poll_loop(bot, key: str) -> None: """Infinite poll loop for one alert subscription.""" try: while True: data = _subscriptions.get(key) or _load(bot, key) if data is None: return interval = data.get("interval", _DEFAULT_INTERVAL) errs = _errors.get(key, 0) if errs >= 5: interval = min(interval * 2, _MAX_INTERVAL) await asyncio.sleep(interval) await _poll_once(bot, key, announce=True) except asyncio.CancelledError: pass def _start_poller(bot, key: str) -> None: """Create and track a poller task.""" existing = _pollers.get(key) if existing and not existing.done(): return task = asyncio.create_task(_poll_loop(bot, key)) _pollers[key] = task def _stop_poller(key: str) -> None: """Cancel and remove a poller task.""" task = _pollers.pop(key, None) if task and not task.done(): task.cancel() _subscriptions.pop(key, None) _errors.pop(key, 0) # -- Restore on connect ----------------------------------------------------- def _restore(bot) -> None: """Rebuild pollers from persisted state.""" for key in bot.state.keys("alert"): existing = _pollers.get(key) if existing and not existing.done(): continue data = _load(bot, key) if data is None: continue _subscriptions[key] = data _start_poller(bot, key) @event("001") async def on_connect(bot, message): """Restore alert subscription pollers on connect.""" _restore(bot) # -- Command handler --------------------------------------------------------- @command("alert", help="Alert: !alert add|del|list|check") async def cmd_alert(bot, message): """Per-channel keyword alert subscriptions across platforms. Usage: !alert add Add keyword alert (admin) !alert del Remove alert (admin) !alert list List alerts !alert check Force-poll now """ parts = message.text.split(None, 3) if len(parts) < 2: await bot.reply(message, "Usage: !alert [args]") return sub = parts[1].lower() # -- list (any user, channel only) ---------------------------------------- if sub == "list": if not message.is_channel: await bot.reply(message, "Use this command in a channel") return channel = message.target prefix = f"{channel}:" subs = [] for key in bot.state.keys("alert"): if key.startswith(prefix): data = _load(bot, key) if data: name = data["name"] err = data.get("last_error", "") if err: subs.append(f"{name} (error)") else: subs.append(name) if not subs: await bot.reply(message, "No alerts in this channel") return await bot.reply(message, f"Alerts: {', '.join(subs)}") return # -- check (any user, channel only) --------------------------------------- if sub == "check": if not message.is_channel: await bot.reply(message, "Use this command in a channel") return if len(parts) < 3: await bot.reply(message, "Usage: !alert check ") return name = parts[2].lower() channel = message.target key = _state_key(channel, name) data = _load(bot, key) if data is None: await bot.reply(message, f"No alert '{name}' in this channel") return _subscriptions[key] = data await _poll_once(bot, key, announce=True) data = _subscriptions.get(key, data) if data.get("last_error"): await bot.reply(message, f"{name}: error -- {data['last_error']}") else: await bot.reply(message, f"{name}: checked") return # -- add (admin, channel only) ------------------------------------------- if sub == "add": if not bot._is_admin(message): await bot.reply(message, "Permission denied: add requires admin") return if not message.is_channel: await bot.reply(message, "Use this command in a channel") return if len(parts) < 4: await bot.reply(message, "Usage: !alert add ") return name = parts[2].lower() keyword = parts[3] if not _validate_name(name): await bot.reply( message, "Invalid name (lowercase alphanumeric + hyphens, 1-20 chars)", ) return if len(keyword) > _MAX_KEYWORD_LEN: await bot.reply(message, f"Keyword too long (max {_MAX_KEYWORD_LEN} chars)") return irc_channel = message.target key = _state_key(irc_channel, name) if _load(bot, key) is not None: await bot.reply(message, f"Alert '{name}' already exists in this channel") return ch_prefix = f"{irc_channel}:" count = sum(1 for k in bot.state.keys("alert") if k.startswith(ch_prefix)) if count >= _MAX_SUBS: await bot.reply(message, f"Alert limit reached ({_MAX_SUBS})") return # Seed: query all backends to populate seen lists (prevents flood) loop = asyncio.get_running_loop() seen: dict[str, list[str]] = {} for tag, backend in _BACKENDS.items(): try: items = await loop.run_in_executor(None, backend, keyword) ids = [item["id"] for item in items] if len(ids) > _MAX_SEEN: ids = ids[-_MAX_SEEN:] seen[tag] = ids except Exception: seen[tag] = [] now = datetime.now(timezone.utc).isoformat() data = { "keyword": keyword, "name": name, "channel": irc_channel, "interval": _DEFAULT_INTERVAL, "added_by": message.nick, "added_at": now, "last_poll": now, "last_error": "", "seen": seen, } _save(bot, key, data) _subscriptions[key] = data _start_poller(bot, key) counts = ", ".join(f"{len(seen.get(t, []))} {t}" for t in _BACKENDS) await bot.reply( message, f"Alert '{name}' added for: {keyword} ({counts} existing)", ) return # -- del (admin, channel only) ------------------------------------------- if sub == "del": if not bot._is_admin(message): await bot.reply(message, "Permission denied: del requires admin") return if not message.is_channel: await bot.reply(message, "Use this command in a channel") return if len(parts) < 3: await bot.reply(message, "Usage: !alert del ") return name = parts[2].lower() channel = message.target key = _state_key(channel, name) if _load(bot, key) is None: await bot.reply(message, f"No alert '{name}' in this channel") return _stop_poller(key) _delete(bot, key) await bot.reply(message, f"Removed '{name}'") return await bot.reply(message, "Usage: !alert [args]")