"""Plugin: paste site keyword monitor for Pastebin and GitHub Gists.""" from __future__ import annotations import asyncio import json import logging import re import urllib.request from datetime import datetime, timezone from html.parser import HTMLParser from derp.http import urlopen as _urlopen from derp.plugin import command, event _log = logging.getLogger(__name__) # -- Constants --------------------------------------------------------------- _NAME_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,19}$") _MAX_SEEN = 200 _MAX_ANNOUNCE = 5 _DEFAULT_INTERVAL = 300 _MAX_INTERVAL = 3600 _FETCH_TIMEOUT = 15 _USER_AGENT = "derp-bot/1.0 (IRC paste monitor)" _MAX_MONITORS = 20 _MAX_SNIPPET_LEN = 80 _MAX_TITLE_LEN = 60 # -- Per-bot runtime state --------------------------------------------------- def _ps(bot): """Per-bot plugin runtime state.""" return bot._pstate.setdefault("pastemoni", { "pollers": {}, "monitors": {}, "errors": {}, }) # -- Pure helpers ------------------------------------------------------------ def _state_key(channel: str, name: str) -> str: """Build composite state key.""" return f"{channel}:{name}" def _validate_name(name: str) -> bool: """Check name against allowed pattern.""" return bool(_NAME_RE.match(name)) def _truncate(text: str, max_len: int = _MAX_TITLE_LEN) -> str: """Truncate text with ellipsis if needed.""" if len(text) <= max_len: return text return text[: max_len - 3].rstrip() + "..." def _snippet_around(text: str, keyword: str, max_len: int = _MAX_SNIPPET_LEN) -> str: """Extract snippet centered around keyword match.""" if not text: return "" text = " ".join(text.split()) # collapse whitespace if len(text) <= max_len: return text idx = text.lower().find(keyword.lower()) if idx < 0: return text[: max_len - 3] + "..." start = max(0, idx - max_len // 3) end = min(len(text), start + max_len) snippet = text[start:end] if start > 0: snippet = "..." + snippet if end < len(text): snippet = snippet + "..." return snippet # -- State helpers ----------------------------------------------------------- def _save(bot, key: str, data: dict) -> None: """Persist monitor data to bot.state.""" bot.state.set("pastemoni", key, json.dumps(data)) def _load(bot, key: str) -> dict | None: """Load monitor data from bot.state.""" raw = bot.state.get("pastemoni", key) if raw is None: return None try: return json.loads(raw) except json.JSONDecodeError: return None def _delete(bot, key: str) -> None: """Remove monitor data from bot.state.""" bot.state.delete("pastemoni", key) # -- Pastebin archive parser ------------------------------------------------ class _ArchiveParser(HTMLParser): """Extract paste links from Pastebin archive HTML.""" def __init__(self): super().__init__() self.links: list[tuple[str, str]] = [] # (paste_id, title) self._in_link = False self._href = "" self._title_parts: list[str] = [] def handle_starttag(self, tag, attrs): if tag != "a": return attr_map = {k: (v or "") for k, v in attrs} href = attr_map.get("href", "") if re.match(r"^/[a-zA-Z0-9]{8}$", href): self._in_link = True self._href = href[1:] # strip leading / self._title_parts = [] def handle_data(self, data): if self._in_link: self._title_parts.append(data) def handle_endtag(self, tag): if tag == "a" and self._in_link: self._in_link = False title = "".join(self._title_parts).strip() if self._href: self.links.append((self._href, title)) # -- Pastebin backend -------------------------------------------------------- def _fetch_pastebin(keyword: str) -> list[dict]: """Scrape Pastebin archive and filter by keyword. Blocking.""" req = urllib.request.Request("https://pastebin.com/archive", method="GET") req.add_header("User-Agent", _USER_AGENT) resp = _urlopen(req, timeout=_FETCH_TIMEOUT) raw = resp.read() resp.close() html = raw.decode("utf-8", errors="replace") parser = _ArchiveParser() parser.feed(html) kw_lower = keyword.lower() results: list[dict] = [] for paste_id, title in parser.links[:30]: # Check title first (avoids raw fetch) if kw_lower in title.lower(): results.append({ "id": paste_id, "title": _truncate(title, _MAX_TITLE_LEN), "url": f"https://pastebin.com/{paste_id}", "snippet": "", }) continue # Fetch raw content and check try: raw_req = urllib.request.Request( f"https://pastebin.com/raw/{paste_id}", method="GET", ) raw_req.add_header("User-Agent", _USER_AGENT) raw_resp = _urlopen(raw_req, timeout=_FETCH_TIMEOUT) content = raw_resp.read().decode("utf-8", errors="replace") raw_resp.close() except Exception: continue if kw_lower in content.lower(): results.append({ "id": paste_id, "title": _truncate(title or "(untitled)", _MAX_TITLE_LEN), "url": f"https://pastebin.com/{paste_id}", "snippet": _snippet_around(content, keyword), }) return results # -- GitHub Gists backend ---------------------------------------------------- def _fetch_gists(keyword: str) -> list[dict]: """Query GitHub public gists and filter by keyword. Blocking.""" req = urllib.request.Request( "https://api.github.com/gists/public?per_page=30", method="GET", ) req.add_header("User-Agent", _USER_AGENT) req.add_header("Accept", "application/vnd.github+json") resp = _urlopen(req, timeout=_FETCH_TIMEOUT) raw = resp.read() resp.close() gists = json.loads(raw) kw_lower = keyword.lower() results: list[dict] = [] for gist in gists if isinstance(gists, list) else []: gist_id = gist.get("id", "") if not gist_id: continue description = gist.get("description") or "" html_url = gist.get("html_url", "") files = gist.get("files") or {} filenames = " ".join(files.keys()) searchable = f"{description} {filenames}" if kw_lower not in searchable.lower(): continue source = description or filenames title = _truncate(source or "(no description)", _MAX_TITLE_LEN) snippet = _snippet_around(source, keyword) if len(source) > _MAX_TITLE_LEN else "" results.append({ "id": gist_id, "title": title, "url": html_url, "snippet": snippet, }) return results # -- Backend registry ------------------------------------------------------- _BACKENDS: dict[str, callable] = { "pb": _fetch_pastebin, "gh": _fetch_gists, } # -- Polling ----------------------------------------------------------------- async def _poll_once(bot, key: str, announce: bool = True) -> None: """Single poll cycle for one monitor (all backends).""" ps = _ps(bot) data = ps["monitors"].get(key) if data is None: data = _load(bot, key) if data is None: return ps["monitors"][key] = data keyword = data["keyword"] now = datetime.now(timezone.utc).isoformat() data["last_poll"] = now loop = asyncio.get_running_loop() had_success = False for tag, backend in _BACKENDS.items(): try: items = await loop.run_in_executor(None, backend, keyword) except Exception as exc: _log.debug("pastemoni %s/%s error: %s", key, tag, exc) data.setdefault("last_errors", {})[tag] = str(exc) continue had_success = True data.setdefault("last_errors", {}).pop(tag, None) seen_set = set(data.get("seen", {}).get(tag, [])) seen_list = list(data.get("seen", {}).get(tag, [])) new_items = [item for item in items if item["id"] not in seen_set] if announce and new_items: channel = data["channel"] shown = new_items[:_MAX_ANNOUNCE] for item in shown: title = item.get("title") or "(untitled)" snippet = item.get("snippet", "") url = item.get("url", "") if url: url = await bot.shorten_url(url) parts = [f"[{tag}] {title}"] if snippet: parts.append(snippet) if url: parts.append(url) await bot.send(channel, " -- ".join(parts)) remaining = len(new_items) - len(shown) if remaining > 0: await bot.send(channel, f"[{tag}] ... and {remaining} more") for item in new_items: seen_list.append(item["id"]) if len(seen_list) > _MAX_SEEN: seen_list = seen_list[-_MAX_SEEN:] data.setdefault("seen", {})[tag] = seen_list if had_success: ps["errors"][key] = 0 else: ps["errors"][key] = ps["errors"].get(key, 0) + 1 ps["monitors"][key] = data _save(bot, key, data) async def _poll_loop(bot, key: str) -> None: """Infinite poll loop for one monitor.""" try: while True: ps = _ps(bot) data = ps["monitors"].get(key) or _load(bot, key) if data is None: return interval = data.get("interval", _DEFAULT_INTERVAL) errs = ps["errors"].get(key, 0) if errs >= 5: interval = min(interval * 2, _MAX_INTERVAL) await asyncio.sleep(interval) await _poll_once(bot, key, announce=True) except asyncio.CancelledError: pass def _start_poller(bot, key: str) -> None: """Create and track a poller task.""" ps = _ps(bot) existing = ps["pollers"].get(key) if existing and not existing.done(): return task = asyncio.create_task(_poll_loop(bot, key)) ps["pollers"][key] = task def _stop_poller(bot, key: str) -> None: """Cancel and remove a poller task.""" ps = _ps(bot) task = ps["pollers"].pop(key, None) if task and not task.done(): task.cancel() ps["monitors"].pop(key, None) ps["errors"].pop(key, 0) # -- Restore on connect ----------------------------------------------------- def _restore(bot) -> None: """Rebuild pollers from persisted state.""" ps = _ps(bot) for key in bot.state.keys("pastemoni"): existing = ps["pollers"].get(key) if existing and not existing.done(): continue data = _load(bot, key) if data is None: continue ps["monitors"][key] = data _start_poller(bot, key) @event("001") async def on_connect(bot, message): """Restore paste monitor pollers on connect.""" _restore(bot) # -- Command handler --------------------------------------------------------- @command("pastemoni", help="Paste monitor: !pastemoni add|del|list|check") async def cmd_pastemoni(bot, message): """Per-channel paste site keyword monitoring. Usage: !pastemoni add Add monitor (admin) !pastemoni del Remove monitor (admin) !pastemoni list List monitors !pastemoni check Force-poll now """ parts = message.text.split(None, 3) if len(parts) < 2: await bot.reply(message, "Usage: !pastemoni [args]") return sub = parts[1].lower() # -- list ---------------------------------------------------------------- if sub == "list": if not message.is_channel: await bot.reply(message, "Use this command in a channel") return channel = message.target prefix = f"{channel}:" monitors = [] for key in bot.state.keys("pastemoni"): if key.startswith(prefix): data = _load(bot, key) if data: name = data["name"] keyword = data.get("keyword", "") errs = data.get("last_errors", {}) entry = f"{name} ({keyword})" if errs: entry += f" [{len(errs)} errors]" monitors.append(entry) if not monitors: await bot.reply(message, "No monitors in this channel") return await bot.reply(message, f"Monitors: {', '.join(monitors)}") return # -- check --------------------------------------------------------------- if sub == "check": if not message.is_channel: await bot.reply(message, "Use this command in a channel") return if len(parts) < 3: await bot.reply(message, "Usage: !pastemoni check ") return name = parts[2].lower() channel = message.target key = _state_key(channel, name) data = _load(bot, key) if data is None: await bot.reply(message, f"No monitor '{name}' in this channel") return _ps(bot)["monitors"][key] = data await _poll_once(bot, key, announce=True) data = _ps(bot)["monitors"].get(key, data) errs = data.get("last_errors", {}) if errs: tags = ", ".join(sorted(errs)) await bot.reply(message, f"{name}: errors on {tags}") else: await bot.reply(message, f"{name}: checked") return # -- add (admin) --------------------------------------------------------- if sub == "add": if not bot._is_admin(message): await bot.reply(message, "Permission denied: add requires admin") return if not message.is_channel: await bot.reply(message, "Use this command in a channel") return if len(parts) < 4: await bot.reply(message, "Usage: !pastemoni add ") return name = parts[2].lower() keyword = parts[3] if not _validate_name(name): await bot.reply( message, "Invalid name (lowercase alphanumeric + hyphens, 1-20 chars)", ) return channel = message.target key = _state_key(channel, name) if _load(bot, key) is not None: await bot.reply( message, f"Monitor '{name}' already exists in this channel", ) return ch_prefix = f"{channel}:" count = sum( 1 for k in bot.state.keys("pastemoni") if k.startswith(ch_prefix) ) if count >= _MAX_MONITORS: await bot.reply(message, f"Monitor limit reached ({_MAX_MONITORS})") return now = datetime.now(timezone.utc).isoformat() data = { "keyword": keyword, "name": name, "channel": channel, "interval": _DEFAULT_INTERVAL, "added_by": message.nick, "added_at": now, "last_poll": now, "last_errors": {}, "seen": {}, } _save(bot, key, data) _ps(bot)["monitors"][key] = data async def _seed(): await _poll_once(bot, key, announce=False) _start_poller(bot, key) asyncio.create_task(_seed()) await bot.reply( message, f"Monitor '{name}' added for: {keyword} (seeding in background)", ) return # -- del (admin) --------------------------------------------------------- if sub == "del": if not bot._is_admin(message): await bot.reply(message, "Permission denied: del requires admin") return if not message.is_channel: await bot.reply(message, "Use this command in a channel") return if len(parts) < 3: await bot.reply(message, "Usage: !pastemoni del ") return name = parts[2].lower() channel = message.target key = _state_key(channel, name) if _load(bot, key) is None: await bot.reply(message, f"No monitor '{name}' in this channel") return _stop_poller(bot, key) _delete(bot, key) await bot.reply(message, f"Removed '{name}'") return await bot.reply(message, "Usage: !pastemoni [args]")