Files
derp/plugins/rss.py
user 125a4c5d4d feat: add per-channel RSS feed subscription plugin
Subscribe RSS/Atom feeds to IRC channels with periodic polling,
new-item announcements, deduplication, and persistence across restarts.
Supports conditional HTTP requests (ETag/Last-Modified), automatic
backoff on errors, and per-channel feed limits.
2026-02-15 13:36:23 +01:00

505 lines
16 KiB
Python

"""Plugin: per-channel RSS/Atom feed subscriptions with periodic polling."""
from __future__ import annotations
import asyncio
import json
import re
import ssl
import urllib.request
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from urllib.parse import urlparse
from derp.plugin import command, event
# -- Constants ---------------------------------------------------------------
_NAME_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,19}$")
_MAX_SEEN = 200
_MAX_ANNOUNCE = 5
_DEFAULT_INTERVAL = 600
_MAX_INTERVAL = 3600
_FETCH_TIMEOUT = 15
_USER_AGENT = "derp/1.0"
_MAX_TITLE_LEN = 80
_MAX_FEEDS = 20
_ATOM_NS = "{http://www.w3.org/2005/Atom}"
_DC_NS = "{http://purl.org/dc/elements/1.1/}"
# -- Module-level tracking ---------------------------------------------------
_pollers: dict[str, asyncio.Task] = {}
_feeds: dict[str, dict] = {}
_errors: dict[str, int] = {}
# -- Pure helpers ------------------------------------------------------------
def _state_key(channel: str, name: str) -> str:
"""Build composite state key."""
return f"{channel}:{name}"
def _validate_name(name: str) -> bool:
"""Check name against allowed pattern."""
return bool(_NAME_RE.match(name))
def _derive_name(url: str) -> str:
"""Derive a short feed name from URL hostname."""
try:
hostname = urlparse(url).hostname or ""
except Exception:
hostname = ""
# Strip www. prefix, take first label, lowercase
hostname = hostname.lower().removeprefix("www.")
name = hostname.split(".")[0] if hostname else "feed"
# Sanitize to allowed chars
name = re.sub(r"[^a-z0-9-]", "", name)
if not name or not name[0].isalnum():
name = "feed"
return name[:20]
def _truncate(text: str, max_len: int = _MAX_TITLE_LEN) -> str:
"""Truncate text with ellipsis if needed."""
if len(text) <= max_len:
return text
return text[: max_len - 3].rstrip() + "..."
# -- State helpers -----------------------------------------------------------
def _save(bot, key: str, data: dict) -> None:
"""Persist feed data to bot.state."""
bot.state.set("rss", key, json.dumps(data))
def _load(bot, key: str) -> dict | None:
"""Load feed data from bot.state."""
raw = bot.state.get("rss", key)
if raw is None:
return None
try:
return json.loads(raw)
except json.JSONDecodeError:
return None
def _delete(bot, key: str) -> None:
"""Remove feed data from bot.state."""
bot.state.delete("rss", key)
# -- Feed fetching (blocking, for executor) ----------------------------------
def _fetch_feed(url: str, etag: str = "", last_modified: str = "") -> dict:
"""Blocking HTTP GET for feed content. Run via executor."""
result: dict = {
"status": 0,
"body": b"",
"etag": "",
"last_modified": "",
"error": "",
}
req = urllib.request.Request(url, method="GET")
req.add_header("User-Agent", _USER_AGENT)
if etag:
req.add_header("If-None-Match", etag)
if last_modified:
req.add_header("If-Modified-Since", last_modified)
ctx = ssl.create_default_context()
try:
resp = urllib.request.urlopen(req, timeout=_FETCH_TIMEOUT, context=ctx)
result["status"] = resp.status
result["body"] = resp.read()
result["etag"] = resp.headers.get("ETag", "")
result["last_modified"] = resp.headers.get("Last-Modified", "")
resp.close()
except urllib.error.HTTPError as exc:
result["status"] = exc.code
if exc.code == 304:
result["etag"] = etag
result["last_modified"] = last_modified
else:
result["error"] = f"HTTP {exc.code}"
except urllib.error.URLError as exc:
result["error"] = str(exc.reason)
except Exception as exc:
result["error"] = str(exc)
return result
# -- Feed parsing ------------------------------------------------------------
def _parse_rss(root: ET.Element) -> tuple[str, list[dict]]:
"""Parse RSS 2.0 feed."""
channel = root.find("channel")
if channel is None:
return ("", [])
title = (channel.findtext("title") or "").strip()
items = []
for item in channel.findall("item"):
item_id = item.findtext("guid") or item.findtext("link") or ""
item_title = (item.findtext("title") or "").strip()
item_link = (item.findtext("link") or "").strip()
if item_id:
items.append({"id": item_id, "title": item_title, "link": item_link})
return (title, items)
def _parse_atom(root: ET.Element) -> tuple[str, list[dict]]:
"""Parse Atom feed."""
title = (root.findtext(f"{_ATOM_NS}title") or "").strip()
items = []
for entry in root.findall(f"{_ATOM_NS}entry"):
entry_id = (entry.findtext(f"{_ATOM_NS}id") or "").strip()
link_el = entry.find(f"{_ATOM_NS}link")
entry_link = (link_el.get("href", "") if link_el is not None else "").strip()
if not entry_id:
entry_id = entry_link
entry_title = (entry.findtext(f"{_ATOM_NS}title") or "").strip()
if entry_id:
items.append({"id": entry_id, "title": entry_title, "link": entry_link})
return (title, items)
def _parse_feed(body: bytes) -> tuple[str, list[dict]]:
"""Auto-detect RSS/Atom and parse. Returns (feed_title, items)."""
root = ET.fromstring(body)
tag = root.tag
local = tag.split("}")[-1].lower() if "}" in tag else tag.lower()
if local == "rss":
return _parse_rss(root)
if local == "feed":
return _parse_atom(root)
raise ValueError(f"Unknown feed format: {root.tag}")
# -- Polling -----------------------------------------------------------------
async def _poll_once(bot, key: str, announce: bool = True) -> None:
"""Single poll cycle for one feed."""
data = _feeds.get(key)
if data is None:
data = _load(bot, key)
if data is None:
return
_feeds[key] = data
url = data["url"]
etag = data.get("etag", "")
last_modified = data.get("last_modified", "")
loop = asyncio.get_running_loop()
result = await loop.run_in_executor(
None, _fetch_feed, url, etag, last_modified,
)
now = datetime.now(timezone.utc).isoformat()
data["last_poll"] = now
if result["error"]:
data["last_error"] = result["error"]
_errors[key] = _errors.get(key, 0) + 1
_feeds[key] = data
_save(bot, key, data)
return
# HTTP 304 -- not modified
if result["status"] == 304:
data["last_error"] = ""
_errors[key] = 0
_feeds[key] = data
_save(bot, key, data)
return
# Update conditional headers
data["etag"] = result["etag"]
data["last_modified"] = result["last_modified"]
data["last_error"] = ""
_errors[key] = 0
try:
feed_title, items = _parse_feed(result["body"])
except Exception as exc:
data["last_error"] = f"Parse error: {exc}"
_errors[key] = _errors.get(key, 0) + 1
_feeds[key] = data
_save(bot, key, data)
return
if feed_title and not data.get("title"):
data["title"] = feed_title
seen = set(data.get("seen", []))
seen_list = list(data.get("seen", []))
new_items = [item for item in items if item["id"] not in seen]
if announce and new_items:
channel = data["channel"]
name = data["name"]
shown = new_items[:_MAX_ANNOUNCE]
for item in shown:
title = _truncate(item["title"]) if item["title"] else "(no title)"
link = item["link"]
line = f"[{name}] {title}"
if link:
line += f" -- {link}"
await bot.send(channel, line)
remaining = len(new_items) - len(shown)
if remaining > 0:
await bot.send(channel, f"[{name}] ... and {remaining} more")
# Update seen list
for item in new_items:
seen_list.append(item["id"])
if len(seen_list) > _MAX_SEEN:
seen_list = seen_list[-_MAX_SEEN:]
data["seen"] = seen_list
_feeds[key] = data
_save(bot, key, data)
async def _poll_loop(bot, key: str) -> None:
"""Infinite poll loop for one feed."""
try:
while True:
data = _feeds.get(key) or _load(bot, key)
if data is None:
return
interval = data.get("interval", _DEFAULT_INTERVAL)
# Back off on consecutive errors
errs = _errors.get(key, 0)
if errs >= 5:
interval = min(interval * 2, _MAX_INTERVAL)
await asyncio.sleep(interval)
await _poll_once(bot, key, announce=True)
except asyncio.CancelledError:
pass
def _start_poller(bot, key: str) -> None:
"""Create and track a poller task."""
existing = _pollers.get(key)
if existing and not existing.done():
return
task = asyncio.create_task(_poll_loop(bot, key))
_pollers[key] = task
def _stop_poller(key: str) -> None:
"""Cancel and remove a poller task."""
task = _pollers.pop(key, None)
if task and not task.done():
task.cancel()
_feeds.pop(key, None)
_errors.pop(key, 0)
# -- Restore on connect -----------------------------------------------------
def _restore(bot) -> None:
"""Rebuild pollers from persisted state."""
for key in bot.state.keys("rss"):
existing = _pollers.get(key)
if existing and not existing.done():
continue
data = _load(bot, key)
if data is None:
continue
_feeds[key] = data
_start_poller(bot, key)
@event("001")
async def on_connect(bot, message):
"""Restore RSS feed pollers on connect."""
_restore(bot)
# -- Command handler ---------------------------------------------------------
@command("rss", help="RSS: !rss add|del|list|check")
async def cmd_rss(bot, message):
"""Per-channel RSS/Atom feed subscriptions.
Usage:
!rss add <url> [name] Subscribe a feed (admin)
!rss del <name> Unsubscribe a feed (admin)
!rss list List feeds in this channel
!rss check <name> Force-poll a feed now
"""
parts = message.text.split(None, 3)
if len(parts) < 2:
await bot.reply(message, "Usage: !rss <add|del|list|check> [args]")
return
sub = parts[1].lower()
# -- list (any user, any context) ----------------------------------------
if sub == "list":
if not message.is_channel:
await bot.reply(message, "Use this command in a channel")
return
channel = message.target
prefix = f"{channel}:"
feeds = []
for key in bot.state.keys("rss"):
if key.startswith(prefix):
data = _load(bot, key)
if data:
name = data["name"]
err = data.get("last_error", "")
if err:
feeds.append(f"{name} (error)")
else:
feeds.append(name)
if not feeds:
await bot.reply(message, "No feeds in this channel")
return
await bot.reply(message, f"Feeds: {', '.join(feeds)}")
return
# -- check (any user, channel only) --------------------------------------
if sub == "check":
if not message.is_channel:
await bot.reply(message, "Use this command in a channel")
return
if len(parts) < 3:
await bot.reply(message, "Usage: !rss check <name>")
return
name = parts[2].lower()
channel = message.target
key = _state_key(channel, name)
data = _load(bot, key)
if data is None:
await bot.reply(message, f"No feed '{name}' in this channel")
return
_feeds[key] = data
await _poll_once(bot, key, announce=True)
data = _feeds.get(key, data)
if data.get("last_error"):
await bot.reply(message, f"{name}: error -- {data['last_error']}")
else:
await bot.reply(message, f"{name}: checked")
return
# -- add (admin, channel only) -------------------------------------------
if sub == "add":
if not bot._is_admin(message):
await bot.reply(message, "Permission denied: add requires admin")
return
if not message.is_channel:
await bot.reply(message, "Use this command in a channel")
return
if len(parts) < 3:
await bot.reply(message, "Usage: !rss add <url> [name]")
return
url = parts[2]
if not url.startswith(("http://", "https://")):
url = f"https://{url}"
name = parts[3].lower() if len(parts) > 3 else _derive_name(url)
if not _validate_name(name):
await bot.reply(
message,
"Invalid name (lowercase alphanumeric + hyphens, 1-20 chars)",
)
return
channel = message.target
key = _state_key(channel, name)
# Check for duplicate
if _load(bot, key) is not None:
await bot.reply(message, f"Feed '{name}' already exists in this channel")
return
# Check per-channel limit
prefix = f"{channel}:"
count = sum(1 for k in bot.state.keys("rss") if k.startswith(prefix))
if count >= _MAX_FEEDS:
await bot.reply(message, f"Channel feed limit reached ({_MAX_FEEDS})")
return
# Test-fetch to validate URL and seed seen list
loop = asyncio.get_running_loop()
result = await loop.run_in_executor(None, _fetch_feed, url, "", "")
if result["error"]:
await bot.reply(message, f"Fetch failed: {result['error']}")
return
feed_title = ""
seen = []
try:
feed_title, items = _parse_feed(result["body"])
seen = [item["id"] for item in items]
if len(seen) > _MAX_SEEN:
seen = seen[-_MAX_SEEN:]
except Exception as exc:
await bot.reply(message, f"Parse failed: {exc}")
return
now = datetime.now(timezone.utc).isoformat()
data = {
"url": url,
"name": name,
"channel": channel,
"interval": _DEFAULT_INTERVAL,
"added_by": message.nick,
"added_at": now,
"seen": seen,
"last_poll": now,
"last_error": "",
"etag": result["etag"],
"last_modified": result["last_modified"],
"title": feed_title,
}
_save(bot, key, data)
_feeds[key] = data
_start_poller(bot, key)
display = feed_title or name
item_count = len(seen)
await bot.reply(
message,
f"Subscribed '{name}' ({display}, {item_count} existing items)",
)
return
# -- del (admin, channel only) -------------------------------------------
if sub == "del":
if not bot._is_admin(message):
await bot.reply(message, "Permission denied: del requires admin")
return
if not message.is_channel:
await bot.reply(message, "Use this command in a channel")
return
if len(parts) < 3:
await bot.reply(message, "Usage: !rss del <name>")
return
name = parts[2].lower()
channel = message.target
key = _state_key(channel, name)
if _load(bot, key) is None:
await bot.reply(message, f"No feed '{name}' in this channel")
return
_stop_poller(key)
_delete(bot, key)
await bot.reply(message, f"Unsubscribed '{name}'")
return
await bot.reply(message, "Usage: !rss <add|del|list|check> [args]")