feat: add Bluesky, Lemmy, Odysee, and Archive.org alert backends

Bluesky (bs) searches public post API, constructs bsky.app URLs from at:// URIs. Lemmy (ly) queries 4 instances (lemmy.ml, lemmy.world, programming.dev, infosec.pub) with cross-instance dedup. Odysee (od) uses LBRY JSON-RPC claim_search for video, audio, and documents with lbry:// to odysee.com URL conversion. Archive.org (ia) searches via advanced search API sorted by date. All routed through SOCKS5 proxy via _urlopen. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 23:07:09 +01:00
parent 52c49609b3
commit f0b198d98a
3 changed files with 231 additions and 7 deletions
--- a/docs/CHEATSHEET.md
+++ b/docs/CHEATSHEET.md
@@ -345,12 +345,13 @@ No API credentials needed (uses public GQL endpoint).
 !alert history <name> [n]        # Show recent results (default 5)
 ```
-Searches keywords across 10 backends: YouTube (yt), Twitch (tw), SearXNG (sx),
+Searches keywords across 14 backends: YouTube (yt), Twitch (tw), SearXNG (sx),
 Reddit (rd), Mastodon (ft), DuckDuckGo (dg), Google News (gn), Kick (kk),
-Dailymotion (dm), PeerTube (pt). Names: lowercase alphanumeric + hyphens, 1-20
+Dailymotion (dm), PeerTube (pt), Bluesky (bs), Lemmy (ly), Odysee (od),
-chars. Keywords: 1-100 chars. Max 20 alerts/channel. Polls every 5min.
+Archive.org (ia). Names: lowercase alphanumeric + hyphens, 1-20 chars. Keywords:
-Format: `[name/yt] Title -- URL`, etc. No API credentials needed. Persists across
+1-100 chars. Max 20 alerts/channel. Polls every 5min. Format: `[name/yt] Title -- URL`,
-restarts. History stored in `data/alert_history.db`.
+etc. No API credentials needed. Persists across restarts. History stored in
 `data/alert_history.db`.
 ## SearX
--- a/docs/USAGE.md
+++ b/docs/USAGE.md
@@ -692,13 +692,17 @@ Platforms searched:
 - **Kick** (`kk`) -- Public search API: channels and livestreams (no auth required)
 - **Dailymotion** (`dm`) -- Public video API, sorted by recent (no auth required)
 - **PeerTube** (`pt`) -- Federated video search across 4 instances (no auth required)
 - **Bluesky** (`bs`) -- Public post search API via SOCKS5 proxy (no auth required)
 - **Lemmy** (`ly`) -- Federated post search across 4 instances (no auth required)
 - **Odysee** (`od`) -- LBRY JSON-RPC claim search: video, audio, documents (no auth required)
 - **Archive.org** (`ia`) -- Internet Archive advanced search, sorted by date (no auth required)
 Polling and announcements:
 - Alerts are polled every 5 minutes by default
 - On `add`, existing results are recorded without announcing (prevents flood)
- New results announced as `[name/<tag>] Title -- URL` where tag is `yt`, `tw`,
+- New results announced as `[name/<tag>] Title -- URL` where tag is one of:
-  `sx`, `rd`, `ft`, `dg`, `gn`, `kk`, `dm`, or `pt`
+  `yt`, `tw`, `sx`, `rd`, `ft`, `dg`, `gn`, `kk`, `dm`, `pt`, `bs`, `ly`, `od`, `ia`
 - Titles are truncated to 80 characters
 - Each platform maintains its own seen list (capped at 200 per platform)
 - 5 consecutive errors doubles the poll interval (max 1 hour)
--- a/plugins/alert.py
+++ b/plugins/alert.py
@@ -51,6 +51,16 @@ _PEERTUBE_INSTANCES = [
    "diode.zone",
 ]
 _PEERTUBE_TIMEOUT = 4
 _BLUESKY_SEARCH_URL = "https://public.api.bsky.app/xrpc/app.bsky.feed.searchPosts"
 _LEMMY_INSTANCES = [
    "lemmy.ml",
    "lemmy.world",
    "programming.dev",
    "infosec.pub",
 ]
 _LEMMY_TIMEOUT = 4
 _ODYSEE_API = "https://api.na-backend.odysee.com/api/v1/proxy"
 _ARCHIVE_SEARCH_URL = "https://archive.org/advancedsearch.php"
 # -- Module-level tracking ---------------------------------------------------
@@ -787,6 +797,211 @@ def _search_peertube(keyword: str) -> list[dict]:
    return results
 # -- Bluesky search (blocking) ----------------------------------------------
 def _search_bluesky(keyword: str) -> list[dict]:
    """Search Bluesky via public search API. Blocking."""
    import urllib.parse
    params = urllib.parse.urlencode({"q": keyword, "limit": "25", "sort": "latest"})
    url = f"{_BLUESKY_SEARCH_URL}?{params}"
    req = urllib.request.Request(url, method="GET")
    req.add_header("Accept", "application/json")
    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
    raw = resp.read()
    resp.close()
    data = json.loads(raw)
    results: list[dict] = []
    for post in data.get("posts") or []:
        uri = post.get("uri", "")
        if not uri:
            continue
        # Extract rkey from at:// URI for web URL
        # URI format: at://did:plc:xxx/app.bsky.feed.post/rkey
        rkey = uri.rsplit("/", 1)[-1] if "/" in uri else ""
        author = post.get("author") or {}
        handle = author.get("handle", "")
        display = author.get("displayName") or handle
        record = post.get("record") or {}
        text = record.get("text", "")
        title = f"@{display}: {_truncate(text, 60)}"
        date = _parse_date(record.get("createdAt", ""))
        post_url = f"https://bsky.app/profile/{handle}/post/{rkey}" if handle else ""
        results.append({
            "id": uri,
            "title": title,
            "url": post_url,
            "date": date,
            "extra": "",
        })
    return results
 # -- Lemmy search (blocking) ------------------------------------------------
 def _search_lemmy(keyword: str) -> list[dict]:
    """Search Lemmy instances via public API. Blocking."""
    import urllib.parse
    results: list[dict] = []
    seen_ids: set[str] = set()
    for instance in _LEMMY_INSTANCES:
        params = urllib.parse.urlencode({
            "q": keyword, "type_": "Posts", "sort": "New", "limit": "25",
        })
        api_url = f"https://{instance}/api/v3/search?{params}"
        req = urllib.request.Request(api_url, method="GET")
        req.add_header("Accept", "application/json")
        req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
        try:
            resp = _urlopen(req, timeout=_LEMMY_TIMEOUT)
            raw = resp.read()
            resp.close()
        except Exception as exc:
            _log.debug("lemmy %s failed: %s", instance, exc)
            continue
        try:
            data = json.loads(raw)
        except json.JSONDecodeError:
            continue
        for entry in data.get("posts") or []:
            post = entry.get("post") or {}
            ap_id = post.get("ap_id", "")
            if not ap_id or ap_id in seen_ids:
                continue
            seen_ids.add(ap_id)
            name = post.get("name", "")
            community = (entry.get("community") or {}).get("name", "")
            title = f"{community}: {name}" if community else name
            date = _parse_date(post.get("published", ""))
            # Use linked URL if present, otherwise the post's ap_id
            post_url = post.get("url") or ap_id
            results.append({
                "id": ap_id,
                "title": title,
                "url": post_url,
                "date": date,
                "extra": "",
            })
    return results
 # -- Odysee/LBRY search (blocking) ------------------------------------------
 def _lbry_to_odysee_url(lbry_url: str) -> str:
    """Convert lbry:// URI to https://odysee.com/ web URL."""
    if not lbry_url.startswith("lbry://"):
        return lbry_url
    return "https://odysee.com/" + lbry_url[7:].replace("#", ":")
 def _search_odysee(keyword: str) -> list[dict]:
    """Search Odysee/LBRY via JSON-RPC claim_search. Blocking."""
    payload = json.dumps({
        "jsonrpc": "2.0",
        "method": "claim_search",
        "params": {
            "text": keyword,
            "order_by": ["release_time"],
            "page_size": 25,
            "stream_types": ["video", "audio", "document"],
        },
        "id": 1,
    }).encode()
    req = urllib.request.Request(
        f"{_ODYSEE_API}?m=claim_search", data=payload, method="POST",
    )
    req.add_header("Content-Type", "application/json")
    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
    raw = resp.read()
    resp.close()
    data = json.loads(raw)
    results: list[dict] = []
    for item in (data.get("result") or {}).get("items") or []:
        claim_id = item.get("claim_id", "")
        if not claim_id:
            continue
        value = item.get("value") or {}
        title = value.get("title", "")
        canonical = item.get("canonical_url", "")
        web_url = _lbry_to_odysee_url(canonical)
        # Use block timestamp for date (release_time can be bogus)
        timestamp = item.get("timestamp")
        date = ""
        if timestamp and isinstance(timestamp, int) and timestamp < 2000000000:
            try:
                date = datetime.fromtimestamp(
                    timestamp, tz=timezone.utc,
                ).strftime("%Y-%m-%d")
            except (ValueError, OSError):
                pass
        results.append({
            "id": claim_id,
            "title": title,
            "url": web_url,
            "date": date,
            "extra": "",
        })
    return results
 # -- Archive.org search (blocking) ------------------------------------------
 def _search_archive(keyword: str) -> list[dict]:
    """Search Archive.org via advanced search API. Blocking."""
    import urllib.parse
    params = urllib.parse.urlencode({
        "q": keyword,
        "output": "json",
        "rows": "25",
        "sort[]": "date desc",
        "fl[]": "identifier,title,date,mediatype",
    })
    url = f"{_ARCHIVE_SEARCH_URL}?{params}"
    req = urllib.request.Request(url, method="GET")
    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
    raw = resp.read()
    resp.close()
    data = json.loads(raw)
    results: list[dict] = []
    for doc in (data.get("response") or {}).get("docs") or []:
        identifier = doc.get("identifier", "")
        if not identifier:
            continue
        title = doc.get("title", "")
        mediatype = doc.get("mediatype", "")
        if mediatype:
            title = f"[{mediatype}] {title}"
        date = _parse_date(doc.get("date", ""))
        results.append({
            "id": identifier,
            "title": title,
            "url": f"https://archive.org/details/{identifier}",
            "date": date,
            "extra": "",
        })
    return results
 # -- Backend registry -------------------------------------------------------
 _BACKENDS: dict[str, callable] = {
@@ -800,6 +1015,10 @@ _BACKENDS: dict[str, callable] = {
    "kk": _search_kick,
    "dm": _search_dailymotion,
    "pt": _search_peertube,
    "bs": _search_bluesky,
    "ly": _search_lemmy,
    "od": _search_odysee,
    "ia": _search_archive,
 }