feat: add Bluesky, Lemmy, Odysee, and Archive.org alert backends

Bluesky (bs) searches public post API, constructs bsky.app URLs from at:// URIs. Lemmy (ly) queries 4 instances (lemmy.ml, lemmy.world, programming.dev, infosec.pub) with cross-instance dedup. Odysee (od) uses LBRY JSON-RPC claim_search for video, audio, and documents with lbry:// to odysee.com URL conversion. Archive.org (ia) searches via advanced search API sorted by date. All routed through SOCKS5 proxy via _urlopen. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 23:07:09 +01:00
parent 52c49609b3
commit f0b198d98a
3 changed files with 231 additions and 7 deletions
--- a/docs/CHEATSHEET.md
+++ b/docs/CHEATSHEET.md
@@ -345,12 +345,13 @@ No API credentials needed (uses public GQL endpoint).
 !alert history <name> [n]        # Show recent results (default 5)
 ```

-Searches keywords across 10 backends: YouTube (yt), Twitch (tw), SearXNG (sx),
+Searches keywords across 14 backends: YouTube (yt), Twitch (tw), SearXNG (sx),
 Reddit (rd), Mastodon (ft), DuckDuckGo (dg), Google News (gn), Kick (kk),
-Dailymotion (dm), PeerTube (pt). Names: lowercase alphanumeric + hyphens, 1-20
-chars. Keywords: 1-100 chars. Max 20 alerts/channel. Polls every 5min.
-Format: `[name/yt] Title -- URL`, etc. No API credentials needed. Persists across
-restarts. History stored in `data/alert_history.db`.
+Dailymotion (dm), PeerTube (pt), Bluesky (bs), Lemmy (ly), Odysee (od),
+Archive.org (ia). Names: lowercase alphanumeric + hyphens, 1-20 chars. Keywords:
+1-100 chars. Max 20 alerts/channel. Polls every 5min. Format: `[name/yt] Title -- URL`,
+etc. No API credentials needed. Persists across restarts. History stored in
+`data/alert_history.db`.

 ## SearX

--- a/docs/USAGE.md
+++ b/docs/USAGE.md
@@ -692,13 +692,17 @@ Platforms searched:
 - **Kick** (`kk`) -- Public search API: channels and livestreams (no auth required)
 - **Dailymotion** (`dm`) -- Public video API, sorted by recent (no auth required)
 - **PeerTube** (`pt`) -- Federated video search across 4 instances (no auth required)
+- **Bluesky** (`bs`) -- Public post search API via SOCKS5 proxy (no auth required)
+- **Lemmy** (`ly`) -- Federated post search across 4 instances (no auth required)
+- **Odysee** (`od`) -- LBRY JSON-RPC claim search: video, audio, documents (no auth required)
+- **Archive.org** (`ia`) -- Internet Archive advanced search, sorted by date (no auth required)

 Polling and announcements:

 - Alerts are polled every 5 minutes by default
 - On `add`, existing results are recorded without announcing (prevents flood)
- New results announced as `[name/<tag>] Title -- URL` where tag is `yt`, `tw`,
-  `sx`, `rd`, `ft`, `dg`, `gn`, `kk`, `dm`, or `pt`
+- New results announced as `[name/<tag>] Title -- URL` where tag is one of:
+  `yt`, `tw`, `sx`, `rd`, `ft`, `dg`, `gn`, `kk`, `dm`, `pt`, `bs`, `ly`, `od`, `ia`
 - Titles are truncated to 80 characters
 - Each platform maintains its own seen list (capped at 200 per platform)
 - 5 consecutive errors doubles the poll interval (max 1 hour)
--- a/plugins/alert.py
+++ b/plugins/alert.py
@@ -51,6 +51,16 @@ _PEERTUBE_INSTANCES = [
    "diode.zone",
 ]
 _PEERTUBE_TIMEOUT = 4
+_BLUESKY_SEARCH_URL = "https://public.api.bsky.app/xrpc/app.bsky.feed.searchPosts"
+_LEMMY_INSTANCES = [
+    "lemmy.ml",
+    "lemmy.world",
+    "programming.dev",
+    "infosec.pub",
+]
+_LEMMY_TIMEOUT = 4
+_ODYSEE_API = "https://api.na-backend.odysee.com/api/v1/proxy"
+_ARCHIVE_SEARCH_URL = "https://archive.org/advancedsearch.php"

 # -- Module-level tracking ---------------------------------------------------

@@ -787,6 +797,211 @@ def _search_peertube(keyword: str) -> list[dict]:
    return results


+# -- Bluesky search (blocking) ----------------------------------------------
+
+def _search_bluesky(keyword: str) -> list[dict]:
+    """Search Bluesky via public search API. Blocking."""
+    import urllib.parse
+
+    params = urllib.parse.urlencode({"q": keyword, "limit": "25", "sort": "latest"})
+    url = f"{_BLUESKY_SEARCH_URL}?{params}"
+
+    req = urllib.request.Request(url, method="GET")
+    req.add_header("Accept", "application/json")
+    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
+
+    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
+    raw = resp.read()
+    resp.close()
+
+    data = json.loads(raw)
+    results: list[dict] = []
+    for post in data.get("posts") or []:
+        uri = post.get("uri", "")
+        if not uri:
+            continue
+        # Extract rkey from at:// URI for web URL
+        # URI format: at://did:plc:xxx/app.bsky.feed.post/rkey
+        rkey = uri.rsplit("/", 1)[-1] if "/" in uri else ""
+        author = post.get("author") or {}
+        handle = author.get("handle", "")
+        display = author.get("displayName") or handle
+        record = post.get("record") or {}
+        text = record.get("text", "")
+        title = f"@{display}: {_truncate(text, 60)}"
+        date = _parse_date(record.get("createdAt", ""))
+        post_url = f"https://bsky.app/profile/{handle}/post/{rkey}" if handle else ""
+        results.append({
+            "id": uri,
+            "title": title,
+            "url": post_url,
+            "date": date,
+            "extra": "",
+        })
+    return results
+
+
+# -- Lemmy search (blocking) ------------------------------------------------
+
+def _search_lemmy(keyword: str) -> list[dict]:
+    """Search Lemmy instances via public API. Blocking."""
+    import urllib.parse
+
+    results: list[dict] = []
+    seen_ids: set[str] = set()
+
+    for instance in _LEMMY_INSTANCES:
+        params = urllib.parse.urlencode({
+            "q": keyword, "type_": "Posts", "sort": "New", "limit": "25",
+        })
+        api_url = f"https://{instance}/api/v3/search?{params}"
+
+        req = urllib.request.Request(api_url, method="GET")
+        req.add_header("Accept", "application/json")
+        req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
+        try:
+            resp = _urlopen(req, timeout=_LEMMY_TIMEOUT)
+            raw = resp.read()
+            resp.close()
+        except Exception as exc:
+            _log.debug("lemmy %s failed: %s", instance, exc)
+            continue
+
+        try:
+            data = json.loads(raw)
+        except json.JSONDecodeError:
+            continue
+
+        for entry in data.get("posts") or []:
+            post = entry.get("post") or {}
+            ap_id = post.get("ap_id", "")
+            if not ap_id or ap_id in seen_ids:
+                continue
+            seen_ids.add(ap_id)
+
+            name = post.get("name", "")
+            community = (entry.get("community") or {}).get("name", "")
+            title = f"{community}: {name}" if community else name
+            date = _parse_date(post.get("published", ""))
+            # Use linked URL if present, otherwise the post's ap_id
+            post_url = post.get("url") or ap_id
+            results.append({
+                "id": ap_id,
+                "title": title,
+                "url": post_url,
+                "date": date,
+                "extra": "",
+            })
+
+    return results
+
+
+# -- Odysee/LBRY search (blocking) ------------------------------------------
+
+def _lbry_to_odysee_url(lbry_url: str) -> str:
+    """Convert lbry:// URI to https://odysee.com/ web URL."""
+    if not lbry_url.startswith("lbry://"):
+        return lbry_url
+    return "https://odysee.com/" + lbry_url[7:].replace("#", ":")
+
+
+def _search_odysee(keyword: str) -> list[dict]:
+    """Search Odysee/LBRY via JSON-RPC claim_search. Blocking."""
+    payload = json.dumps({
+        "jsonrpc": "2.0",
+        "method": "claim_search",
+        "params": {
+            "text": keyword,
+            "order_by": ["release_time"],
+            "page_size": 25,
+            "stream_types": ["video", "audio", "document"],
+        },
+        "id": 1,
+    }).encode()
+
+    req = urllib.request.Request(
+        f"{_ODYSEE_API}?m=claim_search", data=payload, method="POST",
+    )
+    req.add_header("Content-Type", "application/json")
+    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
+
+    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
+    raw = resp.read()
+    resp.close()
+
+    data = json.loads(raw)
+    results: list[dict] = []
+    for item in (data.get("result") or {}).get("items") or []:
+        claim_id = item.get("claim_id", "")
+        if not claim_id:
+            continue
+        value = item.get("value") or {}
+        title = value.get("title", "")
+        canonical = item.get("canonical_url", "")
+        web_url = _lbry_to_odysee_url(canonical)
+        # Use block timestamp for date (release_time can be bogus)
+        timestamp = item.get("timestamp")
+        date = ""
+        if timestamp and isinstance(timestamp, int) and timestamp < 2000000000:
+            try:
+                date = datetime.fromtimestamp(
+                    timestamp, tz=timezone.utc,
+                ).strftime("%Y-%m-%d")
+            except (ValueError, OSError):
+                pass
+        results.append({
+            "id": claim_id,
+            "title": title,
+            "url": web_url,
+            "date": date,
+            "extra": "",
+        })
+    return results
+
+
+# -- Archive.org search (blocking) ------------------------------------------
+
+def _search_archive(keyword: str) -> list[dict]:
+    """Search Archive.org via advanced search API. Blocking."""
+    import urllib.parse
+
+    params = urllib.parse.urlencode({
+        "q": keyword,
+        "output": "json",
+        "rows": "25",
+        "sort[]": "date desc",
+        "fl[]": "identifier,title,date,mediatype",
+    })
+    url = f"{_ARCHIVE_SEARCH_URL}?{params}"
+
+    req = urllib.request.Request(url, method="GET")
+    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
+
+    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
+    raw = resp.read()
+    resp.close()
+
+    data = json.loads(raw)
+    results: list[dict] = []
+    for doc in (data.get("response") or {}).get("docs") or []:
+        identifier = doc.get("identifier", "")
+        if not identifier:
+            continue
+        title = doc.get("title", "")
+        mediatype = doc.get("mediatype", "")
+        if mediatype:
+            title = f"[{mediatype}] {title}"
+        date = _parse_date(doc.get("date", ""))
+        results.append({
+            "id": identifier,
+            "title": title,
+            "url": f"https://archive.org/details/{identifier}",
+            "date": date,
+            "extra": "",
+        })
+    return results
+
+
 # -- Backend registry -------------------------------------------------------

 _BACKENDS: dict[str, callable] = {
@@ -800,6 +1015,10 @@ _BACKENDS: dict[str, callable] = {
    "kk": _search_kick,
    "dm": _search_dailymotion,
    "pt": _search_peertube,
+    "bs": _search_bluesky,
+    "ly": _search_lemmy,
+    "od": _search_odysee,
+    "ia": _search_archive,
 }