derp/plugins/_musicbrainz.py

"""MusicBrainz API helper for music discovery fallback.

Private module (underscore prefix) -- plugin loader skips it.
All functions are blocking; callers should run them in an executor.
"""

from __future__ import annotations

import json
import logging
import time
from urllib.request import Request

log = logging.getLogger(__name__)

_BASE = "https://musicbrainz.org/ws/2"
_UA = "derp-bot/2.0.0 (https://git.mymx.me/username/derp)"

# Rate limit: MusicBrainz requires max 1 request/second.
# We use 1.1s between calls to stay well within limits.
_RATE_INTERVAL = 1.1
_last_request: float = 0.0


def _mb_request(path: str, params: dict | None = None) -> dict:
    """Rate-limited GET to MusicBrainz API. Blocking."""
    global _last_request
    from derp.http import urlopen

    elapsed = time.monotonic() - _last_request
    if elapsed < _RATE_INTERVAL:
        time.sleep(_RATE_INTERVAL - elapsed)

    qs = "&".join(f"{k}={v}" for k, v in (params or {}).items())
    url = f"{_BASE}/{path}?fmt=json&{qs}" if qs else f"{_BASE}/{path}?fmt=json"
    req = Request(url, headers={"User-Agent": _UA})

    try:
        resp = urlopen(req, timeout=10, proxy=False)
        _last_request = time.monotonic()
        return json.loads(resp.read().decode())
    except Exception:
        _last_request = time.monotonic()
        log.warning("musicbrainz: request failed: %s", path, exc_info=True)
        return {}


def mb_search_artist(name: str) -> str | None:
    """Search for an artist by name, return MBID or None."""
    from urllib.parse import quote

    data = _mb_request("artist", {"query": quote(name), "limit": "1"})
    artists = data.get("artists", [])
    if not artists:
        return None
    # Require a reasonable score to avoid false matches
    score = artists[0].get("score", 0)
    if score < 50:
        return None
    return artists[0].get("id")


def mb_artist_tags(mbid: str) -> list[str]:
    """Fetch top 5 tags for an artist by MBID."""
    data = _mb_request(f"artist/{mbid}", {"inc": "tags"})
    tags = data.get("tags", [])
    if not tags:
        return []
    # Sort by count descending, take top 5
    sorted_tags = sorted(tags, key=lambda t: t.get("count", 0), reverse=True)
    return [t["name"] for t in sorted_tags[:5] if t.get("name")]


def mb_find_similar_recordings(artist: str, tags: list[str],
                               limit: int = 10) -> list[dict]:
    """Find recordings by other artists sharing top tags.

    Searches MusicBrainz for recordings tagged with the top 2 tags,
    excluding the original artist. Returns [{"artist": str, "title": str}].
    """
    from urllib.parse import quote

    if not tags:
        return []

    # Use top 2 tags for the query
    tag_query = " AND ".join(f'tag:"{t}"' for t in tags[:2])
    query = f'({tag_query}) AND NOT artist:"{artist}"'

    data = _mb_request("recording", {
        "query": quote(query),
        "limit": str(limit),
    })
    recordings = data.get("recordings", [])
    if not recordings:
        return []

    seen = set()
    results = []
    for rec in recordings:
        title = rec.get("title", "")
        credits = rec.get("artist-credit", [])
        if not credits or not title:
            continue
        rec_artist = credits[0].get("name", "") if credits else ""
        if not rec_artist:
            continue
        # Skip the original artist (case-insensitive)
        if rec_artist.lower() == artist.lower():
            continue
        # Deduplicate by artist+title
        key = f"{rec_artist.lower()}:{title.lower()}"
        if key in seen:
            continue
        seen.add(key)
        results.append({"artist": rec_artist, "title": title})

    return results