fix: resolve YouTube channel ID via InnerTube for video URLs

Video URLs (watch, shorts, embed, youtu.be) now resolve the channel ID through the InnerTube player API -- a small JSON POST instead of fetching the full 1MB watch page. Much more resilient to transient proxy failures. Page scraping remains as fallback for handle URLs. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 18:39:32 +01:00
parent daa3370433
commit 34d5dd6f8d
1 changed files with 46 additions and 3 deletions
--- a/plugins/youtube.py
+++ b/plugins/youtube.py
@@ -20,8 +20,11 @@ _CHANNEL_ID_RE = re.compile(r"UC[A-Za-z0-9_-]{22}")
 _CHANNEL_URL_RE = re.compile(r"/channel/(UC[A-Za-z0-9_-]{22})")
 _PAGE_BROWSE_RE = re.compile(rb'"browseId"\s*:\s*"(UC[A-Za-z0-9_-]{22})"')
 _PAGE_CHANNEL_RE = re.compile(rb'"channelId"\s*:\s*"(UC[A-Za-z0-9_-]{22})"')
+_VIDEO_ID_RE = re.compile(r"(?:v=|youtu\.be/|/embed/|/shorts/)([A-Za-z0-9_-]{11})")
 _YT_DOMAINS = {"youtube.com", "www.youtube.com", "m.youtube.com", "youtu.be"}
 _YT_FEED_URL = "https://www.youtube.com/feeds/videos.xml?channel_id={}"
+_YT_PLAYER_URL = "https://www.youtube.com/youtubei/v1/player"
+_YT_CLIENT_VERSION = "2.20250101.00.00"
 _ATOM_NS = "{http://www.w3.org/2005/Atom}"
 _YT_NS = "{http://www.youtube.com/xml/schemas/2015}"
 _MAX_SEEN = 200
@@ -86,14 +89,48 @@ def _extract_channel_id(url: str) -> str | None:
    return m.group(1) if m else None


+def _extract_video_id(url: str) -> str | None:
+    """Try to extract video ID from a YouTube URL."""
+    m = _VIDEO_ID_RE.search(url)
+    return m.group(1) if m else None
+
+
 # -- Blocking helpers (for executor) -----------------------------------------

+def _resolve_via_innertube(video_id: str) -> str | None:
+    """Resolve video ID to channel ID via InnerTube player API. Blocking.
+
+    Small JSON request/response -- much more resilient to transient proxy
+    issues than fetching the full 1MB watch page.
+    """
+    payload = json.dumps({
+        "context": {
+            "client": {
+                "clientName": "WEB",
+                "clientVersion": _YT_CLIENT_VERSION,
+            },
+        },
+        "videoId": video_id,
+    }).encode()
+    req = urllib.request.Request(_YT_PLAYER_URL, data=payload, method="POST")
+    req.add_header("Content-Type", "application/json")
+    try:
+        resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
+        raw = resp.read()
+        resp.close()
+        data = json.loads(raw)
+        channel_id = (data.get("videoDetails") or {}).get("channelId", "")
+        if channel_id and _CHANNEL_ID_RE.fullmatch(channel_id):
+            return channel_id
+    except Exception:
+        pass
+    return None
+
+
 def _resolve_channel(url: str) -> str | None:
    """Fetch YouTube page HTML and extract channel ID. Blocking.

-    Tries browseId first (reliable on both channel and video pages),
-    then falls back to channelId (correct on video pages but may match
-    recommended channels on channel pages).
+    Fallback for handle/non-video URLs. Tries browseId first, then channelId.
    """
    req = urllib.request.Request(url, method="GET")
    req.add_header("User-Agent", _BROWSER_UA)
@@ -434,6 +471,12 @@ async def cmd_yt(bot, message):
        # Resolve channel ID
        loop = asyncio.get_running_loop()
        channel_id = _extract_channel_id(url)
+        if not channel_id:
+            video_id = _extract_video_id(url)
+            if video_id:
+                channel_id = await loop.run_in_executor(
+                    None, _resolve_via_innertube, video_id,
+                )
        if not channel_id:
            channel_id = await loop.run_in_executor(None, _resolve_channel, url)
        if not channel_id: