From 34d5dd6f8d271640614b2e2e6e4033327f337fca Mon Sep 17 00:00:00 2001 From: user Date: Mon, 16 Feb 2026 18:39:32 +0100 Subject: [PATCH] fix: resolve YouTube channel ID via InnerTube for video URLs Video URLs (watch, shorts, embed, youtu.be) now resolve the channel ID through the InnerTube player API -- a small JSON POST instead of fetching the full 1MB watch page. Much more resilient to transient proxy failures. Page scraping remains as fallback for handle URLs. Co-Authored-By: Claude Opus 4.6 --- plugins/youtube.py | 49 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/plugins/youtube.py b/plugins/youtube.py index 4e2054d..261aebc 100644 --- a/plugins/youtube.py +++ b/plugins/youtube.py @@ -20,8 +20,11 @@ _CHANNEL_ID_RE = re.compile(r"UC[A-Za-z0-9_-]{22}") _CHANNEL_URL_RE = re.compile(r"/channel/(UC[A-Za-z0-9_-]{22})") _PAGE_BROWSE_RE = re.compile(rb'"browseId"\s*:\s*"(UC[A-Za-z0-9_-]{22})"') _PAGE_CHANNEL_RE = re.compile(rb'"channelId"\s*:\s*"(UC[A-Za-z0-9_-]{22})"') +_VIDEO_ID_RE = re.compile(r"(?:v=|youtu\.be/|/embed/|/shorts/)([A-Za-z0-9_-]{11})") _YT_DOMAINS = {"youtube.com", "www.youtube.com", "m.youtube.com", "youtu.be"} _YT_FEED_URL = "https://www.youtube.com/feeds/videos.xml?channel_id={}" +_YT_PLAYER_URL = "https://www.youtube.com/youtubei/v1/player" +_YT_CLIENT_VERSION = "2.20250101.00.00" _ATOM_NS = "{http://www.w3.org/2005/Atom}" _YT_NS = "{http://www.youtube.com/xml/schemas/2015}" _MAX_SEEN = 200 @@ -86,14 +89,48 @@ def _extract_channel_id(url: str) -> str | None: return m.group(1) if m else None +def _extract_video_id(url: str) -> str | None: + """Try to extract video ID from a YouTube URL.""" + m = _VIDEO_ID_RE.search(url) + return m.group(1) if m else None + + # -- Blocking helpers (for executor) ----------------------------------------- +def _resolve_via_innertube(video_id: str) -> str | None: + """Resolve video ID to channel ID via InnerTube player API. Blocking. + + Small JSON request/response -- much more resilient to transient proxy + issues than fetching the full 1MB watch page. + """ + payload = json.dumps({ + "context": { + "client": { + "clientName": "WEB", + "clientVersion": _YT_CLIENT_VERSION, + }, + }, + "videoId": video_id, + }).encode() + req = urllib.request.Request(_YT_PLAYER_URL, data=payload, method="POST") + req.add_header("Content-Type", "application/json") + try: + resp = _urlopen(req, timeout=_FETCH_TIMEOUT) + raw = resp.read() + resp.close() + data = json.loads(raw) + channel_id = (data.get("videoDetails") or {}).get("channelId", "") + if channel_id and _CHANNEL_ID_RE.fullmatch(channel_id): + return channel_id + except Exception: + pass + return None + + def _resolve_channel(url: str) -> str | None: """Fetch YouTube page HTML and extract channel ID. Blocking. - Tries browseId first (reliable on both channel and video pages), - then falls back to channelId (correct on video pages but may match - recommended channels on channel pages). + Fallback for handle/non-video URLs. Tries browseId first, then channelId. """ req = urllib.request.Request(url, method="GET") req.add_header("User-Agent", _BROWSER_UA) @@ -434,6 +471,12 @@ async def cmd_yt(bot, message): # Resolve channel ID loop = asyncio.get_running_loop() channel_id = _extract_channel_id(url) + if not channel_id: + video_id = _extract_video_id(url) + if video_id: + channel_id = await loop.run_in_executor( + None, _resolve_via_innertube, video_id, + ) if not channel_id: channel_id = await loop.run_in_executor(None, _resolve_channel, url) if not channel_id: