fix: resolve YouTube channel ID via InnerTube for video URLs

Video URLs (watch, shorts, embed, youtu.be) now resolve the channel
ID through the InnerTube player API -- a small JSON POST instead of
fetching the full 1MB watch page. Much more resilient to transient
proxy failures. Page scraping remains as fallback for handle URLs.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
user
2026-02-16 18:39:32 +01:00
parent daa3370433
commit 34d5dd6f8d

View File

@@ -20,8 +20,11 @@ _CHANNEL_ID_RE = re.compile(r"UC[A-Za-z0-9_-]{22}")
_CHANNEL_URL_RE = re.compile(r"/channel/(UC[A-Za-z0-9_-]{22})")
_PAGE_BROWSE_RE = re.compile(rb'"browseId"\s*:\s*"(UC[A-Za-z0-9_-]{22})"')
_PAGE_CHANNEL_RE = re.compile(rb'"channelId"\s*:\s*"(UC[A-Za-z0-9_-]{22})"')
_VIDEO_ID_RE = re.compile(r"(?:v=|youtu\.be/|/embed/|/shorts/)([A-Za-z0-9_-]{11})")
_YT_DOMAINS = {"youtube.com", "www.youtube.com", "m.youtube.com", "youtu.be"}
_YT_FEED_URL = "https://www.youtube.com/feeds/videos.xml?channel_id={}"
_YT_PLAYER_URL = "https://www.youtube.com/youtubei/v1/player"
_YT_CLIENT_VERSION = "2.20250101.00.00"
_ATOM_NS = "{http://www.w3.org/2005/Atom}"
_YT_NS = "{http://www.youtube.com/xml/schemas/2015}"
_MAX_SEEN = 200
@@ -86,14 +89,48 @@ def _extract_channel_id(url: str) -> str | None:
return m.group(1) if m else None
def _extract_video_id(url: str) -> str | None:
"""Try to extract video ID from a YouTube URL."""
m = _VIDEO_ID_RE.search(url)
return m.group(1) if m else None
# -- Blocking helpers (for executor) -----------------------------------------
def _resolve_via_innertube(video_id: str) -> str | None:
"""Resolve video ID to channel ID via InnerTube player API. Blocking.
Small JSON request/response -- much more resilient to transient proxy
issues than fetching the full 1MB watch page.
"""
payload = json.dumps({
"context": {
"client": {
"clientName": "WEB",
"clientVersion": _YT_CLIENT_VERSION,
},
},
"videoId": video_id,
}).encode()
req = urllib.request.Request(_YT_PLAYER_URL, data=payload, method="POST")
req.add_header("Content-Type", "application/json")
try:
resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
raw = resp.read()
resp.close()
data = json.loads(raw)
channel_id = (data.get("videoDetails") or {}).get("channelId", "")
if channel_id and _CHANNEL_ID_RE.fullmatch(channel_id):
return channel_id
except Exception:
pass
return None
def _resolve_channel(url: str) -> str | None:
"""Fetch YouTube page HTML and extract channel ID. Blocking.
Tries browseId first (reliable on both channel and video pages),
then falls back to channelId (correct on video pages but may match
recommended channels on channel pages).
Fallback for handle/non-video URLs. Tries browseId first, then channelId.
"""
req = urllib.request.Request(url, method="GET")
req.add_header("User-Agent", _BROWSER_UA)
@@ -434,6 +471,12 @@ async def cmd_yt(bot, message):
# Resolve channel ID
loop = asyncio.get_running_loop()
channel_id = _extract_channel_id(url)
if not channel_id:
video_id = _extract_video_id(url)
if video_id:
channel_id = await loop.run_in_executor(
None, _resolve_via_innertube, video_id,
)
if not channel_id:
channel_id = await loop.run_in_executor(None, _resolve_channel, url)
if not channel_id: