fix: resolve YouTube channel ID via InnerTube for video URLs
Video URLs (watch, shorts, embed, youtu.be) now resolve the channel ID through the InnerTube player API -- a small JSON POST instead of fetching the full 1MB watch page. Much more resilient to transient proxy failures. Page scraping remains as fallback for handle URLs. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -20,8 +20,11 @@ _CHANNEL_ID_RE = re.compile(r"UC[A-Za-z0-9_-]{22}")
|
||||
_CHANNEL_URL_RE = re.compile(r"/channel/(UC[A-Za-z0-9_-]{22})")
|
||||
_PAGE_BROWSE_RE = re.compile(rb'"browseId"\s*:\s*"(UC[A-Za-z0-9_-]{22})"')
|
||||
_PAGE_CHANNEL_RE = re.compile(rb'"channelId"\s*:\s*"(UC[A-Za-z0-9_-]{22})"')
|
||||
_VIDEO_ID_RE = re.compile(r"(?:v=|youtu\.be/|/embed/|/shorts/)([A-Za-z0-9_-]{11})")
|
||||
_YT_DOMAINS = {"youtube.com", "www.youtube.com", "m.youtube.com", "youtu.be"}
|
||||
_YT_FEED_URL = "https://www.youtube.com/feeds/videos.xml?channel_id={}"
|
||||
_YT_PLAYER_URL = "https://www.youtube.com/youtubei/v1/player"
|
||||
_YT_CLIENT_VERSION = "2.20250101.00.00"
|
||||
_ATOM_NS = "{http://www.w3.org/2005/Atom}"
|
||||
_YT_NS = "{http://www.youtube.com/xml/schemas/2015}"
|
||||
_MAX_SEEN = 200
|
||||
@@ -86,14 +89,48 @@ def _extract_channel_id(url: str) -> str | None:
|
||||
return m.group(1) if m else None
|
||||
|
||||
|
||||
def _extract_video_id(url: str) -> str | None:
|
||||
"""Try to extract video ID from a YouTube URL."""
|
||||
m = _VIDEO_ID_RE.search(url)
|
||||
return m.group(1) if m else None
|
||||
|
||||
|
||||
# -- Blocking helpers (for executor) -----------------------------------------
|
||||
|
||||
def _resolve_via_innertube(video_id: str) -> str | None:
|
||||
"""Resolve video ID to channel ID via InnerTube player API. Blocking.
|
||||
|
||||
Small JSON request/response -- much more resilient to transient proxy
|
||||
issues than fetching the full 1MB watch page.
|
||||
"""
|
||||
payload = json.dumps({
|
||||
"context": {
|
||||
"client": {
|
||||
"clientName": "WEB",
|
||||
"clientVersion": _YT_CLIENT_VERSION,
|
||||
},
|
||||
},
|
||||
"videoId": video_id,
|
||||
}).encode()
|
||||
req = urllib.request.Request(_YT_PLAYER_URL, data=payload, method="POST")
|
||||
req.add_header("Content-Type", "application/json")
|
||||
try:
|
||||
resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
|
||||
raw = resp.read()
|
||||
resp.close()
|
||||
data = json.loads(raw)
|
||||
channel_id = (data.get("videoDetails") or {}).get("channelId", "")
|
||||
if channel_id and _CHANNEL_ID_RE.fullmatch(channel_id):
|
||||
return channel_id
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _resolve_channel(url: str) -> str | None:
|
||||
"""Fetch YouTube page HTML and extract channel ID. Blocking.
|
||||
|
||||
Tries browseId first (reliable on both channel and video pages),
|
||||
then falls back to channelId (correct on video pages but may match
|
||||
recommended channels on channel pages).
|
||||
Fallback for handle/non-video URLs. Tries browseId first, then channelId.
|
||||
"""
|
||||
req = urllib.request.Request(url, method="GET")
|
||||
req.add_header("User-Agent", _BROWSER_UA)
|
||||
@@ -434,6 +471,12 @@ async def cmd_yt(bot, message):
|
||||
# Resolve channel ID
|
||||
loop = asyncio.get_running_loop()
|
||||
channel_id = _extract_channel_id(url)
|
||||
if not channel_id:
|
||||
video_id = _extract_video_id(url)
|
||||
if video_id:
|
||||
channel_id = await loop.run_in_executor(
|
||||
None, _resolve_via_innertube, video_id,
|
||||
)
|
||||
if not channel_id:
|
||||
channel_id = await loop.run_in_executor(None, _resolve_channel, url)
|
||||
if not channel_id:
|
||||
|
||||
Reference in New Issue
Block a user