Files
transcript/app/downloader.py
Julien Calixte cd23581870 fix: use tv_embedded player client to bypass n-challenge requirement
tv_embedded doesn't need n-challenge solving, falling back to web.
2026-03-23 22:47:42 +01:00

83 lines
2.6 KiB
Python

import asyncio
import uuid
import os
from pathlib import Path
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
from app.config import settings
AUDIO_TMP_DIR = "/tmp/apoena-audio"
# Query parameters that are tracking/share metadata and not part of the video identity
_STRIP_PARAMS = {"si", "is", "feature", "pp", "ab_channel"}
def normalize_youtube_url(url: str) -> str:
"""Normalize YouTube URLs to canonical form for yt-dlp compatibility.
Converts youtu.be short URLs to youtube.com/watch?v= and strips
tracking/share parameters (si=, is=, feature=, etc.).
"""
parsed = urlparse(url)
netloc = parsed.netloc.lower()
if netloc in ("youtu.be", "www.youtu.be"):
video_id = parsed.path.lstrip("/")
if video_id:
return f"https://www.youtube.com/watch?v={video_id}"
if netloc in ("youtube.com", "www.youtube.com", "m.youtube.com"):
qs = parse_qs(parsed.query, keep_blank_values=True)
cleaned = {k: v for k, v in qs.items() if k not in _STRIP_PARAMS}
new_query = urlencode({k: v[0] for k, v in cleaned.items()})
return urlunparse(parsed._replace(netloc="www.youtube.com", query=new_query))
return url
async def extract_audio(url: str) -> Path:
"""Download audio-only from a URL using yt-dlp. Returns path to an mp3 temp file."""
url = normalize_youtube_url(url)
job_id = str(uuid.uuid4())
outtmpl = f"{AUDIO_TMP_DIR}/{job_id}.%(ext)s"
expected = Path(f"{AUDIO_TMP_DIR}/{job_id}.mp3")
cmd = [
"yt-dlp",
"--extract-audio",
"--audio-format", "mp3",
"--audio-quality", "128K",
"--format", "bestaudio/best/b",
"--extractor-args", "youtube:player_client=tv_embedded,web",
"--output", outtmpl,
]
if settings.yt_dlp_cookies_file:
cmd += ["--cookies", settings.yt_dlp_cookies_file]
cmd.append(url)
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
_, stderr = await proc.communicate()
stderr_text = stderr.decode().strip()
if stderr_text:
import logging
logging.getLogger(__name__).warning("yt-dlp stderr: %s", stderr_text)
if proc.returncode != 0:
raise RuntimeError(stderr_text or "yt-dlp failed with no output")
if expected.exists():
return expected
# yt-dlp sometimes keeps the original extension even with --audio-format mp3
# Find whatever file was created with this job_id prefix
for f in Path(AUDIO_TMP_DIR).glob(f"{job_id}.*"):
return f
raise RuntimeError("yt-dlp produced no output file")