import asyncio import uuid import os from pathlib import Path from urllib.parse import urlparse, parse_qs, urlencode, urlunparse from app.config import settings AUDIO_TMP_DIR = "/tmp/apoena-audio" # Query parameters that are tracking/share metadata and not part of the video identity _STRIP_PARAMS = {"si", "is", "feature", "pp", "ab_channel"} def normalize_youtube_url(url: str) -> str: """Normalize YouTube URLs to canonical form for yt-dlp compatibility. Converts youtu.be short URLs to youtube.com/watch?v= and strips tracking/share parameters (si=, is=, feature=, etc.). """ parsed = urlparse(url) netloc = parsed.netloc.lower() if netloc in ("youtu.be", "www.youtu.be"): video_id = parsed.path.lstrip("/") if video_id: return f"https://www.youtube.com/watch?v={video_id}" if netloc in ("youtube.com", "www.youtube.com", "m.youtube.com"): qs = parse_qs(parsed.query, keep_blank_values=True) cleaned = {k: v for k, v in qs.items() if k not in _STRIP_PARAMS} new_query = urlencode({k: v[0] for k, v in cleaned.items()}) return urlunparse(parsed._replace(netloc="www.youtube.com", query=new_query)) return url async def extract_audio(url: str) -> Path: """Download audio-only from a URL using yt-dlp. Returns path to an mp3 temp file.""" url = normalize_youtube_url(url) job_id = str(uuid.uuid4()) outtmpl = f"{AUDIO_TMP_DIR}/{job_id}.%(ext)s" expected = Path(f"{AUDIO_TMP_DIR}/{job_id}.mp3") cmd = [ "yt-dlp", "--extract-audio", "--audio-format", "mp3", "--audio-quality", "128K", "--format", "bestaudio/best/b", "--extractor-args", "youtube:player_client=ios,web", "--output", outtmpl, ] if settings.yt_dlp_cookies_file: cmd += ["--cookies", settings.yt_dlp_cookies_file] cmd.append(url) proc = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) _, stderr = await proc.communicate() stderr_text = stderr.decode().strip() if stderr_text: import logging logging.getLogger(__name__).warning("yt-dlp stderr: %s", stderr_text) if proc.returncode != 0: raise RuntimeError(stderr_text or "yt-dlp failed with no output") if expected.exists(): return expected # yt-dlp sometimes keeps the original extension even with --audio-format mp3 # Find whatever file was created with this job_id prefix for f in Path(AUDIO_TMP_DIR).glob(f"{job_id}.*"): return f raise RuntimeError("yt-dlp produced no output file")