DASH/HLS formats require n-challenge solving (needs JS runtime). Skipping them falls back to progressive streams which carry audio and don't need challenge solving.
83 lines
2.6 KiB
Python
83 lines
2.6 KiB
Python
import asyncio
|
|
import uuid
|
|
import os
|
|
from pathlib import Path
|
|
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
|
|
from app.config import settings
|
|
|
|
AUDIO_TMP_DIR = "/tmp/apoena-audio"
|
|
|
|
# Query parameters that are tracking/share metadata and not part of the video identity
|
|
_STRIP_PARAMS = {"si", "is", "feature", "pp", "ab_channel"}
|
|
|
|
|
|
def normalize_youtube_url(url: str) -> str:
|
|
"""Normalize YouTube URLs to canonical form for yt-dlp compatibility.
|
|
|
|
Converts youtu.be short URLs to youtube.com/watch?v= and strips
|
|
tracking/share parameters (si=, is=, feature=, etc.).
|
|
"""
|
|
parsed = urlparse(url)
|
|
netloc = parsed.netloc.lower()
|
|
|
|
if netloc in ("youtu.be", "www.youtu.be"):
|
|
video_id = parsed.path.lstrip("/")
|
|
if video_id:
|
|
return f"https://www.youtube.com/watch?v={video_id}"
|
|
|
|
if netloc in ("youtube.com", "www.youtube.com", "m.youtube.com"):
|
|
qs = parse_qs(parsed.query, keep_blank_values=True)
|
|
cleaned = {k: v for k, v in qs.items() if k not in _STRIP_PARAMS}
|
|
new_query = urlencode({k: v[0] for k, v in cleaned.items()})
|
|
return urlunparse(parsed._replace(netloc="www.youtube.com", query=new_query))
|
|
|
|
return url
|
|
|
|
|
|
async def extract_audio(url: str) -> Path:
|
|
"""Download audio-only from a URL using yt-dlp. Returns path to an mp3 temp file."""
|
|
url = normalize_youtube_url(url)
|
|
job_id = str(uuid.uuid4())
|
|
outtmpl = f"{AUDIO_TMP_DIR}/{job_id}.%(ext)s"
|
|
expected = Path(f"{AUDIO_TMP_DIR}/{job_id}.mp3")
|
|
|
|
cmd = [
|
|
"yt-dlp",
|
|
"--extract-audio",
|
|
"--audio-format", "mp3",
|
|
"--audio-quality", "128K",
|
|
"--format", "bestaudio/best/b",
|
|
"--extractor-args", "youtube:skip=dash,hls",
|
|
"--output", outtmpl,
|
|
]
|
|
|
|
if settings.yt_dlp_cookies_file:
|
|
cmd += ["--cookies", settings.yt_dlp_cookies_file]
|
|
|
|
cmd.append(url)
|
|
|
|
proc = await asyncio.create_subprocess_exec(
|
|
*cmd,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
_, stderr = await proc.communicate()
|
|
stderr_text = stderr.decode().strip()
|
|
|
|
if stderr_text:
|
|
import logging
|
|
logging.getLogger(__name__).warning("yt-dlp stderr: %s", stderr_text)
|
|
|
|
if proc.returncode != 0:
|
|
raise RuntimeError(stderr_text or "yt-dlp failed with no output")
|
|
|
|
if expected.exists():
|
|
return expected
|
|
|
|
# yt-dlp sometimes keeps the original extension even with --audio-format mp3
|
|
# Find whatever file was created with this job_id prefix
|
|
for f in Path(AUDIO_TMP_DIR).glob(f"{job_id}.*"):
|
|
return f
|
|
|
|
raise RuntimeError("yt-dlp produced no output file")
|