Converts youtu.be short URLs to youtube.com/watch?v= format and strips tracking/share parameters (si=, is=, feature=, etc.) that can confuse yt-dlp.
79 lines
2.5 KiB
Python
79 lines
2.5 KiB
Python
import asyncio
|
|
import uuid
|
|
import os
|
|
from pathlib import Path
|
|
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
|
|
from app.config import settings
|
|
|
|
AUDIO_TMP_DIR = "/tmp/apoena-audio"
|
|
|
|
# Query parameters that are tracking/share metadata and not part of the video identity
|
|
_STRIP_PARAMS = {"si", "is", "feature", "pp", "ab_channel"}
|
|
|
|
|
|
def normalize_youtube_url(url: str) -> str:
|
|
"""Normalize YouTube URLs to canonical form for yt-dlp compatibility.
|
|
|
|
Converts youtu.be short URLs to youtube.com/watch?v= and strips
|
|
tracking/share parameters (si=, is=, feature=, etc.).
|
|
"""
|
|
parsed = urlparse(url)
|
|
netloc = parsed.netloc.lower()
|
|
|
|
if netloc in ("youtu.be", "www.youtu.be"):
|
|
video_id = parsed.path.lstrip("/")
|
|
if video_id:
|
|
return f"https://www.youtube.com/watch?v={video_id}"
|
|
|
|
if netloc in ("youtube.com", "www.youtube.com", "m.youtube.com"):
|
|
qs = parse_qs(parsed.query, keep_blank_values=True)
|
|
cleaned = {k: v for k, v in qs.items() if k not in _STRIP_PARAMS}
|
|
new_query = urlencode({k: v[0] for k, v in cleaned.items()})
|
|
return urlunparse(parsed._replace(netloc="www.youtube.com", query=new_query))
|
|
|
|
return url
|
|
|
|
|
|
async def extract_audio(url: str) -> Path:
|
|
"""Download audio-only from a URL using yt-dlp. Returns path to an mp3 temp file."""
|
|
url = normalize_youtube_url(url)
|
|
job_id = str(uuid.uuid4())
|
|
outtmpl = f"{AUDIO_TMP_DIR}/{job_id}.%(ext)s"
|
|
expected = Path(f"{AUDIO_TMP_DIR}/{job_id}.mp3")
|
|
|
|
cmd = [
|
|
"yt-dlp",
|
|
"--no-warnings",
|
|
"--quiet",
|
|
"--extract-audio",
|
|
"--audio-format", "mp3",
|
|
"--audio-quality", "128K",
|
|
"--format", "bestaudio/best",
|
|
"--output", outtmpl,
|
|
]
|
|
|
|
if settings.yt_dlp_cookies_file:
|
|
cmd += ["--cookies", settings.yt_dlp_cookies_file]
|
|
|
|
cmd.append(url)
|
|
|
|
proc = await asyncio.create_subprocess_exec(
|
|
*cmd,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
_, stderr = await proc.communicate()
|
|
|
|
if proc.returncode != 0:
|
|
raise RuntimeError(stderr.decode().strip() or "yt-dlp failed with no output")
|
|
|
|
if expected.exists():
|
|
return expected
|
|
|
|
# yt-dlp sometimes keeps the original extension even with --audio-format mp3
|
|
# Find whatever file was created with this job_id prefix
|
|
for f in Path(AUDIO_TMP_DIR).glob(f"{job_id}.*"):
|
|
return f
|
|
|
|
raise RuntimeError("yt-dlp produced no output file")
|