diff --git a/app/downloader.py b/app/downloader.py index 04ce646..6e5e569 100644 --- a/app/downloader.py +++ b/app/downloader.py @@ -2,13 +2,41 @@ import asyncio import uuid import os from pathlib import Path +from urllib.parse import urlparse, parse_qs, urlencode, urlunparse from app.config import settings AUDIO_TMP_DIR = "/tmp/apoena-audio" +# Query parameters that are tracking/share metadata and not part of the video identity +_STRIP_PARAMS = {"si", "is", "feature", "pp", "ab_channel"} + + +def normalize_youtube_url(url: str) -> str: + """Normalize YouTube URLs to canonical form for yt-dlp compatibility. + + Converts youtu.be short URLs to youtube.com/watch?v= and strips + tracking/share parameters (si=, is=, feature=, etc.). + """ + parsed = urlparse(url) + netloc = parsed.netloc.lower() + + if netloc in ("youtu.be", "www.youtu.be"): + video_id = parsed.path.lstrip("/") + if video_id: + return f"https://www.youtube.com/watch?v={video_id}" + + if netloc in ("youtube.com", "www.youtube.com", "m.youtube.com"): + qs = parse_qs(parsed.query, keep_blank_values=True) + cleaned = {k: v for k, v in qs.items() if k not in _STRIP_PARAMS} + new_query = urlencode({k: v[0] for k, v in cleaned.items()}) + return urlunparse(parsed._replace(netloc="www.youtube.com", query=new_query)) + + return url + async def extract_audio(url: str) -> Path: """Download audio-only from a URL using yt-dlp. Returns path to an mp3 temp file.""" + url = normalize_youtube_url(url) job_id = str(uuid.uuid4()) outtmpl = f"{AUDIO_TMP_DIR}/{job_id}.%(ext)s" expected = Path(f"{AUDIO_TMP_DIR}/{job_id}.mp3")