fix: normalize YouTube URLs before passing to yt-dlp
Converts youtu.be short URLs to youtube.com/watch?v= format and strips tracking/share parameters (si=, is=, feature=, etc.) that can confuse yt-dlp.
This commit is contained in:
@@ -2,13 +2,41 @@ import asyncio
|
|||||||
import uuid
|
import uuid
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
|
||||||
from app.config import settings
|
from app.config import settings
|
||||||
|
|
||||||
AUDIO_TMP_DIR = "/tmp/apoena-audio"
|
AUDIO_TMP_DIR = "/tmp/apoena-audio"
|
||||||
|
|
||||||
|
# Query parameters that are tracking/share metadata and not part of the video identity
|
||||||
|
_STRIP_PARAMS = {"si", "is", "feature", "pp", "ab_channel"}
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_youtube_url(url: str) -> str:
|
||||||
|
"""Normalize YouTube URLs to canonical form for yt-dlp compatibility.
|
||||||
|
|
||||||
|
Converts youtu.be short URLs to youtube.com/watch?v= and strips
|
||||||
|
tracking/share parameters (si=, is=, feature=, etc.).
|
||||||
|
"""
|
||||||
|
parsed = urlparse(url)
|
||||||
|
netloc = parsed.netloc.lower()
|
||||||
|
|
||||||
|
if netloc in ("youtu.be", "www.youtu.be"):
|
||||||
|
video_id = parsed.path.lstrip("/")
|
||||||
|
if video_id:
|
||||||
|
return f"https://www.youtube.com/watch?v={video_id}"
|
||||||
|
|
||||||
|
if netloc in ("youtube.com", "www.youtube.com", "m.youtube.com"):
|
||||||
|
qs = parse_qs(parsed.query, keep_blank_values=True)
|
||||||
|
cleaned = {k: v for k, v in qs.items() if k not in _STRIP_PARAMS}
|
||||||
|
new_query = urlencode({k: v[0] for k, v in cleaned.items()})
|
||||||
|
return urlunparse(parsed._replace(netloc="www.youtube.com", query=new_query))
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
async def extract_audio(url: str) -> Path:
|
async def extract_audio(url: str) -> Path:
|
||||||
"""Download audio-only from a URL using yt-dlp. Returns path to an mp3 temp file."""
|
"""Download audio-only from a URL using yt-dlp. Returns path to an mp3 temp file."""
|
||||||
|
url = normalize_youtube_url(url)
|
||||||
job_id = str(uuid.uuid4())
|
job_id = str(uuid.uuid4())
|
||||||
outtmpl = f"{AUDIO_TMP_DIR}/{job_id}.%(ext)s"
|
outtmpl = f"{AUDIO_TMP_DIR}/{job_id}.%(ext)s"
|
||||||
expected = Path(f"{AUDIO_TMP_DIR}/{job_id}.mp3")
|
expected = Path(f"{AUDIO_TMP_DIR}/{job_id}.mp3")
|
||||||
|
|||||||
Reference in New Issue
Block a user