This commit is contained in:
Julien Calixte
2026-03-23 18:54:10 +01:00
commit 8e137ace86
10 changed files with 829 additions and 0 deletions

8
.dockerignore Normal file
View File

@@ -0,0 +1,8 @@
.git
.claude
__pycache__
*.pyc
*.pyo
.env
.venv
venv

20
Dockerfile Normal file
View File

@@ -0,0 +1,20 @@
FROM python:3.11-slim
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \
curl \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY app/ ./app/
RUN mkdir -p /tmp/apoena-audio
ENV MAX_UPLOAD_SIZE_MB=500
EXPOSE 8000
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

0
app/__init__.py Normal file
View File

12
app/config.py Normal file
View File

@@ -0,0 +1,12 @@
from pydantic_settings import BaseSettings
class Settings(BaseSettings):
max_upload_size_mb: int = 500
@property
def max_upload_size_bytes(self) -> int:
return self.max_upload_size_mb * 1024 * 1024
settings = Settings()

45
app/downloader.py Normal file
View File

@@ -0,0 +1,45 @@
import asyncio
import uuid
import os
from pathlib import Path
AUDIO_TMP_DIR = "/tmp/apoena-audio"
async def extract_audio(url: str) -> Path:
"""Download audio-only from a URL using yt-dlp. Returns path to an mp3 temp file."""
job_id = str(uuid.uuid4())
outtmpl = f"{AUDIO_TMP_DIR}/{job_id}.%(ext)s"
expected = Path(f"{AUDIO_TMP_DIR}/{job_id}.mp3")
cmd = [
"yt-dlp",
"--no-warnings",
"--quiet",
"--extract-audio",
"--audio-format", "mp3",
"--audio-quality", "128K",
"--format", "bestaudio/best",
"--output", outtmpl,
url,
]
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
_, stderr = await proc.communicate()
if proc.returncode != 0:
raise RuntimeError(stderr.decode().strip() or "yt-dlp failed with no output")
if expected.exists():
return expected
# yt-dlp sometimes keeps the original extension even with --audio-format mp3
# Find whatever file was created with this job_id prefix
for f in Path(AUDIO_TMP_DIR).glob(f"{job_id}.*"):
return f
raise RuntimeError("yt-dlp produced no output file")

59
app/main.py Normal file
View File

@@ -0,0 +1,59 @@
import os
from contextlib import asynccontextmanager
from pathlib import Path
from fastapi import FastAPI, HTTPException, BackgroundTasks
from fastapi.responses import FileResponse
from pydantic import BaseModel
from app import downloader
STATIC_DIR = Path(__file__).parent / "static"
AUDIO_TMP_DIR = Path("/tmp/apoena-audio")
@asynccontextmanager
async def lifespan(app: FastAPI):
AUDIO_TMP_DIR.mkdir(parents=True, exist_ok=True)
yield
app = FastAPI(title="apoena-transcript", lifespan=lifespan)
class ExtractAudioRequest(BaseModel):
url: str
@app.get("/health")
async def health():
return {"status": "ok"}
@app.get("/worker.js")
async def worker_js():
return FileResponse(STATIC_DIR / "worker.js", media_type="application/javascript")
@app.post("/extract-audio")
async def extract_audio(body: ExtractAudioRequest, background_tasks: BackgroundTasks):
try:
audio_path = await downloader.extract_audio(body.url)
except RuntimeError as e:
raise HTTPException(status_code=422, detail=str(e))
background_tasks.add_task(_delete_file, audio_path)
return FileResponse(audio_path, media_type="audio/mpeg", filename="audio.mp3")
def _delete_file(path):
try:
os.unlink(path)
except OSError:
pass
@app.get("/")
async def index():
return FileResponse(STATIC_DIR / "index.html")

611
app/static/index.html Normal file
View File

@@ -0,0 +1,611 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Apoena Transcript</title>
<style>
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
background: #0f0f13;
color: #e2e2e8;
min-height: 100vh;
display: flex;
flex-direction: column;
align-items: center;
padding: 2rem 1rem;
}
.container { width: 100%; max-width: 680px; }
header {
display: flex;
align-items: center;
justify-content: space-between;
margin-bottom: 2rem;
}
h1 { font-size: 1.4rem; font-weight: 600; letter-spacing: -0.02em; }
h1 span { color: #7c6af7; }
.badge {
font-size: 0.72rem;
font-weight: 600;
padding: 0.25rem 0.6rem;
border-radius: 999px;
text-transform: uppercase;
letter-spacing: 0.05em;
}
.badge-gpu { background: #1a2f1a; color: #4ade80; border: 1px solid #166534; }
.badge-wasm { background: #2a2318; color: #fbbf24; border: 1px solid #92400e; }
.badge-loading { background: #1e1e28; color: #94a3b8; border: 1px solid #334155; }
/* Model status bar */
.model-status {
background: #1a1a24;
border: 1px solid #2a2a38;
border-radius: 12px;
padding: 1rem 1.2rem;
margin-bottom: 1.5rem;
font-size: 0.85rem;
color: #94a3b8;
}
.model-status.ready { color: #4ade80; }
.model-status.error { color: #f87171; }
.progress-bar-wrap {
height: 4px;
background: #2a2a38;
border-radius: 2px;
margin-top: 0.6rem;
overflow: hidden;
}
.progress-bar {
height: 100%;
background: #7c6af7;
border-radius: 2px;
transition: width 0.2s ease;
}
/* Input cards */
.card {
background: #1a1a24;
border: 1px solid #2a2a38;
border-radius: 16px;
padding: 1.4rem;
margin-bottom: 1rem;
}
.card h2 {
font-size: 0.9rem;
font-weight: 600;
color: #94a3b8;
text-transform: uppercase;
letter-spacing: 0.06em;
margin-bottom: 1rem;
}
/* Drop zone */
.drop-zone {
border: 2px dashed #2a2a38;
border-radius: 10px;
padding: 2rem;
text-align: center;
cursor: pointer;
transition: border-color 0.15s, background 0.15s;
color: #64748b;
font-size: 0.9rem;
}
.drop-zone:hover, .drop-zone.drag-over {
border-color: #7c6af7;
background: #1e1a30;
color: #a89cf8;
}
.drop-zone input { display: none; }
.drop-zone .icon { font-size: 2rem; margin-bottom: 0.5rem; display: block; }
/* URL input row */
.url-row {
display: flex;
gap: 0.6rem;
}
.url-row input {
flex: 1;
background: #0f0f13;
border: 1px solid #2a2a38;
border-radius: 8px;
padding: 0.65rem 0.9rem;
color: #e2e2e8;
font-size: 0.9rem;
outline: none;
transition: border-color 0.15s;
}
.url-row input:focus { border-color: #7c6af7; }
.url-row input::placeholder { color: #44444f; }
button {
background: #7c6af7;
color: #fff;
border: none;
border-radius: 8px;
padding: 0.65rem 1.2rem;
font-size: 0.9rem;
font-weight: 600;
cursor: pointer;
transition: background 0.15s, opacity 0.15s;
white-space: nowrap;
}
button:hover:not(:disabled) { background: #6b58e8; }
button:disabled { opacity: 0.4; cursor: not-allowed; }
/* Language selector */
.option-row {
display: flex;
align-items: center;
gap: 0.6rem;
margin-top: 0.8rem;
font-size: 0.82rem;
color: #64748b;
}
.option-row select {
background: #0f0f13;
border: 1px solid #2a2a38;
border-radius: 6px;
padding: 0.3rem 0.6rem;
color: #94a3b8;
font-size: 0.82rem;
outline: none;
cursor: pointer;
}
/* Transcribing state */
.transcribing-panel {
display: none;
background: #1a1a24;
border: 1px solid #2a2a38;
border-radius: 16px;
padding: 1.4rem;
margin-bottom: 1rem;
text-align: center;
}
.transcribing-panel.active { display: block; }
.spinner {
width: 36px;
height: 36px;
border: 3px solid #2a2a38;
border-top-color: #7c6af7;
border-radius: 50%;
animation: spin 0.8s linear infinite;
margin: 0 auto 0.8rem;
}
@keyframes spin { to { transform: rotate(360deg); } }
.transcribing-label { font-size: 0.95rem; color: #94a3b8; }
.elapsed { font-size: 0.8rem; color: #44444f; margin-top: 0.3rem; }
/* Result */
.result-panel {
display: none;
background: #1a1a24;
border: 1px solid #2a2a38;
border-radius: 16px;
padding: 1.4rem;
margin-bottom: 1rem;
}
.result-panel.active { display: block; }
.result-header {
display: flex;
align-items: center;
justify-content: space-between;
margin-bottom: 1rem;
}
.result-header h2 {
font-size: 0.9rem;
font-weight: 600;
color: #94a3b8;
text-transform: uppercase;
letter-spacing: 0.06em;
}
.result-actions { display: flex; gap: 0.5rem; }
.result-actions button {
background: #2a2a38;
color: #94a3b8;
font-size: 0.78rem;
padding: 0.4rem 0.8rem;
}
.result-actions button:hover { background: #353547; color: #e2e2e8; }
.result-actions .copy-btn.copied { background: #166534; color: #4ade80; }
.result-text {
font-size: 0.9rem;
line-height: 1.7;
color: #cbd5e1;
white-space: pre-wrap;
word-break: break-word;
max-height: 400px;
overflow-y: auto;
padding-right: 0.3rem;
}
.result-text::-webkit-scrollbar { width: 4px; }
.result-text::-webkit-scrollbar-track { background: transparent; }
.result-text::-webkit-scrollbar-thumb { background: #2a2a38; border-radius: 2px; }
/* Error toast */
.error-panel {
display: none;
background: #2a1010;
border: 1px solid #7f1d1d;
border-radius: 12px;
padding: 1rem 1.2rem;
margin-bottom: 1rem;
font-size: 0.85rem;
color: #fca5a5;
}
.error-panel.active { display: block; }
</style>
</head>
<body>
<div class="container">
<header>
<h1><span>apoena</span> transcript</h1>
<span id="device-badge" class="badge badge-loading">Loading...</span>
</header>
<!-- Model status -->
<div id="model-status" class="model-status">
Loading model — first visit downloads ~100 MB, then it's cached locally.
<div class="progress-bar-wrap" id="progress-wrap" style="display:none">
<div class="progress-bar" id="progress-bar" style="width:0%"></div>
</div>
</div>
<!-- Error panel -->
<div id="error-panel" class="error-panel"></div>
<!-- File upload card -->
<div class="card">
<h2>Upload file</h2>
<div class="drop-zone" id="drop-zone">
<span class="icon">📂</span>
Drop an audio or video file here, or click to browse
<input type="file" id="file-input" accept="audio/*,video/*,.mp4,.mkv,.webm,.mov,.m4a,.mp3,.wav,.ogg,.flac" />
</div>
<div class="option-row">
Language:
<select id="lang-file">
<option value="">Auto-detect</option>
<option value="en">English</option>
<option value="fr">French</option>
<option value="es">Spanish</option>
<option value="de">German</option>
<option value="it">Italian</option>
<option value="pt">Portuguese</option>
<option value="ja">Japanese</option>
<option value="zh">Chinese</option>
<option value="ar">Arabic</option>
<option value="ko">Korean</option>
</select>
</div>
</div>
<!-- URL card -->
<div class="card">
<h2>YouTube / TikTok / URL</h2>
<div class="url-row">
<input type="url" id="url-input" placeholder="https://www.youtube.com/watch?v=..." />
<button id="url-btn" disabled>Transcribe</button>
</div>
<div class="option-row">
Language:
<select id="lang-url">
<option value="">Auto-detect</option>
<option value="en">English</option>
<option value="fr">French</option>
<option value="es">Spanish</option>
<option value="de">German</option>
<option value="it">Italian</option>
<option value="pt">Portuguese</option>
<option value="ja">Japanese</option>
<option value="zh">Chinese</option>
<option value="ar">Arabic</option>
<option value="ko">Korean</option>
</select>
</div>
</div>
<!-- Transcribing panel -->
<div id="transcribing-panel" class="transcribing-panel">
<div class="spinner"></div>
<div class="transcribing-label" id="transcribing-label">Transcribing…</div>
<div class="elapsed" id="elapsed"></div>
</div>
<!-- Result panel -->
<div id="result-panel" class="result-panel">
<div class="result-header">
<h2>Transcript</h2>
<div class="result-actions">
<button class="copy-btn" id="copy-btn">Copy</button>
<button id="srt-btn">Download SRT</button>
</div>
</div>
<div class="result-text" id="result-text"></div>
</div>
</div>
<script type="module">
// ── State ──────────────────────────────────────────────────────────────────
let modelReady = false;
let busy = false;
let lastChunks = [];
let elapsedInterval = null;
// ── DOM refs ───────────────────────────────────────────────────────────────
const deviceBadge = document.getElementById('device-badge');
const modelStatus = document.getElementById('model-status');
const progressWrap = document.getElementById('progress-wrap');
const progressBar = document.getElementById('progress-bar');
const dropZone = document.getElementById('drop-zone');
const fileInput = document.getElementById('file-input');
const urlInput = document.getElementById('url-input');
const urlBtn = document.getElementById('url-btn');
const langFile = document.getElementById('lang-file');
const langUrl = document.getElementById('lang-url');
const transcribingP = document.getElementById('transcribing-panel');
const transcribingL = document.getElementById('transcribing-label');
const elapsedEl = document.getElementById('elapsed');
const resultPanel = document.getElementById('result-panel');
const resultText = document.getElementById('result-text');
const copyBtn = document.getElementById('copy-btn');
const srtBtn = document.getElementById('srt-btn');
const errorPanel = document.getElementById('error-panel');
// ── WebGPU detection ───────────────────────────────────────────────────────
async function detectDevice() {
if (!navigator.gpu) return 'wasm';
try {
const adapter = await navigator.gpu.requestAdapter();
return adapter ? 'webgpu' : 'wasm';
} catch { return 'wasm'; }
}
// ── Web Worker ─────────────────────────────────────────────────────────────
const worker = new Worker('/worker.js?v=3', { type: 'module' });
worker.onmessage = (e) => {
const { type, progress, text, chunks, message } = e.data;
if (type === 'model-progress') {
if (progress?.status === 'downloading') {
const pct = progress.progress ? Math.round(progress.progress) : 0;
progressWrap.style.display = 'block';
progressBar.style.width = pct + '%';
modelStatus.textContent = `Downloading model… ${pct}%`;
const div = document.createElement('div');
div.className = 'progress-bar-wrap';
progressWrap.style.display = 'block';
progressBar.style.width = pct + '%';
} else if (progress?.status === 'loading') {
modelStatus.textContent = 'Loading model into memory…';
} else if (progress?.status === 'initiate') {
modelStatus.textContent = `Fetching ${progress.file || 'model files'}`;
}
}
if (type === 'model-ready') {
modelReady = true;
progressWrap.style.display = 'none';
modelStatus.className = 'model-status ready';
modelStatus.textContent = '✓ Model ready — transcription runs locally on your device';
urlBtn.disabled = false;
}
if (type === 'result') {
lastChunks = chunks;
showResult(text, chunks);
setBusy(false);
}
if (type === 'error') {
showError(message);
setBusy(false);
}
};
worker.onerror = (e) => {
showError('Worker error: ' + e.message);
setBusy(false);
};
// ── Initialise ─────────────────────────────────────────────────────────────
(async () => {
const device = await detectDevice();
if (device === 'webgpu') {
deviceBadge.textContent = 'Local · GPU';
deviceBadge.className = 'badge badge-gpu';
} else {
deviceBadge.textContent = 'Local · CPU';
deviceBadge.className = 'badge badge-wasm';
modelStatus.textContent += ' (WebGPU not available — using CPU, transcription will be slower)';
}
worker.postMessage({ type: 'load', modelId: 'Xenova/whisper-small' });
})();
// ── Audio decoding (main thread — AudioContext not available in workers) ────
async function decodeAudioToFloat32(arrayBuffer) {
// Decode at native sample rate first, then resample to 16kHz for Whisper
const audioCtx = new AudioContext();
const decoded = await audioCtx.decodeAudioData(arrayBuffer);
await audioCtx.close();
if (decoded.sampleRate === 16000 && decoded.numberOfChannels === 1) {
return decoded.getChannelData(0);
}
// Resample + downmix to mono 16kHz via OfflineAudioContext
const targetRate = 16000;
const offlineCtx = new OfflineAudioContext(
1,
Math.ceil(decoded.duration * targetRate),
targetRate,
);
const source = offlineCtx.createBufferSource();
source.buffer = decoded;
source.connect(offlineCtx.destination);
source.start();
const resampled = await offlineCtx.startRendering();
// Explicit copy — getChannelData returns a view into AudioBuffer memory
// which may not be transferable; own buffer avoids postMessage issues.
return new Float32Array(resampled.getChannelData(0));
}
// ── File upload ────────────────────────────────────────────────────────────
dropZone.addEventListener('click', () => fileInput.click());
dropZone.addEventListener('dragover', (e) => { e.preventDefault(); dropZone.classList.add('drag-over'); });
dropZone.addEventListener('dragleave', () => dropZone.classList.remove('drag-over'));
dropZone.addEventListener('drop', (e) => {
e.preventDefault();
dropZone.classList.remove('drag-over');
const file = e.dataTransfer.files[0];
if (file) handleFile(file);
});
fileInput.addEventListener('change', () => {
if (fileInput.files[0]) handleFile(fileInput.files[0]);
});
async function handleFile(file) {
if (!modelReady || busy) return;
setBusy(true);
transcribingL.textContent = `Decoding "${file.name}"…`;
try {
const arrayBuffer = await file.arrayBuffer();
const audioData = await decodeAudioToFloat32(arrayBuffer);
transcribingL.textContent = `Transcribing "${file.name}"…`;
worker.postMessage({ type: 'transcribe', audioData, language: langFile.value || null });
} catch (err) {
showError('Failed to decode audio: ' + err.message);
setBusy(false);
}
}
// ── URL input ──────────────────────────────────────────────────────────────
urlInput.addEventListener('keydown', (e) => {
if (e.key === 'Enter' && !urlBtn.disabled) urlBtn.click();
});
urlBtn.addEventListener('click', async () => {
const url = urlInput.value.trim();
if (!url || !modelReady || busy) return;
setBusy(true);
transcribingL.textContent = 'Downloading audio from URL…';
try {
const res = await fetch('/extract-audio', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ url }),
});
if (!res.ok) {
const err = await res.json().catch(() => ({ detail: res.statusText }));
throw new Error(err.detail || 'Server error');
}
transcribingL.textContent = 'Decoding audio…';
const arrayBuffer = await res.arrayBuffer();
const audioData = await decodeAudioToFloat32(arrayBuffer);
transcribingL.textContent = 'Transcribing…';
worker.postMessage({ type: 'transcribe', audioData, language: langUrl.value || null });
} catch (err) {
showError('Failed to extract audio: ' + err.message);
setBusy(false);
}
});
// ── UI helpers ─────────────────────────────────────────────────────────────
function setBusy(state) {
busy = state;
urlBtn.disabled = state || !modelReady;
if (state) {
errorPanel.className = 'error-panel';
resultPanel.className = 'result-panel';
transcribingP.className = 'transcribing-panel active';
let start = Date.now();
clearInterval(elapsedInterval);
elapsedInterval = setInterval(() => {
const s = Math.floor((Date.now() - start) / 1000);
const m = Math.floor(s / 60);
elapsedEl.textContent = m > 0
? `${m}m ${s % 60}s elapsed`
: `${s}s elapsed`;
}, 1000);
} else {
clearInterval(elapsedInterval);
transcribingP.className = 'transcribing-panel';
elapsedEl.textContent = '';
}
}
function showResult(text, chunks) {
resultText.textContent = text.trim();
resultPanel.className = 'result-panel active';
}
function showError(msg) {
errorPanel.textContent = msg;
errorPanel.className = 'error-panel active';
}
// ── Copy ───────────────────────────────────────────────────────────────────
copyBtn.addEventListener('click', () => {
navigator.clipboard.writeText(resultText.textContent).then(() => {
copyBtn.textContent = 'Copied!';
copyBtn.classList.add('copied');
setTimeout(() => {
copyBtn.textContent = 'Copy';
copyBtn.classList.remove('copied');
}, 2000);
});
});
// ── SRT download ───────────────────────────────────────────────────────────
srtBtn.addEventListener('click', () => {
const srt = chunksToSRT(lastChunks);
const blob = new Blob([srt], { type: 'text/plain' });
const a = document.createElement('a');
a.href = URL.createObjectURL(blob);
a.download = 'transcript.srt';
a.click();
});
function chunksToSRT(chunks) {
if (!chunks || chunks.length === 0) {
return `1\n00:00:00,000 --> 00:00:01,000\n${resultText.textContent.trim()}\n`;
}
return chunks.map((chunk, i) => {
const [start, end] = chunk.timestamp || [0, 1];
return `${i + 1}\n${toSRTTime(start)} --> ${toSRTTime(end || start + 1)}\n${chunk.text.trim()}\n`;
}).join('\n');
}
function toSRTTime(seconds) {
const s = Math.max(0, seconds);
const h = Math.floor(s / 3600);
const m = Math.floor((s % 3600) / 60);
const sec = Math.floor(s % 60);
const ms = Math.floor((s % 1) * 1000);
return `${pad(h)}:${pad(m)}:${pad(sec)},${pad(ms, 3)}`;
}
function pad(n, len = 2) { return String(Math.floor(n)).padStart(len, '0'); }
</script>
</body>
</html>

58
app/static/worker.js Normal file
View File

@@ -0,0 +1,58 @@
import { pipeline, env } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3/dist/transformers.min.js';
// Disable local model file check — always fetch from HuggingFace Hub
env.allowLocalModels = false;
let transcriber = null;
async function loadModel(modelId) {
transcriber = await pipeline(
'automatic-speech-recognition',
modelId,
{
device: 'webgpu',
dtype: 'q4',
progress_callback: (progress) => {
self.postMessage({ type: 'model-progress', progress });
},
}
);
self.postMessage({ type: 'model-ready' });
}
// audioData is a Float32Array of 16kHz mono PCM — decoded in the main thread
// to avoid the missing AudioContext issue in Web Workers.
async function transcribe(audioData, language) {
if (!transcriber) {
self.postMessage({ type: 'error', message: 'Model not loaded' });
return;
}
try {
const result = await transcriber(audioData, {
return_timestamps: true,
chunk_length_s: 30,
stride_length_s: 5,
language: language || null,
task: 'transcribe',
});
self.postMessage({ type: 'result', text: result.text, chunks: result.chunks || [] });
} catch (err) {
self.postMessage({ type: 'error', message: err.message });
}
}
self.onmessage = async (event) => {
const { type, modelId, audioData, language } = event.data;
if (type === 'load') {
try {
await loadModel(modelId || 'Xenova/whisper-small');
} catch (err) {
self.postMessage({ type: 'error', message: `Failed to load model: ${err.message}` });
}
} else if (type === 'transcribe') {
await transcribe(audioData, language);
}
};

9
docker-compose.yml Normal file
View File

@@ -0,0 +1,9 @@
services:
app:
build: .
ports:
- "8000:8000"
environment:
- MAX_UPLOAD_SIZE_MB=500
volumes:
- /tmp/apoena-audio:/tmp/apoena-audio

7
requirements.txt Normal file
View File

@@ -0,0 +1,7 @@
fastapi>=0.111
uvicorn[standard]>=0.29
python-multipart>=0.0.9
yt-dlp>=2024.1
pydantic>=2.0
pydantic-settings>=2.0
aiofiles>=23.0