init

2026-03-23 18:54:10 +01:00
commit 8e137ace86
10 changed files with 829 additions and 0 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,8 @@
+.git
+.claude
+__pycache__
+*.pyc
+*.pyo
+.env
+.venv
+venv
--- a/20
+++ b/20
@@ -0,0 +1,20 @@
+FROM python:3.11-slim
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ffmpeg \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY app/ ./app/
+
+RUN mkdir -p /tmp/apoena-audio
+
+ENV MAX_UPLOAD_SIZE_MB=500
+
+EXPOSE 8000
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/app/init.py
+++ b/app/init.py
--- a/app/config.py
+++ b/app/config.py
@@ -0,0 +1,12 @@
+from pydantic_settings import BaseSettings
+
+
+class Settings(BaseSettings):
+    max_upload_size_mb: int = 500
+
+    @property
+    def max_upload_size_bytes(self) -> int:
+        return self.max_upload_size_mb * 1024 * 1024
+
+
+settings = Settings()
--- a/app/downloader.py
+++ b/app/downloader.py
@@ -0,0 +1,45 @@
+import asyncio
+import uuid
+import os
+from pathlib import Path
+
+AUDIO_TMP_DIR = "/tmp/apoena-audio"
+
+
+async def extract_audio(url: str) -> Path:
+    """Download audio-only from a URL using yt-dlp. Returns path to an mp3 temp file."""
+    job_id = str(uuid.uuid4())
+    outtmpl = f"{AUDIO_TMP_DIR}/{job_id}.%(ext)s"
+    expected = Path(f"{AUDIO_TMP_DIR}/{job_id}.mp3")
+
+    cmd = [
+        "yt-dlp",
+        "--no-warnings",
+        "--quiet",
+        "--extract-audio",
+        "--audio-format", "mp3",
+        "--audio-quality", "128K",
+        "--format", "bestaudio/best",
+        "--output", outtmpl,
+        url,
+    ]
+
+    proc = await asyncio.create_subprocess_exec(
+        *cmd,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+    )
+    _, stderr = await proc.communicate()
+
+    if proc.returncode != 0:
+        raise RuntimeError(stderr.decode().strip() or "yt-dlp failed with no output")
+
+    if expected.exists():
+        return expected
+
+    # yt-dlp sometimes keeps the original extension even with --audio-format mp3
+    # Find whatever file was created with this job_id prefix
+    for f in Path(AUDIO_TMP_DIR).glob(f"{job_id}.*"):
+        return f
+
+    raise RuntimeError("yt-dlp produced no output file")
--- a/app/main.py
+++ b/app/main.py
@@ -0,0 +1,59 @@
+import os
+from contextlib import asynccontextmanager
+from pathlib import Path
+
+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from fastapi.responses import FileResponse
+from pydantic import BaseModel
+
+from app import downloader
+
+
+STATIC_DIR = Path(__file__).parent / "static"
+AUDIO_TMP_DIR = Path("/tmp/apoena-audio")
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    AUDIO_TMP_DIR.mkdir(parents=True, exist_ok=True)
+    yield
+
+
+app = FastAPI(title="apoena-transcript", lifespan=lifespan)
+
+
+class ExtractAudioRequest(BaseModel):
+    url: str
+
+
+@app.get("/health")
+async def health():
+    return {"status": "ok"}
+
+
+@app.get("/worker.js")
+async def worker_js():
+    return FileResponse(STATIC_DIR / "worker.js", media_type="application/javascript")
+
+
+@app.post("/extract-audio")
+async def extract_audio(body: ExtractAudioRequest, background_tasks: BackgroundTasks):
+    try:
+        audio_path = await downloader.extract_audio(body.url)
+    except RuntimeError as e:
+        raise HTTPException(status_code=422, detail=str(e))
+
+    background_tasks.add_task(_delete_file, audio_path)
+    return FileResponse(audio_path, media_type="audio/mpeg", filename="audio.mp3")
+
+
+def _delete_file(path):
+    try:
+        os.unlink(path)
+    except OSError:
+        pass
+
+
+@app.get("/")
+async def index():
+    return FileResponse(STATIC_DIR / "index.html")
--- a/app/static/index.html
+++ b/app/static/index.html
@@ -0,0 +1,611 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+  <title>Apoena Transcript</title>
+  <style>
+    *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
+
+    body {
+      font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
+      background: #0f0f13;
+      color: #e2e2e8;
+      min-height: 100vh;
+      display: flex;
+      flex-direction: column;
+      align-items: center;
+      padding: 2rem 1rem;
+    }
+
+    .container { width: 100%; max-width: 680px; }
+
+    header {
+      display: flex;
+      align-items: center;
+      justify-content: space-between;
+      margin-bottom: 2rem;
+    }
+
+    h1 { font-size: 1.4rem; font-weight: 600; letter-spacing: -0.02em; }
+    h1 span { color: #7c6af7; }
+
+    .badge {
+      font-size: 0.72rem;
+      font-weight: 600;
+      padding: 0.25rem 0.6rem;
+      border-radius: 999px;
+      text-transform: uppercase;
+      letter-spacing: 0.05em;
+    }
+    .badge-gpu  { background: #1a2f1a; color: #4ade80; border: 1px solid #166534; }
+    .badge-wasm { background: #2a2318; color: #fbbf24; border: 1px solid #92400e; }
+    .badge-loading { background: #1e1e28; color: #94a3b8; border: 1px solid #334155; }
+
+    /* Model status bar */
+    .model-status {
+      background: #1a1a24;
+      border: 1px solid #2a2a38;
+      border-radius: 12px;
+      padding: 1rem 1.2rem;
+      margin-bottom: 1.5rem;
+      font-size: 0.85rem;
+      color: #94a3b8;
+    }
+    .model-status.ready { color: #4ade80; }
+    .model-status.error { color: #f87171; }
+
+    .progress-bar-wrap {
+      height: 4px;
+      background: #2a2a38;
+      border-radius: 2px;
+      margin-top: 0.6rem;
+      overflow: hidden;
+    }
+    .progress-bar {
+      height: 100%;
+      background: #7c6af7;
+      border-radius: 2px;
+      transition: width 0.2s ease;
+    }
+
+    /* Input cards */
+    .card {
+      background: #1a1a24;
+      border: 1px solid #2a2a38;
+      border-radius: 16px;
+      padding: 1.4rem;
+      margin-bottom: 1rem;
+    }
+    .card h2 {
+      font-size: 0.9rem;
+      font-weight: 600;
+      color: #94a3b8;
+      text-transform: uppercase;
+      letter-spacing: 0.06em;
+      margin-bottom: 1rem;
+    }
+
+    /* Drop zone */
+    .drop-zone {
+      border: 2px dashed #2a2a38;
+      border-radius: 10px;
+      padding: 2rem;
+      text-align: center;
+      cursor: pointer;
+      transition: border-color 0.15s, background 0.15s;
+      color: #64748b;
+      font-size: 0.9rem;
+    }
+    .drop-zone:hover, .drop-zone.drag-over {
+      border-color: #7c6af7;
+      background: #1e1a30;
+      color: #a89cf8;
+    }
+    .drop-zone input { display: none; }
+    .drop-zone .icon { font-size: 2rem; margin-bottom: 0.5rem; display: block; }
+
+    /* URL input row */
+    .url-row {
+      display: flex;
+      gap: 0.6rem;
+    }
+    .url-row input {
+      flex: 1;
+      background: #0f0f13;
+      border: 1px solid #2a2a38;
+      border-radius: 8px;
+      padding: 0.65rem 0.9rem;
+      color: #e2e2e8;
+      font-size: 0.9rem;
+      outline: none;
+      transition: border-color 0.15s;
+    }
+    .url-row input:focus { border-color: #7c6af7; }
+    .url-row input::placeholder { color: #44444f; }
+
+    button {
+      background: #7c6af7;
+      color: #fff;
+      border: none;
+      border-radius: 8px;
+      padding: 0.65rem 1.2rem;
+      font-size: 0.9rem;
+      font-weight: 600;
+      cursor: pointer;
+      transition: background 0.15s, opacity 0.15s;
+      white-space: nowrap;
+    }
+    button:hover:not(:disabled) { background: #6b58e8; }
+    button:disabled { opacity: 0.4; cursor: not-allowed; }
+
+    /* Language selector */
+    .option-row {
+      display: flex;
+      align-items: center;
+      gap: 0.6rem;
+      margin-top: 0.8rem;
+      font-size: 0.82rem;
+      color: #64748b;
+    }
+    .option-row select {
+      background: #0f0f13;
+      border: 1px solid #2a2a38;
+      border-radius: 6px;
+      padding: 0.3rem 0.6rem;
+      color: #94a3b8;
+      font-size: 0.82rem;
+      outline: none;
+      cursor: pointer;
+    }
+
+    /* Transcribing state */
+    .transcribing-panel {
+      display: none;
+      background: #1a1a24;
+      border: 1px solid #2a2a38;
+      border-radius: 16px;
+      padding: 1.4rem;
+      margin-bottom: 1rem;
+      text-align: center;
+    }
+    .transcribing-panel.active { display: block; }
+
+    .spinner {
+      width: 36px;
+      height: 36px;
+      border: 3px solid #2a2a38;
+      border-top-color: #7c6af7;
+      border-radius: 50%;
+      animation: spin 0.8s linear infinite;
+      margin: 0 auto 0.8rem;
+    }
+    @keyframes spin { to { transform: rotate(360deg); } }
+
+    .transcribing-label { font-size: 0.95rem; color: #94a3b8; }
+    .elapsed { font-size: 0.8rem; color: #44444f; margin-top: 0.3rem; }
+
+    /* Result */
+    .result-panel {
+      display: none;
+      background: #1a1a24;
+      border: 1px solid #2a2a38;
+      border-radius: 16px;
+      padding: 1.4rem;
+      margin-bottom: 1rem;
+    }
+    .result-panel.active { display: block; }
+
+    .result-header {
+      display: flex;
+      align-items: center;
+      justify-content: space-between;
+      margin-bottom: 1rem;
+    }
+    .result-header h2 {
+      font-size: 0.9rem;
+      font-weight: 600;
+      color: #94a3b8;
+      text-transform: uppercase;
+      letter-spacing: 0.06em;
+    }
+    .result-actions { display: flex; gap: 0.5rem; }
+    .result-actions button {
+      background: #2a2a38;
+      color: #94a3b8;
+      font-size: 0.78rem;
+      padding: 0.4rem 0.8rem;
+    }
+    .result-actions button:hover { background: #353547; color: #e2e2e8; }
+    .result-actions .copy-btn.copied { background: #166534; color: #4ade80; }
+
+    .result-text {
+      font-size: 0.9rem;
+      line-height: 1.7;
+      color: #cbd5e1;
+      white-space: pre-wrap;
+      word-break: break-word;
+      max-height: 400px;
+      overflow-y: auto;
+      padding-right: 0.3rem;
+    }
+    .result-text::-webkit-scrollbar { width: 4px; }
+    .result-text::-webkit-scrollbar-track { background: transparent; }
+    .result-text::-webkit-scrollbar-thumb { background: #2a2a38; border-radius: 2px; }
+
+    /* Error toast */
+    .error-panel {
+      display: none;
+      background: #2a1010;
+      border: 1px solid #7f1d1d;
+      border-radius: 12px;
+      padding: 1rem 1.2rem;
+      margin-bottom: 1rem;
+      font-size: 0.85rem;
+      color: #fca5a5;
+    }
+    .error-panel.active { display: block; }
+  </style>
+</head>
+<body>
+<div class="container">
+  <header>
+    <h1><span>apoena</span> transcript</h1>
+    <span id="device-badge" class="badge badge-loading">Loading...</span>
+  </header>
+
+  <!-- Model status -->
+  <div id="model-status" class="model-status">
+    Loading model — first visit downloads ~100 MB, then it's cached locally.
+    <div class="progress-bar-wrap" id="progress-wrap" style="display:none">
+      <div class="progress-bar" id="progress-bar" style="width:0%"></div>
+    </div>
+  </div>
+
+  <!-- Error panel -->
+  <div id="error-panel" class="error-panel"></div>
+
+  <!-- File upload card -->
+  <div class="card">
+    <h2>Upload file</h2>
+    <div class="drop-zone" id="drop-zone">
+      <span class="icon">📂</span>
+      Drop an audio or video file here, or click to browse
+      <input type="file" id="file-input" accept="audio/*,video/*,.mp4,.mkv,.webm,.mov,.m4a,.mp3,.wav,.ogg,.flac" />
+    </div>
+    <div class="option-row">
+      Language:
+      <select id="lang-file">
+        <option value="">Auto-detect</option>
+        <option value="en">English</option>
+        <option value="fr">French</option>
+        <option value="es">Spanish</option>
+        <option value="de">German</option>
+        <option value="it">Italian</option>
+        <option value="pt">Portuguese</option>
+        <option value="ja">Japanese</option>
+        <option value="zh">Chinese</option>
+        <option value="ar">Arabic</option>
+        <option value="ko">Korean</option>
+      </select>
+    </div>
+  </div>
+
+  <!-- URL card -->
+  <div class="card">
+    <h2>YouTube / TikTok / URL</h2>
+    <div class="url-row">
+      <input type="url" id="url-input" placeholder="https://www.youtube.com/watch?v=..." />
+      <button id="url-btn" disabled>Transcribe</button>
+    </div>
+    <div class="option-row">
+      Language:
+      <select id="lang-url">
+        <option value="">Auto-detect</option>
+        <option value="en">English</option>
+        <option value="fr">French</option>
+        <option value="es">Spanish</option>
+        <option value="de">German</option>
+        <option value="it">Italian</option>
+        <option value="pt">Portuguese</option>
+        <option value="ja">Japanese</option>
+        <option value="zh">Chinese</option>
+        <option value="ar">Arabic</option>
+        <option value="ko">Korean</option>
+      </select>
+    </div>
+  </div>
+
+  <!-- Transcribing panel -->
+  <div id="transcribing-panel" class="transcribing-panel">
+    <div class="spinner"></div>
+    <div class="transcribing-label" id="transcribing-label">Transcribing…</div>
+    <div class="elapsed" id="elapsed"></div>
+  </div>
+
+  <!-- Result panel -->
+  <div id="result-panel" class="result-panel">
+    <div class="result-header">
+      <h2>Transcript</h2>
+      <div class="result-actions">
+        <button class="copy-btn" id="copy-btn">Copy</button>
+        <button id="srt-btn">Download SRT</button>
+      </div>
+    </div>
+    <div class="result-text" id="result-text"></div>
+  </div>
+</div>
+
+<script type="module">
+// ── State ──────────────────────────────────────────────────────────────────
+let modelReady = false;
+let busy = false;
+let lastChunks = [];
+let elapsedInterval = null;
+
+// ── DOM refs ───────────────────────────────────────────────────────────────
+const deviceBadge   = document.getElementById('device-badge');
+const modelStatus   = document.getElementById('model-status');
+const progressWrap  = document.getElementById('progress-wrap');
+const progressBar   = document.getElementById('progress-bar');
+const dropZone      = document.getElementById('drop-zone');
+const fileInput     = document.getElementById('file-input');
+const urlInput      = document.getElementById('url-input');
+const urlBtn        = document.getElementById('url-btn');
+const langFile      = document.getElementById('lang-file');
+const langUrl       = document.getElementById('lang-url');
+const transcribingP = document.getElementById('transcribing-panel');
+const transcribingL = document.getElementById('transcribing-label');
+const elapsedEl     = document.getElementById('elapsed');
+const resultPanel   = document.getElementById('result-panel');
+const resultText    = document.getElementById('result-text');
+const copyBtn       = document.getElementById('copy-btn');
+const srtBtn        = document.getElementById('srt-btn');
+const errorPanel    = document.getElementById('error-panel');
+
+// ── WebGPU detection ───────────────────────────────────────────────────────
+async function detectDevice() {
+  if (!navigator.gpu) return 'wasm';
+  try {
+    const adapter = await navigator.gpu.requestAdapter();
+    return adapter ? 'webgpu' : 'wasm';
+  } catch { return 'wasm'; }
+}
+
+// ── Web Worker ─────────────────────────────────────────────────────────────
+const worker = new Worker('/worker.js?v=3', { type: 'module' });
+
+worker.onmessage = (e) => {
+  const { type, progress, text, chunks, message } = e.data;
+
+  if (type === 'model-progress') {
+    if (progress?.status === 'downloading') {
+      const pct = progress.progress ? Math.round(progress.progress) : 0;
+      progressWrap.style.display = 'block';
+      progressBar.style.width = pct + '%';
+      modelStatus.textContent = `Downloading model… ${pct}%`;
+      const div = document.createElement('div');
+      div.className = 'progress-bar-wrap';
+      progressWrap.style.display = 'block';
+      progressBar.style.width = pct + '%';
+    } else if (progress?.status === 'loading') {
+      modelStatus.textContent = 'Loading model into memory…';
+    } else if (progress?.status === 'initiate') {
+      modelStatus.textContent = `Fetching ${progress.file || 'model files'}…`;
+    }
+  }
+
+  if (type === 'model-ready') {
+    modelReady = true;
+    progressWrap.style.display = 'none';
+    modelStatus.className = 'model-status ready';
+    modelStatus.textContent = '✓ Model ready — transcription runs locally on your device';
+    urlBtn.disabled = false;
+  }
+
+  if (type === 'result') {
+    lastChunks = chunks;
+    showResult(text, chunks);
+    setBusy(false);
+  }
+
+  if (type === 'error') {
+    showError(message);
+    setBusy(false);
+  }
+};
+
+worker.onerror = (e) => {
+  showError('Worker error: ' + e.message);
+  setBusy(false);
+};
+
+// ── Initialise ─────────────────────────────────────────────────────────────
+(async () => {
+  const device = await detectDevice();
+
+  if (device === 'webgpu') {
+    deviceBadge.textContent = 'Local · GPU';
+    deviceBadge.className = 'badge badge-gpu';
+  } else {
+    deviceBadge.textContent = 'Local · CPU';
+    deviceBadge.className = 'badge badge-wasm';
+    modelStatus.textContent += ' (WebGPU not available — using CPU, transcription will be slower)';
+  }
+
+  worker.postMessage({ type: 'load', modelId: 'Xenova/whisper-small' });
+})();
+
+// ── Audio decoding (main thread — AudioContext not available in workers) ────
+async function decodeAudioToFloat32(arrayBuffer) {
+  // Decode at native sample rate first, then resample to 16kHz for Whisper
+  const audioCtx = new AudioContext();
+  const decoded = await audioCtx.decodeAudioData(arrayBuffer);
+  await audioCtx.close();
+
+  if (decoded.sampleRate === 16000 && decoded.numberOfChannels === 1) {
+    return decoded.getChannelData(0);
+  }
+
+  // Resample + downmix to mono 16kHz via OfflineAudioContext
+  const targetRate = 16000;
+  const offlineCtx = new OfflineAudioContext(
+    1,
+    Math.ceil(decoded.duration * targetRate),
+    targetRate,
+  );
+  const source = offlineCtx.createBufferSource();
+  source.buffer = decoded;
+  source.connect(offlineCtx.destination);
+  source.start();
+  const resampled = await offlineCtx.startRendering();
+  // Explicit copy — getChannelData returns a view into AudioBuffer memory
+  // which may not be transferable; own buffer avoids postMessage issues.
+  return new Float32Array(resampled.getChannelData(0));
+}
+
+// ── File upload ────────────────────────────────────────────────────────────
+dropZone.addEventListener('click', () => fileInput.click());
+dropZone.addEventListener('dragover', (e) => { e.preventDefault(); dropZone.classList.add('drag-over'); });
+dropZone.addEventListener('dragleave', () => dropZone.classList.remove('drag-over'));
+dropZone.addEventListener('drop', (e) => {
+  e.preventDefault();
+  dropZone.classList.remove('drag-over');
+  const file = e.dataTransfer.files[0];
+  if (file) handleFile(file);
+});
+fileInput.addEventListener('change', () => {
+  if (fileInput.files[0]) handleFile(fileInput.files[0]);
+});
+
+async function handleFile(file) {
+  if (!modelReady || busy) return;
+  setBusy(true);
+  transcribingL.textContent = `Decoding "${file.name}"…`;
+  try {
+    const arrayBuffer = await file.arrayBuffer();
+    const audioData = await decodeAudioToFloat32(arrayBuffer);
+    transcribingL.textContent = `Transcribing "${file.name}"…`;
+    worker.postMessage({ type: 'transcribe', audioData, language: langFile.value || null });
+  } catch (err) {
+    showError('Failed to decode audio: ' + err.message);
+    setBusy(false);
+  }
+}
+
+// ── URL input ──────────────────────────────────────────────────────────────
+urlInput.addEventListener('keydown', (e) => {
+  if (e.key === 'Enter' && !urlBtn.disabled) urlBtn.click();
+});
+
+urlBtn.addEventListener('click', async () => {
+  const url = urlInput.value.trim();
+  if (!url || !modelReady || busy) return;
+
+  setBusy(true);
+  transcribingL.textContent = 'Downloading audio from URL…';
+
+  try {
+    const res = await fetch('/extract-audio', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ url }),
+    });
+
+    if (!res.ok) {
+      const err = await res.json().catch(() => ({ detail: res.statusText }));
+      throw new Error(err.detail || 'Server error');
+    }
+
+    transcribingL.textContent = 'Decoding audio…';
+    const arrayBuffer = await res.arrayBuffer();
+    const audioData = await decodeAudioToFloat32(arrayBuffer);
+    transcribingL.textContent = 'Transcribing…';
+    worker.postMessage({ type: 'transcribe', audioData, language: langUrl.value || null });
+  } catch (err) {
+    showError('Failed to extract audio: ' + err.message);
+    setBusy(false);
+  }
+});
+
+// ── UI helpers ─────────────────────────────────────────────────────────────
+function setBusy(state) {
+  busy = state;
+  urlBtn.disabled = state || !modelReady;
+
+  if (state) {
+    errorPanel.className = 'error-panel';
+    resultPanel.className = 'result-panel';
+    transcribingP.className = 'transcribing-panel active';
+
+    let start = Date.now();
+    clearInterval(elapsedInterval);
+    elapsedInterval = setInterval(() => {
+      const s = Math.floor((Date.now() - start) / 1000);
+      const m = Math.floor(s / 60);
+      elapsedEl.textContent = m > 0
+        ? `${m}m ${s % 60}s elapsed`
+        : `${s}s elapsed`;
+    }, 1000);
+  } else {
+    clearInterval(elapsedInterval);
+    transcribingP.className = 'transcribing-panel';
+    elapsedEl.textContent = '';
+  }
+}
+
+function showResult(text, chunks) {
+  resultText.textContent = text.trim();
+  resultPanel.className = 'result-panel active';
+}
+
+function showError(msg) {
+  errorPanel.textContent = msg;
+  errorPanel.className = 'error-panel active';
+}
+
+// ── Copy ───────────────────────────────────────────────────────────────────
+copyBtn.addEventListener('click', () => {
+  navigator.clipboard.writeText(resultText.textContent).then(() => {
+    copyBtn.textContent = 'Copied!';
+    copyBtn.classList.add('copied');
+    setTimeout(() => {
+      copyBtn.textContent = 'Copy';
+      copyBtn.classList.remove('copied');
+    }, 2000);
+  });
+});
+
+// ── SRT download ───────────────────────────────────────────────────────────
+srtBtn.addEventListener('click', () => {
+  const srt = chunksToSRT(lastChunks);
+  const blob = new Blob([srt], { type: 'text/plain' });
+  const a = document.createElement('a');
+  a.href = URL.createObjectURL(blob);
+  a.download = 'transcript.srt';
+  a.click();
+});
+
+function chunksToSRT(chunks) {
+  if (!chunks || chunks.length === 0) {
+    return `1\n00:00:00,000 --> 00:00:01,000\n${resultText.textContent.trim()}\n`;
+  }
+  return chunks.map((chunk, i) => {
+    const [start, end] = chunk.timestamp || [0, 1];
+    return `${i + 1}\n${toSRTTime(start)} --> ${toSRTTime(end || start + 1)}\n${chunk.text.trim()}\n`;
+  }).join('\n');
+}
+
+function toSRTTime(seconds) {
+  const s = Math.max(0, seconds);
+  const h = Math.floor(s / 3600);
+  const m = Math.floor((s % 3600) / 60);
+  const sec = Math.floor(s % 60);
+  const ms = Math.floor((s % 1) * 1000);
+  return `${pad(h)}:${pad(m)}:${pad(sec)},${pad(ms, 3)}`;
+}
+
+function pad(n, len = 2) { return String(Math.floor(n)).padStart(len, '0'); }
+</script>
+</body>
+</html>
--- a/app/static/worker.js
+++ b/app/static/worker.js
@@ -0,0 +1,58 @@
+import { pipeline, env } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3/dist/transformers.min.js';
+
+// Disable local model file check — always fetch from HuggingFace Hub
+env.allowLocalModels = false;
+
+let transcriber = null;
+
+async function loadModel(modelId) {
+  transcriber = await pipeline(
+    'automatic-speech-recognition',
+    modelId,
+    {
+      device: 'webgpu',
+      dtype: 'q4',
+      progress_callback: (progress) => {
+        self.postMessage({ type: 'model-progress', progress });
+      },
+    }
+  );
+  self.postMessage({ type: 'model-ready' });
+}
+
+// audioData is a Float32Array of 16kHz mono PCM — decoded in the main thread
+// to avoid the missing AudioContext issue in Web Workers.
+async function transcribe(audioData, language) {
+  if (!transcriber) {
+    self.postMessage({ type: 'error', message: 'Model not loaded' });
+    return;
+  }
+
+  try {
+    const result = await transcriber(audioData, {
+      return_timestamps: true,
+      chunk_length_s: 30,
+      stride_length_s: 5,
+      language: language || null,
+      task: 'transcribe',
+    });
+
+    self.postMessage({ type: 'result', text: result.text, chunks: result.chunks || [] });
+  } catch (err) {
+    self.postMessage({ type: 'error', message: err.message });
+  }
+}
+
+self.onmessage = async (event) => {
+  const { type, modelId, audioData, language } = event.data;
+
+  if (type === 'load') {
+    try {
+      await loadModel(modelId || 'Xenova/whisper-small');
+    } catch (err) {
+      self.postMessage({ type: 'error', message: `Failed to load model: ${err.message}` });
+    }
+  } else if (type === 'transcribe') {
+    await transcribe(audioData, language);
+  }
+};
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,9 @@
+services:
+  app:
+    build: .
+    ports:
+      - "8000:8000"
+    environment:
+      - MAX_UPLOAD_SIZE_MB=500
+    volumes:
+      - /tmp/apoena-audio:/tmp/apoena-audio
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+fastapi>=0.111
+uvicorn[standard]>=0.29
+python-multipart>=0.0.9
+yt-dlp>=2024.1
+pydantic>=2.0
+pydantic-settings>=2.0
+aiofiles>=23.0