This commit is contained in:
Julien Calixte
2026-03-23 18:54:10 +01:00
commit 8e137ace86
10 changed files with 829 additions and 0 deletions

611
app/static/index.html Normal file
View File

@@ -0,0 +1,611 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Apoena Transcript</title>
<style>
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
background: #0f0f13;
color: #e2e2e8;
min-height: 100vh;
display: flex;
flex-direction: column;
align-items: center;
padding: 2rem 1rem;
}
.container { width: 100%; max-width: 680px; }
header {
display: flex;
align-items: center;
justify-content: space-between;
margin-bottom: 2rem;
}
h1 { font-size: 1.4rem; font-weight: 600; letter-spacing: -0.02em; }
h1 span { color: #7c6af7; }
.badge {
font-size: 0.72rem;
font-weight: 600;
padding: 0.25rem 0.6rem;
border-radius: 999px;
text-transform: uppercase;
letter-spacing: 0.05em;
}
.badge-gpu { background: #1a2f1a; color: #4ade80; border: 1px solid #166534; }
.badge-wasm { background: #2a2318; color: #fbbf24; border: 1px solid #92400e; }
.badge-loading { background: #1e1e28; color: #94a3b8; border: 1px solid #334155; }
/* Model status bar */
.model-status {
background: #1a1a24;
border: 1px solid #2a2a38;
border-radius: 12px;
padding: 1rem 1.2rem;
margin-bottom: 1.5rem;
font-size: 0.85rem;
color: #94a3b8;
}
.model-status.ready { color: #4ade80; }
.model-status.error { color: #f87171; }
.progress-bar-wrap {
height: 4px;
background: #2a2a38;
border-radius: 2px;
margin-top: 0.6rem;
overflow: hidden;
}
.progress-bar {
height: 100%;
background: #7c6af7;
border-radius: 2px;
transition: width 0.2s ease;
}
/* Input cards */
.card {
background: #1a1a24;
border: 1px solid #2a2a38;
border-radius: 16px;
padding: 1.4rem;
margin-bottom: 1rem;
}
.card h2 {
font-size: 0.9rem;
font-weight: 600;
color: #94a3b8;
text-transform: uppercase;
letter-spacing: 0.06em;
margin-bottom: 1rem;
}
/* Drop zone */
.drop-zone {
border: 2px dashed #2a2a38;
border-radius: 10px;
padding: 2rem;
text-align: center;
cursor: pointer;
transition: border-color 0.15s, background 0.15s;
color: #64748b;
font-size: 0.9rem;
}
.drop-zone:hover, .drop-zone.drag-over {
border-color: #7c6af7;
background: #1e1a30;
color: #a89cf8;
}
.drop-zone input { display: none; }
.drop-zone .icon { font-size: 2rem; margin-bottom: 0.5rem; display: block; }
/* URL input row */
.url-row {
display: flex;
gap: 0.6rem;
}
.url-row input {
flex: 1;
background: #0f0f13;
border: 1px solid #2a2a38;
border-radius: 8px;
padding: 0.65rem 0.9rem;
color: #e2e2e8;
font-size: 0.9rem;
outline: none;
transition: border-color 0.15s;
}
.url-row input:focus { border-color: #7c6af7; }
.url-row input::placeholder { color: #44444f; }
button {
background: #7c6af7;
color: #fff;
border: none;
border-radius: 8px;
padding: 0.65rem 1.2rem;
font-size: 0.9rem;
font-weight: 600;
cursor: pointer;
transition: background 0.15s, opacity 0.15s;
white-space: nowrap;
}
button:hover:not(:disabled) { background: #6b58e8; }
button:disabled { opacity: 0.4; cursor: not-allowed; }
/* Language selector */
.option-row {
display: flex;
align-items: center;
gap: 0.6rem;
margin-top: 0.8rem;
font-size: 0.82rem;
color: #64748b;
}
.option-row select {
background: #0f0f13;
border: 1px solid #2a2a38;
border-radius: 6px;
padding: 0.3rem 0.6rem;
color: #94a3b8;
font-size: 0.82rem;
outline: none;
cursor: pointer;
}
/* Transcribing state */
.transcribing-panel {
display: none;
background: #1a1a24;
border: 1px solid #2a2a38;
border-radius: 16px;
padding: 1.4rem;
margin-bottom: 1rem;
text-align: center;
}
.transcribing-panel.active { display: block; }
.spinner {
width: 36px;
height: 36px;
border: 3px solid #2a2a38;
border-top-color: #7c6af7;
border-radius: 50%;
animation: spin 0.8s linear infinite;
margin: 0 auto 0.8rem;
}
@keyframes spin { to { transform: rotate(360deg); } }
.transcribing-label { font-size: 0.95rem; color: #94a3b8; }
.elapsed { font-size: 0.8rem; color: #44444f; margin-top: 0.3rem; }
/* Result */
.result-panel {
display: none;
background: #1a1a24;
border: 1px solid #2a2a38;
border-radius: 16px;
padding: 1.4rem;
margin-bottom: 1rem;
}
.result-panel.active { display: block; }
.result-header {
display: flex;
align-items: center;
justify-content: space-between;
margin-bottom: 1rem;
}
.result-header h2 {
font-size: 0.9rem;
font-weight: 600;
color: #94a3b8;
text-transform: uppercase;
letter-spacing: 0.06em;
}
.result-actions { display: flex; gap: 0.5rem; }
.result-actions button {
background: #2a2a38;
color: #94a3b8;
font-size: 0.78rem;
padding: 0.4rem 0.8rem;
}
.result-actions button:hover { background: #353547; color: #e2e2e8; }
.result-actions .copy-btn.copied { background: #166534; color: #4ade80; }
.result-text {
font-size: 0.9rem;
line-height: 1.7;
color: #cbd5e1;
white-space: pre-wrap;
word-break: break-word;
max-height: 400px;
overflow-y: auto;
padding-right: 0.3rem;
}
.result-text::-webkit-scrollbar { width: 4px; }
.result-text::-webkit-scrollbar-track { background: transparent; }
.result-text::-webkit-scrollbar-thumb { background: #2a2a38; border-radius: 2px; }
/* Error toast */
.error-panel {
display: none;
background: #2a1010;
border: 1px solid #7f1d1d;
border-radius: 12px;
padding: 1rem 1.2rem;
margin-bottom: 1rem;
font-size: 0.85rem;
color: #fca5a5;
}
.error-panel.active { display: block; }
</style>
</head>
<body>
<div class="container">
<header>
<h1><span>apoena</span> transcript</h1>
<span id="device-badge" class="badge badge-loading">Loading...</span>
</header>
<!-- Model status -->
<div id="model-status" class="model-status">
Loading model — first visit downloads ~100 MB, then it's cached locally.
<div class="progress-bar-wrap" id="progress-wrap" style="display:none">
<div class="progress-bar" id="progress-bar" style="width:0%"></div>
</div>
</div>
<!-- Error panel -->
<div id="error-panel" class="error-panel"></div>
<!-- File upload card -->
<div class="card">
<h2>Upload file</h2>
<div class="drop-zone" id="drop-zone">
<span class="icon">📂</span>
Drop an audio or video file here, or click to browse
<input type="file" id="file-input" accept="audio/*,video/*,.mp4,.mkv,.webm,.mov,.m4a,.mp3,.wav,.ogg,.flac" />
</div>
<div class="option-row">
Language:
<select id="lang-file">
<option value="">Auto-detect</option>
<option value="en">English</option>
<option value="fr">French</option>
<option value="es">Spanish</option>
<option value="de">German</option>
<option value="it">Italian</option>
<option value="pt">Portuguese</option>
<option value="ja">Japanese</option>
<option value="zh">Chinese</option>
<option value="ar">Arabic</option>
<option value="ko">Korean</option>
</select>
</div>
</div>
<!-- URL card -->
<div class="card">
<h2>YouTube / TikTok / URL</h2>
<div class="url-row">
<input type="url" id="url-input" placeholder="https://www.youtube.com/watch?v=..." />
<button id="url-btn" disabled>Transcribe</button>
</div>
<div class="option-row">
Language:
<select id="lang-url">
<option value="">Auto-detect</option>
<option value="en">English</option>
<option value="fr">French</option>
<option value="es">Spanish</option>
<option value="de">German</option>
<option value="it">Italian</option>
<option value="pt">Portuguese</option>
<option value="ja">Japanese</option>
<option value="zh">Chinese</option>
<option value="ar">Arabic</option>
<option value="ko">Korean</option>
</select>
</div>
</div>
<!-- Transcribing panel -->
<div id="transcribing-panel" class="transcribing-panel">
<div class="spinner"></div>
<div class="transcribing-label" id="transcribing-label">Transcribing…</div>
<div class="elapsed" id="elapsed"></div>
</div>
<!-- Result panel -->
<div id="result-panel" class="result-panel">
<div class="result-header">
<h2>Transcript</h2>
<div class="result-actions">
<button class="copy-btn" id="copy-btn">Copy</button>
<button id="srt-btn">Download SRT</button>
</div>
</div>
<div class="result-text" id="result-text"></div>
</div>
</div>
<script type="module">
// ── State ──────────────────────────────────────────────────────────────────
let modelReady = false;
let busy = false;
let lastChunks = [];
let elapsedInterval = null;
// ── DOM refs ───────────────────────────────────────────────────────────────
const deviceBadge = document.getElementById('device-badge');
const modelStatus = document.getElementById('model-status');
const progressWrap = document.getElementById('progress-wrap');
const progressBar = document.getElementById('progress-bar');
const dropZone = document.getElementById('drop-zone');
const fileInput = document.getElementById('file-input');
const urlInput = document.getElementById('url-input');
const urlBtn = document.getElementById('url-btn');
const langFile = document.getElementById('lang-file');
const langUrl = document.getElementById('lang-url');
const transcribingP = document.getElementById('transcribing-panel');
const transcribingL = document.getElementById('transcribing-label');
const elapsedEl = document.getElementById('elapsed');
const resultPanel = document.getElementById('result-panel');
const resultText = document.getElementById('result-text');
const copyBtn = document.getElementById('copy-btn');
const srtBtn = document.getElementById('srt-btn');
const errorPanel = document.getElementById('error-panel');
// ── WebGPU detection ───────────────────────────────────────────────────────
async function detectDevice() {
if (!navigator.gpu) return 'wasm';
try {
const adapter = await navigator.gpu.requestAdapter();
return adapter ? 'webgpu' : 'wasm';
} catch { return 'wasm'; }
}
// ── Web Worker ─────────────────────────────────────────────────────────────
const worker = new Worker('/worker.js?v=3', { type: 'module' });
worker.onmessage = (e) => {
const { type, progress, text, chunks, message } = e.data;
if (type === 'model-progress') {
if (progress?.status === 'downloading') {
const pct = progress.progress ? Math.round(progress.progress) : 0;
progressWrap.style.display = 'block';
progressBar.style.width = pct + '%';
modelStatus.textContent = `Downloading model… ${pct}%`;
const div = document.createElement('div');
div.className = 'progress-bar-wrap';
progressWrap.style.display = 'block';
progressBar.style.width = pct + '%';
} else if (progress?.status === 'loading') {
modelStatus.textContent = 'Loading model into memory…';
} else if (progress?.status === 'initiate') {
modelStatus.textContent = `Fetching ${progress.file || 'model files'}`;
}
}
if (type === 'model-ready') {
modelReady = true;
progressWrap.style.display = 'none';
modelStatus.className = 'model-status ready';
modelStatus.textContent = '✓ Model ready — transcription runs locally on your device';
urlBtn.disabled = false;
}
if (type === 'result') {
lastChunks = chunks;
showResult(text, chunks);
setBusy(false);
}
if (type === 'error') {
showError(message);
setBusy(false);
}
};
worker.onerror = (e) => {
showError('Worker error: ' + e.message);
setBusy(false);
};
// ── Initialise ─────────────────────────────────────────────────────────────
(async () => {
const device = await detectDevice();
if (device === 'webgpu') {
deviceBadge.textContent = 'Local · GPU';
deviceBadge.className = 'badge badge-gpu';
} else {
deviceBadge.textContent = 'Local · CPU';
deviceBadge.className = 'badge badge-wasm';
modelStatus.textContent += ' (WebGPU not available — using CPU, transcription will be slower)';
}
worker.postMessage({ type: 'load', modelId: 'Xenova/whisper-small' });
})();
// ── Audio decoding (main thread — AudioContext not available in workers) ────
async function decodeAudioToFloat32(arrayBuffer) {
// Decode at native sample rate first, then resample to 16kHz for Whisper
const audioCtx = new AudioContext();
const decoded = await audioCtx.decodeAudioData(arrayBuffer);
await audioCtx.close();
if (decoded.sampleRate === 16000 && decoded.numberOfChannels === 1) {
return decoded.getChannelData(0);
}
// Resample + downmix to mono 16kHz via OfflineAudioContext
const targetRate = 16000;
const offlineCtx = new OfflineAudioContext(
1,
Math.ceil(decoded.duration * targetRate),
targetRate,
);
const source = offlineCtx.createBufferSource();
source.buffer = decoded;
source.connect(offlineCtx.destination);
source.start();
const resampled = await offlineCtx.startRendering();
// Explicit copy — getChannelData returns a view into AudioBuffer memory
// which may not be transferable; own buffer avoids postMessage issues.
return new Float32Array(resampled.getChannelData(0));
}
// ── File upload ────────────────────────────────────────────────────────────
dropZone.addEventListener('click', () => fileInput.click());
dropZone.addEventListener('dragover', (e) => { e.preventDefault(); dropZone.classList.add('drag-over'); });
dropZone.addEventListener('dragleave', () => dropZone.classList.remove('drag-over'));
dropZone.addEventListener('drop', (e) => {
e.preventDefault();
dropZone.classList.remove('drag-over');
const file = e.dataTransfer.files[0];
if (file) handleFile(file);
});
fileInput.addEventListener('change', () => {
if (fileInput.files[0]) handleFile(fileInput.files[0]);
});
async function handleFile(file) {
if (!modelReady || busy) return;
setBusy(true);
transcribingL.textContent = `Decoding "${file.name}"…`;
try {
const arrayBuffer = await file.arrayBuffer();
const audioData = await decodeAudioToFloat32(arrayBuffer);
transcribingL.textContent = `Transcribing "${file.name}"…`;
worker.postMessage({ type: 'transcribe', audioData, language: langFile.value || null });
} catch (err) {
showError('Failed to decode audio: ' + err.message);
setBusy(false);
}
}
// ── URL input ──────────────────────────────────────────────────────────────
urlInput.addEventListener('keydown', (e) => {
if (e.key === 'Enter' && !urlBtn.disabled) urlBtn.click();
});
urlBtn.addEventListener('click', async () => {
const url = urlInput.value.trim();
if (!url || !modelReady || busy) return;
setBusy(true);
transcribingL.textContent = 'Downloading audio from URL…';
try {
const res = await fetch('/extract-audio', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ url }),
});
if (!res.ok) {
const err = await res.json().catch(() => ({ detail: res.statusText }));
throw new Error(err.detail || 'Server error');
}
transcribingL.textContent = 'Decoding audio…';
const arrayBuffer = await res.arrayBuffer();
const audioData = await decodeAudioToFloat32(arrayBuffer);
transcribingL.textContent = 'Transcribing…';
worker.postMessage({ type: 'transcribe', audioData, language: langUrl.value || null });
} catch (err) {
showError('Failed to extract audio: ' + err.message);
setBusy(false);
}
});
// ── UI helpers ─────────────────────────────────────────────────────────────
function setBusy(state) {
busy = state;
urlBtn.disabled = state || !modelReady;
if (state) {
errorPanel.className = 'error-panel';
resultPanel.className = 'result-panel';
transcribingP.className = 'transcribing-panel active';
let start = Date.now();
clearInterval(elapsedInterval);
elapsedInterval = setInterval(() => {
const s = Math.floor((Date.now() - start) / 1000);
const m = Math.floor(s / 60);
elapsedEl.textContent = m > 0
? `${m}m ${s % 60}s elapsed`
: `${s}s elapsed`;
}, 1000);
} else {
clearInterval(elapsedInterval);
transcribingP.className = 'transcribing-panel';
elapsedEl.textContent = '';
}
}
function showResult(text, chunks) {
resultText.textContent = text.trim();
resultPanel.className = 'result-panel active';
}
function showError(msg) {
errorPanel.textContent = msg;
errorPanel.className = 'error-panel active';
}
// ── Copy ───────────────────────────────────────────────────────────────────
copyBtn.addEventListener('click', () => {
navigator.clipboard.writeText(resultText.textContent).then(() => {
copyBtn.textContent = 'Copied!';
copyBtn.classList.add('copied');
setTimeout(() => {
copyBtn.textContent = 'Copy';
copyBtn.classList.remove('copied');
}, 2000);
});
});
// ── SRT download ───────────────────────────────────────────────────────────
srtBtn.addEventListener('click', () => {
const srt = chunksToSRT(lastChunks);
const blob = new Blob([srt], { type: 'text/plain' });
const a = document.createElement('a');
a.href = URL.createObjectURL(blob);
a.download = 'transcript.srt';
a.click();
});
function chunksToSRT(chunks) {
if (!chunks || chunks.length === 0) {
return `1\n00:00:00,000 --> 00:00:01,000\n${resultText.textContent.trim()}\n`;
}
return chunks.map((chunk, i) => {
const [start, end] = chunk.timestamp || [0, 1];
return `${i + 1}\n${toSRTTime(start)} --> ${toSRTTime(end || start + 1)}\n${chunk.text.trim()}\n`;
}).join('\n');
}
function toSRTTime(seconds) {
const s = Math.max(0, seconds);
const h = Math.floor(s / 3600);
const m = Math.floor((s % 3600) / 60);
const sec = Math.floor(s % 60);
const ms = Math.floor((s % 1) * 1000);
return `${pad(h)}:${pad(m)}:${pad(sec)},${pad(ms, 3)}`;
}
function pad(n, len = 2) { return String(Math.floor(n)).padStart(len, '0'); }
</script>
</body>
</html>