init
This commit is contained in:
8
.dockerignore
Normal file
8
.dockerignore
Normal file
@@ -0,0 +1,8 @@
|
||||
.git
|
||||
.claude
|
||||
__pycache__
|
||||
*.pyc
|
||||
*.pyo
|
||||
.env
|
||||
.venv
|
||||
venv
|
||||
20
Dockerfile
Normal file
20
Dockerfile
Normal file
@@ -0,0 +1,20 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ffmpeg \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY app/ ./app/
|
||||
|
||||
RUN mkdir -p /tmp/apoena-audio
|
||||
|
||||
ENV MAX_UPLOAD_SIZE_MB=500
|
||||
|
||||
EXPOSE 8000
|
||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
0
app/__init__.py
Normal file
0
app/__init__.py
Normal file
12
app/config.py
Normal file
12
app/config.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from pydantic_settings import BaseSettings
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
max_upload_size_mb: int = 500
|
||||
|
||||
@property
|
||||
def max_upload_size_bytes(self) -> int:
|
||||
return self.max_upload_size_mb * 1024 * 1024
|
||||
|
||||
|
||||
settings = Settings()
|
||||
45
app/downloader.py
Normal file
45
app/downloader.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import asyncio
|
||||
import uuid
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
AUDIO_TMP_DIR = "/tmp/apoena-audio"
|
||||
|
||||
|
||||
async def extract_audio(url: str) -> Path:
|
||||
"""Download audio-only from a URL using yt-dlp. Returns path to an mp3 temp file."""
|
||||
job_id = str(uuid.uuid4())
|
||||
outtmpl = f"{AUDIO_TMP_DIR}/{job_id}.%(ext)s"
|
||||
expected = Path(f"{AUDIO_TMP_DIR}/{job_id}.mp3")
|
||||
|
||||
cmd = [
|
||||
"yt-dlp",
|
||||
"--no-warnings",
|
||||
"--quiet",
|
||||
"--extract-audio",
|
||||
"--audio-format", "mp3",
|
||||
"--audio-quality", "128K",
|
||||
"--format", "bestaudio/best",
|
||||
"--output", outtmpl,
|
||||
url,
|
||||
]
|
||||
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
_, stderr = await proc.communicate()
|
||||
|
||||
if proc.returncode != 0:
|
||||
raise RuntimeError(stderr.decode().strip() or "yt-dlp failed with no output")
|
||||
|
||||
if expected.exists():
|
||||
return expected
|
||||
|
||||
# yt-dlp sometimes keeps the original extension even with --audio-format mp3
|
||||
# Find whatever file was created with this job_id prefix
|
||||
for f in Path(AUDIO_TMP_DIR).glob(f"{job_id}.*"):
|
||||
return f
|
||||
|
||||
raise RuntimeError("yt-dlp produced no output file")
|
||||
59
app/main.py
Normal file
59
app/main.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI, HTTPException, BackgroundTasks
|
||||
from fastapi.responses import FileResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
from app import downloader
|
||||
|
||||
|
||||
STATIC_DIR = Path(__file__).parent / "static"
|
||||
AUDIO_TMP_DIR = Path("/tmp/apoena-audio")
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
AUDIO_TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
yield
|
||||
|
||||
|
||||
app = FastAPI(title="apoena-transcript", lifespan=lifespan)
|
||||
|
||||
|
||||
class ExtractAudioRequest(BaseModel):
|
||||
url: str
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@app.get("/worker.js")
|
||||
async def worker_js():
|
||||
return FileResponse(STATIC_DIR / "worker.js", media_type="application/javascript")
|
||||
|
||||
|
||||
@app.post("/extract-audio")
|
||||
async def extract_audio(body: ExtractAudioRequest, background_tasks: BackgroundTasks):
|
||||
try:
|
||||
audio_path = await downloader.extract_audio(body.url)
|
||||
except RuntimeError as e:
|
||||
raise HTTPException(status_code=422, detail=str(e))
|
||||
|
||||
background_tasks.add_task(_delete_file, audio_path)
|
||||
return FileResponse(audio_path, media_type="audio/mpeg", filename="audio.mp3")
|
||||
|
||||
|
||||
def _delete_file(path):
|
||||
try:
|
||||
os.unlink(path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def index():
|
||||
return FileResponse(STATIC_DIR / "index.html")
|
||||
611
app/static/index.html
Normal file
611
app/static/index.html
Normal file
@@ -0,0 +1,611 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>Apoena Transcript</title>
|
||||
<style>
|
||||
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
||||
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
|
||||
background: #0f0f13;
|
||||
color: #e2e2e8;
|
||||
min-height: 100vh;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
padding: 2rem 1rem;
|
||||
}
|
||||
|
||||
.container { width: 100%; max-width: 680px; }
|
||||
|
||||
header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
margin-bottom: 2rem;
|
||||
}
|
||||
|
||||
h1 { font-size: 1.4rem; font-weight: 600; letter-spacing: -0.02em; }
|
||||
h1 span { color: #7c6af7; }
|
||||
|
||||
.badge {
|
||||
font-size: 0.72rem;
|
||||
font-weight: 600;
|
||||
padding: 0.25rem 0.6rem;
|
||||
border-radius: 999px;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.05em;
|
||||
}
|
||||
.badge-gpu { background: #1a2f1a; color: #4ade80; border: 1px solid #166534; }
|
||||
.badge-wasm { background: #2a2318; color: #fbbf24; border: 1px solid #92400e; }
|
||||
.badge-loading { background: #1e1e28; color: #94a3b8; border: 1px solid #334155; }
|
||||
|
||||
/* Model status bar */
|
||||
.model-status {
|
||||
background: #1a1a24;
|
||||
border: 1px solid #2a2a38;
|
||||
border-radius: 12px;
|
||||
padding: 1rem 1.2rem;
|
||||
margin-bottom: 1.5rem;
|
||||
font-size: 0.85rem;
|
||||
color: #94a3b8;
|
||||
}
|
||||
.model-status.ready { color: #4ade80; }
|
||||
.model-status.error { color: #f87171; }
|
||||
|
||||
.progress-bar-wrap {
|
||||
height: 4px;
|
||||
background: #2a2a38;
|
||||
border-radius: 2px;
|
||||
margin-top: 0.6rem;
|
||||
overflow: hidden;
|
||||
}
|
||||
.progress-bar {
|
||||
height: 100%;
|
||||
background: #7c6af7;
|
||||
border-radius: 2px;
|
||||
transition: width 0.2s ease;
|
||||
}
|
||||
|
||||
/* Input cards */
|
||||
.card {
|
||||
background: #1a1a24;
|
||||
border: 1px solid #2a2a38;
|
||||
border-radius: 16px;
|
||||
padding: 1.4rem;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
.card h2 {
|
||||
font-size: 0.9rem;
|
||||
font-weight: 600;
|
||||
color: #94a3b8;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.06em;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
/* Drop zone */
|
||||
.drop-zone {
|
||||
border: 2px dashed #2a2a38;
|
||||
border-radius: 10px;
|
||||
padding: 2rem;
|
||||
text-align: center;
|
||||
cursor: pointer;
|
||||
transition: border-color 0.15s, background 0.15s;
|
||||
color: #64748b;
|
||||
font-size: 0.9rem;
|
||||
}
|
||||
.drop-zone:hover, .drop-zone.drag-over {
|
||||
border-color: #7c6af7;
|
||||
background: #1e1a30;
|
||||
color: #a89cf8;
|
||||
}
|
||||
.drop-zone input { display: none; }
|
||||
.drop-zone .icon { font-size: 2rem; margin-bottom: 0.5rem; display: block; }
|
||||
|
||||
/* URL input row */
|
||||
.url-row {
|
||||
display: flex;
|
||||
gap: 0.6rem;
|
||||
}
|
||||
.url-row input {
|
||||
flex: 1;
|
||||
background: #0f0f13;
|
||||
border: 1px solid #2a2a38;
|
||||
border-radius: 8px;
|
||||
padding: 0.65rem 0.9rem;
|
||||
color: #e2e2e8;
|
||||
font-size: 0.9rem;
|
||||
outline: none;
|
||||
transition: border-color 0.15s;
|
||||
}
|
||||
.url-row input:focus { border-color: #7c6af7; }
|
||||
.url-row input::placeholder { color: #44444f; }
|
||||
|
||||
button {
|
||||
background: #7c6af7;
|
||||
color: #fff;
|
||||
border: none;
|
||||
border-radius: 8px;
|
||||
padding: 0.65rem 1.2rem;
|
||||
font-size: 0.9rem;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
transition: background 0.15s, opacity 0.15s;
|
||||
white-space: nowrap;
|
||||
}
|
||||
button:hover:not(:disabled) { background: #6b58e8; }
|
||||
button:disabled { opacity: 0.4; cursor: not-allowed; }
|
||||
|
||||
/* Language selector */
|
||||
.option-row {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.6rem;
|
||||
margin-top: 0.8rem;
|
||||
font-size: 0.82rem;
|
||||
color: #64748b;
|
||||
}
|
||||
.option-row select {
|
||||
background: #0f0f13;
|
||||
border: 1px solid #2a2a38;
|
||||
border-radius: 6px;
|
||||
padding: 0.3rem 0.6rem;
|
||||
color: #94a3b8;
|
||||
font-size: 0.82rem;
|
||||
outline: none;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
/* Transcribing state */
|
||||
.transcribing-panel {
|
||||
display: none;
|
||||
background: #1a1a24;
|
||||
border: 1px solid #2a2a38;
|
||||
border-radius: 16px;
|
||||
padding: 1.4rem;
|
||||
margin-bottom: 1rem;
|
||||
text-align: center;
|
||||
}
|
||||
.transcribing-panel.active { display: block; }
|
||||
|
||||
.spinner {
|
||||
width: 36px;
|
||||
height: 36px;
|
||||
border: 3px solid #2a2a38;
|
||||
border-top-color: #7c6af7;
|
||||
border-radius: 50%;
|
||||
animation: spin 0.8s linear infinite;
|
||||
margin: 0 auto 0.8rem;
|
||||
}
|
||||
@keyframes spin { to { transform: rotate(360deg); } }
|
||||
|
||||
.transcribing-label { font-size: 0.95rem; color: #94a3b8; }
|
||||
.elapsed { font-size: 0.8rem; color: #44444f; margin-top: 0.3rem; }
|
||||
|
||||
/* Result */
|
||||
.result-panel {
|
||||
display: none;
|
||||
background: #1a1a24;
|
||||
border: 1px solid #2a2a38;
|
||||
border-radius: 16px;
|
||||
padding: 1.4rem;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
.result-panel.active { display: block; }
|
||||
|
||||
.result-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
.result-header h2 {
|
||||
font-size: 0.9rem;
|
||||
font-weight: 600;
|
||||
color: #94a3b8;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.06em;
|
||||
}
|
||||
.result-actions { display: flex; gap: 0.5rem; }
|
||||
.result-actions button {
|
||||
background: #2a2a38;
|
||||
color: #94a3b8;
|
||||
font-size: 0.78rem;
|
||||
padding: 0.4rem 0.8rem;
|
||||
}
|
||||
.result-actions button:hover { background: #353547; color: #e2e2e8; }
|
||||
.result-actions .copy-btn.copied { background: #166534; color: #4ade80; }
|
||||
|
||||
.result-text {
|
||||
font-size: 0.9rem;
|
||||
line-height: 1.7;
|
||||
color: #cbd5e1;
|
||||
white-space: pre-wrap;
|
||||
word-break: break-word;
|
||||
max-height: 400px;
|
||||
overflow-y: auto;
|
||||
padding-right: 0.3rem;
|
||||
}
|
||||
.result-text::-webkit-scrollbar { width: 4px; }
|
||||
.result-text::-webkit-scrollbar-track { background: transparent; }
|
||||
.result-text::-webkit-scrollbar-thumb { background: #2a2a38; border-radius: 2px; }
|
||||
|
||||
/* Error toast */
|
||||
.error-panel {
|
||||
display: none;
|
||||
background: #2a1010;
|
||||
border: 1px solid #7f1d1d;
|
||||
border-radius: 12px;
|
||||
padding: 1rem 1.2rem;
|
||||
margin-bottom: 1rem;
|
||||
font-size: 0.85rem;
|
||||
color: #fca5a5;
|
||||
}
|
||||
.error-panel.active { display: block; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header>
|
||||
<h1><span>apoena</span> transcript</h1>
|
||||
<span id="device-badge" class="badge badge-loading">Loading...</span>
|
||||
</header>
|
||||
|
||||
<!-- Model status -->
|
||||
<div id="model-status" class="model-status">
|
||||
Loading model — first visit downloads ~100 MB, then it's cached locally.
|
||||
<div class="progress-bar-wrap" id="progress-wrap" style="display:none">
|
||||
<div class="progress-bar" id="progress-bar" style="width:0%"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Error panel -->
|
||||
<div id="error-panel" class="error-panel"></div>
|
||||
|
||||
<!-- File upload card -->
|
||||
<div class="card">
|
||||
<h2>Upload file</h2>
|
||||
<div class="drop-zone" id="drop-zone">
|
||||
<span class="icon">📂</span>
|
||||
Drop an audio or video file here, or click to browse
|
||||
<input type="file" id="file-input" accept="audio/*,video/*,.mp4,.mkv,.webm,.mov,.m4a,.mp3,.wav,.ogg,.flac" />
|
||||
</div>
|
||||
<div class="option-row">
|
||||
Language:
|
||||
<select id="lang-file">
|
||||
<option value="">Auto-detect</option>
|
||||
<option value="en">English</option>
|
||||
<option value="fr">French</option>
|
||||
<option value="es">Spanish</option>
|
||||
<option value="de">German</option>
|
||||
<option value="it">Italian</option>
|
||||
<option value="pt">Portuguese</option>
|
||||
<option value="ja">Japanese</option>
|
||||
<option value="zh">Chinese</option>
|
||||
<option value="ar">Arabic</option>
|
||||
<option value="ko">Korean</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- URL card -->
|
||||
<div class="card">
|
||||
<h2>YouTube / TikTok / URL</h2>
|
||||
<div class="url-row">
|
||||
<input type="url" id="url-input" placeholder="https://www.youtube.com/watch?v=..." />
|
||||
<button id="url-btn" disabled>Transcribe</button>
|
||||
</div>
|
||||
<div class="option-row">
|
||||
Language:
|
||||
<select id="lang-url">
|
||||
<option value="">Auto-detect</option>
|
||||
<option value="en">English</option>
|
||||
<option value="fr">French</option>
|
||||
<option value="es">Spanish</option>
|
||||
<option value="de">German</option>
|
||||
<option value="it">Italian</option>
|
||||
<option value="pt">Portuguese</option>
|
||||
<option value="ja">Japanese</option>
|
||||
<option value="zh">Chinese</option>
|
||||
<option value="ar">Arabic</option>
|
||||
<option value="ko">Korean</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Transcribing panel -->
|
||||
<div id="transcribing-panel" class="transcribing-panel">
|
||||
<div class="spinner"></div>
|
||||
<div class="transcribing-label" id="transcribing-label">Transcribing…</div>
|
||||
<div class="elapsed" id="elapsed"></div>
|
||||
</div>
|
||||
|
||||
<!-- Result panel -->
|
||||
<div id="result-panel" class="result-panel">
|
||||
<div class="result-header">
|
||||
<h2>Transcript</h2>
|
||||
<div class="result-actions">
|
||||
<button class="copy-btn" id="copy-btn">Copy</button>
|
||||
<button id="srt-btn">Download SRT</button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="result-text" id="result-text"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script type="module">
|
||||
// ── State ──────────────────────────────────────────────────────────────────
|
||||
let modelReady = false;
|
||||
let busy = false;
|
||||
let lastChunks = [];
|
||||
let elapsedInterval = null;
|
||||
|
||||
// ── DOM refs ───────────────────────────────────────────────────────────────
|
||||
const deviceBadge = document.getElementById('device-badge');
|
||||
const modelStatus = document.getElementById('model-status');
|
||||
const progressWrap = document.getElementById('progress-wrap');
|
||||
const progressBar = document.getElementById('progress-bar');
|
||||
const dropZone = document.getElementById('drop-zone');
|
||||
const fileInput = document.getElementById('file-input');
|
||||
const urlInput = document.getElementById('url-input');
|
||||
const urlBtn = document.getElementById('url-btn');
|
||||
const langFile = document.getElementById('lang-file');
|
||||
const langUrl = document.getElementById('lang-url');
|
||||
const transcribingP = document.getElementById('transcribing-panel');
|
||||
const transcribingL = document.getElementById('transcribing-label');
|
||||
const elapsedEl = document.getElementById('elapsed');
|
||||
const resultPanel = document.getElementById('result-panel');
|
||||
const resultText = document.getElementById('result-text');
|
||||
const copyBtn = document.getElementById('copy-btn');
|
||||
const srtBtn = document.getElementById('srt-btn');
|
||||
const errorPanel = document.getElementById('error-panel');
|
||||
|
||||
// ── WebGPU detection ───────────────────────────────────────────────────────
|
||||
async function detectDevice() {
|
||||
if (!navigator.gpu) return 'wasm';
|
||||
try {
|
||||
const adapter = await navigator.gpu.requestAdapter();
|
||||
return adapter ? 'webgpu' : 'wasm';
|
||||
} catch { return 'wasm'; }
|
||||
}
|
||||
|
||||
// ── Web Worker ─────────────────────────────────────────────────────────────
|
||||
const worker = new Worker('/worker.js?v=3', { type: 'module' });
|
||||
|
||||
worker.onmessage = (e) => {
|
||||
const { type, progress, text, chunks, message } = e.data;
|
||||
|
||||
if (type === 'model-progress') {
|
||||
if (progress?.status === 'downloading') {
|
||||
const pct = progress.progress ? Math.round(progress.progress) : 0;
|
||||
progressWrap.style.display = 'block';
|
||||
progressBar.style.width = pct + '%';
|
||||
modelStatus.textContent = `Downloading model… ${pct}%`;
|
||||
const div = document.createElement('div');
|
||||
div.className = 'progress-bar-wrap';
|
||||
progressWrap.style.display = 'block';
|
||||
progressBar.style.width = pct + '%';
|
||||
} else if (progress?.status === 'loading') {
|
||||
modelStatus.textContent = 'Loading model into memory…';
|
||||
} else if (progress?.status === 'initiate') {
|
||||
modelStatus.textContent = `Fetching ${progress.file || 'model files'}…`;
|
||||
}
|
||||
}
|
||||
|
||||
if (type === 'model-ready') {
|
||||
modelReady = true;
|
||||
progressWrap.style.display = 'none';
|
||||
modelStatus.className = 'model-status ready';
|
||||
modelStatus.textContent = '✓ Model ready — transcription runs locally on your device';
|
||||
urlBtn.disabled = false;
|
||||
}
|
||||
|
||||
if (type === 'result') {
|
||||
lastChunks = chunks;
|
||||
showResult(text, chunks);
|
||||
setBusy(false);
|
||||
}
|
||||
|
||||
if (type === 'error') {
|
||||
showError(message);
|
||||
setBusy(false);
|
||||
}
|
||||
};
|
||||
|
||||
worker.onerror = (e) => {
|
||||
showError('Worker error: ' + e.message);
|
||||
setBusy(false);
|
||||
};
|
||||
|
||||
// ── Initialise ─────────────────────────────────────────────────────────────
|
||||
(async () => {
|
||||
const device = await detectDevice();
|
||||
|
||||
if (device === 'webgpu') {
|
||||
deviceBadge.textContent = 'Local · GPU';
|
||||
deviceBadge.className = 'badge badge-gpu';
|
||||
} else {
|
||||
deviceBadge.textContent = 'Local · CPU';
|
||||
deviceBadge.className = 'badge badge-wasm';
|
||||
modelStatus.textContent += ' (WebGPU not available — using CPU, transcription will be slower)';
|
||||
}
|
||||
|
||||
worker.postMessage({ type: 'load', modelId: 'Xenova/whisper-small' });
|
||||
})();
|
||||
|
||||
// ── Audio decoding (main thread — AudioContext not available in workers) ────
|
||||
async function decodeAudioToFloat32(arrayBuffer) {
|
||||
// Decode at native sample rate first, then resample to 16kHz for Whisper
|
||||
const audioCtx = new AudioContext();
|
||||
const decoded = await audioCtx.decodeAudioData(arrayBuffer);
|
||||
await audioCtx.close();
|
||||
|
||||
if (decoded.sampleRate === 16000 && decoded.numberOfChannels === 1) {
|
||||
return decoded.getChannelData(0);
|
||||
}
|
||||
|
||||
// Resample + downmix to mono 16kHz via OfflineAudioContext
|
||||
const targetRate = 16000;
|
||||
const offlineCtx = new OfflineAudioContext(
|
||||
1,
|
||||
Math.ceil(decoded.duration * targetRate),
|
||||
targetRate,
|
||||
);
|
||||
const source = offlineCtx.createBufferSource();
|
||||
source.buffer = decoded;
|
||||
source.connect(offlineCtx.destination);
|
||||
source.start();
|
||||
const resampled = await offlineCtx.startRendering();
|
||||
// Explicit copy — getChannelData returns a view into AudioBuffer memory
|
||||
// which may not be transferable; own buffer avoids postMessage issues.
|
||||
return new Float32Array(resampled.getChannelData(0));
|
||||
}
|
||||
|
||||
// ── File upload ────────────────────────────────────────────────────────────
|
||||
dropZone.addEventListener('click', () => fileInput.click());
|
||||
dropZone.addEventListener('dragover', (e) => { e.preventDefault(); dropZone.classList.add('drag-over'); });
|
||||
dropZone.addEventListener('dragleave', () => dropZone.classList.remove('drag-over'));
|
||||
dropZone.addEventListener('drop', (e) => {
|
||||
e.preventDefault();
|
||||
dropZone.classList.remove('drag-over');
|
||||
const file = e.dataTransfer.files[0];
|
||||
if (file) handleFile(file);
|
||||
});
|
||||
fileInput.addEventListener('change', () => {
|
||||
if (fileInput.files[0]) handleFile(fileInput.files[0]);
|
||||
});
|
||||
|
||||
async function handleFile(file) {
|
||||
if (!modelReady || busy) return;
|
||||
setBusy(true);
|
||||
transcribingL.textContent = `Decoding "${file.name}"…`;
|
||||
try {
|
||||
const arrayBuffer = await file.arrayBuffer();
|
||||
const audioData = await decodeAudioToFloat32(arrayBuffer);
|
||||
transcribingL.textContent = `Transcribing "${file.name}"…`;
|
||||
worker.postMessage({ type: 'transcribe', audioData, language: langFile.value || null });
|
||||
} catch (err) {
|
||||
showError('Failed to decode audio: ' + err.message);
|
||||
setBusy(false);
|
||||
}
|
||||
}
|
||||
|
||||
// ── URL input ──────────────────────────────────────────────────────────────
|
||||
urlInput.addEventListener('keydown', (e) => {
|
||||
if (e.key === 'Enter' && !urlBtn.disabled) urlBtn.click();
|
||||
});
|
||||
|
||||
urlBtn.addEventListener('click', async () => {
|
||||
const url = urlInput.value.trim();
|
||||
if (!url || !modelReady || busy) return;
|
||||
|
||||
setBusy(true);
|
||||
transcribingL.textContent = 'Downloading audio from URL…';
|
||||
|
||||
try {
|
||||
const res = await fetch('/extract-audio', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ url }),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
const err = await res.json().catch(() => ({ detail: res.statusText }));
|
||||
throw new Error(err.detail || 'Server error');
|
||||
}
|
||||
|
||||
transcribingL.textContent = 'Decoding audio…';
|
||||
const arrayBuffer = await res.arrayBuffer();
|
||||
const audioData = await decodeAudioToFloat32(arrayBuffer);
|
||||
transcribingL.textContent = 'Transcribing…';
|
||||
worker.postMessage({ type: 'transcribe', audioData, language: langUrl.value || null });
|
||||
} catch (err) {
|
||||
showError('Failed to extract audio: ' + err.message);
|
||||
setBusy(false);
|
||||
}
|
||||
});
|
||||
|
||||
// ── UI helpers ─────────────────────────────────────────────────────────────
|
||||
function setBusy(state) {
|
||||
busy = state;
|
||||
urlBtn.disabled = state || !modelReady;
|
||||
|
||||
if (state) {
|
||||
errorPanel.className = 'error-panel';
|
||||
resultPanel.className = 'result-panel';
|
||||
transcribingP.className = 'transcribing-panel active';
|
||||
|
||||
let start = Date.now();
|
||||
clearInterval(elapsedInterval);
|
||||
elapsedInterval = setInterval(() => {
|
||||
const s = Math.floor((Date.now() - start) / 1000);
|
||||
const m = Math.floor(s / 60);
|
||||
elapsedEl.textContent = m > 0
|
||||
? `${m}m ${s % 60}s elapsed`
|
||||
: `${s}s elapsed`;
|
||||
}, 1000);
|
||||
} else {
|
||||
clearInterval(elapsedInterval);
|
||||
transcribingP.className = 'transcribing-panel';
|
||||
elapsedEl.textContent = '';
|
||||
}
|
||||
}
|
||||
|
||||
function showResult(text, chunks) {
|
||||
resultText.textContent = text.trim();
|
||||
resultPanel.className = 'result-panel active';
|
||||
}
|
||||
|
||||
function showError(msg) {
|
||||
errorPanel.textContent = msg;
|
||||
errorPanel.className = 'error-panel active';
|
||||
}
|
||||
|
||||
// ── Copy ───────────────────────────────────────────────────────────────────
|
||||
copyBtn.addEventListener('click', () => {
|
||||
navigator.clipboard.writeText(resultText.textContent).then(() => {
|
||||
copyBtn.textContent = 'Copied!';
|
||||
copyBtn.classList.add('copied');
|
||||
setTimeout(() => {
|
||||
copyBtn.textContent = 'Copy';
|
||||
copyBtn.classList.remove('copied');
|
||||
}, 2000);
|
||||
});
|
||||
});
|
||||
|
||||
// ── SRT download ───────────────────────────────────────────────────────────
|
||||
srtBtn.addEventListener('click', () => {
|
||||
const srt = chunksToSRT(lastChunks);
|
||||
const blob = new Blob([srt], { type: 'text/plain' });
|
||||
const a = document.createElement('a');
|
||||
a.href = URL.createObjectURL(blob);
|
||||
a.download = 'transcript.srt';
|
||||
a.click();
|
||||
});
|
||||
|
||||
function chunksToSRT(chunks) {
|
||||
if (!chunks || chunks.length === 0) {
|
||||
return `1\n00:00:00,000 --> 00:00:01,000\n${resultText.textContent.trim()}\n`;
|
||||
}
|
||||
return chunks.map((chunk, i) => {
|
||||
const [start, end] = chunk.timestamp || [0, 1];
|
||||
return `${i + 1}\n${toSRTTime(start)} --> ${toSRTTime(end || start + 1)}\n${chunk.text.trim()}\n`;
|
||||
}).join('\n');
|
||||
}
|
||||
|
||||
function toSRTTime(seconds) {
|
||||
const s = Math.max(0, seconds);
|
||||
const h = Math.floor(s / 3600);
|
||||
const m = Math.floor((s % 3600) / 60);
|
||||
const sec = Math.floor(s % 60);
|
||||
const ms = Math.floor((s % 1) * 1000);
|
||||
return `${pad(h)}:${pad(m)}:${pad(sec)},${pad(ms, 3)}`;
|
||||
}
|
||||
|
||||
function pad(n, len = 2) { return String(Math.floor(n)).padStart(len, '0'); }
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
58
app/static/worker.js
Normal file
58
app/static/worker.js
Normal file
@@ -0,0 +1,58 @@
|
||||
import { pipeline, env } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3/dist/transformers.min.js';
|
||||
|
||||
// Disable local model file check — always fetch from HuggingFace Hub
|
||||
env.allowLocalModels = false;
|
||||
|
||||
let transcriber = null;
|
||||
|
||||
async function loadModel(modelId) {
|
||||
transcriber = await pipeline(
|
||||
'automatic-speech-recognition',
|
||||
modelId,
|
||||
{
|
||||
device: 'webgpu',
|
||||
dtype: 'q4',
|
||||
progress_callback: (progress) => {
|
||||
self.postMessage({ type: 'model-progress', progress });
|
||||
},
|
||||
}
|
||||
);
|
||||
self.postMessage({ type: 'model-ready' });
|
||||
}
|
||||
|
||||
// audioData is a Float32Array of 16kHz mono PCM — decoded in the main thread
|
||||
// to avoid the missing AudioContext issue in Web Workers.
|
||||
async function transcribe(audioData, language) {
|
||||
if (!transcriber) {
|
||||
self.postMessage({ type: 'error', message: 'Model not loaded' });
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await transcriber(audioData, {
|
||||
return_timestamps: true,
|
||||
chunk_length_s: 30,
|
||||
stride_length_s: 5,
|
||||
language: language || null,
|
||||
task: 'transcribe',
|
||||
});
|
||||
|
||||
self.postMessage({ type: 'result', text: result.text, chunks: result.chunks || [] });
|
||||
} catch (err) {
|
||||
self.postMessage({ type: 'error', message: err.message });
|
||||
}
|
||||
}
|
||||
|
||||
self.onmessage = async (event) => {
|
||||
const { type, modelId, audioData, language } = event.data;
|
||||
|
||||
if (type === 'load') {
|
||||
try {
|
||||
await loadModel(modelId || 'Xenova/whisper-small');
|
||||
} catch (err) {
|
||||
self.postMessage({ type: 'error', message: `Failed to load model: ${err.message}` });
|
||||
}
|
||||
} else if (type === 'transcribe') {
|
||||
await transcribe(audioData, language);
|
||||
}
|
||||
};
|
||||
9
docker-compose.yml
Normal file
9
docker-compose.yml
Normal file
@@ -0,0 +1,9 @@
|
||||
services:
|
||||
app:
|
||||
build: .
|
||||
ports:
|
||||
- "8000:8000"
|
||||
environment:
|
||||
- MAX_UPLOAD_SIZE_MB=500
|
||||
volumes:
|
||||
- /tmp/apoena-audio:/tmp/apoena-audio
|
||||
7
requirements.txt
Normal file
7
requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
fastapi>=0.111
|
||||
uvicorn[standard]>=0.29
|
||||
python-multipart>=0.0.9
|
||||
yt-dlp>=2024.1
|
||||
pydantic>=2.0
|
||||
pydantic-settings>=2.0
|
||||
aiofiles>=23.0
|
||||
Reference in New Issue
Block a user