Commit ad758123 authored by Stefy Lanza (nextime / spora )'s avatar Stefy Lanza (nextime / spora )

merge: integrate whisper-server local model workflow

parents b17e45a5 da83cc25
...@@ -20,3 +20,5 @@ debug.log ...@@ -20,3 +20,5 @@ debug.log
# Test files # Test files
test_*.py test_*.py
!tests/
!tests/test_whisper_server_local_models.py
...@@ -1172,6 +1172,17 @@ async def api_model_load(request: Request, username: str = Depends(require_admin ...@@ -1172,6 +1172,17 @@ async def api_model_load(request: Request, username: str = Depends(require_admin
raise RuntimeError("Model failed to load") raise RuntimeError("Model failed to load")
multi_model_manager.models[result["model_key"] or path] = mm multi_model_manager.models[result["model_key"] or path] = mm
multi_model_manager.active_in_vram = result["model_key"] or path multi_model_manager.active_in_vram = result["model_key"] or path
elif model_type == "audio":
wsm = multi_model_manager.whisper_servers.get(path)
if wsm is not None:
started = wsm.start(getattr(wsm, "_model_path", None), gpu_device=getattr(wsm, "_gpu_device", 0))
if not wsm.is_running():
raise RuntimeError("whisper-server failed to start")
model_key = f"audio:{path}"
multi_model_manager.models[model_key] = wsm
multi_model_manager.active_in_vram = model_key
multi_model_manager.models_in_vram.add(model_key)
return {"success": True, "already_loaded": False, "started_model": started}
elif model_type == "image": elif model_type == "image":
from codai.api.images import _load_diffusers_pipeline, _is_gguf_model, _load_sdcpp_model from codai.api.images import _load_diffusers_pipeline, _is_gguf_model, _load_sdcpp_model
from codai.api.state import get_global_args from codai.api.state import get_global_args
...@@ -1243,6 +1254,38 @@ async def api_model_configure(request: Request, username: str = Depends(require_ ...@@ -1243,6 +1254,38 @@ async def api_model_configure(request: Request, username: str = Depends(require_
if config_manager is None: if config_manager is None:
raise HTTPException(status_code=503, detail="Config manager not initialized") raise HTTPException(status_code=503, detail="Config manager not initialized")
data = await request.json() data = await request.json()
if data.get("backend") == "whisper-server":
model_id = (data.get("model_id") or "").strip()
if not model_id:
raise HTTPException(status_code=400, detail="model_id is required")
server_path = (data.get("server_path") or "").strip()
if not server_path:
raise HTTPException(status_code=400, detail="server_path is required")
port = int(data.get("port", 8744))
if port < 1 or port > 65535:
raise HTTPException(status_code=400, detail="port must be between 1 and 65535")
gpu_device = int(data.get("gpu_device", 0))
if gpu_device < 0:
raise HTTPException(status_code=400, detail="gpu_device must be >= 0")
for existing in config_manager.models_data.get("audio_models", []):
if isinstance(existing, dict) and existing.get("id") == model_id:
raise HTTPException(status_code=409, detail=f"whisper-server model '{model_id}' already exists")
entry = {
"id": model_id,
"backend": "whisper-server",
"server_path": server_path,
"model_path": (data.get("model_path") or "").strip() or None,
"port": port,
"gpu_device": gpu_device,
"load_mode": data.get("load_mode", "on-request"),
"model_type": "audio_models",
"model_types": ["audio_models"],
}
if data.get("used_vram_gb") is not None:
entry["used_vram_gb"] = data["used_vram_gb"]
config_manager.models_data.setdefault("audio_models", []).append(entry)
config_manager.save_models()
return {"success": True}
path = data.get("path") or data.get("model_id", "") path = data.get("path") or data.get("model_id", "")
valid = {"text_models", "image_models", "audio_models", "tts_models", "vision_models", "video_models", valid = {"text_models", "image_models", "audio_models", "tts_models", "vision_models", "video_models",
"audio_gen_models", "embedding_models"} "audio_gen_models", "embedding_models"}
...@@ -1375,10 +1418,6 @@ async def api_get_settings(username: str = Depends(require_admin)): ...@@ -1375,10 +1418,6 @@ async def api_get_settings(username: str = Depends(require_admin)):
"device_id": c.vulkan.device_id, "device_id": c.vulkan.device_id,
"single_gpu": c.vulkan.single_gpu, "single_gpu": c.vulkan.single_gpu,
}, },
"whisper": {
"server_path": c.whisper.server_path,
"server_port": c.whisper.server_port,
},
"system_prompt": c.system_prompt, "system_prompt": c.system_prompt,
"tools_closer_prompt": c.tools_closer_prompt, "tools_closer_prompt": c.tools_closer_prompt,
"grammar_guided": c.grammar_guided, "grammar_guided": c.grammar_guided,
...@@ -1442,11 +1481,6 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad ...@@ -1442,11 +1481,6 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
c.vulkan.device_id = int(vk.get("device_id", c.vulkan.device_id)) c.vulkan.device_id = int(vk.get("device_id", c.vulkan.device_id))
c.vulkan.single_gpu = bool(vk.get("single_gpu", c.vulkan.single_gpu)) c.vulkan.single_gpu = bool(vk.get("single_gpu", c.vulkan.single_gpu))
if "whisper" in data:
wh = data["whisper"]
c.whisper.server_path = wh.get("server_path") or None
c.whisper.server_port = int(wh.get("server_port", c.whisper.server_port))
if "system_prompt" in data: if "system_prompt" in data:
c.system_prompt = data["system_prompt"] or None c.system_prompt = data["system_prompt"] or None
if "tools_closer_prompt" in data: if "tools_closer_prompt" in data:
...@@ -1458,83 +1492,6 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad ...@@ -1458,83 +1492,6 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
config_manager.save_config() config_manager.save_config()
return {"success": True} return {"success": True}
# --- Whisper-server management ---
@router.get("/admin/api/whisper-server/status")
async def api_whisper_server_status(username: str = Depends(require_admin)):
"""Return status of all registered whisper-server instances."""
from codai.models.manager import multi_model_manager
if multi_model_manager.whisper_servers:
return {
mid: wsm.get_status()
for mid, wsm in multi_model_manager.whisper_servers.items()
}
# Legacy single-instance fallback
if multi_model_manager.whisper_server:
return {"whisper-server": multi_model_manager.whisper_server.get_status()}
return {}
@router.post("/admin/api/whisper-server/start")
async def api_whisper_server_start(request: Request, username: str = Depends(require_admin)):
"""Start (or restart) a whisper-server instance by model_id."""
from codai.models.manager import multi_model_manager
data = await request.json()
model_id = data.get("model_id", "whisper-server")
server_path = data.get("server_path", "")
model_path = data.get("model_path") or None
port = int(data.get("port", 8744))
gpu_device = int(data.get("gpu_device", 0))
if not server_path:
raise HTTPException(status_code=400, detail="server_path required")
wsm = multi_model_manager.whisper_servers.get(model_id)
if wsm is None:
wsm = multi_model_manager.register_whisper_server(
model_id=model_id, server_path=server_path,
model_path=model_path, port=port, gpu_device=gpu_device,
)
else:
wsm.server_path = server_path
wsm.port = port
wsm.base_url = f"http://127.0.0.1:{port}"
wsm._model_path = model_path
wsm._gpu_device = gpu_device
result = wsm.start(model_path, gpu_device=gpu_device)
running = wsm.is_running()
if running:
ws_key = f"audio:{model_id}"
multi_model_manager.models[ws_key] = wsm
multi_model_manager.active_in_vram = ws_key
multi_model_manager.models_in_vram.add(ws_key)
return {"success": running, "running": running, "started_model": result}
@router.post("/admin/api/whisper-server/stop")
async def api_whisper_server_stop(request: Request, username: str = Depends(require_admin)):
"""Stop a whisper-server instance by model_id."""
from codai.models.manager import multi_model_manager
data = await request.json() if request.headers.get("content-type", "").startswith("application/json") else {}
model_id = data.get("model_id", "whisper-server")
wsm = multi_model_manager.whisper_servers.get(model_id) or multi_model_manager.whisper_server
if wsm:
wsm.stop()
ws_key = f"audio:{model_id}"
multi_model_manager.models.pop(ws_key, None)
multi_model_manager.models_in_vram.discard(ws_key)
if multi_model_manager.active_in_vram == ws_key:
multi_model_manager.active_in_vram = None
return {"success": True, "running": False}
# --- HuggingFace model search proxy --- # --- HuggingFace model search proxy ---
import re as _re import re as _re
...@@ -1773,4 +1730,4 @@ async def api_hf_model_info(model_id: str, username: str = Depends(require_admin ...@@ -1773,4 +1730,4 @@ async def api_hf_model_info(model_id: str, username: str = Depends(require_admin
"params_label": params_label, "params_label": params_label,
"gguf_files": gguf_files, "gguf_files": gguf_files,
"file_count": len(all_files), "file_count": len(all_files),
} }
\ No newline at end of file
...@@ -95,17 +95,26 @@ ...@@ -95,17 +95,26 @@
<div id="gguf-models-list"><span class="muted small">Loading…</span></div> <div id="gguf-models-list"><span class="muted small">Loading…</span></div>
</div> </div>
<!-- Whisper Server --> <div class="card mb-0" style="margin-top:1rem" id="ws-model-builder">
<div class="card mb-0" style="margin-top:1rem" id="ws-card"> <div class="card-title">Whisper-server simulated models</div>
<div style="display:flex;align-items:center;justify-content:space-between;flex-wrap:wrap;gap:.5rem"> <p class="muted small" style="margin-top:0">Create local audio models backed by dedicated whisper-server subprocess configurations.</p>
<div> <div style="display:grid;grid-template-columns:repeat(3,minmax(0,1fr));gap:.75rem">
<div class="card-title" style="margin:0">whisper-server <span class="muted" style="font-size:11px;font-weight:400">— native subprocess (AMD/Vulkan)</span></div> <input id="ws-model-id" class="form-input" placeholder="whisper-vulkan-base">
<div id="ws-model-status" class="muted small" style="margin-top:.25rem"></div> <input id="ws-server-path" class="form-input" placeholder="/usr/local/bin/whisper-server">
</div> <input id="ws-model-path" class="form-input" placeholder="/models/ggml-base.bin">
<div style="display:flex;align-items:center;gap:.5rem"> <input id="ws-port" class="form-input" type="number" value="8744" min="1" max="65535">
<span id="ws-running-badge" style="font-size:12px;font-weight:500"></span> <input id="ws-gpu-device" class="form-input" type="number" value="0" min="0">
<a href="/admin/settings" class="btn btn-sm btn-ghost">Configure</a> <select id="ws-load-mode" class="form-input">
</div> <option value="on-request">On request</option>
<option value="load">Load</option>
</select>
</div>
<div style="display:grid;grid-template-columns:repeat(2,minmax(0,1fr));gap:.75rem;margin-top:.75rem">
<input id="ws-used-vram" class="form-input" type="number" min="0" step="0.1" placeholder="Used VRAM (optional)">
<div></div>
</div>
<div class="form-actions" style="margin-top:.75rem">
<button class="btn btn-primary" onclick="addWhisperServerModel()">Add model</button>
</div> </div>
</div> </div>
</div> </div>
...@@ -517,33 +526,6 @@ async function loadGlobalSettings(){ ...@@ -517,33 +526,6 @@ async function loadGlobalSettings(){
}catch{} }catch{}
} }
async function loadWsStatus(){
try{
const s = await fetch('/admin/api/whisper-server/status').then(r=>r.json());
const card = document.getElementById('ws-card');
const badge = document.getElementById('ws-running-badge');
const modelEl = document.getElementById('ws-model-status');
const entries = Object.entries(s);
if(!entries.length){
card.style.display = 'none';
return;
}
card.style.display = '';
const running = entries.filter(([,v])=>v.running);
if(running.length){
badge.textContent = `● ${running.length}/${entries.length} running`;
badge.style.color = 'var(--green, #4ade80)';
card.style.borderColor = 'rgba(74,222,128,.3)';
modelEl.textContent = running.map(([id,v])=>`${id}: ${v.model||'?'} @ ${v.url}`).join(' | ');
} else {
badge.textContent = '○ stopped';
badge.style.color = 'var(--text-2)';
card.style.borderColor = '';
modelEl.textContent = entries.map(([id])=>id).join(', ') + ' — not started';
}
}catch{}
}
/* ── GGUF format toggle ──────────────────────────────── */ /* ── GGUF format toggle ──────────────────────────────── */
let _ggufMode = 'gguf'; let _ggufMode = 'gguf';
document.querySelectorAll('.tog-btn').forEach(btn=>{ document.querySelectorAll('.tog-btn').forEach(btn=>{
...@@ -982,6 +964,57 @@ async function loadCacheStats(){ ...@@ -982,6 +964,57 @@ async function loadCacheStats(){
let _localModels = []; let _localModels = [];
function _renderWhisperServerRows(models){
if(!models.length) return '';
const rows = models.map(m=>{
const idx = _localModels.length;
_localModels.push({
label:m.id,
path:m.id,
cacheType:'whisper-server',
size_gb:0,
defaultType:'audio_models',
settings:{
backend:m.backend || 'whisper-server',
load_mode:m.load_mode || 'on-request',
model_type:'audio_models',
model_path:m.model_path || '',
port:m.port,
gpu_device:m.gpu_device,
},
in_config:true,
capabilities:m.capabilities || ['speech_to_text']
});
const loaded = _loadedKeys.has(`audio:${m.id}`) || _loadedKeys.has(m.id);
return `<tr style="border-top:1px solid var(--border)">
<td style="padding:.4rem .25rem;font-family:monospace;font-size:12px">${esc(m.id)}</td>
<td style="padding:.4rem .25rem"><span class="badge badge-ok">${esc(m.backend || 'whisper-server')}</span></td>
<td style="padding:.4rem .25rem;font-size:11px;color:var(--text-2)">${esc(m.model_path || '—')}</td>
<td style="padding:.4rem .25rem;font-size:11px;color:var(--text-2)">${m.port ?? '—'} / GPU ${m.gpu_device ?? 0}</td>
<td style="padding:.4rem .25rem;font-size:11px;color:var(--text-2)">${esc(m.load_mode || 'on-request')}</td>
<td style="padding:.4rem .25rem;text-align:center">${loaded?'<span class="badge badge-ok">loaded</span>':'<span class="muted small">idle</span>'}</td>
<td style="padding:.4rem .25rem;text-align:right;white-space:nowrap">
${loaded
?`<button class="btn btn-ghost btn-sm" onclick="unloadModel(${idx})">Unload</button>`
:`<button class="btn btn-primary btn-sm" onclick="loadModel(${idx})">Load now</button>`}
<button class="btn btn-secondary btn-sm" onclick="openCfgModal(${idx})">Configure</button>
<button class="btn btn-ghost btn-sm" onclick="disableModel(${idx})">Remove</button>
</td>
</tr>`;
});
return '<div class="card" style="margin-top:1rem">'+
'<div class="card-title">Configured whisper-server models</div>'+
'<table style="width:100%;border-collapse:collapse;font-size:13px">'+
'<thead><tr style="color:var(--text-2);font-size:10px;text-transform:uppercase;letter-spacing:.05em">'+
'<th style="text-align:left;padding:.3rem .25rem;font-weight:700">Model</th>'+
'<th style="text-align:left;padding:.3rem .25rem;font-weight:700">Backend</th>'+
'<th style="text-align:left;padding:.3rem .25rem;font-weight:700">Model path</th>'+
'<th style="text-align:left;padding:.3rem .25rem;font-weight:700">Port / GPU</th>'+
'<th style="text-align:left;padding:.3rem .25rem;font-weight:700">Load mode</th>'+
'<th style="text-align:center;padding:.3rem .25rem;font-weight:700">Status</th>'+
'<th></th></tr></thead><tbody>'+rows.join('')+'</tbody></table></div>';
}
async function loadCachedModels(){ async function loadCachedModels(){
_localModels = []; _localModels = [];
const hfEl = document.getElementById('hf-models-list'); const hfEl = document.getElementById('hf-models-list');
...@@ -991,6 +1024,8 @@ async function loadCachedModels(){ ...@@ -991,6 +1024,8 @@ async function loadCachedModels(){
const r = await fetch('/admin/api/cached-models'); const r = await fetch('/admin/api/cached-models');
if(!r.ok) throw new Error((await r.json()).detail||r.statusText); if(!r.ok) throw new Error((await r.json()).detail||r.statusText);
const d = await r.json(); const d = await r.json();
const whisperModels = (await fetch('/admin/api/models').then(r=>r.ok?r.json():[]))
.filter(m => m.backend === 'whisper-server');
// HF models // HF models
const hf = d.hf||[]; const hf = d.hf||[];
...@@ -1067,6 +1102,7 @@ async function loadCachedModels(){ ...@@ -1067,6 +1102,7 @@ async function loadCachedModels(){
'<th style="text-align:center;padding:.3rem .25rem;font-weight:700">Config</th>'+ '<th style="text-align:center;padding:.3rem .25rem;font-weight:700">Config</th>'+
'<th></th></tr></thead><tbody>'+rows.join('')+'</tbody></table>'; '<th></th></tr></thead><tbody>'+rows.join('')+'</tbody></table>';
} }
ggufEl.insertAdjacentHTML('afterend', _renderWhisperServerRows(whisperModels));
}catch(e){ }catch(e){
hfEl.innerHTML = ggufEl.innerHTML = `<span class="muted small">Error: ${esc(e.message)}</span>`; hfEl.innerHTML = ggufEl.innerHTML = `<span class="muted small">Error: ${esc(e.message)}</span>`;
} }
...@@ -1089,8 +1125,6 @@ async function refreshLocal(){ ...@@ -1089,8 +1125,6 @@ async function refreshLocal(){
loadGlobalSettings(); loadGlobalSettings();
refreshLocal(); refreshLocal();
loadWsStatus();
setInterval(loadWsStatus, 5000);
async function clearCacheConfirm(type){ async function clearCacheConfirm(type){
const labels = {hf:'HuggingFace', gguf:'GGUF', all:'ALL'}; const labels = {hf:'HuggingFace', gguf:'GGUF', all:'ALL'};
...@@ -1232,6 +1266,9 @@ function openCfgModal(idx){ ...@@ -1232,6 +1266,9 @@ function openCfgModal(idx){
document.getElementById('cfg-parser').value = s.parser || 'auto'; document.getElementById('cfg-parser').value = s.parser || 'auto';
document.getElementById('cfg-tools').checked = !!s.tools_closer_prompt; document.getElementById('cfg-tools').checked = !!s.tools_closer_prompt;
document.getElementById('cfg-grammar').checked = !!s.grammar_guided; document.getElementById('cfg-grammar').checked = !!s.grammar_guided;
if (m.cacheType === 'whisper-server') {
document.getElementById('cfg-backend').value = 'cpu';
}
openModal('cfg-modal'); openModal('cfg-modal');
} }
...@@ -1282,6 +1319,31 @@ async function saveModelConfig(){ ...@@ -1282,6 +1319,31 @@ async function saveModelConfig(){
}catch(e){ alert('Error: '+e.message); } }catch(e){ alert('Error: '+e.message); }
} }
async function addWhisperServerModel(){
const usedVram = parseFloat(document.getElementById('ws-used-vram').value);
const payload = {
model_id: document.getElementById('ws-model-id').value.trim(),
model_type: 'audio_models',
backend: 'whisper-server',
server_path: document.getElementById('ws-server-path').value.trim(),
model_path: document.getElementById('ws-model-path').value.trim() || null,
port: parseInt(document.getElementById('ws-port').value, 10) || 8744,
gpu_device: parseInt(document.getElementById('ws-gpu-device').value, 10) || 0,
load_mode: document.getElementById('ws-load-mode').value,
used_vram_gb: Number.isNaN(usedVram) ? null : usedVram,
};
try{
const r = await fetch('/admin/api/model-configure', {
method:'POST',
headers:{'Content-Type':'application/json'},
body: JSON.stringify(payload)
});
const d = await r.json();
if(!r.ok) throw new Error(d.detail || 'Failed to add whisper-server model');
refreshLocal();
}catch(e){ alert('Error: '+e.message); }
}
async function loadModel(idx){ async function loadModel(idx){
const m = _localModels[idx]; const m = _localModels[idx];
// Find the button and show loading state // Find the button and show loading state
......
...@@ -69,48 +69,6 @@ ...@@ -69,48 +69,6 @@
<span class="form-hint">Models will inherit this as default when configured</span> <span class="form-hint">Models will inherit this as default when configured</span>
</div> </div>
</div> </div>
<!-- Whisper Server -->
<div class="card mb-0" style="margin-top:1rem">
<div style="display:flex;align-items:center;justify-content:space-between;flex-wrap:wrap;gap:.5rem;margin-bottom:1rem">
<div class="card-title" style="margin:0">Whisper Server <span class="muted" style="font-size:11px;font-weight:400">(whisper.cpp native binary — recommended for AMD/Vulkan)</span></div>
<div style="display:flex;align-items:center;gap:.5rem">
<span id="ws-badge" class="muted small"></span>
<button class="btn btn-sm btn-secondary" onclick="wsStart()">Start</button>
<button class="btn btn-sm btn-danger" onclick="wsStop()">Stop</button>
</div>
</div>
<div style="display:grid;grid-template-columns:1fr 160px;gap:1rem;align-items:start">
<div class="form-row" style="margin:0">
<label class="form-label">Model ID <span class="muted">(used in API calls, e.g. whisper-base)</span></label>
<input type="text" id="ws-id" class="form-input" placeholder="whisper-server">
<span class="form-hint">The name clients use in the <code>model</code> field of transcription requests</span>
</div>
<div class="form-row" style="margin:0">
<label class="form-label">Port</label>
<input type="number" id="ws-port" class="form-input" placeholder="8744" min="1024" max="65535">
</div>
</div>
<div style="display:grid;grid-template-columns:1fr 160px;gap:1rem;align-items:start;margin-top:1rem">
<div class="form-row" style="margin:0">
<label class="form-label">whisper-server binary path</label>
<input type="text" id="ws-path" class="form-input" placeholder="/usr/local/bin/whisper-server">
</div>
<div class="form-row" style="margin:0">
<label class="form-label">GPU device index</label>
<input type="number" id="ws-gpu" class="form-input" placeholder="0" min="0">
</div>
</div>
<div class="form-row" style="margin-top:1rem;margin-bottom:0">
<label class="form-label">Model path <span class="muted">(GGUF whisper model, e.g. ggml-base.bin)</span></label>
<input type="text" id="ws-model" class="form-input" placeholder="/path/to/ggml-base.bin">
<span class="form-hint">Configure multiple instances by adding entries to <code>models.json</code> with <code>"backend": "whisper-server"</code></span>
</div>
<p class="form-hint" style="margin-top:.75rem;margin-bottom:0">
When configured, the transcription endpoint uses this subprocess instead of the Python faster-whisper module.
Saves settings to <code>config.json</code> and takes effect immediately (no restart needed).
</p>
</div>
{% endblock %} {% endblock %}
{% block scripts %} {% block scripts %}
...@@ -140,65 +98,10 @@ async function loadSettings(){ ...@@ -140,65 +98,10 @@ async function loadSettings(){
document.getElementById('s-hf-cache').value = d.models?.hf_cache_dir ?? ''; document.getElementById('s-hf-cache').value = d.models?.hf_cache_dir ?? '';
document.getElementById('s-gguf-cache').value = d.models?.gguf_cache_dir ?? ''; document.getElementById('s-gguf-cache').value = d.models?.gguf_cache_dir ?? '';
document.getElementById('s-offload-dir').value = d.offload?.directory ?? './offload'; document.getElementById('s-offload-dir').value = d.offload?.directory ?? './offload';
document.getElementById('ws-path').value = d.whisper?.server_path ?? '';
document.getElementById('ws-port').value = d.whisper?.server_port ?? 8744;
toggleHttps(); toggleHttps();
}catch(e){ showAlert('error','Failed to load settings: '+e.message); } }catch(e){ showAlert('error','Failed to load settings: '+e.message); }
} }
async function loadWsStatus(){
try{
const s = await fetch('/admin/api/whisper-server/status').then(r=>r.json());
const badge = document.getElementById('ws-badge');
// s is now a dict of {model_id: {running, model, url}}
const entries = Object.entries(s);
if(!entries.length){
badge.textContent = '○ not configured';
badge.style.color = 'var(--text-2)';
return;
}
const running = entries.filter(([,v])=>v.running);
if(running.length){
badge.textContent = `● ${running.length} running`;
badge.style.color = 'var(--green, #4ade80)';
} else {
badge.textContent = '○ stopped';
badge.style.color = 'var(--text-2)';
}
}catch(e){}
}
async function wsStart(){
const path = document.getElementById('ws-path').value.trim();
if(!path){ showAlert('error','Binary path required'); return; }
try{
const r = await fetch('/admin/api/whisper-server/start',{
method:'POST', headers:{'Content-Type':'application/json'},
body: JSON.stringify({
model_id: document.getElementById('ws-id').value.trim() || 'whisper-server',
server_path: path,
model_path: document.getElementById('ws-model').value.trim() || null,
port: parseInt(document.getElementById('ws-port').value) || 8744,
gpu_device: parseInt(document.getElementById('ws-gpu').value) || 0,
})
});
const d = await r.json();
if(d.success) showAlert('info','whisper-server started');
else showAlert('error','Failed to start whisper-server');
loadWsStatus();
}catch(e){ showAlert('error','Error: '+e.message); }
}
async function wsStop(){
const modelId = document.getElementById('ws-id').value.trim() || 'whisper-server';
await fetch('/admin/api/whisper-server/stop',{
method:'POST', headers:{'Content-Type':'application/json'},
body: JSON.stringify({model_id: modelId})
});
showAlert('info','whisper-server stopped');
loadWsStatus();
}
async function saveSettings(){ async function saveSettings(){
const strOrNull = id => document.getElementById(id).value.trim() || null; const strOrNull = id => document.getElementById(id).value.trim() || null;
const data = { const data = {
...@@ -216,11 +119,7 @@ async function saveSettings(){ ...@@ -216,11 +119,7 @@ async function saveSettings(){
}, },
offload:{ offload:{
directory: document.getElementById('s-offload-dir').value.trim() || './offload', directory: document.getElementById('s-offload-dir').value.trim() || './offload',
}, }
whisper:{
server_path: document.getElementById('ws-path').value.trim() || null,
server_port: parseInt(document.getElementById('ws-port').value) || 8744,
},
}; };
try{ try{
const r = await fetch('/admin/api/settings',{ const r = await fetch('/admin/api/settings',{
...@@ -233,7 +132,5 @@ async function saveSettings(){ ...@@ -233,7 +132,5 @@ async function saveSettings(){
} }
loadSettings(); loadSettings();
loadWsStatus();
setInterval(loadWsStatus, 5000);
</script> </script>
{% endblock %} {% endblock %}
...@@ -134,33 +134,30 @@ async def create_transcription( ...@@ -134,33 +134,30 @@ async def create_transcription(
if len(file_content) > _MAX_AUDIO_BYTES: if len(file_content) > _MAX_AUDIO_BYTES:
raise HTTPException(status_code=413, detail="Audio file too large (max 100 MB)") raise HTTPException(status_code=413, detail="Audio file too large (max 100 MB)")
# Check if the requested model is a whisper-server instance # Check if the requested model maps to a configured whisper-server instance first
wsm = multi_model_manager.whisper_servers.get(model) whisper_server = multi_model_manager.whisper_servers.get(model)
if wsm is None and multi_model_manager.whisper_server is not None: if whisper_server is not None:
# Legacy single-instance fallback: use it if no specific match
if not multi_model_manager.whisper_servers:
wsm = multi_model_manager.whisper_server
if wsm is not None:
ws_key = f"audio:{model}" if model in multi_model_manager.whisper_servers else "audio:whisper-server"
# Let the VRAM manager evict other models if needed
multi_model_manager.request_model(requested_model=model, model_type="audio") multi_model_manager.request_model(requested_model=model, model_type="audio")
if not whisper_server.is_running():
# Start the subprocess if it isn't running (on-demand) whisper_server.start(
if not wsm.is_running(): getattr(whisper_server, "_model_path", None),
wsm.start(getattr(wsm, '_model_path', None), gpu_device=getattr(wsm, '_gpu_device', 0)) gpu_device=getattr(whisper_server, "_gpu_device", 0),
if wsm.is_running(): )
multi_model_manager.models[ws_key] = wsm if whisper_server.is_running():
ws_key = f"audio:{model}"
multi_model_manager.models[ws_key] = whisper_server
multi_model_manager.active_in_vram = ws_key multi_model_manager.active_in_vram = ws_key
multi_model_manager.models_in_vram.add(ws_key) multi_model_manager.models_in_vram.add(ws_key)
if not whisper_server.is_running():
if wsm.is_running(): raise HTTPException(status_code=500, detail="whisper-server failed to start")
result = wsm.transcribe(file_content, language=language, prompt=prompt) result = whisper_server.transcribe(
if "error" in result: file_content,
raise HTTPException(status_code=500, detail=result["error"]) language=language,
return _format_response(response_format, result.get("text", ""), []) prompt=prompt
# Fall through to Python backends if subprocess failed to start )
if "error" in result:
raise HTTPException(status_code=500, detail=result["error"])
return _format_response(response_format, result.get("text", ""), [])
# Use the manager to resolve the model and manage VRAM # Use the manager to resolve the model and manage VRAM
model_info = multi_model_manager.request_model( model_info = multi_model_manager.request_model(
...@@ -265,4 +262,4 @@ async def create_transcription( ...@@ -265,4 +262,4 @@ async def create_transcription(
try: try:
os.unlink(tmp_path) os.unlink(tmp_path)
except Exception: except Exception:
pass pass
\ No newline at end of file
...@@ -344,10 +344,6 @@ class ConfigManager: ...@@ -344,10 +344,6 @@ class ConfigManager:
"vae_tiling": self.config.image.vae_tiling, "vae_tiling": self.config.image.vae_tiling,
"clip_on_cpu": self.config.image.clip_on_cpu "clip_on_cpu": self.config.image.clip_on_cpu
}, },
"whisper": {
"server_path": self.config.whisper.server_path,
"server_port": self.config.whisper.server_port
},
"system_prompt": self.config.system_prompt, "system_prompt": self.config.system_prompt,
"tools_closer_prompt": self.config.tools_closer_prompt, "tools_closer_prompt": self.config.tools_closer_prompt,
"grammar_guided": self.config.grammar_guided, "grammar_guided": self.config.grammar_guided,
...@@ -377,4 +373,4 @@ class ConfigManager: ...@@ -377,4 +373,4 @@ class ConfigManager:
def reload(self): def reload(self):
"""Reload all configuration files.""" """Reload all configuration files."""
return self.load() return self.load()
\ No newline at end of file
...@@ -370,16 +370,21 @@ def main(): ...@@ -370,16 +370,21 @@ def main():
mid = _model_id(m) mid = _model_id(m)
if not mid: if not mid:
continue continue
backend = m.get("backend", "") if isinstance(m, dict) else "" if isinstance(m, dict) and m.get("backend") == "whisper-server":
if backend == "whisper-server":
# Register as a whisper-server instance
cfg = _model_cfg(m, "audio") cfg = _model_cfg(m, "audio")
cfg.update({
"backend": "whisper-server",
"server_path": m.get("server_path", ""),
"model_path": m.get("model_path") or None,
"port": int(m.get("port", 8744)),
"gpu_device": int(m.get("gpu_device", 0)),
})
multi_model_manager.register_whisper_server( multi_model_manager.register_whisper_server(
model_id=mid, model_id=mid,
server_path=m.get("server_path", config.whisper.server_path or ""), server_path=m.get("server_path", ""),
model_path=m.get("model_path") or None, model_path=m.get("model_path") or None,
port=int(m.get("port", config.whisper.server_port)), port=int(m.get("port", 8744)),
gpu_device=int(m.get("gpu_device", config.vulkan.device_id)), gpu_device=int(m.get("gpu_device", 0)),
config=cfg, config=cfg,
) )
else: else:
...@@ -680,4 +685,4 @@ def main(): ...@@ -680,4 +685,4 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
main() main()
\ No newline at end of file
...@@ -450,6 +450,7 @@ class MultiModelManager: ...@@ -450,6 +450,7 @@ class MultiModelManager:
self.whisper_server.stop() self.whisper_server.stop()
except Exception: except Exception:
pass pass
self.whisper_servers.clear()
# Clear all model lists # Clear all model lists
self.default_model = None self.default_model = None
...@@ -649,6 +650,10 @@ class MultiModelManager: ...@@ -649,6 +650,10 @@ class MultiModelManager:
self.audio_models.append(model_name) self.audio_models.append(model_name)
self.config[f"audio:{model_name}"] = config or {} self.config[f"audio:{model_name}"] = config or {}
if isinstance(config, dict) and config.get("backend") == "whisper-server":
print(f"Registered whisper-server audio model: {model_name}")
return
# Download/cache the model at startup if it's a URL or HF ID # Download/cache the model at startup if it's a URL or HF ID
resolved_model = self.load_model(model_name) resolved_model = self.load_model(model_name)
if resolved_model != model_name: if resolved_model != model_name:
...@@ -1803,16 +1808,22 @@ class MultiModelManager: ...@@ -1803,16 +1808,22 @@ class MultiModelManager:
"embedding_models": "embedding", "embedding_models": "embedding",
} }
def _add(model_id: str, model_type: str = None): def _add(model_id: str, model_type: str = None, meta: Dict[str, Any] = None):
if model_id in seen_ids: if model_id in seen_ids:
return return
seen_ids.add(model_id) seen_ids.add(model_id)
caps = detect_model_capabilities(model_id) caps = detect_model_capabilities(model_id)
resolved_type = model_type or (caps.to_list()[0].split("_")[0] if caps.to_list() else "text") resolved_type = model_type or (caps.to_list()[0].split("_")[0] if caps.to_list() else "text")
meta = meta or {}
models.append(ModelInfo( models.append(ModelInfo(
id=model_id, id=model_id,
type=resolved_type, type=resolved_type,
capabilities=caps.to_list(), capabilities=caps.to_list(),
backend=meta.get("backend"),
model_path=meta.get("model_path"),
port=meta.get("port"),
gpu_device=meta.get("gpu_device"),
load_mode=meta.get("load_mode"),
)) ))
# --- Models from config (the authoritative source) --- # --- Models from config (the authoritative source) ---
...@@ -1831,15 +1842,15 @@ class MultiModelManager: ...@@ -1831,15 +1842,15 @@ class MultiModelManager:
mid = m.get("alias") or m.get("path") or m.get("id") or "" mid = m.get("alias") or m.get("path") or m.get("id") or ""
raw = m.get("path") or m.get("id") or "" raw = m.get("path") or m.get("id") or ""
if raw and raw != mid: if raw and raw != mid:
_add(raw, mtype) _add(raw, mtype, m)
short = raw.split("/")[-1] if "/" in raw else raw short = raw.split("/")[-1] if "/" in raw else raw
if short != raw: if short != raw:
_add(short, mtype) _add(short, mtype, m)
if mid: if mid:
_add(mid, mtype) _add(mid, mtype, m if isinstance(m, dict) else None)
short = mid.split("/")[-1] if "/" in mid else mid short = mid.split("/")[-1] if "/" in mid else mid
if short != mid: if short != mid:
_add(short, mtype) _add(short, mtype, m if isinstance(m, dict) else None)
except Exception: except Exception:
pass pass
...@@ -1901,4 +1912,4 @@ class MultiModelManager: ...@@ -1901,4 +1912,4 @@ class MultiModelManager:
# Global singleton instances for convenience # Global singleton instances for convenience
model_manager = ModelManager() model_manager = ModelManager()
multi_model_manager = MultiModelManager() multi_model_manager = MultiModelManager()
\ No newline at end of file
...@@ -121,8 +121,13 @@ class ModelInfo(BaseModel): ...@@ -121,8 +121,13 @@ class ModelInfo(BaseModel):
owned_by: str = "huggingface" owned_by: str = "huggingface"
type: Optional[str] = None # e.g. "text", "image", "video", "audio", "tts", "vision", "embedding" type: Optional[str] = None # e.g. "text", "image", "video", "audio", "tts", "vision", "embedding"
capabilities: Optional[List[str]] = None # list of capability strings capabilities: Optional[List[str]] = None # list of capability strings
backend: Optional[str] = None
model_path: Optional[str] = None
port: Optional[int] = None
gpu_device: Optional[int] = None
load_mode: Optional[str] = None
class ModelList(BaseModel): class ModelList(BaseModel):
object: str = "list" object: str = "list"
data: List[ModelInfo] data: List[ModelInfo]
\ No newline at end of file
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment