merge: integrate whisper-server local model workflow

ad758123 · Stefy Lanza (nextime / spora ) · b17e45a5 · da83cc25 · ad758123 · ad758123
Commit ad758123 authored May 06, 2026 by Stefy Lanza (nextime / spora )
10 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -20,3 +20,5 @@ debug.log

 # Test files
 test_*.py
+!tests/
+!tests/test_whisper_server_local_models.py
--- a/codai/admin/routes.py
+++ b/codai/admin/routes.py
@@ -1172,6 +1172,17 @@ async def api_model_load(request: Request, username: str = Depends(require_admin
                raise RuntimeError("Model failed to load")
            multi_model_manager.models[result["model_key"] or path] = mm
            multi_model_manager.active_in_vram = result["model_key"] or path
+        elif model_type == "audio":
+            wsm = multi_model_manager.whisper_servers.get(path)
+            if wsm is not None:
+                started = wsm.start(getattr(wsm, "_model_path", None), gpu_device=getattr(wsm, "_gpu_device", 0))
+                if not wsm.is_running():
+                    raise RuntimeError("whisper-server failed to start")
+                model_key = f"audio:{path}"
+                multi_model_manager.models[model_key] = wsm
+                multi_model_manager.active_in_vram = model_key
+                multi_model_manager.models_in_vram.add(model_key)
+                return {"success": True, "already_loaded": False, "started_model": started}
        elif model_type == "image":
            from codai.api.images import _load_diffusers_pipeline, _is_gguf_model, _load_sdcpp_model
            from codai.api.state import get_global_args
@@ -1243,6 +1254,38 @@ async def api_model_configure(request: Request, username: str = Depends(require_
    if config_manager is None:
        raise HTTPException(status_code=503, detail="Config manager not initialized")
    data = await request.json()
+    if data.get("backend") == "whisper-server":
+        model_id = (data.get("model_id") or "").strip()
+        if not model_id:
+            raise HTTPException(status_code=400, detail="model_id is required")
+        server_path = (data.get("server_path") or "").strip()
+        if not server_path:
+            raise HTTPException(status_code=400, detail="server_path is required")
+        port = int(data.get("port", 8744))
+        if port < 1 or port > 65535:
+            raise HTTPException(status_code=400, detail="port must be between 1 and 65535")
+        gpu_device = int(data.get("gpu_device", 0))
+        if gpu_device < 0:
+            raise HTTPException(status_code=400, detail="gpu_device must be >= 0")
+        for existing in config_manager.models_data.get("audio_models", []):
+            if isinstance(existing, dict) and existing.get("id") == model_id:
+                raise HTTPException(status_code=409, detail=f"whisper-server model '{model_id}' already exists")
+        entry = {
+            "id": model_id,
+            "backend": "whisper-server",
+            "server_path": server_path,
+            "model_path": (data.get("model_path") or "").strip() or None,
+            "port": port,
+            "gpu_device": gpu_device,
+            "load_mode": data.get("load_mode", "on-request"),
+            "model_type": "audio_models",
+            "model_types": ["audio_models"],
+        }
+        if data.get("used_vram_gb") is not None:
+            entry["used_vram_gb"] = data["used_vram_gb"]
+        config_manager.models_data.setdefault("audio_models", []).append(entry)
+        config_manager.save_models()
+        return {"success": True}
    path = data.get("path") or data.get("model_id", "")
    valid = {"text_models", "image_models", "audio_models", "tts_models", "vision_models", "video_models",
             "audio_gen_models", "embedding_models"}
@@ -1375,10 +1418,6 @@ async def api_get_settings(username: str = Depends(require_admin)):
            "device_id": c.vulkan.device_id,
            "single_gpu": c.vulkan.single_gpu,
        },
-        "whisper": {
-            "server_path": c.whisper.server_path,
-            "server_port": c.whisper.server_port,
-        },
        "system_prompt": c.system_prompt,
        "tools_closer_prompt": c.tools_closer_prompt,
        "grammar_guided": c.grammar_guided,
@@ -1442,11 +1481,6 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
        c.vulkan.device_id = int(vk.get("device_id", c.vulkan.device_id))
        c.vulkan.single_gpu = bool(vk.get("single_gpu", c.vulkan.single_gpu))

-    if "whisper" in data:
-        wh = data["whisper"]
-        c.whisper.server_path = wh.get("server_path") or None
-        c.whisper.server_port = int(wh.get("server_port", c.whisper.server_port))
-
    if "system_prompt" in data:
        c.system_prompt = data["system_prompt"] or None
    if "tools_closer_prompt" in data:
@@ -1458,83 +1492,6 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad

    config_manager.save_config()
    return {"success": True}
-
-
-
-# --- Whisper-server management ---
-
-@router.get("/admin/api/whisper-server/status")
-async def api_whisper_server_status(username: str = Depends(require_admin)):
-    """Return status of all registered whisper-server instances."""
-    from codai.models.manager import multi_model_manager
-    if multi_model_manager.whisper_servers:
-        return {
-            mid: wsm.get_status()
-            for mid, wsm in multi_model_manager.whisper_servers.items()
-        }
-    # Legacy single-instance fallback
-    if multi_model_manager.whisper_server:
-        return {"whisper-server": multi_model_manager.whisper_server.get_status()}
-    return {}
-
-
-@router.post("/admin/api/whisper-server/start")
-async def api_whisper_server_start(request: Request, username: str = Depends(require_admin)):
-    """Start (or restart) a whisper-server instance by model_id."""
-    from codai.models.manager import multi_model_manager
-    data = await request.json()
-    model_id   = data.get("model_id", "whisper-server")
-    server_path = data.get("server_path", "")
-    model_path  = data.get("model_path") or None
-    port        = int(data.get("port", 8744))
-    gpu_device  = int(data.get("gpu_device", 0))
-
-    if not server_path:
-        raise HTTPException(status_code=400, detail="server_path required")
-
-    wsm = multi_model_manager.whisper_servers.get(model_id)
-    if wsm is None:
-        wsm = multi_model_manager.register_whisper_server(
-            model_id=model_id, server_path=server_path,
-            model_path=model_path, port=port, gpu_device=gpu_device,
-        )
-    else:
-        wsm.server_path = server_path
-        wsm.port = port
-        wsm.base_url = f"http://127.0.0.1:{port}"
-        wsm._model_path = model_path
-        wsm._gpu_device = gpu_device
-
-    result = wsm.start(model_path, gpu_device=gpu_device)
-    running = wsm.is_running()
-
-    if running:
-        ws_key = f"audio:{model_id}"
-        multi_model_manager.models[ws_key] = wsm
-        multi_model_manager.active_in_vram = ws_key
-        multi_model_manager.models_in_vram.add(ws_key)
-
-    return {"success": running, "running": running, "started_model": result}
-
-
-@router.post("/admin/api/whisper-server/stop")
-async def api_whisper_server_stop(request: Request, username: str = Depends(require_admin)):
-    """Stop a whisper-server instance by model_id."""
-    from codai.models.manager import multi_model_manager
-    data = await request.json() if request.headers.get("content-type", "").startswith("application/json") else {}
-    model_id = data.get("model_id", "whisper-server")
-
-    wsm = multi_model_manager.whisper_servers.get(model_id) or multi_model_manager.whisper_server
-    if wsm:
-        wsm.stop()
-        ws_key = f"audio:{model_id}"
-        multi_model_manager.models.pop(ws_key, None)
-        multi_model_manager.models_in_vram.discard(ws_key)
-        if multi_model_manager.active_in_vram == ws_key:
-            multi_model_manager.active_in_vram = None
-    return {"success": True, "running": False}
-
-
 # --- HuggingFace model search proxy ---

 import re as _re
@@ -1773,4 +1730,4 @@ async def api_hf_model_info(model_id: str, username: str = Depends(require_admin
        "params_label": params_label,
        "gguf_files": gguf_files,
        "file_count": len(all_files),
-    }
\ No newline at end of file
+    }
--- a/codai/admin/templates/models.html
+++ b/codai/admin/templates/models.html
@@ -95,17 +95,26 @@
    <div id="gguf-models-list"><span class="muted small">Loading…</span></div>
  </div>

-  <!-- Whisper Server -->
-  <div class="card mb-0" style="margin-top:1rem" id="ws-card">
-    <div style="display:flex;align-items:center;justify-content:space-between;flex-wrap:wrap;gap:.5rem">
-      <div>
-        <div class="card-title" style="margin:0">whisper-server <span class="muted" style="font-size:11px;font-weight:400">— native subprocess (AMD/Vulkan)</span></div>
-        <div id="ws-model-status" class="muted small" style="margin-top:.25rem">—</div>
-      </div>
-      <div style="display:flex;align-items:center;gap:.5rem">
-        <span id="ws-running-badge" style="font-size:12px;font-weight:500">—</span>
-        <a href="/admin/settings" class="btn btn-sm btn-ghost">Configure</a>
-      </div>
+  <div class="card mb-0" style="margin-top:1rem" id="ws-model-builder">
+    <div class="card-title">Whisper-server simulated models</div>
+    <p class="muted small" style="margin-top:0">Create local audio models backed by dedicated whisper-server subprocess configurations.</p>
+    <div style="display:grid;grid-template-columns:repeat(3,minmax(0,1fr));gap:.75rem">
+      <input id="ws-model-id" class="form-input" placeholder="whisper-vulkan-base">
+      <input id="ws-server-path" class="form-input" placeholder="/usr/local/bin/whisper-server">
+      <input id="ws-model-path" class="form-input" placeholder="/models/ggml-base.bin">
+      <input id="ws-port" class="form-input" type="number" value="8744" min="1" max="65535">
+      <input id="ws-gpu-device" class="form-input" type="number" value="0" min="0">
+      <select id="ws-load-mode" class="form-input">
+        <option value="on-request">On request</option>
+        <option value="load">Load</option>
+      </select>
+    </div>
+    <div style="display:grid;grid-template-columns:repeat(2,minmax(0,1fr));gap:.75rem;margin-top:.75rem">
+      <input id="ws-used-vram" class="form-input" type="number" min="0" step="0.1" placeholder="Used VRAM (optional)">
+      <div></div>
+    </div>
+    <div class="form-actions" style="margin-top:.75rem">
+      <button class="btn btn-primary" onclick="addWhisperServerModel()">Add model</button>
    </div>
  </div>
 </div>
@@ -517,33 +526,6 @@ async function loadGlobalSettings(){
  }catch{}
 }

-async function loadWsStatus(){
-  try{
-    const s = await fetch('/admin/api/whisper-server/status').then(r=>r.json());
-    const card = document.getElementById('ws-card');
-    const badge = document.getElementById('ws-running-badge');
-    const modelEl = document.getElementById('ws-model-status');
-    const entries = Object.entries(s);
-    if(!entries.length){
-      card.style.display = 'none';
-      return;
-    }
-    card.style.display = '';
-    const running = entries.filter(([,v])=>v.running);
-    if(running.length){
-      badge.textContent = `● ${running.length}/${entries.length} running`;
-      badge.style.color = 'var(--green, #4ade80)';
-      card.style.borderColor = 'rgba(74,222,128,.3)';
-      modelEl.textContent = running.map(([id,v])=>`${id}: ${v.model||'?'} @ ${v.url}`).join(' | ');
-    } else {
-      badge.textContent = '○ stopped';
-      badge.style.color = 'var(--text-2)';
-      card.style.borderColor = '';
-      modelEl.textContent = entries.map(([id])=>id).join(', ') + ' — not started';
-    }
-  }catch{}
-}
-
 /* ── GGUF format toggle ──────────────────────────────── */
 let _ggufMode = 'gguf';
 document.querySelectorAll('.tog-btn').forEach(btn=>{
@@ -982,6 +964,57 @@ async function loadCacheStats(){

 let _localModels = [];

+function _renderWhisperServerRows(models){
+  if(!models.length) return '';
+  const rows = models.map(m=>{
+    const idx = _localModels.length;
+    _localModels.push({
+      label:m.id,
+      path:m.id,
+      cacheType:'whisper-server',
+      size_gb:0,
+      defaultType:'audio_models',
+      settings:{
+        backend:m.backend || 'whisper-server',
+        load_mode:m.load_mode || 'on-request',
+        model_type:'audio_models',
+        model_path:m.model_path || '',
+        port:m.port,
+        gpu_device:m.gpu_device,
+      },
+      in_config:true,
+      capabilities:m.capabilities || ['speech_to_text']
+    });
+    const loaded = _loadedKeys.has(`audio:${m.id}`) || _loadedKeys.has(m.id);
+    return `<tr style="border-top:1px solid var(--border)">
+      <td style="padding:.4rem .25rem;font-family:monospace;font-size:12px">${esc(m.id)}</td>
+      <td style="padding:.4rem .25rem"><span class="badge badge-ok">${esc(m.backend || 'whisper-server')}</span></td>
+      <td style="padding:.4rem .25rem;font-size:11px;color:var(--text-2)">${esc(m.model_path || '—')}</td>
+      <td style="padding:.4rem .25rem;font-size:11px;color:var(--text-2)">${m.port ?? '—'} / GPU ${m.gpu_device ?? 0}</td>
+      <td style="padding:.4rem .25rem;font-size:11px;color:var(--text-2)">${esc(m.load_mode || 'on-request')}</td>
+      <td style="padding:.4rem .25rem;text-align:center">${loaded?'<span class="badge badge-ok">loaded</span>':'<span class="muted small">idle</span>'}</td>
+      <td style="padding:.4rem .25rem;text-align:right;white-space:nowrap">
+        ${loaded
+          ?`<button class="btn btn-ghost btn-sm" onclick="unloadModel(${idx})">Unload</button>`
+          :`<button class="btn btn-primary btn-sm" onclick="loadModel(${idx})">Load now</button>`}
+        <button class="btn btn-secondary btn-sm" onclick="openCfgModal(${idx})">Configure</button>
+        <button class="btn btn-ghost btn-sm" onclick="disableModel(${idx})">Remove</button>
+      </td>
+    </tr>`;
+  });
+  return '<div class="card" style="margin-top:1rem">'+
+    '<div class="card-title">Configured whisper-server models</div>'+
+    '<table style="width:100%;border-collapse:collapse;font-size:13px">'+
+    '<thead><tr style="color:var(--text-2);font-size:10px;text-transform:uppercase;letter-spacing:.05em">'+
+    '<th style="text-align:left;padding:.3rem .25rem;font-weight:700">Model</th>'+
+    '<th style="text-align:left;padding:.3rem .25rem;font-weight:700">Backend</th>'+
+    '<th style="text-align:left;padding:.3rem .25rem;font-weight:700">Model path</th>'+
+    '<th style="text-align:left;padding:.3rem .25rem;font-weight:700">Port / GPU</th>'+
+    '<th style="text-align:left;padding:.3rem .25rem;font-weight:700">Load mode</th>'+
+    '<th style="text-align:center;padding:.3rem .25rem;font-weight:700">Status</th>'+
+    '<th></th></tr></thead><tbody>'+rows.join('')+'</tbody></table></div>';
+}
+
 async function loadCachedModels(){
  _localModels = [];
  const hfEl   = document.getElementById('hf-models-list');
@@ -991,6 +1024,8 @@ async function loadCachedModels(){
    const r = await fetch('/admin/api/cached-models');
    if(!r.ok) throw new Error((await r.json()).detail||r.statusText);
    const d = await r.json();
+    const whisperModels = (await fetch('/admin/api/models').then(r=>r.ok?r.json():[]))
+      .filter(m => m.backend === 'whisper-server');

    // HF models
    const hf = d.hf||[];
@@ -1067,6 +1102,7 @@ async function loadCachedModels(){
        '<th style="text-align:center;padding:.3rem .25rem;font-weight:700">Config</th>'+
        '<th></th></tr></thead><tbody>'+rows.join('')+'</tbody></table>';
    }
+    ggufEl.insertAdjacentHTML('afterend', _renderWhisperServerRows(whisperModels));
  }catch(e){
    hfEl.innerHTML = ggufEl.innerHTML = `<span class="muted small">Error: ${esc(e.message)}</span>`;
  }
@@ -1089,8 +1125,6 @@ async function refreshLocal(){

 loadGlobalSettings();
 refreshLocal();
-loadWsStatus();
-setInterval(loadWsStatus, 5000);

 async function clearCacheConfirm(type){
  const labels = {hf:'HuggingFace', gguf:'GGUF', all:'ALL'};
@@ -1232,6 +1266,9 @@ function openCfgModal(idx){
  document.getElementById('cfg-parser').value = s.parser || 'auto';
  document.getElementById('cfg-tools').checked = !!s.tools_closer_prompt;
  document.getElementById('cfg-grammar').checked = !!s.grammar_guided;
+  if (m.cacheType === 'whisper-server') {
+    document.getElementById('cfg-backend').value = 'cpu';
+  }
  openModal('cfg-modal');
 }

@@ -1282,6 +1319,31 @@ async function saveModelConfig(){
  }catch(e){ alert('Error: '+e.message); }
 }

+async function addWhisperServerModel(){
+  const usedVram = parseFloat(document.getElementById('ws-used-vram').value);
+  const payload = {
+    model_id: document.getElementById('ws-model-id').value.trim(),
+    model_type: 'audio_models',
+    backend: 'whisper-server',
+    server_path: document.getElementById('ws-server-path').value.trim(),
+    model_path: document.getElementById('ws-model-path').value.trim() || null,
+    port: parseInt(document.getElementById('ws-port').value, 10) || 8744,
+    gpu_device: parseInt(document.getElementById('ws-gpu-device').value, 10) || 0,
+    load_mode: document.getElementById('ws-load-mode').value,
+    used_vram_gb: Number.isNaN(usedVram) ? null : usedVram,
+  };
+  try{
+    const r = await fetch('/admin/api/model-configure', {
+      method:'POST',
+      headers:{'Content-Type':'application/json'},
+      body: JSON.stringify(payload)
+    });
+    const d = await r.json();
+    if(!r.ok) throw new Error(d.detail || 'Failed to add whisper-server model');
+    refreshLocal();
+  }catch(e){ alert('Error: '+e.message); }
+}
+
 async function loadModel(idx){
  const m = _localModels[idx];
  // Find the button and show loading state

--- a/codai/admin/templates/settings.html
+++ b/codai/admin/templates/settings.html
@@ -69,48 +69,6 @@
    <span class="form-hint">Models will inherit this as default when configured</span>
  </div>
 </div>
-
-<!-- Whisper Server -->
-<div class="card mb-0" style="margin-top:1rem">
-  <div style="display:flex;align-items:center;justify-content:space-between;flex-wrap:wrap;gap:.5rem;margin-bottom:1rem">
-    <div class="card-title" style="margin:0">Whisper Server <span class="muted" style="font-size:11px;font-weight:400">(whisper.cpp native binary — recommended for AMD/Vulkan)</span></div>
-    <div style="display:flex;align-items:center;gap:.5rem">
-      <span id="ws-badge" class="muted small">—</span>
-      <button class="btn btn-sm btn-secondary" onclick="wsStart()">Start</button>
-      <button class="btn btn-sm btn-danger" onclick="wsStop()">Stop</button>
-    </div>
-  </div>
-  <div style="display:grid;grid-template-columns:1fr 160px;gap:1rem;align-items:start">
-    <div class="form-row" style="margin:0">
-      <label class="form-label">Model ID <span class="muted">(used in API calls, e.g. whisper-base)</span></label>
-      <input type="text" id="ws-id" class="form-input" placeholder="whisper-server">
-      <span class="form-hint">The name clients use in the <code>model</code> field of transcription requests</span>
-    </div>
-    <div class="form-row" style="margin:0">
-      <label class="form-label">Port</label>
-      <input type="number" id="ws-port" class="form-input" placeholder="8744" min="1024" max="65535">
-    </div>
-  </div>
-  <div style="display:grid;grid-template-columns:1fr 160px;gap:1rem;align-items:start;margin-top:1rem">
-    <div class="form-row" style="margin:0">
-      <label class="form-label">whisper-server binary path</label>
-      <input type="text" id="ws-path" class="form-input" placeholder="/usr/local/bin/whisper-server">
-    </div>
-    <div class="form-row" style="margin:0">
-      <label class="form-label">GPU device index</label>
-      <input type="number" id="ws-gpu" class="form-input" placeholder="0" min="0">
-    </div>
-  </div>
-  <div class="form-row" style="margin-top:1rem;margin-bottom:0">
-    <label class="form-label">Model path <span class="muted">(GGUF whisper model, e.g. ggml-base.bin)</span></label>
-    <input type="text" id="ws-model" class="form-input" placeholder="/path/to/ggml-base.bin">
-    <span class="form-hint">Configure multiple instances by adding entries to <code>models.json</code> with <code>"backend": "whisper-server"</code></span>
-  </div>
-  <p class="form-hint" style="margin-top:.75rem;margin-bottom:0">
-    When configured, the transcription endpoint uses this subprocess instead of the Python faster-whisper module.
-    Saves settings to <code>config.json</code> and takes effect immediately (no restart needed).
-  </p>
-</div>
 {% endblock %}

 {% block scripts %}
@@ -140,65 +98,10 @@ async function loadSettings(){
    document.getElementById('s-hf-cache').value   = d.models?.hf_cache_dir ?? '';
    document.getElementById('s-gguf-cache').value = d.models?.gguf_cache_dir ?? '';
    document.getElementById('s-offload-dir').value = d.offload?.directory ?? './offload';
-    document.getElementById('ws-path').value = d.whisper?.server_path ?? '';
-    document.getElementById('ws-port').value = d.whisper?.server_port ?? 8744;
    toggleHttps();
  }catch(e){ showAlert('error','Failed to load settings: '+e.message); }
 }

-async function loadWsStatus(){
-  try{
-    const s = await fetch('/admin/api/whisper-server/status').then(r=>r.json());
-    const badge = document.getElementById('ws-badge');
-    // s is now a dict of {model_id: {running, model, url}}
-    const entries = Object.entries(s);
-    if(!entries.length){
-      badge.textContent = '○ not configured';
-      badge.style.color = 'var(--text-2)';
-      return;
-    }
-    const running = entries.filter(([,v])=>v.running);
-    if(running.length){
-      badge.textContent = `● ${running.length} running`;
-      badge.style.color = 'var(--green, #4ade80)';
-    } else {
-      badge.textContent = '○ stopped';
-      badge.style.color = 'var(--text-2)';
-    }
-  }catch(e){}
-}
-
-async function wsStart(){
-  const path = document.getElementById('ws-path').value.trim();
-  if(!path){ showAlert('error','Binary path required'); return; }
-  try{
-    const r = await fetch('/admin/api/whisper-server/start',{
-      method:'POST', headers:{'Content-Type':'application/json'},
-      body: JSON.stringify({
-        model_id: document.getElementById('ws-id').value.trim() || 'whisper-server',
-        server_path: path,
-        model_path: document.getElementById('ws-model').value.trim() || null,
-        port: parseInt(document.getElementById('ws-port').value) || 8744,
-        gpu_device: parseInt(document.getElementById('ws-gpu').value) || 0,
-      })
-    });
-    const d = await r.json();
-    if(d.success) showAlert('info','whisper-server started');
-    else showAlert('error','Failed to start whisper-server');
-    loadWsStatus();
-  }catch(e){ showAlert('error','Error: '+e.message); }
-}
-
-async function wsStop(){
-  const modelId = document.getElementById('ws-id').value.trim() || 'whisper-server';
-  await fetch('/admin/api/whisper-server/stop',{
-    method:'POST', headers:{'Content-Type':'application/json'},
-    body: JSON.stringify({model_id: modelId})
-  });
-  showAlert('info','whisper-server stopped');
-  loadWsStatus();
-}
-
 async function saveSettings(){
  const strOrNull = id => document.getElementById(id).value.trim() || null;
  const data = {
@@ -216,11 +119,7 @@ async function saveSettings(){
    },
    offload:{
      directory: document.getElementById('s-offload-dir').value.trim() || './offload',
-    },
-    whisper:{
-      server_path: document.getElementById('ws-path').value.trim() || null,
-      server_port: parseInt(document.getElementById('ws-port').value) || 8744,
-    },
+    }
  };
  try{
    const r = await fetch('/admin/api/settings',{
@@ -233,7 +132,5 @@ async function saveSettings(){
 }

 loadSettings();
-loadWsStatus();
-setInterval(loadWsStatus, 5000);
 </script>
 {% endblock %}
--- a/codai/api/transcriptions.py
+++ b/codai/api/transcriptions.py
@@ -134,33 +134,30 @@ async def create_transcription(
    if len(file_content) > _MAX_AUDIO_BYTES:
        raise HTTPException(status_code=413, detail="Audio file too large (max 100 MB)")

-    # Check if the requested model is a whisper-server instance
-    wsm = multi_model_manager.whisper_servers.get(model)
-    if wsm is None and multi_model_manager.whisper_server is not None:
-        # Legacy single-instance fallback: use it if no specific match
-        if not multi_model_manager.whisper_servers:
-            wsm = multi_model_manager.whisper_server
-
-    if wsm is not None:
-        ws_key = f"audio:{model}" if model in multi_model_manager.whisper_servers else "audio:whisper-server"
-
-        # Let the VRAM manager evict other models if needed
+    # Check if the requested model maps to a configured whisper-server instance first
+    whisper_server = multi_model_manager.whisper_servers.get(model)
+    if whisper_server is not None:
        multi_model_manager.request_model(requested_model=model, model_type="audio")
-
-        # Start the subprocess if it isn't running (on-demand)
-        if not wsm.is_running():
-            wsm.start(getattr(wsm, '_model_path', None), gpu_device=getattr(wsm, '_gpu_device', 0))
-            if wsm.is_running():
-                multi_model_manager.models[ws_key] = wsm
+        if not whisper_server.is_running():
+            whisper_server.start(
+                getattr(whisper_server, "_model_path", None),
+                gpu_device=getattr(whisper_server, "_gpu_device", 0),
+            )
+            if whisper_server.is_running():
+                ws_key = f"audio:{model}"
+                multi_model_manager.models[ws_key] = whisper_server
                multi_model_manager.active_in_vram = ws_key
                multi_model_manager.models_in_vram.add(ws_key)
-
-        if wsm.is_running():
-            result = wsm.transcribe(file_content, language=language, prompt=prompt)
-            if "error" in result:
-                raise HTTPException(status_code=500, detail=result["error"])
-            return _format_response(response_format, result.get("text", ""), [])
-        # Fall through to Python backends if subprocess failed to start
+        if not whisper_server.is_running():
+            raise HTTPException(status_code=500, detail="whisper-server failed to start")
+        result = whisper_server.transcribe(
+            file_content,
+            language=language,
+            prompt=prompt
+        )
+        if "error" in result:
+            raise HTTPException(status_code=500, detail=result["error"])
+        return _format_response(response_format, result.get("text", ""), [])

    # Use the manager to resolve the model and manage VRAM
    model_info = multi_model_manager.request_model(
@@ -265,4 +262,4 @@ async def create_transcription(
        try:
            os.unlink(tmp_path)
        except Exception:
-            pass
\ No newline at end of file
+            pass
--- a/codai/config.py
+++ b/codai/config.py
@@ -344,10 +344,6 @@ class ConfigManager:
                "vae_tiling": self.config.image.vae_tiling,
                "clip_on_cpu": self.config.image.clip_on_cpu
            },
-            "whisper": {
-                "server_path": self.config.whisper.server_path,
-                "server_port": self.config.whisper.server_port
-            },
            "system_prompt": self.config.system_prompt,
            "tools_closer_prompt": self.config.tools_closer_prompt,
            "grammar_guided": self.config.grammar_guided,
@@ -377,4 +373,4 @@ class ConfigManager:
    
    def reload(self):
        """Reload all configuration files."""
-        return self.load()
\ No newline at end of file
+        return self.load()
--- a/codai/main.py
+++ b/codai/main.py
@@ -370,16 +370,21 @@ def main():
        mid = _model_id(m)
        if not mid:
            continue
-        backend = m.get("backend", "") if isinstance(m, dict) else ""
-        if backend == "whisper-server":
-            # Register as a whisper-server instance
+        if isinstance(m, dict) and m.get("backend") == "whisper-server":
            cfg = _model_cfg(m, "audio")
+            cfg.update({
+                "backend": "whisper-server",
+                "server_path": m.get("server_path", ""),
+                "model_path": m.get("model_path") or None,
+                "port": int(m.get("port", 8744)),
+                "gpu_device": int(m.get("gpu_device", 0)),
+            })
            multi_model_manager.register_whisper_server(
                model_id=mid,
-                server_path=m.get("server_path", config.whisper.server_path or ""),
+                server_path=m.get("server_path", ""),
                model_path=m.get("model_path") or None,
-                port=int(m.get("port", config.whisper.server_port)),
-                gpu_device=int(m.get("gpu_device", config.vulkan.device_id)),
+                port=int(m.get("port", 8744)),
+                gpu_device=int(m.get("gpu_device", 0)),
                config=cfg,
            )
        else:
@@ -680,4 +685,4 @@ def main():


 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
--- a/codai/models/manager.py
+++ b/codai/models/manager.py
@@ -450,6 +450,7 @@ class MultiModelManager:
                self.whisper_server.stop()
            except Exception:
                pass
+        self.whisper_servers.clear()
        
        # Clear all model lists
        self.default_model = None
@@ -649,6 +650,10 @@ class MultiModelManager:
            self.audio_models.append(model_name)
        self.config[f"audio:{model_name}"] = config or {}

+        if isinstance(config, dict) and config.get("backend") == "whisper-server":
+            print(f"Registered whisper-server audio model: {model_name}")
+            return
+
        # Download/cache the model at startup if it's a URL or HF ID
        resolved_model = self.load_model(model_name)
        if resolved_model != model_name:
@@ -1803,16 +1808,22 @@ class MultiModelManager:
            "embedding_models": "embedding",
        }

-        def _add(model_id: str, model_type: str = None):
+        def _add(model_id: str, model_type: str = None, meta: Dict[str, Any] = None):
            if model_id in seen_ids:
                return
            seen_ids.add(model_id)
            caps = detect_model_capabilities(model_id)
            resolved_type = model_type or (caps.to_list()[0].split("_")[0] if caps.to_list() else "text")
+            meta = meta or {}
            models.append(ModelInfo(
                id=model_id,
                type=resolved_type,
                capabilities=caps.to_list(),
+                backend=meta.get("backend"),
+                model_path=meta.get("model_path"),
+                port=meta.get("port"),
+                gpu_device=meta.get("gpu_device"),
+                load_mode=meta.get("load_mode"),
            ))

        # --- Models from config (the authoritative source) ---
@@ -1831,15 +1842,15 @@ class MultiModelManager:
                            mid = m.get("alias") or m.get("path") or m.get("id") or ""
                            raw = m.get("path") or m.get("id") or ""
                            if raw and raw != mid:
-                                _add(raw, mtype)
+                                _add(raw, mtype, m)
                                short = raw.split("/")[-1] if "/" in raw else raw
                                if short != raw:
-                                    _add(short, mtype)
+                                    _add(short, mtype, m)
                        if mid:
-                            _add(mid, mtype)
+                            _add(mid, mtype, m if isinstance(m, dict) else None)
                            short = mid.split("/")[-1] if "/" in mid else mid
                            if short != mid:
-                                _add(short, mtype)
+                                _add(short, mtype, m if isinstance(m, dict) else None)
        except Exception:
            pass

@@ -1901,4 +1912,4 @@ class MultiModelManager:

 # Global singleton instances for convenience
 model_manager = ModelManager()
-multi_model_manager = MultiModelManager()
\ No newline at end of file
+multi_model_manager = MultiModelManager()
--- a/codai/pydantic/textrequest.py
+++ b/codai/pydantic/textrequest.py
@@ -121,8 +121,13 @@ class ModelInfo(BaseModel):
    owned_by: str = "huggingface"
    type: Optional[str] = None          # e.g. "text", "image", "video", "audio", "tts", "vision", "embedding"
    capabilities: Optional[List[str]] = None  # list of capability strings
+    backend: Optional[str] = None
+    model_path: Optional[str] = None
+    port: Optional[int] = None
+    gpu_device: Optional[int] = None
+    load_mode: Optional[str] = None


 class ModelList(BaseModel):
    object: str = "list"
-    data: List[ModelInfo]
\ No newline at end of file
+    data: List[ModelInfo]
--- a/tests/test_whisper_server_local_models.py
+++ b/tests/test_whisper_server_local_models.py