whisper: account a running runner as a loaded model for VRAM eviction

Starting a whisper-server runner loads the gguf onto the GPU, but it was invisible to the VRAM-eviction logic — it never evicted others to make room, recorded no footprint, and (lacking a cleanup()) couldn't itself be evicted. - WhisperServerManager.cleanup() -> stop(), so _evict_one/unload_model can free its VRAM like any other model. - MultiModelManager.start_whisper_server(): estimate the gguf footprint, evict other models if free VRAM is short, start the subprocess, and register it in models/models_in_vram/_measured_vram_gb (active_in_vram). It's now both a trigger for eviction and an eviction candidate. - stop_whisper_server(): stop + clear all that accounting (frees VRAM). - Routed every start/stop through these: on-request transcription, engine startup pre-load, admin model-load (Load button) and model-unload/disable. So: starting a runner = a model load (evicts as needed); unloading = frees VRAM. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

whisper: account a running runner as a loaded model for VRAM eviction
Starting a whisper-server runner loads the gguf onto the GPU, but it was invisible to the VRAM-eviction logic — it never evicted others to make room, recorded no footprint, and (lacking a cleanup()) couldn't itself be evicted. - WhisperServerManager.cleanup() -> stop(), so _evict_one/unload_model can free its VRAM like any other model. - MultiModelManager.start_whisper_server(): estimate the gguf footprint, evict other models if free VRAM is short, start the subprocess, and register it in models/models_in_vram/_measured_vram_gb (active_in_vram). It's now both a trigger for eviction and an eviction candidate. - stop_whisper_server(): stop + clear all that accounting (frees VRAM). - Routed every start/stop through these: on-request transcription, engine startup pre-load, admin model-load (Load button) and model-unload/disable. So: starting a runner = a model load (evicts as needed); unloading = frees VRAM. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2a214215 · Stefy Lanza (nextime / spora ) · 3d551444 · 2a214215 · 2a214215 · 2a214215
Commit 2a214215 authored Jun 19, 2026 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 93 additions and 36 deletions

routes.py codai/admin/routes.py +18 -19

transcriptions.py codai/api/transcriptions.py +4 -9

main.py codai/main.py +2 -7

manager.py codai/models/manager.py +69 -1

No files found.
--- a/codai/admin/routes.py
+++ b/codai/admin/routes.py
@@ -2094,17 +2094,12 @@ async def api_model_disable(request: Request, username: str = Depends(require_ad
        for m in removed:
            if isinstance(m, dict) and m.get("backend") == "whisper-server":
                mid = m.get("id")
-                wsm = _mmm.whisper_servers.pop(mid, None) if mid else None
+                if not mid:
-                if wsm is not None:
+                    continue
-                    try:
+                # Stop the subprocess + clear VRAM accounting, then forget the runner
-                        wsm.stop()
+                # entirely (its config is gone, unlike a plain unload).
-                    except Exception:
+                _mmm.stop_whisper_server(mid)
-                        pass
+                _mmm.whisper_servers.pop(mid, None)
-                for k in (f"audio:{mid}", mid):
-                    if k:
-                        _mmm.models.pop(k, None)
-                        _mmm.model_pools.pop(k, None)
-                        _mmm.models_in_vram.discard(k)
    except Exception as e:
        print(f"  [admin] whisper runner teardown failed: {e}")
@@ -2207,6 +2202,16 @@ async def api_model_load(request: Request, username: str = Depends(require_admin
    if not path:
        raise HTTPException(status_code=400, detail="path required")
+    # A whisper-server runner: starting it IS the model load (subprocess onto the
+    # GPU). Route through the accounted start so it evicts for VRAM and registers
+    # as a loaded model (and "Unload" later frees it).
+    for _mid in (path, path.split("audio:")[-1]):
+        if _mid in multi_model_manager.whisper_servers:
+            ok = await asyncio.to_thread(multi_model_manager.start_whisper_server, _mid)
+            if not ok:
+                raise HTTPException(status_code=500, detail="whisper-server failed to start")
+            return {"success": True, "model_key": f"audio:{_mid}"}
    # Find the model config entry to determine its type. A model may be
    # registered in several categories (e.g. a vision LLM advertises image_to_text
    # → also listed under vision_models). The category-bucket loop below would pick
@@ -2397,14 +2402,8 @@ async def api_model_unload(request: Request, username: str = Depends(require_adm
            continue
        mp = getattr(wsm, "_model_path", None) or ""
        if _matches(mid) or _matches(f"audio:{mid}") or _matches(mp):
-            try:
+            # Stops the subprocess AND clears its VRAM accounting (frees VRAM).
-                wsm.stop()
+            multi_model_manager.stop_whisper_server(mid)
-            except Exception:
-                pass
-            for k in (f"audio:{mid}", mid):
-                multi_model_manager.models.pop(k, None)
-                multi_model_manager.model_pools.pop(k, None)
-                multi_model_manager.models_in_vram.discard(k)
            stopped_whisper = True
    if stopped_whisper:
        return {"success": True, "was_loaded": True}

--- a/codai/api/transcriptions.py
+++ b/codai/api/transcriptions.py
@@ -173,15 +173,10 @@ async def _run_transcription(
        await asyncio.to_thread(
            multi_model_manager.request_model, requested_model=model, model_type="audio")
        if not whisper_server.is_running():
-            whisper_server.start(
+            # Treat starting the runner as a model load: evict other models for its
-                getattr(whisper_server, "_model_path", None),
+            # VRAM and register it in the loaded-model maps (so it's evictable too).
-                gpu_device=getattr(whisper_server, "_gpu_device", 0),
+            await asyncio.to_thread(
-            )
+                multi_model_manager.start_whisper_server, whisper_model_id or model)
-            if whisper_server.is_running():
-                ws_key = f"audio:{whisper_model_id or model}"
-                multi_model_manager.models[ws_key] = whisper_server
-                multi_model_manager.active_in_vram = ws_key
-                multi_model_manager.models_in_vram.add(ws_key)
        if not whisper_server.is_running():
            raise HTTPException(status_code=500, detail="whisper-server failed to start")
        result = whisper_server.transcribe(

--- a/codai/main.py
+++ b/codai/main.py
@@ -917,13 +917,8 @@ def main():
                else:
                    print(f"  Warning: {mid} failed to load")
            elif mtype == "audio" and mid in multi_model_manager.whisper_servers:
-                wsm = multi_model_manager.whisper_servers[mid]
+                # Accounted start: evicts for VRAM + registers it as a loaded model.
-                result = wsm.start(wsm._model_path, gpu_device=wsm._gpu_device)
+                if multi_model_manager.start_whisper_server(mid):
-                if wsm.is_running():
-                    ws_key = f"audio:{mid}"
-                    multi_model_manager.models[ws_key] = wsm
-                    multi_model_manager.active_in_vram = ws_key
-                    multi_model_manager.models_in_vram.add(ws_key)
                    print(f"  whisper-server started: {mid}")
                else:
                    print(f"  Warning: whisper-server '{mid}' failed to start")

--- a/codai/models/manager.py
+++ b/codai/models/manager.py
@@ -573,7 +573,13 @@ class WhisperServerManager:
                    print(f"Error stopping whisper-server: {e}")
                self.process = None
                self.current_model = None
+    def cleanup(self):
+        """Free VRAM by stopping the subprocess. Lets the generic VRAM-eviction
+        path (_evict_one / unload_model) treat a running whisper-server like any
+        other loaded model and actually release its GPU memory."""
+        self.stop()
    def transcribe(self, audio_data: bytes, language: str = None, prompt: str = None):
        """Send transcription request to whisper-server."""
        if not self.is_running():
@@ -1284,6 +1290,68 @@ class MultiModelManager:
              + (f" alias={alias}" if alias else ""))
        return wsm
+    def _estimate_gguf_vram_gb(self, path: Optional[str]) -> float:
+        """Rough VRAM footprint of a gguf from its on-disk size (weights ~= file)."""
+        try:
+            if path and os.path.isfile(path):
+                return round(os.path.getsize(path) / 1e9 * 1.1, 2)
+        except Exception:
+            pass
+        return 0.0
+    def start_whisper_server(self, model_id: str, model_path: str = None,
+                             gpu_device: int = None) -> bool:
+        """Start a whisper-server RUNNER, treating it as a model load for VRAM
+        accounting: estimate its footprint, evict other loaded models to make room,
+        then register it in the loaded-model maps so a later load can evict IT in
+        turn (and so the dashboard/eviction see its VRAM). 1:1 with its gguf."""
+        wsm = self.whisper_servers.get(model_id)
+        if wsm is None:
+            return False
+        ws_key = f"audio:{model_id}"
+        if wsm.is_running():
+            self.models[ws_key] = wsm
+            self.models_in_vram.add(ws_key)
+            return True
+        mp = model_path or getattr(wsm, "_model_path", None)
+        gd = gpu_device if gpu_device is not None else getattr(wsm, "_gpu_device", 0)
+        needed = self._estimate_gguf_vram_gb(mp)
+        if needed > 0 and self._get_free_vram_gb() < needed:
+            print(f"Whisper start: need ~{needed:.1f} GB VRAM for '{model_id}' — evicting")
+            self._evict_models_for_vram(needed)
+        wsm.start(mp, gpu_device=int(gd or 0))
+        if not wsm.is_running():
+            return False
+        # Register it like a loaded model so eviction/accounting can see + evict it.
+        self.models[ws_key] = wsm
+        self.model_pools.pop(ws_key, None)
+        self.active_in_vram = ws_key
+        self.current_model_key = ws_key
+        self.models_in_vram.add(ws_key)
+        if needed > 0:
+            self._measured_vram_gb.setdefault(ws_key, needed)
+        return True
+    def stop_whisper_server(self, model_id: str) -> bool:
+        """Stop a whisper-server runner and clear its VRAM accounting (frees VRAM)."""
+        wsm = self.whisper_servers.get(model_id)
+        ws_key = f"audio:{model_id}"
+        was = bool(wsm and wsm.is_running())
+        if wsm is not None:
+            try:
+                wsm.stop()
+            except Exception:
+                pass
+        self.models.pop(ws_key, None)
+        self.model_pools.pop(ws_key, None)
+        self.models_in_vram.discard(ws_key)
+        self._measured_vram_gb.pop(ws_key, None)
+        if self.active_in_vram == ws_key:
+            self.active_in_vram = None
+        if self.current_model_key == ws_key:
+            self.current_model_key = None
+        return was
    def resolve_whisper_alias(self, name: str) -> Optional[WhisperServerManager]:
        """Return the next round-robin WhisperServerManager for an alias, or None."""
        ids = self.whisper_aliases.get(name)