whisper: account a running runner as a loaded model for VRAM eviction

Starting a whisper-server runner loads the gguf onto the GPU, but it was
invisible to the VRAM-eviction logic — it never evicted others to make room,
recorded no footprint, and (lacking a cleanup()) couldn't itself be evicted.

- WhisperServerManager.cleanup() -> stop(), so _evict_one/unload_model can
  free its VRAM like any other model.
- MultiModelManager.start_whisper_server(): estimate the gguf footprint, evict
  other models if free VRAM is short, start the subprocess, and register it in
  models/models_in_vram/_measured_vram_gb (active_in_vram). It's now both a
  trigger for eviction and an eviction candidate.
- stop_whisper_server(): stop + clear all that accounting (frees VRAM).
- Routed every start/stop through these: on-request transcription, engine
  startup pre-load, admin model-load (Load button) and model-unload/disable.

So: starting a runner = a model load (evicts as needed); unloading = frees VRAM.
Co-Authored-By: 's avatarClaude Opus 4.8 <noreply@anthropic.com>
parent 3d551444
......@@ -2094,17 +2094,12 @@ async def api_model_disable(request: Request, username: str = Depends(require_ad
for m in removed:
if isinstance(m, dict) and m.get("backend") == "whisper-server":
mid = m.get("id")
wsm = _mmm.whisper_servers.pop(mid, None) if mid else None
if wsm is not None:
try:
wsm.stop()
except Exception:
pass
for k in (f"audio:{mid}", mid):
if k:
_mmm.models.pop(k, None)
_mmm.model_pools.pop(k, None)
_mmm.models_in_vram.discard(k)
if not mid:
continue
# Stop the subprocess + clear VRAM accounting, then forget the runner
# entirely (its config is gone, unlike a plain unload).
_mmm.stop_whisper_server(mid)
_mmm.whisper_servers.pop(mid, None)
except Exception as e:
print(f" [admin] whisper runner teardown failed: {e}")
......@@ -2207,6 +2202,16 @@ async def api_model_load(request: Request, username: str = Depends(require_admin
if not path:
raise HTTPException(status_code=400, detail="path required")
# A whisper-server runner: starting it IS the model load (subprocess onto the
# GPU). Route through the accounted start so it evicts for VRAM and registers
# as a loaded model (and "Unload" later frees it).
for _mid in (path, path.split("audio:")[-1]):
if _mid in multi_model_manager.whisper_servers:
ok = await asyncio.to_thread(multi_model_manager.start_whisper_server, _mid)
if not ok:
raise HTTPException(status_code=500, detail="whisper-server failed to start")
return {"success": True, "model_key": f"audio:{_mid}"}
# Find the model config entry to determine its type. A model may be
# registered in several categories (e.g. a vision LLM advertises image_to_text
# → also listed under vision_models). The category-bucket loop below would pick
......@@ -2397,14 +2402,8 @@ async def api_model_unload(request: Request, username: str = Depends(require_adm
continue
mp = getattr(wsm, "_model_path", None) or ""
if _matches(mid) or _matches(f"audio:{mid}") or _matches(mp):
try:
wsm.stop()
except Exception:
pass
for k in (f"audio:{mid}", mid):
multi_model_manager.models.pop(k, None)
multi_model_manager.model_pools.pop(k, None)
multi_model_manager.models_in_vram.discard(k)
# Stops the subprocess AND clears its VRAM accounting (frees VRAM).
multi_model_manager.stop_whisper_server(mid)
stopped_whisper = True
if stopped_whisper:
return {"success": True, "was_loaded": True}
......
......@@ -173,15 +173,10 @@ async def _run_transcription(
await asyncio.to_thread(
multi_model_manager.request_model, requested_model=model, model_type="audio")
if not whisper_server.is_running():
whisper_server.start(
getattr(whisper_server, "_model_path", None),
gpu_device=getattr(whisper_server, "_gpu_device", 0),
)
if whisper_server.is_running():
ws_key = f"audio:{whisper_model_id or model}"
multi_model_manager.models[ws_key] = whisper_server
multi_model_manager.active_in_vram = ws_key
multi_model_manager.models_in_vram.add(ws_key)
# Treat starting the runner as a model load: evict other models for its
# VRAM and register it in the loaded-model maps (so it's evictable too).
await asyncio.to_thread(
multi_model_manager.start_whisper_server, whisper_model_id or model)
if not whisper_server.is_running():
raise HTTPException(status_code=500, detail="whisper-server failed to start")
result = whisper_server.transcribe(
......
......@@ -917,13 +917,8 @@ def main():
else:
print(f" Warning: {mid} failed to load")
elif mtype == "audio" and mid in multi_model_manager.whisper_servers:
wsm = multi_model_manager.whisper_servers[mid]
result = wsm.start(wsm._model_path, gpu_device=wsm._gpu_device)
if wsm.is_running():
ws_key = f"audio:{mid}"
multi_model_manager.models[ws_key] = wsm
multi_model_manager.active_in_vram = ws_key
multi_model_manager.models_in_vram.add(ws_key)
# Accounted start: evicts for VRAM + registers it as a loaded model.
if multi_model_manager.start_whisper_server(mid):
print(f" whisper-server started: {mid}")
else:
print(f" Warning: whisper-server '{mid}' failed to start")
......
......@@ -573,7 +573,13 @@ class WhisperServerManager:
print(f"Error stopping whisper-server: {e}")
self.process = None
self.current_model = None
def cleanup(self):
"""Free VRAM by stopping the subprocess. Lets the generic VRAM-eviction
path (_evict_one / unload_model) treat a running whisper-server like any
other loaded model and actually release its GPU memory."""
self.stop()
def transcribe(self, audio_data: bytes, language: str = None, prompt: str = None):
"""Send transcription request to whisper-server."""
if not self.is_running():
......@@ -1284,6 +1290,68 @@ class MultiModelManager:
+ (f" alias={alias}" if alias else ""))
return wsm
def _estimate_gguf_vram_gb(self, path: Optional[str]) -> float:
"""Rough VRAM footprint of a gguf from its on-disk size (weights ~= file)."""
try:
if path and os.path.isfile(path):
return round(os.path.getsize(path) / 1e9 * 1.1, 2)
except Exception:
pass
return 0.0
def start_whisper_server(self, model_id: str, model_path: str = None,
gpu_device: int = None) -> bool:
"""Start a whisper-server RUNNER, treating it as a model load for VRAM
accounting: estimate its footprint, evict other loaded models to make room,
then register it in the loaded-model maps so a later load can evict IT in
turn (and so the dashboard/eviction see its VRAM). 1:1 with its gguf."""
wsm = self.whisper_servers.get(model_id)
if wsm is None:
return False
ws_key = f"audio:{model_id}"
if wsm.is_running():
self.models[ws_key] = wsm
self.models_in_vram.add(ws_key)
return True
mp = model_path or getattr(wsm, "_model_path", None)
gd = gpu_device if gpu_device is not None else getattr(wsm, "_gpu_device", 0)
needed = self._estimate_gguf_vram_gb(mp)
if needed > 0 and self._get_free_vram_gb() < needed:
print(f"Whisper start: need ~{needed:.1f} GB VRAM for '{model_id}' — evicting")
self._evict_models_for_vram(needed)
wsm.start(mp, gpu_device=int(gd or 0))
if not wsm.is_running():
return False
# Register it like a loaded model so eviction/accounting can see + evict it.
self.models[ws_key] = wsm
self.model_pools.pop(ws_key, None)
self.active_in_vram = ws_key
self.current_model_key = ws_key
self.models_in_vram.add(ws_key)
if needed > 0:
self._measured_vram_gb.setdefault(ws_key, needed)
return True
def stop_whisper_server(self, model_id: str) -> bool:
"""Stop a whisper-server runner and clear its VRAM accounting (frees VRAM)."""
wsm = self.whisper_servers.get(model_id)
ws_key = f"audio:{model_id}"
was = bool(wsm and wsm.is_running())
if wsm is not None:
try:
wsm.stop()
except Exception:
pass
self.models.pop(ws_key, None)
self.model_pools.pop(ws_key, None)
self.models_in_vram.discard(ws_key)
self._measured_vram_gb.pop(ws_key, None)
if self.active_in_vram == ws_key:
self.active_in_vram = None
if self.current_model_key == ws_key:
self.current_model_key = None
return was
def resolve_whisper_alias(self, name: str) -> Optional[WhisperServerManager]:
"""Return the next round-robin WhisperServerManager for an alias, or None."""
ids = self.whisper_aliases.get(name)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment