coderai: global host-RAM cap with leak watch + disk-offload eviction

Add a server-wide host-RAM ceiling (OffloadConfig.max_ram_gb) alongside the
existing VRAM budgeting:

- hf_loading clamps the accelerate CPU-offload budget to the headroom under
  the cap, so overflow spills to the disk offload folder instead of growing RSS.
- manager: process-tree RSS accounting, true-LRU (active_in_vram property stamps
  _last_used), shared _evict_one, and _evict_models_for_ram; idle models are
  evicted before a new load when RSS nears the cap.
- ram_monitor.py: background watcher samples RSS, flags a suspected leak when it
  climbs while the scheduler is idle, and runs a mitigation ladder
  (gc -> empty_cache -> malloc_trim -> drop upscaler cache -> evict idle).
- admin /status returns a ram block; Settings page exposes max RAM + evict/
  leak-watch toggles (applied live); dashboard shows a RAM gauge + leak badge.

Also fold loaded upscalers (_UPSCALER_CACHE) into the dashboard models-loaded
count so an active upscale no longer reports '0 models loaded'.
Co-Authored-By: 's avatarClaude Opus 4.8 <noreply@anthropic.com>
parent a9b6d35e
...@@ -330,6 +330,18 @@ def api_status(username: str = Depends(require_auth)): ...@@ -330,6 +330,18 @@ def api_status(username: str = Depends(require_auth)):
loaded_keys = list(multi_model_manager.models.keys()) loaded_keys = list(multi_model_manager.models.keys())
# Real-ESRGAN / diffusers upscalers live in a private cache (deliberately not
# in multi_model_manager.models — see codai/api/images.py), so the registry
# count alone reports 0 models while an upscale job is actively running. Fold
# the loaded upscalers in so the dashboard count matches the Tasks page.
try:
from codai.api.images import _UPSCALER_CACHE as _upscalers
for _uk in _upscalers.keys():
if _uk not in loaded_keys:
loaded_keys.append(_uk)
except Exception:
pass
# VRAM info # VRAM info
vram = None vram = None
is_cuda = False is_cuda = False
...@@ -464,6 +476,26 @@ def api_status(username: str = Depends(require_auth)): ...@@ -464,6 +476,26 @@ def api_status(username: str = Depends(require_auth)):
except Exception: except Exception:
pass pass
# Host-RAM status: system totals (psutil) + the global-cap watcher snapshot
# (process RSS, configured cap, % of cap, leak-suspected flag, last mitigation).
ram = None
try:
import psutil
vm = psutil.virtual_memory()
ram = {
"total": round(vm.total / 1e9, 2),
"used": round((vm.total - vm.available) / 1e9, 2),
"free": round(vm.available / 1e9, 2),
"percent": vm.percent,
}
try:
from codai.models.ram_monitor import get_status as _ram_status
ram["watch"] = _ram_status()
except Exception:
pass
except Exception:
pass
# Whisper-server status # Whisper-server status
whisper_status = None whisper_status = None
try: try:
...@@ -484,6 +516,7 @@ def api_status(username: str = Depends(require_auth)): ...@@ -484,6 +516,7 @@ def api_status(username: str = Depends(require_auth)):
"enabled_models": enabled_models, "enabled_models": enabled_models,
"enabled_aliases": enabled_aliases, "enabled_aliases": enabled_aliases,
"vram": vram, "vram": vram,
"ram": ram,
"cuda": is_cuda, "cuda": is_cuda,
"requests": { "requests": {
"total": req_total, "total": req_total,
...@@ -2394,6 +2427,9 @@ async def api_get_settings(username: str = Depends(require_admin)): ...@@ -2394,6 +2427,9 @@ async def api_get_settings(username: str = Depends(require_admin)):
"load_in_8bit": c.offload.load_in_8bit, "load_in_8bit": c.offload.load_in_8bit,
"manual_ram_gb": c.offload.manual_ram_gb, "manual_ram_gb": c.offload.manual_ram_gb,
"flash_attention": c.offload.flash_attention, "flash_attention": c.offload.flash_attention,
"max_ram_gb": c.offload.max_ram_gb,
"evict_idle_on_ram": c.offload.evict_idle_on_ram,
"ram_leak_watch": c.offload.ram_leak_watch,
}, },
"vulkan": { "vulkan": {
"n_gpu_layers": c.vulkan.n_gpu_layers, "n_gpu_layers": c.vulkan.n_gpu_layers,
...@@ -2503,6 +2539,21 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad ...@@ -2503,6 +2539,21 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
if "manual_ram_gb" in off: if "manual_ram_gb" in off:
c.offload.manual_ram_gb = off["manual_ram_gb"] or None c.offload.manual_ram_gb = off["manual_ram_gb"] or None
c.offload.flash_attention = bool(off.get("flash_attention", c.offload.flash_attention)) c.offload.flash_attention = bool(off.get("flash_attention", c.offload.flash_attention))
if "max_ram_gb" in off:
c.offload.max_ram_gb = off["max_ram_gb"] or None
c.offload.evict_idle_on_ram = bool(off.get("evict_idle_on_ram", c.offload.evict_idle_on_ram))
c.offload.ram_leak_watch = bool(off.get("ram_leak_watch", c.offload.ram_leak_watch))
# Push the RAM-cap settings to live global_args so the watcher, per-load
# budget clamp and eviction honour them without a restart.
try:
from codai.api.state import get_global_args
ga = get_global_args()
if ga is not None:
ga.max_ram_gb = c.offload.max_ram_gb
ga.evict_idle_on_ram = c.offload.evict_idle_on_ram
ga.ram_leak_watch = c.offload.ram_leak_watch
except Exception:
pass
if "vulkan" in data: if "vulkan" in data:
vk = data["vulkan"] vk = data["vulkan"]
......
...@@ -40,6 +40,20 @@ ...@@ -40,6 +40,20 @@
<div style="font-size:11.5px;color:var(--text-2);margin-top:.2rem;font-family:var(--mono)" id="vram-total-line"></div> <div style="font-size:11.5px;color:var(--text-2);margin-top:.2rem;font-family:var(--mono)" id="vram-total-line"></div>
<div class="stat-sub" id="vram-gpu" style="margin-top:.25rem"></div> <div class="stat-sub" id="vram-gpu" style="margin-top:.25rem"></div>
</div> </div>
<div class="stat" id="ram-card" style="display:none">
<div class="stat-label">RAM</div>
<div class="stat-value" id="ram-pct" style="font-size:2rem"></div>
<div class="progress" style="margin-top:.625rem">
<div class="progress-fill" id="ram-bar" style="width:0%"></div>
</div>
<div class="progress-labels" style="color:var(--text-1);font-size:12px;margin-top:.4rem">
<span id="ram-used"></span><span id="ram-free"></span>
</div>
<div style="font-size:11.5px;color:var(--text-2);margin-top:.2rem;font-family:var(--mono)" id="ram-total-line"></div>
<div class="stat-sub" id="ram-cap" style="margin-top:.25rem"></div>
<div class="stat-sub" id="ram-leak" style="margin-top:.25rem;color:var(--danger,#e5484d);font-weight:600;display:none">⚠ RAM leak suspected</div>
</div>
</div> </div>
<div class="card" style="margin-bottom:1rem"> <div class="card" style="margin-bottom:1rem">
...@@ -116,6 +130,33 @@ async function poll() { ...@@ -116,6 +130,33 @@ async function poll() {
document.getElementById('vram-card').style.display = 'none'; document.getElementById('vram-card').style.display = 'none';
} }
if (d.ram && d.ram.total) {
document.getElementById('ram-card').style.display = '';
const w = d.ram.watch || {};
// When a cap is set, show RSS-vs-cap; otherwise system used-vs-total.
const cap = w.cap_gb || null;
if (cap && w.rss_gb != null) {
const pct = Math.min(100, Math.round(w.rss_gb / cap * 100));
document.getElementById('ram-pct').textContent = pct + '%';
document.getElementById('ram-bar').style.width = pct + '%';
document.getElementById('ram-used').textContent = w.rss_gb.toFixed(1) + ' GB server';
document.getElementById('ram-free').textContent = (d.ram.free != null ? d.ram.free.toFixed(1) + ' GB free' : '');
document.getElementById('ram-total-line').textContent = cap.toFixed(1) + ' GB cap · ' + d.ram.total.toFixed(1) + ' GB total';
document.getElementById('ram-cap').textContent = w.last_action ? ('last: ' + w.last_action) : '';
} else {
const pct = Math.round(d.ram.used / d.ram.total * 100);
document.getElementById('ram-pct').textContent = pct + '%';
document.getElementById('ram-bar').style.width = pct + '%';
document.getElementById('ram-used').textContent = d.ram.used.toFixed(1) + ' GB used';
document.getElementById('ram-free').textContent = d.ram.free.toFixed(1) + ' GB free';
document.getElementById('ram-total-line').textContent = d.ram.total.toFixed(1) + ' GB total (no cap)';
document.getElementById('ram-cap').textContent = '';
}
document.getElementById('ram-leak').style.display = w.leak_suspected ? '' : 'none';
} else {
document.getElementById('ram-card').style.display = 'none';
}
if (d.requests) { if (d.requests) {
document.getElementById('req-total').textContent = d.requests.total ?? 0; document.getElementById('req-total').textContent = d.requests.total ?? 0;
document.getElementById('req-active').textContent = d.requests.active ?? 0; document.getElementById('req-active').textContent = d.requests.active ?? 0;
......
...@@ -68,6 +68,25 @@ ...@@ -68,6 +68,25 @@
<input type="text" id="s-offload-dir" class="form-input" placeholder="./offload"> <input type="text" id="s-offload-dir" class="form-input" placeholder="./offload">
<span class="form-hint">Models will inherit this as default when configured</span> <span class="form-hint">Models will inherit this as default when configured</span>
</div> </div>
<div class="form-row" style="margin-top:.75rem">
<label class="form-label">Max global RAM (GB) <span class="muted">(blank = no cap)</span></label>
<input type="number" id="s-max-ram" class="form-input" min="0" step="1" placeholder="e.g. 96">
<span class="form-hint">Server-wide ceiling on host RAM (process-tree RSS). New model loads get a CPU-offload budget clamped to the remaining headroom, so the overflow spills to the offload directory (disk) instead of pushing past the limit. Applied live on save.</span>
</div>
<div class="form-row" style="margin:0">
<label style="display:flex;align-items:center;gap:.5rem;cursor:pointer">
<input type="checkbox" id="s-evict-idle-ram">
<span>Evict idle models when over the RAM limit</span>
</label>
<span class="form-hint">Unload least-recently-used idle models to free real RAM before forcing disk offload.</span>
</div>
<div class="form-row" style="margin:0">
<label style="display:flex;align-items:center;gap:.5rem;cursor:pointer">
<input type="checkbox" id="s-ram-leak-watch">
<span>Watch for RAM leaks (auto-mitigate)</span>
</label>
<span class="form-hint">Background watcher samples RSS; when it keeps climbing while idle or nears the cap it runs gc / CUDA cache release / heap trim and (if enabled) evicts idle models.</span>
</div>
<div class="form-row" style="margin-top:.75rem"> <div class="form-row" style="margin-top:.75rem">
<label class="form-label">Temporary working directory <span class="muted">(default: system /tmp)</span></label> <label class="form-label">Temporary working directory <span class="muted">(default: system /tmp)</span></label>
<input type="text" id="s-tmp-dir" class="form-input" placeholder="e.g. /data/tmp"> <input type="text" id="s-tmp-dir" class="form-input" placeholder="e.g. /data/tmp">
...@@ -355,6 +374,9 @@ async function loadSettings(){ ...@@ -355,6 +374,9 @@ async function loadSettings(){
document.getElementById('s-hf-cache').value = d.models?.hf_cache_dir ?? ''; document.getElementById('s-hf-cache').value = d.models?.hf_cache_dir ?? '';
document.getElementById('s-gguf-cache').value = d.models?.gguf_cache_dir ?? ''; document.getElementById('s-gguf-cache').value = d.models?.gguf_cache_dir ?? '';
document.getElementById('s-offload-dir').value = d.offload?.directory ?? './offload'; document.getElementById('s-offload-dir').value = d.offload?.directory ?? './offload';
document.getElementById('s-max-ram').value = d.offload?.max_ram_gb ?? '';
document.getElementById('s-evict-idle-ram').checked = d.offload?.evict_idle_on_ram !== false;
document.getElementById('s-ram-leak-watch').checked = d.offload?.ram_leak_watch !== false;
document.getElementById('s-tmp-dir').value = d.tmp_dir ?? ''; document.getElementById('s-tmp-dir').value = d.tmp_dir ?? '';
document.getElementById('s-allow-ffmpeg').checked = !!(d.enhance && d.enhance.allow_ffmpeg); document.getElementById('s-allow-ffmpeg').checked = !!(d.enhance && d.enhance.allow_ffmpeg);
document.getElementById('s-allow-rife-ncnn').checked = !!(d.enhance && d.enhance.allow_rife_ncnn); document.getElementById('s-allow-rife-ncnn').checked = !!(d.enhance && d.enhance.allow_rife_ncnn);
...@@ -424,6 +446,9 @@ async function saveSettings(){ ...@@ -424,6 +446,9 @@ async function saveSettings(){
}, },
offload:{ offload:{
directory: document.getElementById('s-offload-dir').value.trim() || './offload', directory: document.getElementById('s-offload-dir').value.trim() || './offload',
max_ram_gb: (parseFloat(document.getElementById('s-max-ram').value) || null),
evict_idle_on_ram: document.getElementById('s-evict-idle-ram').checked,
ram_leak_watch: document.getElementById('s-ram-leak-watch').checked,
}, },
tmp_dir: strOrNull('s-tmp-dir'), tmp_dir: strOrNull('s-tmp-dir'),
enhance:{ enhance:{
......
...@@ -65,6 +65,13 @@ class OffloadConfig: ...@@ -65,6 +65,13 @@ class OffloadConfig:
load_in_8bit: bool = False load_in_8bit: bool = False
manual_ram_gb: Optional[float] = None manual_ram_gb: Optional[float] = None
flash_attention: bool = False flash_attention: bool = False
# Server-wide ceiling on host RAM (process-tree RSS) the server may use, in GB.
# None = no global cap (per-load budget = available RAM, as before). When set, new
# model loads get a CPU-offload budget clamped to the remaining headroom so the
# overflow spills to the offload directory (disk), and idle models can be evicted.
max_ram_gb: Optional[float] = None
evict_idle_on_ram: bool = True # unload idle LRU models when over the RAM cap
ram_leak_watch: bool = True # background watcher samples RSS + auto-mitigates
@dataclass @dataclass
...@@ -414,7 +421,10 @@ class ConfigManager: ...@@ -414,7 +421,10 @@ class ConfigManager:
"load_in_4bit": self.config.offload.load_in_4bit, "load_in_4bit": self.config.offload.load_in_4bit,
"load_in_8bit": self.config.offload.load_in_8bit, "load_in_8bit": self.config.offload.load_in_8bit,
"manual_ram_gb": self.config.offload.manual_ram_gb, "manual_ram_gb": self.config.offload.manual_ram_gb,
"flash_attention": self.config.offload.flash_attention "flash_attention": self.config.offload.flash_attention,
"max_ram_gb": self.config.offload.max_ram_gb,
"evict_idle_on_ram": self.config.offload.evict_idle_on_ram,
"ram_leak_watch": self.config.offload.ram_leak_watch
}, },
"vulkan": { "vulkan": {
"n_gpu_layers": self.config.vulkan.n_gpu_layers, "n_gpu_layers": self.config.vulkan.n_gpu_layers,
......
...@@ -427,6 +427,15 @@ def main(): ...@@ -427,6 +427,15 @@ def main():
# Migrate any GGUF files that ended up in the HF cache to the GGUF cache # Migrate any GGUF files that ended up in the HF cache to the GGUF cache
_t.Thread(target=_migrate_hf_gguf_to_gguf_cache, daemon=True).start() _t.Thread(target=_migrate_hf_gguf_to_gguf_cache, daemon=True).start()
# Start the global host-RAM watcher (leak detection + auto-mitigation). Safe to
# start unconditionally: it idles cheaply and reads the cap/watch flags live, so
# it begins acting as soon as a max_ram_gb is configured (incl. via live reload).
try:
from codai.models.ram_monitor import start as _start_ram_monitor
_start_ram_monitor()
except Exception as _e:
logging.getLogger(__name__).debug("RAM monitor not started: %s", _e)
# Import core modules (only after early exits) # Import core modules (only after early exits)
from codai.api import app from codai.api import app
from codai.api.state import ( from codai.api.state import (
...@@ -802,6 +811,10 @@ def main(): ...@@ -802,6 +811,10 @@ def main():
global_args.load_in_8bit = config.offload.load_in_8bit global_args.load_in_8bit = config.offload.load_in_8bit
global_args.flash_attn = config.offload.flash_attention global_args.flash_attn = config.offload.flash_attention
global_args.max_gpu_percent = config.offload.max_gpu_percent global_args.max_gpu_percent = config.offload.max_gpu_percent
# Global host-RAM cap + leak watch (read live by hf_loading, manager, ram_monitor).
global_args.max_ram_gb = config.offload.max_ram_gb
global_args.evict_idle_on_ram = config.offload.evict_idle_on_ram
global_args.ram_leak_watch = config.offload.ram_leak_watch
# Thermal protection settings (read live by codai.models.thermal). # Thermal protection settings (read live by codai.models.thermal).
global_args.thermal_cpu_enabled = config.thermal.cpu_enabled global_args.thermal_cpu_enabled = config.thermal.cpu_enabled
global_args.thermal_gpu_enabled = config.thermal.gpu_enabled global_args.thermal_gpu_enabled = config.thermal.gpu_enabled
......
...@@ -352,6 +352,23 @@ def build_from_pretrained_kwargs( ...@@ -352,6 +352,23 @@ def build_from_pretrained_kwargs(
else: else:
cpu_budget = max(0, psutil.virtual_memory().available - int(4e9)) cpu_budget = max(0, psutil.virtual_memory().available - int(4e9))
# Global RAM cap: clamp the CPU-offload budget to the headroom remaining under
# the server-wide ceiling, so the overflow spills to the disk offload folder
# (set below) instead of pushing process RSS past the cap. Read live from
# global_args (same pattern as pipeline_cache). None = no cap (legacy behaviour).
try:
from codai.api.state import get_global_args as _gga
_ga = _gga()
_cap = getattr(_ga, 'max_ram_gb', None) if _ga else None
if _cap:
_used = psutil.Process().memory_info().rss
_headroom = int(float(_cap) * 1e9) - _used
# Keep a small floor so a single component can still land in RAM; the
# rest goes to disk. Never raise the budget above the cap headroom.
cpu_budget = max(0, min(cpu_budget, _headroom))
except Exception:
pass
kwargs['device_map'] = 'auto' kwargs['device_map'] = 'auto'
kwargs['max_memory'] = {0: gpu_budget, 'cpu': cpu_budget} kwargs['max_memory'] = {0: gpu_budget, 'cpu': cpu_budget}
......
...@@ -546,7 +546,8 @@ class MultiModelManager: ...@@ -546,7 +546,8 @@ class MultiModelManager:
self.tool_parser = ModelParserAdapter() self.tool_parser = ModelParserAdapter()
self.current_model_key: Optional[str] = None self.current_model_key: Optional[str] = None
self.load_mode: str = "ondemand" self.load_mode: str = "ondemand"
self.active_in_vram: Optional[str] = None # most-recently-used model key self._last_used: Dict[str, float] = {} # model_key -> last-served monotonic ts (LRU)
self._active_in_vram: Optional[str] = None # backing field for active_in_vram property
self.models_in_vram: set = set() # all models currently in VRAM self.models_in_vram: set = set() # all models currently in VRAM
self.model_aliases: Dict[str, str] = {} self.model_aliases: Dict[str, str] = {}
self.whisper_server: Optional[WhisperServerManager] = None # legacy single-instance compat self.whisper_server: Optional[WhisperServerManager] = None # legacy single-instance compat
...@@ -575,6 +576,19 @@ class MultiModelManager: ...@@ -575,6 +576,19 @@ class MultiModelManager:
# the GB it freed (or None). Invoked as a last resort during eviction. # the GB it freed (or None). Invoked as a last resort during eviction.
self._external_vram_releasers: List[Any] = [] self._external_vram_releasers: List[Any] = []
@property
def active_in_vram(self) -> Optional[str]:
"""Most-recently-used model key (None when nothing is active)."""
return self._active_in_vram
@active_in_vram.setter
def active_in_vram(self, key: Optional[str]) -> None:
# Single chokepoint for "this model was just used" — stamp the LRU clock so
# both VRAM and RAM eviction order by true recency, not dict insertion order.
self._active_in_vram = key
if key:
self._last_used[key] = time.monotonic()
def register_external_vram_releaser(self, fn) -> None: def register_external_vram_releaser(self, fn) -> None:
"""Register a callback that frees VRAM held outside the manager. """Register a callback that frees VRAM held outside the manager.
...@@ -2510,6 +2524,146 @@ class MultiModelManager: ...@@ -2510,6 +2524,146 @@ class MultiModelManager:
_trim_cpu_ram() _trim_cpu_ram()
return True return True
def _evict_one(self, key):
"""Fully unload ONE model by key and free its VRAM + host RAM.
Shared by VRAM- and RAM-driven eviction. Never pulls a model that is still
mid-request; waits briefly first and skips it if it stays busy.
"""
# Make absolutely sure nothing is mid-request on this model before we
# pull its tensors off the GPU.
if self._is_key_busy(key):
if not self._wait_until_idle(key):
print(f" Skipping eviction of '{key}' — still busy after wait")
return
model_obj = self.models.pop(key, None)
self.models_in_vram.discard(key)
self._last_used.pop(key, None)
# Debug-only: ground-truth the object state before cleanup so an
# orphaned/detached backend (VRAM that won't free) is visible.
try:
from codai.api.state import get_global_debug
if get_global_debug():
_be = getattr(model_obj, 'backend', '∅')
_has_model = (getattr(_be, 'model', None) is not None) if _be not in ('∅', None) else False
_pool = self.model_pools.get(key)
print(f" evict-debug '{key}': obj_id={id(model_obj)} "
f"backend={'None' if _be is None else ('missing' if _be=='∅' else 'set')} "
f"backend.model={'set' if _has_model else 'None'} "
f"pool_instances={_pool.count if _pool else 0}")
except Exception:
pass
# Clean every instance in the pool (not just the primary) so extra
# instances don't leak VRAM, then drop the pool.
pool = self.model_pools.pop(key, None)
if pool is not None:
try:
pool.cleanup_all()
except Exception as e:
print(f" Warning cleaning pool for '{key}': {e}")
if model_obj is not None:
try:
if hasattr(model_obj, 'cleanup'):
model_obj.cleanup()
elif hasattr(model_obj, 'to'):
# Diffusers pipeline: move all components to CPU explicitly
# before dropping the reference so VRAM is freed promptly.
model_obj.to('cpu')
except Exception as e:
print(f" Warning during eviction of '{key}': {e}")
del model_obj
for _ in range(3):
gc.collect()
try:
import torch
if torch.cuda.is_available():
torch.cuda.synchronize()
torch.cuda.empty_cache()
except Exception:
pass
# Return freed CPU heap (the evicted model's host-side copy / offloaded
# weights) to the OS, and let the kernel reclaim any swap backing it —
# otherwise RSS stays high across evict/load cycles and the machine
# slowly fills RAM + swap.
_trim_cpu_ram()
@staticmethod
def _get_process_ram_gb() -> float:
"""Resident-set size of the server process TREE, in GB (0.0 on failure).
Offloaded weights and worker subprocesses count against the global cap, so
sum the parent plus all children (mirrors thermal.read_process_tree_cpu)."""
try:
import psutil
proc = psutil.Process()
total = proc.memory_info().rss
for child in proc.children(recursive=True):
try:
total += child.memory_info().rss
except Exception:
pass
return total / 1e9
except Exception:
return 0.0
@staticmethod
def _ram_cap_gb() -> Optional[float]:
"""The configured global RAM ceiling in GB, or None when no cap is set."""
try:
from codai.api.state import get_global_args
ga = get_global_args()
cap = getattr(ga, 'max_ram_gb', None) if ga else None
return float(cap) if cap else None
except Exception:
return None
@staticmethod
def _evict_idle_on_ram_enabled() -> bool:
"""Whether idle models may be unloaded when over the RAM cap (default True)."""
try:
from codai.api.state import get_global_args
ga = get_global_args()
return bool(getattr(ga, 'evict_idle_on_ram', True)) if ga else True
except Exception:
return True
def _evict_models_for_ram(self, target_gb: float):
"""Unload idle models (LRU first) until process-tree RSS drops to target_gb.
Same safety rules as VRAM eviction: never pulls a busy model, and the active
model is only evicted as a last resort once idle."""
if target_gb <= 0:
return
if self._get_process_ram_gb() <= target_gb:
return
_before = self._get_process_ram_gb()
for key in self._lru_order():
if key == self.active_in_vram:
continue
if self._get_process_ram_gb() <= target_gb:
break
if self._is_key_busy(key):
print(f" '{key}' is busy serving a request — not evicting it (RAM)")
continue
print(f"RAM eviction: unloading '{key}' to free host RAM "
f"(RSS {self._get_process_ram_gb():.1f} GB > cap target {target_gb:.1f} GB)")
self._evict_one(key)
# Last resort: the active model, only if idle.
if (self._get_process_ram_gb() > target_gb and self.active_in_vram
and self.active_in_vram in self.models):
_active = self.active_in_vram
if not self._is_key_busy(_active) and self._wait_until_idle(_active):
print(f"RAM eviction: unloading active model '{_active}' to free host RAM")
self._evict_one(_active)
self.active_in_vram = None
_freed = _before - self._get_process_ram_gb()
if _freed > 0.05:
print(f"RAM eviction freed {_freed:.1f} GB (RSS now {self._get_process_ram_gb():.1f} GB)")
def _lru_order(self):
"""Loaded model keys, least-recently-used first (for eviction)."""
return sorted(self.models.keys(), key=lambda k: self._last_used.get(k, 0.0))
def _evict_models_for_vram(self, needed_gb: float): def _evict_models_for_vram(self, needed_gb: float):
"""Unload loaded models (LRU first) until we have at least needed_gb free VRAM. """Unload loaded models (LRU first) until we have at least needed_gb free VRAM.
...@@ -2520,62 +2674,7 @@ class MultiModelManager: ...@@ -2520,62 +2674,7 @@ class MultiModelManager:
if needed_gb <= 0: if needed_gb <= 0:
return return
def _evict_key(key): _evict_key = self._evict_one
# Make absolutely sure nothing is mid-request on this model before we
# pull its tensors off the GPU.
if self._is_key_busy(key):
if not self._wait_until_idle(key):
print(f" Skipping eviction of '{key}' — still busy after wait")
return
model_obj = self.models.pop(key, None)
self.models_in_vram.discard(key)
# Debug-only: ground-truth the object state before cleanup so an
# orphaned/detached backend (VRAM that won't free) is visible.
try:
from codai.api.state import get_global_debug
if get_global_debug():
_be = getattr(model_obj, 'backend', '∅')
_has_model = (getattr(_be, 'model', None) is not None) if _be not in ('∅', None) else False
_pool = self.model_pools.get(key)
print(f" evict-debug '{key}': obj_id={id(model_obj)} "
f"backend={'None' if _be is None else ('missing' if _be=='∅' else 'set')} "
f"backend.model={'set' if _has_model else 'None'} "
f"pool_instances={_pool.count if _pool else 0}")
except Exception:
pass
# Clean every instance in the pool (not just the primary) so extra
# instances don't leak VRAM, then drop the pool.
pool = self.model_pools.pop(key, None)
if pool is not None:
try:
pool.cleanup_all()
except Exception as e:
print(f" Warning cleaning pool for '{key}': {e}")
if model_obj is not None:
try:
if hasattr(model_obj, 'cleanup'):
model_obj.cleanup()
elif hasattr(model_obj, 'to'):
# Diffusers pipeline: move all components to CPU explicitly
# before dropping the reference so VRAM is freed promptly.
model_obj.to('cpu')
except Exception as e:
print(f" Warning during eviction of '{key}': {e}")
del model_obj
for _ in range(3):
gc.collect()
try:
import torch
if torch.cuda.is_available():
torch.cuda.synchronize()
torch.cuda.empty_cache()
except Exception:
pass
# Return freed CPU heap (the evicted model's host-side copy / offloaded
# weights) to the OS, and let the kernel reclaim any swap backing it —
# otherwise RSS stays high across evict/load cycles and the machine
# slowly fills RAM + swap.
_trim_cpu_ram()
def _size_label(key: str) -> str: def _size_label(key: str) -> str:
m = self._measured_vram_gb.get(key) m = self._measured_vram_gb.get(key)
...@@ -2589,7 +2688,7 @@ class MultiModelManager: ...@@ -2589,7 +2688,7 @@ class MultiModelManager:
_free_before = self._get_free_vram_gb() _free_before = self._get_free_vram_gb()
# First pass: evict idle non-active models in LRU order. # First pass: evict idle non-active models in LRU order.
for key in list(self.models.keys()): for key in self._lru_order():
if key == self.active_in_vram: if key == self.active_in_vram:
continue continue
if self._get_free_vram_gb() >= needed_gb: if self._get_free_vram_gb() >= needed_gb:
...@@ -3096,6 +3195,18 @@ class MultiModelManager: ...@@ -3096,6 +3195,18 @@ class MultiModelManager:
except Exception as e: except Exception as e:
print(f" Warning: Error cleaning up legacy model_manager: {e}") print(f" Warning: Error cleaning up legacy model_manager: {e}")
# Global RAM cap: before loading the new model, if host RSS is near the cap
# and idle-eviction is enabled, unload idle LRU models to make real RAM room.
# Whatever still doesn't fit is handled by the clamped CPU budget in
# hf_loading (overflow spills to the disk offload folder).
_cap = self._ram_cap_gb()
if _cap and self._evict_idle_on_ram_enabled():
_rss = self._get_process_ram_gb()
if _rss > _cap * 0.9:
print(f"Global RAM cap {_cap:.1f} GB — RSS {_rss:.1f} GB near limit; "
f"evicting idle models before load of '{model_key}'")
self._evict_models_for_ram(_cap * 0.85)
# Return info for the caller to load the model # Return info for the caller to load the model
return { return {
'model_key': model_key, 'model_key': model_key,
......
"""Global host-RAM watcher with leak detection and auto-mitigation.
Coderai caps host RAM via ``offload.max_ram_gb`` (see ``OffloadConfig``). The
per-load CPU budget in ``hf_loading`` keeps individual loads under the cap by
spilling to disk, and ``MultiModelManager`` evicts idle models before a new load.
This module adds the *continuous* side: a background thread samples process-tree
RSS, flags a suspected leak when RSS keeps climbing **while the scheduler is idle**
(so growth isn't just an in-flight generation), and runs a mitigation ladder when
RSS crosses a soft threshold — gc → CUDA empty_cache → malloc_trim → drop the
upscaler cache → evict idle models.
Mirrors ``codai.models.thermal`` in shape: module-level state + ``get_status()``
for the admin dashboard, started once from ``codai.main``.
"""
import threading
import time
import logging
from typing import Optional, Dict, Any
_log = logging.getLogger(__name__)
# How often to sample RSS, and how many consecutive rising idle samples count as a
# suspected leak. SOFT_FRACTION of the cap is where the mitigation ladder engages.
_POLL_SECONDS = 15.0
_LEAK_SAMPLES = 4 # consecutive idle increases before flagging a leak
_LEAK_MIN_GROWTH_GB = 0.3 # ignore sub-0.3 GB jitter between samples
_SOFT_FRACTION = 0.90 # mitigate at/above this fraction of the cap
_EVICT_TARGET_FRACTION = 0.85 # evict idle models down to this fraction of the cap
_state_lock = threading.Lock()
_state: Dict[str, Any] = {
"rss_gb": 0.0,
"cap_gb": None,
"percent": None,
"leak_suspected": False,
"last_action": "",
"last_action_ts": 0.0,
"samples": 0,
}
_recent: list = [] # recent idle RSS samples (for trend detection)
_thread: Optional[threading.Thread] = None
_started = False
def get_status() -> Dict[str, Any]:
"""Snapshot for the admin status endpoint / dashboard."""
with _state_lock:
return dict(_state)
def _cap_gb() -> Optional[float]:
try:
from codai.api.state import get_global_args
ga = get_global_args()
cap = getattr(ga, "max_ram_gb", None) if ga else None
return float(cap) if cap else None
except Exception:
return None
def _watch_enabled() -> bool:
try:
from codai.api.state import get_global_args
ga = get_global_args()
return bool(getattr(ga, "ram_leak_watch", True)) if ga else True
except Exception:
return True
def _scheduler_idle() -> bool:
"""True when no request is being served (so RSS growth isn't a live job)."""
try:
from codai.queue.manager import queue_manager
return not queue_manager.active_leases
except Exception:
return True
def _process_ram_gb() -> float:
try:
from codai.models.manager import multi_model_manager
return multi_model_manager._get_process_ram_gb()
except Exception:
try:
import psutil
return psutil.Process().memory_info().rss / 1e9
except Exception:
return 0.0
def _mitigate(rss_gb: float, cap_gb: float, leak: bool) -> str:
"""Run the mitigation ladder; return a short description of what was done."""
import gc
actions = []
for _ in range(3):
gc.collect()
actions.append("gc")
try:
import torch
if torch.cuda.is_available():
torch.cuda.empty_cache()
actions.append("empty_cache")
except Exception:
pass
try:
from codai.models.manager import _trim_cpu_ram
_trim_cpu_ram()
actions.append("trim")
except Exception:
pass
# Drop the in-memory super-resolution upscaler cache — it can hold a multi-GB
# ESRGAN/diffusers model alive long after an enhance job finished.
try:
from codai.api import images as _img
if getattr(_img, "_UPSCALER_CACHE", None):
_img._UPSCALER_CACHE.clear()
actions.append("drop_upscalers")
except Exception:
pass
# Still over and eviction is enabled → unload idle LRU models.
try:
from codai.models.manager import multi_model_manager as _mm
if (_mm._get_process_ram_gb() > cap_gb * _SOFT_FRACTION
and _mm._evict_idle_on_ram_enabled()):
_mm._evict_models_for_ram(cap_gb * _EVICT_TARGET_FRACTION)
actions.append("evict_idle")
except Exception as e:
_log.warning("RAM mitigation eviction failed: %s", e)
desc = ("leak-suspected; " if leak else "") + "+".join(actions)
return desc
def _loop():
global _recent
while True:
try:
time.sleep(_POLL_SECONDS)
if not _watch_enabled():
continue
cap = _cap_gb()
rss = _process_ram_gb()
idle = _scheduler_idle()
# Leak heuristic: only trust growth measured while idle (a live job
# legitimately inflates RSS). Keep a short rolling window of idle samples.
leak = False
if idle:
_recent.append(rss)
_recent = _recent[-(_LEAK_SAMPLES + 1):]
if len(_recent) > _LEAK_SAMPLES:
rising = all(
_recent[i + 1] - _recent[i] >= _LEAK_MIN_GROWTH_GB
for i in range(len(_recent) - 1)
)
leak = rising
else:
_recent = [] # reset trend while a job runs
with _state_lock:
_state["rss_gb"] = round(rss, 2)
_state["cap_gb"] = cap
_state["percent"] = round(100.0 * rss / cap, 1) if cap else None
_state["leak_suspected"] = leak
_state["samples"] += 1
# Engage the ladder when over the soft threshold or a leak is suspected.
if cap and (rss >= cap * _SOFT_FRACTION or leak):
desc = _mitigate(rss, cap, leak)
new_rss = _process_ram_gb()
_log.warning(
"RAM watch: RSS %.1f/%.1f GB (%.0f%%)%s — mitigation [%s] → %.1f GB",
rss, cap, 100.0 * rss / cap,
" LEAK SUSPECTED" if leak else "", desc, new_rss,
)
with _state_lock:
_state["last_action"] = desc
_state["last_action_ts"] = time.time()
_state["rss_gb"] = round(new_rss, 2)
_recent = [] # avoid re-triggering on the same growth
except Exception as e:
_log.debug("RAM watch loop error: %s", e)
def start() -> None:
"""Start the background watcher once. Safe to call when no cap is configured —
it idles cheaply and begins acting as soon as a cap is set live."""
global _thread, _started
if _started:
return
_started = True
_thread = threading.Thread(target=_loop, name="ram-monitor", daemon=True)
_thread.start()
_log.info("RAM monitor started (poll %.0fs)", _POLL_SECONDS)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment