coderai: global host-RAM cap with leak watch + disk-offload eviction

Add a server-wide host-RAM ceiling (OffloadConfig.max_ram_gb) alongside the existing VRAM budgeting: - hf_loading clamps the accelerate CPU-offload budget to the headroom under the cap, so overflow spills to the disk offload folder instead of growing RSS. - manager: process-tree RSS accounting, true-LRU (active_in_vram property stamps _last_used), shared _evict_one, and _evict_models_for_ram; idle models are evicted before a new load when RSS nears the cap. - ram_monitor.py: background watcher samples RSS, flags a suspected leak when it climbs while the scheduler is idle, and runs a mitigation ladder (gc -> empty_cache -> malloc_trim -> drop upscaler cache -> evict idle). - admin /status returns a ram block; Settings page exposes max RAM + evict/ leak-watch toggles (applied live); dashboard shows a RAM gauge + leak badge. Also fold loaded upscalers (_UPSCALER_CACHE) into the dashboard models-loaded count so an active upscale no longer reports '0 models loaded'. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

coderai: global host-RAM cap with leak watch + disk-offload eviction
Add a server-wide host-RAM ceiling (OffloadConfig.max_ram_gb) alongside the existing VRAM budgeting: - hf_loading clamps the accelerate CPU-offload budget to the headroom under the cap, so overflow spills to the disk offload folder instead of growing RSS. - manager: process-tree RSS accounting, true-LRU (active_in_vram property stamps _last_used), shared _evict_one, and _evict_models_for_ram; idle models are evicted before a new load when RSS nears the cap. - ram_monitor.py: background watcher samples RSS, flags a suspected leak when it climbs while the scheduler is idle, and runs a mitigation ladder (gc -> empty_cache -> malloc_trim -> drop upscaler cache -> evict idle). - admin /status returns a ram block; Settings page exposes max RAM + evict/ leak-watch toggles (applied live); dashboard shows a RAM gauge + leak badge. Also fold loaded upscalers (_UPSCALER_CACHE) into the dashboard models-loaded count so an active upscale no longer reports '0 models loaded'. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
99f8ba85 · Stefy Lanza (nextime / spora ) · a9b6d35e · 99f8ba85 · 99f8ba85 · 99f8ba85
Commit 99f8ba85 authored Jun 15, 2026 by Stefy Lanza (nextime / spora )
8 changed files
--- a/codai/admin/routes.py
+++ b/codai/admin/routes.py
@@ -330,6 +330,18 @@ def api_status(username: str = Depends(require_auth)):
    loaded_keys = list(multi_model_manager.models.keys())
+    # Real-ESRGAN / diffusers upscalers live in a private cache (deliberately not
+    # in multi_model_manager.models — see codai/api/images.py), so the registry
+    # count alone reports 0 models while an upscale job is actively running. Fold
+    # the loaded upscalers in so the dashboard count matches the Tasks page.
+    try:
+        from codai.api.images import _UPSCALER_CACHE as _upscalers
+        for _uk in _upscalers.keys():
+            if _uk not in loaded_keys:
+                loaded_keys.append(_uk)
+    except Exception:
+        pass
    # VRAM info
    vram = None
    is_cuda = False
@@ -464,6 +476,26 @@ def api_status(username: str = Depends(require_auth)):
    except Exception:
        pass
+    # Host-RAM status: system totals (psutil) + the global-cap watcher snapshot
+    # (process RSS, configured cap, % of cap, leak-suspected flag, last mitigation).
+    ram = None
+    try:
+        import psutil
+        vm = psutil.virtual_memory()
+        ram = {
+            "total": round(vm.total / 1e9, 2),
+            "used": round((vm.total - vm.available) / 1e9, 2),
+            "free": round(vm.available / 1e9, 2),
+            "percent": vm.percent,
+        }
+        try:
+            from codai.models.ram_monitor import get_status as _ram_status
+            ram["watch"] = _ram_status()
+        except Exception:
+            pass
+    except Exception:
+        pass
    # Whisper-server status
    whisper_status = None
    try:
@@ -484,6 +516,7 @@ def api_status(username: str = Depends(require_auth)):
        "enabled_models": enabled_models,
        "enabled_aliases": enabled_aliases,
        "vram": vram,
+        "ram": ram,
        "cuda": is_cuda,
        "requests": {
            "total": req_total,
@@ -2394,6 +2427,9 @@ async def api_get_settings(username: str = Depends(require_admin)):
            "load_in_8bit": c.offload.load_in_8bit,
            "manual_ram_gb": c.offload.manual_ram_gb,
            "flash_attention": c.offload.flash_attention,
+            "max_ram_gb": c.offload.max_ram_gb,
+            "evict_idle_on_ram": c.offload.evict_idle_on_ram,
+            "ram_leak_watch": c.offload.ram_leak_watch,
        },
        "vulkan": {
            "n_gpu_layers": c.vulkan.n_gpu_layers,
@@ -2503,6 +2539,21 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
        if "manual_ram_gb" in off:
            c.offload.manual_ram_gb = off["manual_ram_gb"] or None
        c.offload.flash_attention = bool(off.get("flash_attention", c.offload.flash_attention))
+        if "max_ram_gb" in off:
+            c.offload.max_ram_gb = off["max_ram_gb"] or None
+        c.offload.evict_idle_on_ram = bool(off.get("evict_idle_on_ram", c.offload.evict_idle_on_ram))
+        c.offload.ram_leak_watch = bool(off.get("ram_leak_watch", c.offload.ram_leak_watch))
+        # Push the RAM-cap settings to live global_args so the watcher, per-load
+        # budget clamp and eviction honour them without a restart.
+        try:
+            from codai.api.state import get_global_args
+            ga = get_global_args()
+            if ga is not None:
+                ga.max_ram_gb = c.offload.max_ram_gb
+                ga.evict_idle_on_ram = c.offload.evict_idle_on_ram
+                ga.ram_leak_watch = c.offload.ram_leak_watch
+        except Exception:
+            pass
    if "vulkan" in data:
        vk = data["vulkan"]

--- a/codai/admin/templates/dashboard.html
+++ b/codai/admin/templates/dashboard.html
@@ -40,6 +40,20 @@
    <div style="font-size:11.5px;color:var(--text-2);margin-top:.2rem;font-family:var(--mono)" id="vram-total-line"></div>
    <div class="stat-sub" id="vram-gpu" style="margin-top:.25rem"></div>
  </div>
+  <div class="stat" id="ram-card" style="display:none">
+    <div class="stat-label">RAM</div>
+    <div class="stat-value" id="ram-pct" style="font-size:2rem">—</div>
+    <div class="progress" style="margin-top:.625rem">
+      <div class="progress-fill" id="ram-bar" style="width:0%"></div>
+    </div>
+    <div class="progress-labels" style="color:var(--text-1);font-size:12px;margin-top:.4rem">
+      <span id="ram-used">—</span><span id="ram-free">—</span>
+    </div>
+    <div style="font-size:11.5px;color:var(--text-2);margin-top:.2rem;font-family:var(--mono)" id="ram-total-line"></div>
+    <div class="stat-sub" id="ram-cap" style="margin-top:.25rem"></div>
+    <div class="stat-sub" id="ram-leak" style="margin-top:.25rem;color:var(--danger,#e5484d);font-weight:600;display:none">⚠ RAM leak suspected</div>
+  </div>
 </div>
 <div class="card" style="margin-bottom:1rem">
@@ -116,6 +130,33 @@ async function poll() {
      document.getElementById('vram-card').style.display = 'none';
    }
+    if (d.ram && d.ram.total) {
+      document.getElementById('ram-card').style.display = '';
+      const w = d.ram.watch || {};
+      // When a cap is set, show RSS-vs-cap; otherwise system used-vs-total.
+      const cap = w.cap_gb || null;
+      if (cap && w.rss_gb != null) {
+        const pct = Math.min(100, Math.round(w.rss_gb / cap * 100));
+        document.getElementById('ram-pct').textContent = pct + '%';
+        document.getElementById('ram-bar').style.width = pct + '%';
+        document.getElementById('ram-used').textContent = w.rss_gb.toFixed(1) + ' GB server';
+        document.getElementById('ram-free').textContent = (d.ram.free != null ? d.ram.free.toFixed(1) + ' GB free' : '');
+        document.getElementById('ram-total-line').textContent = cap.toFixed(1) + ' GB cap · ' + d.ram.total.toFixed(1) + ' GB total';
+        document.getElementById('ram-cap').textContent = w.last_action ? ('last: ' + w.last_action) : '';
+      } else {
+        const pct = Math.round(d.ram.used / d.ram.total * 100);
+        document.getElementById('ram-pct').textContent = pct + '%';
+        document.getElementById('ram-bar').style.width = pct + '%';
+        document.getElementById('ram-used').textContent = d.ram.used.toFixed(1) + ' GB used';
+        document.getElementById('ram-free').textContent = d.ram.free.toFixed(1) + ' GB free';
+        document.getElementById('ram-total-line').textContent = d.ram.total.toFixed(1) + ' GB total (no cap)';
+        document.getElementById('ram-cap').textContent = '';
+      }
+      document.getElementById('ram-leak').style.display = w.leak_suspected ? '' : 'none';
+    } else {
+      document.getElementById('ram-card').style.display = 'none';
+    }
    if (d.requests) {
      document.getElementById('req-total').textContent = d.requests.total ?? 0;
      document.getElementById('req-active').textContent = d.requests.active ?? 0;

--- a/codai/admin/templates/settings.html
+++ b/codai/admin/templates/settings.html
@@ -68,6 +68,25 @@
    <input type="text" id="s-offload-dir" class="form-input" placeholder="./offload">
    <span class="form-hint">Models will inherit this as default when configured</span>
  </div>
+  <div class="form-row" style="margin-top:.75rem">
+    <label class="form-label">Max global RAM (GB) <span class="muted">(blank = no cap)</span></label>
+    <input type="number" id="s-max-ram" class="form-input" min="0" step="1" placeholder="e.g. 96">
+    <span class="form-hint">Server-wide ceiling on host RAM (process-tree RSS). New model loads get a CPU-offload budget clamped to the remaining headroom, so the overflow spills to the offload directory (disk) instead of pushing past the limit. Applied live on save.</span>
+  </div>
+  <div class="form-row" style="margin:0">
+    <label style="display:flex;align-items:center;gap:.5rem;cursor:pointer">
+      <input type="checkbox" id="s-evict-idle-ram">
+      <span>Evict idle models when over the RAM limit</span>
+    </label>
+    <span class="form-hint">Unload least-recently-used idle models to free real RAM before forcing disk offload.</span>
+  </div>
+  <div class="form-row" style="margin:0">
+    <label style="display:flex;align-items:center;gap:.5rem;cursor:pointer">
+      <input type="checkbox" id="s-ram-leak-watch">
+      <span>Watch for RAM leaks (auto-mitigate)</span>
+    </label>
+    <span class="form-hint">Background watcher samples RSS; when it keeps climbing while idle or nears the cap it runs gc / CUDA cache release / heap trim and (if enabled) evicts idle models.</span>
+  </div>
  <div class="form-row" style="margin-top:.75rem">
    <label class="form-label">Temporary working directory <span class="muted">(default: system /tmp)</span></label>
    <input type="text" id="s-tmp-dir" class="form-input" placeholder="e.g. /data/tmp">
@@ -355,6 +374,9 @@ async function loadSettings(){
    document.getElementById('s-hf-cache').value   = d.models?.hf_cache_dir ?? '';
    document.getElementById('s-gguf-cache').value = d.models?.gguf_cache_dir ?? '';
    document.getElementById('s-offload-dir').value = d.offload?.directory ?? './offload';
+    document.getElementById('s-max-ram').value = d.offload?.max_ram_gb ?? '';
+    document.getElementById('s-evict-idle-ram').checked = d.offload?.evict_idle_on_ram !== false;
+    document.getElementById('s-ram-leak-watch').checked = d.offload?.ram_leak_watch !== false;
    document.getElementById('s-tmp-dir').value = d.tmp_dir ?? '';
    document.getElementById('s-allow-ffmpeg').checked = !!(d.enhance && d.enhance.allow_ffmpeg);
    document.getElementById('s-allow-rife-ncnn').checked = !!(d.enhance && d.enhance.allow_rife_ncnn);
@@ -424,6 +446,9 @@ async function saveSettings(){
    },
    offload:{
      directory: document.getElementById('s-offload-dir').value.trim() || './offload',
+      max_ram_gb: (parseFloat(document.getElementById('s-max-ram').value) || null),
+      evict_idle_on_ram: document.getElementById('s-evict-idle-ram').checked,
+      ram_leak_watch: document.getElementById('s-ram-leak-watch').checked,
    },
    tmp_dir: strOrNull('s-tmp-dir'),
    enhance:{

--- a/codai/config.py
+++ b/codai/config.py
@@ -65,6 +65,13 @@ class OffloadConfig:
    load_in_8bit: bool = False
    manual_ram_gb: Optional[float] = None
    flash_attention: bool = False
+    # Server-wide ceiling on host RAM (process-tree RSS) the server may use, in GB.
+    # None = no global cap (per-load budget = available RAM, as before). When set, new
+    # model loads get a CPU-offload budget clamped to the remaining headroom so the
+    # overflow spills to the offload directory (disk), and idle models can be evicted.
+    max_ram_gb: Optional[float] = None
+    evict_idle_on_ram: bool = True   # unload idle LRU models when over the RAM cap
+    ram_leak_watch: bool = True      # background watcher samples RSS + auto-mitigates
 @dataclass
@@ -414,7 +421,10 @@ class ConfigManager:
                "load_in_4bit": self.config.offload.load_in_4bit,
                "load_in_8bit": self.config.offload.load_in_8bit,
                "manual_ram_gb": self.config.offload.manual_ram_gb,
-                "flash_attention": self.config.offload.flash_attention
+                "flash_attention": self.config.offload.flash_attention,
+                "max_ram_gb": self.config.offload.max_ram_gb,
+                "evict_idle_on_ram": self.config.offload.evict_idle_on_ram,
+                "ram_leak_watch": self.config.offload.ram_leak_watch
            },
            "vulkan": {
                "n_gpu_layers": self.config.vulkan.n_gpu_layers,

--- a/codai/main.py
+++ b/codai/main.py
@@ -427,6 +427,15 @@ def main():
    # Migrate any GGUF files that ended up in the HF cache to the GGUF cache
    _t.Thread(target=_migrate_hf_gguf_to_gguf_cache, daemon=True).start()
+    # Start the global host-RAM watcher (leak detection + auto-mitigation). Safe to
+    # start unconditionally: it idles cheaply and reads the cap/watch flags live, so
+    # it begins acting as soon as a max_ram_gb is configured (incl. via live reload).
+    try:
+        from codai.models.ram_monitor import start as _start_ram_monitor
+        _start_ram_monitor()
+    except Exception as _e:
+        logging.getLogger(__name__).debug("RAM monitor not started: %s", _e)
    # Import core modules (only after early exits)
    from codai.api import app
    from codai.api.state import (
@@ -802,6 +811,10 @@ def main():
    global_args.load_in_8bit = config.offload.load_in_8bit
    global_args.flash_attn = config.offload.flash_attention
    global_args.max_gpu_percent = config.offload.max_gpu_percent
+    # Global host-RAM cap + leak watch (read live by hf_loading, manager, ram_monitor).
+    global_args.max_ram_gb = config.offload.max_ram_gb
+    global_args.evict_idle_on_ram = config.offload.evict_idle_on_ram
+    global_args.ram_leak_watch = config.offload.ram_leak_watch
    # Thermal protection settings (read live by codai.models.thermal).
    global_args.thermal_cpu_enabled = config.thermal.cpu_enabled
    global_args.thermal_gpu_enabled = config.thermal.gpu_enabled

--- a/codai/models/hf_loading.py
+++ b/codai/models/hf_loading.py
@@ -352,6 +352,23 @@ def build_from_pretrained_kwargs(
        else:
            cpu_budget = max(0, psutil.virtual_memory().available - int(4e9))
+        # Global RAM cap: clamp the CPU-offload budget to the headroom remaining under
+        # the server-wide ceiling, so the overflow spills to the disk offload folder
+        # (set below) instead of pushing process RSS past the cap. Read live from
+        # global_args (same pattern as pipeline_cache). None = no cap (legacy behaviour).
+        try:
+            from codai.api.state import get_global_args as _gga
+            _ga = _gga()
+            _cap = getattr(_ga, 'max_ram_gb', None) if _ga else None
+            if _cap:
+                _used = psutil.Process().memory_info().rss
+                _headroom = int(float(_cap) * 1e9) - _used
+                # Keep a small floor so a single component can still land in RAM; the
+                # rest goes to disk. Never raise the budget above the cap headroom.
+                cpu_budget = max(0, min(cpu_budget, _headroom))
+        except Exception:
+            pass
        kwargs['device_map'] = 'auto'
        kwargs['max_memory'] = {0: gpu_budget, 'cpu': cpu_budget}

--- a/codai/models/manager.py
+++ b/codai/models/manager.py
@@ -546,7 +546,8 @@ class MultiModelManager:
        self.tool_parser = ModelParserAdapter()
        self.current_model_key: Optional[str] = None
        self.load_mode: str = "ondemand"
-        self.active_in_vram: Optional[str] = None  # most-recently-used model key
+        self._last_used: Dict[str, float] = {}  # model_key -> last-served monotonic ts (LRU)
+        self._active_in_vram: Optional[str] = None  # backing field for active_in_vram property
        self.models_in_vram: set = set()  # all models currently in VRAM
        self.model_aliases: Dict[str, str] = {}
        self.whisper_server: Optional[WhisperServerManager] = None  # legacy single-instance compat
@@ -575,6 +576,19 @@ class MultiModelManager:
        # the GB it freed (or None). Invoked as a last resort during eviction.
        self._external_vram_releasers: List[Any] = []
+    @property
+    def active_in_vram(self) -> Optional[str]:
+        """Most-recently-used model key (None when nothing is active)."""
+        return self._active_in_vram
+    @active_in_vram.setter
+    def active_in_vram(self, key: Optional[str]) -> None:
+        # Single chokepoint for "this model was just used" — stamp the LRU clock so
+        # both VRAM and RAM eviction order by true recency, not dict insertion order.
+        self._active_in_vram = key
+        if key:
+            self._last_used[key] = time.monotonic()
    def register_external_vram_releaser(self, fn) -> None:
        """Register a callback that frees VRAM held outside the manager.
@@ -2510,6 +2524,146 @@ class MultiModelManager:
        _trim_cpu_ram()
        return True
+    def _evict_one(self, key):
+        """Fully unload ONE model by key and free its VRAM + host RAM.
+        Shared by VRAM- and RAM-driven eviction. Never pulls a model that is still
+        mid-request; waits briefly first and skips it if it stays busy.
+        """
+        # Make absolutely sure nothing is mid-request on this model before we
+        # pull its tensors off the GPU.
+        if self._is_key_busy(key):
+            if not self._wait_until_idle(key):
+                print(f"  Skipping eviction of '{key}' — still busy after wait")
+                return
+        model_obj = self.models.pop(key, None)
+        self.models_in_vram.discard(key)
+        self._last_used.pop(key, None)
+        # Debug-only: ground-truth the object state before cleanup so an
+        # orphaned/detached backend (VRAM that won't free) is visible.
+        try:
+            from codai.api.state import get_global_debug
+            if get_global_debug():
+                _be = getattr(model_obj, 'backend', '∅')
+                _has_model = (getattr(_be, 'model', None) is not None) if _be not in ('∅', None) else False
+                _pool = self.model_pools.get(key)
+                print(f"  evict-debug '{key}': obj_id={id(model_obj)} "
+                      f"backend={'None' if _be is None else ('missing' if _be=='∅' else 'set')} "
+                      f"backend.model={'set' if _has_model else 'None'} "
+                      f"pool_instances={_pool.count if _pool else 0}")
+        except Exception:
+            pass
+        # Clean every instance in the pool (not just the primary) so extra
+        # instances don't leak VRAM, then drop the pool.
+        pool = self.model_pools.pop(key, None)
+        if pool is not None:
+            try:
+                pool.cleanup_all()
+            except Exception as e:
+                print(f"  Warning cleaning pool for '{key}': {e}")
+        if model_obj is not None:
+            try:
+                if hasattr(model_obj, 'cleanup'):
+                    model_obj.cleanup()
+                elif hasattr(model_obj, 'to'):
+                    # Diffusers pipeline: move all components to CPU explicitly
+                    # before dropping the reference so VRAM is freed promptly.
+                    model_obj.to('cpu')
+            except Exception as e:
+                print(f"  Warning during eviction of '{key}': {e}")
+        del model_obj
+        for _ in range(3):
+            gc.collect()
+        try:
+            import torch
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+                torch.cuda.empty_cache()
+        except Exception:
+            pass
+        # Return freed CPU heap (the evicted model's host-side copy / offloaded
+        # weights) to the OS, and let the kernel reclaim any swap backing it —
+        # otherwise RSS stays high across evict/load cycles and the machine
+        # slowly fills RAM + swap.
+        _trim_cpu_ram()
+    @staticmethod
+    def _get_process_ram_gb() -> float:
+        """Resident-set size of the server process TREE, in GB (0.0 on failure).
+        Offloaded weights and worker subprocesses count against the global cap, so
+        sum the parent plus all children (mirrors thermal.read_process_tree_cpu)."""
+        try:
+            import psutil
+            proc = psutil.Process()
+            total = proc.memory_info().rss
+            for child in proc.children(recursive=True):
+                try:
+                    total += child.memory_info().rss
+                except Exception:
+                    pass
+            return total / 1e9
+        except Exception:
+            return 0.0
+    @staticmethod
+    def _ram_cap_gb() -> Optional[float]:
+        """The configured global RAM ceiling in GB, or None when no cap is set."""
+        try:
+            from codai.api.state import get_global_args
+            ga = get_global_args()
+            cap = getattr(ga, 'max_ram_gb', None) if ga else None
+            return float(cap) if cap else None
+        except Exception:
+            return None
+    @staticmethod
+    def _evict_idle_on_ram_enabled() -> bool:
+        """Whether idle models may be unloaded when over the RAM cap (default True)."""
+        try:
+            from codai.api.state import get_global_args
+            ga = get_global_args()
+            return bool(getattr(ga, 'evict_idle_on_ram', True)) if ga else True
+        except Exception:
+            return True
+    def _evict_models_for_ram(self, target_gb: float):
+        """Unload idle models (LRU first) until process-tree RSS drops to target_gb.
+        Same safety rules as VRAM eviction: never pulls a busy model, and the active
+        model is only evicted as a last resort once idle."""
+        if target_gb <= 0:
+            return
+        if self._get_process_ram_gb() <= target_gb:
+            return
+        _before = self._get_process_ram_gb()
+        for key in self._lru_order():
+            if key == self.active_in_vram:
+                continue
+            if self._get_process_ram_gb() <= target_gb:
+                break
+            if self._is_key_busy(key):
+                print(f"  '{key}' is busy serving a request — not evicting it (RAM)")
+                continue
+            print(f"RAM eviction: unloading '{key}' to free host RAM "
+                  f"(RSS {self._get_process_ram_gb():.1f} GB > cap target {target_gb:.1f} GB)")
+            self._evict_one(key)
+        # Last resort: the active model, only if idle.
+        if (self._get_process_ram_gb() > target_gb and self.active_in_vram
+                and self.active_in_vram in self.models):
+            _active = self.active_in_vram
+            if not self._is_key_busy(_active) and self._wait_until_idle(_active):
+                print(f"RAM eviction: unloading active model '{_active}' to free host RAM")
+                self._evict_one(_active)
+                self.active_in_vram = None
+        _freed = _before - self._get_process_ram_gb()
+        if _freed > 0.05:
+            print(f"RAM eviction freed {_freed:.1f} GB (RSS now {self._get_process_ram_gb():.1f} GB)")
+    def _lru_order(self):
+        """Loaded model keys, least-recently-used first (for eviction)."""
+        return sorted(self.models.keys(), key=lambda k: self._last_used.get(k, 0.0))
    def _evict_models_for_vram(self, needed_gb: float):
        """Unload loaded models (LRU first) until we have at least needed_gb free VRAM.
@@ -2520,62 +2674,7 @@ class MultiModelManager:
        if needed_gb <= 0:
            return
-        def _evict_key(key):
+        _evict_key = self._evict_one
-            # Make absolutely sure nothing is mid-request on this model before we
-            # pull its tensors off the GPU.
-            if self._is_key_busy(key):
-                if not self._wait_until_idle(key):
-                    print(f"  Skipping eviction of '{key}' — still busy after wait")
-                    return
-            model_obj = self.models.pop(key, None)
-            self.models_in_vram.discard(key)
-            # Debug-only: ground-truth the object state before cleanup so an
-            # orphaned/detached backend (VRAM that won't free) is visible.
-            try:
-                from codai.api.state import get_global_debug
-                if get_global_debug():
-                    _be = getattr(model_obj, 'backend', '∅')
-                    _has_model = (getattr(_be, 'model', None) is not None) if _be not in ('∅', None) else False
-                    _pool = self.model_pools.get(key)
-                    print(f"  evict-debug '{key}': obj_id={id(model_obj)} "
-                          f"backend={'None' if _be is None else ('missing' if _be=='∅' else 'set')} "
-                          f"backend.model={'set' if _has_model else 'None'} "
-                          f"pool_instances={_pool.count if _pool else 0}")
-            except Exception:
-                pass
-            # Clean every instance in the pool (not just the primary) so extra
-            # instances don't leak VRAM, then drop the pool.
-            pool = self.model_pools.pop(key, None)
-            if pool is not None:
-                try:
-                    pool.cleanup_all()
-                except Exception as e:
-                    print(f"  Warning cleaning pool for '{key}': {e}")
-            if model_obj is not None:
-                try:
-                    if hasattr(model_obj, 'cleanup'):
-                        model_obj.cleanup()
-                    elif hasattr(model_obj, 'to'):
-                        # Diffusers pipeline: move all components to CPU explicitly
-                        # before dropping the reference so VRAM is freed promptly.
-                        model_obj.to('cpu')
-                except Exception as e:
-                    print(f"  Warning during eviction of '{key}': {e}")
-            del model_obj
-            for _ in range(3):
-                gc.collect()
-            try:
-                import torch
-                if torch.cuda.is_available():
-                    torch.cuda.synchronize()
-                    torch.cuda.empty_cache()
-            except Exception:
-                pass
-            # Return freed CPU heap (the evicted model's host-side copy / offloaded
-            # weights) to the OS, and let the kernel reclaim any swap backing it —
-            # otherwise RSS stays high across evict/load cycles and the machine
-            # slowly fills RAM + swap.
-            _trim_cpu_ram()
        def _size_label(key: str) -> str:
            m = self._measured_vram_gb.get(key)
@@ -2589,7 +2688,7 @@ class MultiModelManager:
        _free_before = self._get_free_vram_gb()
        # First pass: evict idle non-active models in LRU order.
-        for key in list(self.models.keys()):
+        for key in self._lru_order():
            if key == self.active_in_vram:
                continue
            if self._get_free_vram_gb() >= needed_gb:
@@ -3096,6 +3195,18 @@ class MultiModelManager:
                        except Exception as e:
                            print(f"  Warning: Error cleaning up legacy model_manager: {e}")
+        # Global RAM cap: before loading the new model, if host RSS is near the cap
+        # and idle-eviction is enabled, unload idle LRU models to make real RAM room.
+        # Whatever still doesn't fit is handled by the clamped CPU budget in
+        # hf_loading (overflow spills to the disk offload folder).
+        _cap = self._ram_cap_gb()
+        if _cap and self._evict_idle_on_ram_enabled():
+            _rss = self._get_process_ram_gb()
+            if _rss > _cap * 0.9:
+                print(f"Global RAM cap {_cap:.1f} GB — RSS {_rss:.1f} GB near limit; "
+                      f"evicting idle models before load of '{model_key}'")
+                self._evict_models_for_ram(_cap * 0.85)
        # Return info for the caller to load the model
        return {
            'model_key': model_key,

--- a/codai/models/ram_monitor.py
+++ b/codai/models/ram_monitor.py
+"""Global host-RAM watcher with leak detection and auto-mitigation.
+Coderai caps host RAM via ``offload.max_ram_gb`` (see ``OffloadConfig``). The
+per-load CPU budget in ``hf_loading`` keeps individual loads under the cap by
+spilling to disk, and ``MultiModelManager`` evicts idle models before a new load.
+This module adds the *continuous* side: a background thread samples process-tree
+RSS, flags a suspected leak when RSS keeps climbing **while the scheduler is idle**
+(so growth isn't just an in-flight generation), and runs a mitigation ladder when
+RSS crosses a soft threshold — gc → CUDA empty_cache → malloc_trim → drop the
+upscaler cache → evict idle models.
+Mirrors ``codai.models.thermal`` in shape: module-level state + ``get_status()``
+for the admin dashboard, started once from ``codai.main``.
+"""
+import threading
+import time
+import logging
+from typing import Optional, Dict, Any
+_log = logging.getLogger(__name__)
+# How often to sample RSS, and how many consecutive rising idle samples count as a
+# suspected leak. SOFT_FRACTION of the cap is where the mitigation ladder engages.
+_POLL_SECONDS = 15.0
+_LEAK_SAMPLES = 4          # consecutive idle increases before flagging a leak
+_LEAK_MIN_GROWTH_GB = 0.3  # ignore sub-0.3 GB jitter between samples
+_SOFT_FRACTION = 0.90      # mitigate at/above this fraction of the cap
+_EVICT_TARGET_FRACTION = 0.85  # evict idle models down to this fraction of the cap
+_state_lock = threading.Lock()
+_state: Dict[str, Any] = {
+    "rss_gb": 0.0,
+    "cap_gb": None,
+    "percent": None,
+    "leak_suspected": False,
+    "last_action": "",
+    "last_action_ts": 0.0,
+    "samples": 0,
+}
+_recent: list = []           # recent idle RSS samples (for trend detection)
+_thread: Optional[threading.Thread] = None
+_started = False
+def get_status() -> Dict[str, Any]:
+    """Snapshot for the admin status endpoint / dashboard."""
+    with _state_lock:
+        return dict(_state)
+def _cap_gb() -> Optional[float]:
+    try:
+        from codai.api.state import get_global_args
+        ga = get_global_args()
+        cap = getattr(ga, "max_ram_gb", None) if ga else None
+        return float(cap) if cap else None
+    except Exception:
+        return None
+def _watch_enabled() -> bool:
+    try:
+        from codai.api.state import get_global_args
+        ga = get_global_args()
+        return bool(getattr(ga, "ram_leak_watch", True)) if ga else True
+    except Exception:
+        return True
+def _scheduler_idle() -> bool:
+    """True when no request is being served (so RSS growth isn't a live job)."""
+    try:
+        from codai.queue.manager import queue_manager
+        return not queue_manager.active_leases
+    except Exception:
+        return True
+def _process_ram_gb() -> float:
+    try:
+        from codai.models.manager import multi_model_manager
+        return multi_model_manager._get_process_ram_gb()
+    except Exception:
+        try:
+            import psutil
+            return psutil.Process().memory_info().rss / 1e9
+        except Exception:
+            return 0.0
+def _mitigate(rss_gb: float, cap_gb: float, leak: bool) -> str:
+    """Run the mitigation ladder; return a short description of what was done."""
+    import gc
+    actions = []
+    for _ in range(3):
+        gc.collect()
+    actions.append("gc")
+    try:
+        import torch
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            actions.append("empty_cache")
+    except Exception:
+        pass
+    try:
+        from codai.models.manager import _trim_cpu_ram
+        _trim_cpu_ram()
+        actions.append("trim")
+    except Exception:
+        pass
+    # Drop the in-memory super-resolution upscaler cache — it can hold a multi-GB
+    # ESRGAN/diffusers model alive long after an enhance job finished.
+    try:
+        from codai.api import images as _img
+        if getattr(_img, "_UPSCALER_CACHE", None):
+            _img._UPSCALER_CACHE.clear()
+            actions.append("drop_upscalers")
+    except Exception:
+        pass
+    # Still over and eviction is enabled → unload idle LRU models.
+    try:
+        from codai.models.manager import multi_model_manager as _mm
+        if (_mm._get_process_ram_gb() > cap_gb * _SOFT_FRACTION
+                and _mm._evict_idle_on_ram_enabled()):
+            _mm._evict_models_for_ram(cap_gb * _EVICT_TARGET_FRACTION)
+            actions.append("evict_idle")
+    except Exception as e:
+        _log.warning("RAM mitigation eviction failed: %s", e)
+    desc = ("leak-suspected; " if leak else "") + "+".join(actions)
+    return desc
+def _loop():
+    global _recent
+    while True:
+        try:
+            time.sleep(_POLL_SECONDS)
+            if not _watch_enabled():
+                continue
+            cap = _cap_gb()
+            rss = _process_ram_gb()
+            idle = _scheduler_idle()
+            # Leak heuristic: only trust growth measured while idle (a live job
+            # legitimately inflates RSS). Keep a short rolling window of idle samples.
+            leak = False
+            if idle:
+                _recent.append(rss)
+                _recent = _recent[-(_LEAK_SAMPLES + 1):]
+                if len(_recent) > _LEAK_SAMPLES:
+                    rising = all(
+                        _recent[i + 1] - _recent[i] >= _LEAK_MIN_GROWTH_GB
+                        for i in range(len(_recent) - 1)
+                    )
+                    leak = rising
+            else:
+                _recent = []  # reset trend while a job runs
+            with _state_lock:
+                _state["rss_gb"] = round(rss, 2)
+                _state["cap_gb"] = cap
+                _state["percent"] = round(100.0 * rss / cap, 1) if cap else None
+                _state["leak_suspected"] = leak
+                _state["samples"] += 1
+            # Engage the ladder when over the soft threshold or a leak is suspected.
+            if cap and (rss >= cap * _SOFT_FRACTION or leak):
+                desc = _mitigate(rss, cap, leak)
+                new_rss = _process_ram_gb()
+                _log.warning(
+                    "RAM watch: RSS %.1f/%.1f GB (%.0f%%)%s — mitigation [%s] → %.1f GB",
+                    rss, cap, 100.0 * rss / cap,
+                    " LEAK SUSPECTED" if leak else "", desc, new_rss,
+                )
+                with _state_lock:
+                    _state["last_action"] = desc
+                    _state["last_action_ts"] = time.time()
+                    _state["rss_gb"] = round(new_rss, 2)
+                _recent = []  # avoid re-triggering on the same growth
+        except Exception as e:
+            _log.debug("RAM watch loop error: %s", e)
+def start() -> None:
+    """Start the background watcher once. Safe to call when no cap is configured —
+    it idles cheaply and begins acting as soon as a cap is set live."""
+    global _thread, _started
+    if _started:
+        return
+    _started = True
+    _thread = threading.Thread(target=_loop, name="ram-monitor", daemon=True)
+    _thread.start()
+    _log.info("RAM monitor started (poll %.0fs)", _POLL_SECONDS)