coderai: global host-RAM cap with leak watch + disk-offload eviction

Add a server-wide host-RAM ceiling (OffloadConfig.max_ram_gb) alongside the existing VRAM budgeting: - hf_loading clamps the accelerate CPU-offload budget to the headroom under the cap, so overflow spills to the disk offload folder instead of growing RSS. - manager: process-tree RSS accounting, true-LRU (active_in_vram property stamps _last_used), shared _evict_one, and _evict_models_for_ram; idle models are evicted before a new load when RSS nears the cap. - ram_monitor.py: background watcher samples RSS, flags a suspected leak when it climbs while the scheduler is idle, and runs a mitigation ladder (gc -> empty_cache -> malloc_trim -> drop upscaler cache -> evict idle). - admin /status returns a ram block; Settings page exposes max RAM + evict/ leak-watch toggles (applied live); dashboard shows a RAM gauge + leak badge. Also fold loaded upscalers (_UPSCALER_CACHE) into the dashboard models-loaded count so an active upscale no longer reports '0 models loaded'. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

coderai: global host-RAM cap with leak watch + disk-offload eviction
Add a server-wide host-RAM ceiling (OffloadConfig.max_ram_gb) alongside the existing VRAM budgeting: - hf_loading clamps the accelerate CPU-offload budget to the headroom under the cap, so overflow spills to the disk offload folder instead of growing RSS. - manager: process-tree RSS accounting, true-LRU (active_in_vram property stamps _last_used), shared _evict_one, and _evict_models_for_ram; idle models are evicted before a new load when RSS nears the cap. - ram_monitor.py: background watcher samples RSS, flags a suspected leak when it climbs while the scheduler is idle, and runs a mitigation ladder (gc -> empty_cache -> malloc_trim -> drop upscaler cache -> evict idle). - admin /status returns a ram block; Settings page exposes max RAM + evict/ leak-watch toggles (applied live); dashboard shows a RAM gauge + leak badge. Also fold loaded upscalers (_UPSCALER_CACHE) into the dashboard models-loaded count so an active upscale no longer reports '0 models loaded'. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
99f8ba85 · Stefy Lanza (nextime / spora ) · a9b6d35e · 99f8ba85 · 99f8ba85 · 99f8ba85
Commit 99f8ba85 authored Jun 15, 2026 by Stefy Lanza (nextime / spora )
8 changed files
--- a/codai/admin/routes.py
+++ b/codai/admin/routes.py
@@ -330,6 +330,18 @@ def api_status(username: str = Depends(require_auth)):

    loaded_keys = list(multi_model_manager.models.keys())

+    # Real-ESRGAN / diffusers upscalers live in a private cache (deliberately not
+    # in multi_model_manager.models — see codai/api/images.py), so the registry
+    # count alone reports 0 models while an upscale job is actively running. Fold
+    # the loaded upscalers in so the dashboard count matches the Tasks page.
+    try:
+        from codai.api.images import _UPSCALER_CACHE as _upscalers
+        for _uk in _upscalers.keys():
+            if _uk not in loaded_keys:
+                loaded_keys.append(_uk)
+    except Exception:
+        pass
+
    # VRAM info
    vram = None
    is_cuda = False
@@ -464,6 +476,26 @@ def api_status(username: str = Depends(require_auth)):
    except Exception:
        pass

+    # Host-RAM status: system totals (psutil) + the global-cap watcher snapshot
+    # (process RSS, configured cap, % of cap, leak-suspected flag, last mitigation).
+    ram = None
+    try:
+        import psutil
+        vm = psutil.virtual_memory()
+        ram = {
+            "total": round(vm.total / 1e9, 2),
+            "used": round((vm.total - vm.available) / 1e9, 2),
+            "free": round(vm.available / 1e9, 2),
+            "percent": vm.percent,
+        }
+        try:
+            from codai.models.ram_monitor import get_status as _ram_status
+            ram["watch"] = _ram_status()
+        except Exception:
+            pass
+    except Exception:
+        pass
+
    # Whisper-server status
    whisper_status = None
    try:
@@ -484,6 +516,7 @@ def api_status(username: str = Depends(require_auth)):
        "enabled_models": enabled_models,
        "enabled_aliases": enabled_aliases,
        "vram": vram,
+        "ram": ram,
        "cuda": is_cuda,
        "requests": {
            "total": req_total,
@@ -2394,6 +2427,9 @@ async def api_get_settings(username: str = Depends(require_admin)):
            "load_in_8bit": c.offload.load_in_8bit,
            "manual_ram_gb": c.offload.manual_ram_gb,
            "flash_attention": c.offload.flash_attention,
+            "max_ram_gb": c.offload.max_ram_gb,
+            "evict_idle_on_ram": c.offload.evict_idle_on_ram,
+            "ram_leak_watch": c.offload.ram_leak_watch,
        },
        "vulkan": {
            "n_gpu_layers": c.vulkan.n_gpu_layers,
@@ -2503,6 +2539,21 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
        if "manual_ram_gb" in off:
            c.offload.manual_ram_gb = off["manual_ram_gb"] or None
        c.offload.flash_attention = bool(off.get("flash_attention", c.offload.flash_attention))
+        if "max_ram_gb" in off:
+            c.offload.max_ram_gb = off["max_ram_gb"] or None
+        c.offload.evict_idle_on_ram = bool(off.get("evict_idle_on_ram", c.offload.evict_idle_on_ram))
+        c.offload.ram_leak_watch = bool(off.get("ram_leak_watch", c.offload.ram_leak_watch))
+        # Push the RAM-cap settings to live global_args so the watcher, per-load
+        # budget clamp and eviction honour them without a restart.
+        try:
+            from codai.api.state import get_global_args
+            ga = get_global_args()
+            if ga is not None:
+                ga.max_ram_gb = c.offload.max_ram_gb
+                ga.evict_idle_on_ram = c.offload.evict_idle_on_ram
+                ga.ram_leak_watch = c.offload.ram_leak_watch
+        except Exception:
+            pass

    if "vulkan" in data:
        vk = data["vulkan"]

--- a/codai/admin/templates/dashboard.html
+++ b/codai/admin/templates/dashboard.html
@@ -40,6 +40,20 @@
    <div style="font-size:11.5px;color:var(--text-2);margin-top:.2rem;font-family:var(--mono)" id="vram-total-line"></div>
    <div class="stat-sub" id="vram-gpu" style="margin-top:.25rem"></div>
  </div>
+
+  <div class="stat" id="ram-card" style="display:none">
+    <div class="stat-label">RAM</div>
+    <div class="stat-value" id="ram-pct" style="font-size:2rem">—</div>
+    <div class="progress" style="margin-top:.625rem">
+      <div class="progress-fill" id="ram-bar" style="width:0%"></div>
+    </div>
+    <div class="progress-labels" style="color:var(--text-1);font-size:12px;margin-top:.4rem">
+      <span id="ram-used">—</span><span id="ram-free">—</span>
+    </div>
+    <div style="font-size:11.5px;color:var(--text-2);margin-top:.2rem;font-family:var(--mono)" id="ram-total-line"></div>
+    <div class="stat-sub" id="ram-cap" style="margin-top:.25rem"></div>
+    <div class="stat-sub" id="ram-leak" style="margin-top:.25rem;color:var(--danger,#e5484d);font-weight:600;display:none">⚠ RAM leak suspected</div>
+  </div>
 </div>

 <div class="card" style="margin-bottom:1rem">
@@ -116,6 +130,33 @@ async function poll() {
      document.getElementById('vram-card').style.display = 'none';
    }

+    if (d.ram && d.ram.total) {
+      document.getElementById('ram-card').style.display = '';
+      const w = d.ram.watch || {};
+      // When a cap is set, show RSS-vs-cap; otherwise system used-vs-total.
+      const cap = w.cap_gb || null;
+      if (cap && w.rss_gb != null) {
+        const pct = Math.min(100, Math.round(w.rss_gb / cap * 100));
+        document.getElementById('ram-pct').textContent = pct + '%';
+        document.getElementById('ram-bar').style.width = pct + '%';
+        document.getElementById('ram-used').textContent = w.rss_gb.toFixed(1) + ' GB server';
+        document.getElementById('ram-free').textContent = (d.ram.free != null ? d.ram.free.toFixed(1) + ' GB free' : '');
+        document.getElementById('ram-total-line').textContent = cap.toFixed(1) + ' GB cap · ' + d.ram.total.toFixed(1) + ' GB total';
+        document.getElementById('ram-cap').textContent = w.last_action ? ('last: ' + w.last_action) : '';
+      } else {
+        const pct = Math.round(d.ram.used / d.ram.total * 100);
+        document.getElementById('ram-pct').textContent = pct + '%';
+        document.getElementById('ram-bar').style.width = pct + '%';
+        document.getElementById('ram-used').textContent = d.ram.used.toFixed(1) + ' GB used';
+        document.getElementById('ram-free').textContent = d.ram.free.toFixed(1) + ' GB free';
+        document.getElementById('ram-total-line').textContent = d.ram.total.toFixed(1) + ' GB total (no cap)';
+        document.getElementById('ram-cap').textContent = '';
+      }
+      document.getElementById('ram-leak').style.display = w.leak_suspected ? '' : 'none';
+    } else {
+      document.getElementById('ram-card').style.display = 'none';
+    }
+
    if (d.requests) {
      document.getElementById('req-total').textContent = d.requests.total ?? 0;
      document.getElementById('req-active').textContent = d.requests.active ?? 0;

--- a/codai/admin/templates/settings.html
+++ b/codai/admin/templates/settings.html
@@ -68,6 +68,25 @@
    <input type="text" id="s-offload-dir" class="form-input" placeholder="./offload">
    <span class="form-hint">Models will inherit this as default when configured</span>
  </div>
+  <div class="form-row" style="margin-top:.75rem">
+    <label class="form-label">Max global RAM (GB) <span class="muted">(blank = no cap)</span></label>
+    <input type="number" id="s-max-ram" class="form-input" min="0" step="1" placeholder="e.g. 96">
+    <span class="form-hint">Server-wide ceiling on host RAM (process-tree RSS). New model loads get a CPU-offload budget clamped to the remaining headroom, so the overflow spills to the offload directory (disk) instead of pushing past the limit. Applied live on save.</span>
+  </div>
+  <div class="form-row" style="margin:0">
+    <label style="display:flex;align-items:center;gap:.5rem;cursor:pointer">
+      <input type="checkbox" id="s-evict-idle-ram">
+      <span>Evict idle models when over the RAM limit</span>
+    </label>
+    <span class="form-hint">Unload least-recently-used idle models to free real RAM before forcing disk offload.</span>
+  </div>
+  <div class="form-row" style="margin:0">
+    <label style="display:flex;align-items:center;gap:.5rem;cursor:pointer">
+      <input type="checkbox" id="s-ram-leak-watch">
+      <span>Watch for RAM leaks (auto-mitigate)</span>
+    </label>
+    <span class="form-hint">Background watcher samples RSS; when it keeps climbing while idle or nears the cap it runs gc / CUDA cache release / heap trim and (if enabled) evicts idle models.</span>
+  </div>
  <div class="form-row" style="margin-top:.75rem">
    <label class="form-label">Temporary working directory <span class="muted">(default: system /tmp)</span></label>
    <input type="text" id="s-tmp-dir" class="form-input" placeholder="e.g. /data/tmp">
@@ -355,6 +374,9 @@ async function loadSettings(){
    document.getElementById('s-hf-cache').value   = d.models?.hf_cache_dir ?? '';
    document.getElementById('s-gguf-cache').value = d.models?.gguf_cache_dir ?? '';
    document.getElementById('s-offload-dir').value = d.offload?.directory ?? './offload';
+    document.getElementById('s-max-ram').value = d.offload?.max_ram_gb ?? '';
+    document.getElementById('s-evict-idle-ram').checked = d.offload?.evict_idle_on_ram !== false;
+    document.getElementById('s-ram-leak-watch').checked = d.offload?.ram_leak_watch !== false;
    document.getElementById('s-tmp-dir').value = d.tmp_dir ?? '';
    document.getElementById('s-allow-ffmpeg').checked = !!(d.enhance && d.enhance.allow_ffmpeg);
    document.getElementById('s-allow-rife-ncnn').checked = !!(d.enhance && d.enhance.allow_rife_ncnn);
@@ -424,6 +446,9 @@ async function saveSettings(){
    },
    offload:{
      directory: document.getElementById('s-offload-dir').value.trim() || './offload',
+      max_ram_gb: (parseFloat(document.getElementById('s-max-ram').value) || null),
+      evict_idle_on_ram: document.getElementById('s-evict-idle-ram').checked,
+      ram_leak_watch: document.getElementById('s-ram-leak-watch').checked,
    },
    tmp_dir: strOrNull('s-tmp-dir'),
    enhance:{

--- a/codai/config.py
+++ b/codai/config.py
@@ -65,6 +65,13 @@ class OffloadConfig:
    load_in_8bit: bool = False
    manual_ram_gb: Optional[float] = None
    flash_attention: bool = False
+    # Server-wide ceiling on host RAM (process-tree RSS) the server may use, in GB.
+    # None = no global cap (per-load budget = available RAM, as before). When set, new
+    # model loads get a CPU-offload budget clamped to the remaining headroom so the
+    # overflow spills to the offload directory (disk), and idle models can be evicted.
+    max_ram_gb: Optional[float] = None
+    evict_idle_on_ram: bool = True   # unload idle LRU models when over the RAM cap
+    ram_leak_watch: bool = True      # background watcher samples RSS + auto-mitigates


 @dataclass
@@ -414,7 +421,10 @@ class ConfigManager:
                "load_in_4bit": self.config.offload.load_in_4bit,
                "load_in_8bit": self.config.offload.load_in_8bit,
                "manual_ram_gb": self.config.offload.manual_ram_gb,
-                "flash_attention": self.config.offload.flash_attention
+                "flash_attention": self.config.offload.flash_attention,
+                "max_ram_gb": self.config.offload.max_ram_gb,
+                "evict_idle_on_ram": self.config.offload.evict_idle_on_ram,
+                "ram_leak_watch": self.config.offload.ram_leak_watch
            },
            "vulkan": {
                "n_gpu_layers": self.config.vulkan.n_gpu_layers,

--- a/codai/main.py
+++ b/codai/main.py
@@ -427,6 +427,15 @@ def main():
    # Migrate any GGUF files that ended up in the HF cache to the GGUF cache
    _t.Thread(target=_migrate_hf_gguf_to_gguf_cache, daemon=True).start()

+    # Start the global host-RAM watcher (leak detection + auto-mitigation). Safe to
+    # start unconditionally: it idles cheaply and reads the cap/watch flags live, so
+    # it begins acting as soon as a max_ram_gb is configured (incl. via live reload).
+    try:
+        from codai.models.ram_monitor import start as _start_ram_monitor
+        _start_ram_monitor()
+    except Exception as _e:
+        logging.getLogger(__name__).debug("RAM monitor not started: %s", _e)
+
    # Import core modules (only after early exits)
    from codai.api import app
    from codai.api.state import (
@@ -802,6 +811,10 @@ def main():
    global_args.load_in_8bit = config.offload.load_in_8bit
    global_args.flash_attn = config.offload.flash_attention
    global_args.max_gpu_percent = config.offload.max_gpu_percent
+    # Global host-RAM cap + leak watch (read live by hf_loading, manager, ram_monitor).
+    global_args.max_ram_gb = config.offload.max_ram_gb
+    global_args.evict_idle_on_ram = config.offload.evict_idle_on_ram
+    global_args.ram_leak_watch = config.offload.ram_leak_watch
    # Thermal protection settings (read live by codai.models.thermal).
    global_args.thermal_cpu_enabled = config.thermal.cpu_enabled
    global_args.thermal_gpu_enabled = config.thermal.gpu_enabled

--- a/codai/models/hf_loading.py
+++ b/codai/models/hf_loading.py
@@ -352,6 +352,23 @@ def build_from_pretrained_kwargs(
        else:
            cpu_budget = max(0, psutil.virtual_memory().available - int(4e9))

+        # Global RAM cap: clamp the CPU-offload budget to the headroom remaining under
+        # the server-wide ceiling, so the overflow spills to the disk offload folder
+        # (set below) instead of pushing process RSS past the cap. Read live from
+        # global_args (same pattern as pipeline_cache). None = no cap (legacy behaviour).
+        try:
+            from codai.api.state import get_global_args as _gga
+            _ga = _gga()
+            _cap = getattr(_ga, 'max_ram_gb', None) if _ga else None
+            if _cap:
+                _used = psutil.Process().memory_info().rss
+                _headroom = int(float(_cap) * 1e9) - _used
+                # Keep a small floor so a single component can still land in RAM; the
+                # rest goes to disk. Never raise the budget above the cap headroom.
+                cpu_budget = max(0, min(cpu_budget, _headroom))
+        except Exception:
+            pass
+
        kwargs['device_map'] = 'auto'
        kwargs['max_memory'] = {0: gpu_budget, 'cpu': cpu_budget}


--- a/codai/models/manager.py
+++ b/codai/models/manager.py
--- a/codai/models/ram_monitor.py
+++ b/codai/models/ram_monitor.py
+"""Global host-RAM watcher with leak detection and auto-mitigation.
+
+Coderai caps host RAM via ``offload.max_ram_gb`` (see ``OffloadConfig``). The
+per-load CPU budget in ``hf_loading`` keeps individual loads under the cap by
+spilling to disk, and ``MultiModelManager`` evicts idle models before a new load.
+This module adds the *continuous* side: a background thread samples process-tree
+RSS, flags a suspected leak when RSS keeps climbing **while the scheduler is idle**
+(so growth isn't just an in-flight generation), and runs a mitigation ladder when
+RSS crosses a soft threshold — gc → CUDA empty_cache → malloc_trim → drop the
+upscaler cache → evict idle models.
+
+Mirrors ``codai.models.thermal`` in shape: module-level state + ``get_status()``
+for the admin dashboard, started once from ``codai.main``.
+"""
+import threading
+import time
+import logging
+from typing import Optional, Dict, Any
+
+_log = logging.getLogger(__name__)
+
+# How often to sample RSS, and how many consecutive rising idle samples count as a
+# suspected leak. SOFT_FRACTION of the cap is where the mitigation ladder engages.
+_POLL_SECONDS = 15.0
+_LEAK_SAMPLES = 4          # consecutive idle increases before flagging a leak
+_LEAK_MIN_GROWTH_GB = 0.3  # ignore sub-0.3 GB jitter between samples
+_SOFT_FRACTION = 0.90      # mitigate at/above this fraction of the cap
+_EVICT_TARGET_FRACTION = 0.85  # evict idle models down to this fraction of the cap
+
+_state_lock = threading.Lock()
+_state: Dict[str, Any] = {
+    "rss_gb": 0.0,
+    "cap_gb": None,
+    "percent": None,
+    "leak_suspected": False,
+    "last_action": "",
+    "last_action_ts": 0.0,
+    "samples": 0,
+}
+_recent: list = []           # recent idle RSS samples (for trend detection)
+_thread: Optional[threading.Thread] = None
+_started = False
+
+
+def get_status() -> Dict[str, Any]:
+    """Snapshot for the admin status endpoint / dashboard."""
+    with _state_lock:
+        return dict(_state)
+
+
+def _cap_gb() -> Optional[float]:
+    try:
+        from codai.api.state import get_global_args
+        ga = get_global_args()
+        cap = getattr(ga, "max_ram_gb", None) if ga else None
+        return float(cap) if cap else None
+    except Exception:
+        return None
+
+
+def _watch_enabled() -> bool:
+    try:
+        from codai.api.state import get_global_args
+        ga = get_global_args()
+        return bool(getattr(ga, "ram_leak_watch", True)) if ga else True
+    except Exception:
+        return True
+
+
+def _scheduler_idle() -> bool:
+    """True when no request is being served (so RSS growth isn't a live job)."""
+    try:
+        from codai.queue.manager import queue_manager
+        return not queue_manager.active_leases
+    except Exception:
+        return True
+
+
+def _process_ram_gb() -> float:
+    try:
+        from codai.models.manager import multi_model_manager
+        return multi_model_manager._get_process_ram_gb()
+    except Exception:
+        try:
+            import psutil
+            return psutil.Process().memory_info().rss / 1e9
+        except Exception:
+            return 0.0
+
+
+def _mitigate(rss_gb: float, cap_gb: float, leak: bool) -> str:
+    """Run the mitigation ladder; return a short description of what was done."""
+    import gc
+    actions = []
+    for _ in range(3):
+        gc.collect()
+    actions.append("gc")
+    try:
+        import torch
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            actions.append("empty_cache")
+    except Exception:
+        pass
+    try:
+        from codai.models.manager import _trim_cpu_ram
+        _trim_cpu_ram()
+        actions.append("trim")
+    except Exception:
+        pass
+    # Drop the in-memory super-resolution upscaler cache — it can hold a multi-GB
+    # ESRGAN/diffusers model alive long after an enhance job finished.
+    try:
+        from codai.api import images as _img
+        if getattr(_img, "_UPSCALER_CACHE", None):
+            _img._UPSCALER_CACHE.clear()
+            actions.append("drop_upscalers")
+    except Exception:
+        pass
+    # Still over and eviction is enabled → unload idle LRU models.
+    try:
+        from codai.models.manager import multi_model_manager as _mm
+        if (_mm._get_process_ram_gb() > cap_gb * _SOFT_FRACTION
+                and _mm._evict_idle_on_ram_enabled()):
+            _mm._evict_models_for_ram(cap_gb * _EVICT_TARGET_FRACTION)
+            actions.append("evict_idle")
+    except Exception as e:
+        _log.warning("RAM mitigation eviction failed: %s", e)
+    desc = ("leak-suspected; " if leak else "") + "+".join(actions)
+    return desc
+
+
+def _loop():
+    global _recent
+    while True:
+        try:
+            time.sleep(_POLL_SECONDS)
+            if not _watch_enabled():
+                continue
+            cap = _cap_gb()
+            rss = _process_ram_gb()
+            idle = _scheduler_idle()
+
+            # Leak heuristic: only trust growth measured while idle (a live job
+            # legitimately inflates RSS). Keep a short rolling window of idle samples.
+            leak = False
+            if idle:
+                _recent.append(rss)
+                _recent = _recent[-(_LEAK_SAMPLES + 1):]
+                if len(_recent) > _LEAK_SAMPLES:
+                    rising = all(
+                        _recent[i + 1] - _recent[i] >= _LEAK_MIN_GROWTH_GB
+                        for i in range(len(_recent) - 1)
+                    )
+                    leak = rising
+            else:
+                _recent = []  # reset trend while a job runs
+
+            with _state_lock:
+                _state["rss_gb"] = round(rss, 2)
+                _state["cap_gb"] = cap
+                _state["percent"] = round(100.0 * rss / cap, 1) if cap else None
+                _state["leak_suspected"] = leak
+                _state["samples"] += 1
+
+            # Engage the ladder when over the soft threshold or a leak is suspected.
+            if cap and (rss >= cap * _SOFT_FRACTION or leak):
+                desc = _mitigate(rss, cap, leak)
+                new_rss = _process_ram_gb()
+                _log.warning(
+                    "RAM watch: RSS %.1f/%.1f GB (%.0f%%)%s — mitigation [%s] → %.1f GB",
+                    rss, cap, 100.0 * rss / cap,
+                    " LEAK SUSPECTED" if leak else "", desc, new_rss,
+                )
+                with _state_lock:
+                    _state["last_action"] = desc
+                    _state["last_action_ts"] = time.time()
+                    _state["rss_gb"] = round(new_rss, 2)
+                _recent = []  # avoid re-triggering on the same growth
+        except Exception as e:
+            _log.debug("RAM watch loop error: %s", e)
+
+
+def start() -> None:
+    """Start the background watcher once. Safe to call when no cap is configured —
+    it idles cheaply and begins acting as soon as a cap is set live."""
+    global _thread, _started
+    if _started:
+        return
+    _started = True
+    _thread = threading.Thread(target=_loop, name="ram-monitor", daemon=True)
+    _thread.start()
+    _log.info("RAM monitor started (poll %.0fs)", _POLL_SECONDS)