coderai: global host-RAM cap with leak watch + disk-offload eviction

Add a server-wide host-RAM ceiling (OffloadConfig.max_ram_gb) alongside the
existing VRAM budgeting:

- hf_loading clamps the accelerate CPU-offload budget to the headroom under
  the cap, so overflow spills to the disk offload folder instead of growing RSS.
- manager: process-tree RSS accounting, true-LRU (active_in_vram property stamps
  _last_used), shared _evict_one, and _evict_models_for_ram; idle models are
  evicted before a new load when RSS nears the cap.
- ram_monitor.py: background watcher samples RSS, flags a suspected leak when it
  climbs while the scheduler is idle, and runs a mitigation ladder
  (gc -> empty_cache -> malloc_trim -> drop upscaler cache -> evict idle).
- admin /status returns a ram block; Settings page exposes max RAM + evict/
  leak-watch toggles (applied live); dashboard shows a RAM gauge + leak badge.

Also fold loaded upscalers (_UPSCALER_CACHE) into the dashboard models-loaded
count so an active upscale no longer reports '0 models loaded'.
Co-Authored-By: 's avatarClaude Opus 4.8 <noreply@anthropic.com>
parent a9b6d35e
......@@ -330,6 +330,18 @@ def api_status(username: str = Depends(require_auth)):
loaded_keys = list(multi_model_manager.models.keys())
# Real-ESRGAN / diffusers upscalers live in a private cache (deliberately not
# in multi_model_manager.models — see codai/api/images.py), so the registry
# count alone reports 0 models while an upscale job is actively running. Fold
# the loaded upscalers in so the dashboard count matches the Tasks page.
try:
from codai.api.images import _UPSCALER_CACHE as _upscalers
for _uk in _upscalers.keys():
if _uk not in loaded_keys:
loaded_keys.append(_uk)
except Exception:
pass
# VRAM info
vram = None
is_cuda = False
......@@ -464,6 +476,26 @@ def api_status(username: str = Depends(require_auth)):
except Exception:
pass
# Host-RAM status: system totals (psutil) + the global-cap watcher snapshot
# (process RSS, configured cap, % of cap, leak-suspected flag, last mitigation).
ram = None
try:
import psutil
vm = psutil.virtual_memory()
ram = {
"total": round(vm.total / 1e9, 2),
"used": round((vm.total - vm.available) / 1e9, 2),
"free": round(vm.available / 1e9, 2),
"percent": vm.percent,
}
try:
from codai.models.ram_monitor import get_status as _ram_status
ram["watch"] = _ram_status()
except Exception:
pass
except Exception:
pass
# Whisper-server status
whisper_status = None
try:
......@@ -484,6 +516,7 @@ def api_status(username: str = Depends(require_auth)):
"enabled_models": enabled_models,
"enabled_aliases": enabled_aliases,
"vram": vram,
"ram": ram,
"cuda": is_cuda,
"requests": {
"total": req_total,
......@@ -2394,6 +2427,9 @@ async def api_get_settings(username: str = Depends(require_admin)):
"load_in_8bit": c.offload.load_in_8bit,
"manual_ram_gb": c.offload.manual_ram_gb,
"flash_attention": c.offload.flash_attention,
"max_ram_gb": c.offload.max_ram_gb,
"evict_idle_on_ram": c.offload.evict_idle_on_ram,
"ram_leak_watch": c.offload.ram_leak_watch,
},
"vulkan": {
"n_gpu_layers": c.vulkan.n_gpu_layers,
......@@ -2503,6 +2539,21 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
if "manual_ram_gb" in off:
c.offload.manual_ram_gb = off["manual_ram_gb"] or None
c.offload.flash_attention = bool(off.get("flash_attention", c.offload.flash_attention))
if "max_ram_gb" in off:
c.offload.max_ram_gb = off["max_ram_gb"] or None
c.offload.evict_idle_on_ram = bool(off.get("evict_idle_on_ram", c.offload.evict_idle_on_ram))
c.offload.ram_leak_watch = bool(off.get("ram_leak_watch", c.offload.ram_leak_watch))
# Push the RAM-cap settings to live global_args so the watcher, per-load
# budget clamp and eviction honour them without a restart.
try:
from codai.api.state import get_global_args
ga = get_global_args()
if ga is not None:
ga.max_ram_gb = c.offload.max_ram_gb
ga.evict_idle_on_ram = c.offload.evict_idle_on_ram
ga.ram_leak_watch = c.offload.ram_leak_watch
except Exception:
pass
if "vulkan" in data:
vk = data["vulkan"]
......
......@@ -40,6 +40,20 @@
<div style="font-size:11.5px;color:var(--text-2);margin-top:.2rem;font-family:var(--mono)" id="vram-total-line"></div>
<div class="stat-sub" id="vram-gpu" style="margin-top:.25rem"></div>
</div>
<div class="stat" id="ram-card" style="display:none">
<div class="stat-label">RAM</div>
<div class="stat-value" id="ram-pct" style="font-size:2rem"></div>
<div class="progress" style="margin-top:.625rem">
<div class="progress-fill" id="ram-bar" style="width:0%"></div>
</div>
<div class="progress-labels" style="color:var(--text-1);font-size:12px;margin-top:.4rem">
<span id="ram-used"></span><span id="ram-free"></span>
</div>
<div style="font-size:11.5px;color:var(--text-2);margin-top:.2rem;font-family:var(--mono)" id="ram-total-line"></div>
<div class="stat-sub" id="ram-cap" style="margin-top:.25rem"></div>
<div class="stat-sub" id="ram-leak" style="margin-top:.25rem;color:var(--danger,#e5484d);font-weight:600;display:none">⚠ RAM leak suspected</div>
</div>
</div>
<div class="card" style="margin-bottom:1rem">
......@@ -116,6 +130,33 @@ async function poll() {
document.getElementById('vram-card').style.display = 'none';
}
if (d.ram && d.ram.total) {
document.getElementById('ram-card').style.display = '';
const w = d.ram.watch || {};
// When a cap is set, show RSS-vs-cap; otherwise system used-vs-total.
const cap = w.cap_gb || null;
if (cap && w.rss_gb != null) {
const pct = Math.min(100, Math.round(w.rss_gb / cap * 100));
document.getElementById('ram-pct').textContent = pct + '%';
document.getElementById('ram-bar').style.width = pct + '%';
document.getElementById('ram-used').textContent = w.rss_gb.toFixed(1) + ' GB server';
document.getElementById('ram-free').textContent = (d.ram.free != null ? d.ram.free.toFixed(1) + ' GB free' : '');
document.getElementById('ram-total-line').textContent = cap.toFixed(1) + ' GB cap · ' + d.ram.total.toFixed(1) + ' GB total';
document.getElementById('ram-cap').textContent = w.last_action ? ('last: ' + w.last_action) : '';
} else {
const pct = Math.round(d.ram.used / d.ram.total * 100);
document.getElementById('ram-pct').textContent = pct + '%';
document.getElementById('ram-bar').style.width = pct + '%';
document.getElementById('ram-used').textContent = d.ram.used.toFixed(1) + ' GB used';
document.getElementById('ram-free').textContent = d.ram.free.toFixed(1) + ' GB free';
document.getElementById('ram-total-line').textContent = d.ram.total.toFixed(1) + ' GB total (no cap)';
document.getElementById('ram-cap').textContent = '';
}
document.getElementById('ram-leak').style.display = w.leak_suspected ? '' : 'none';
} else {
document.getElementById('ram-card').style.display = 'none';
}
if (d.requests) {
document.getElementById('req-total').textContent = d.requests.total ?? 0;
document.getElementById('req-active').textContent = d.requests.active ?? 0;
......
......@@ -68,6 +68,25 @@
<input type="text" id="s-offload-dir" class="form-input" placeholder="./offload">
<span class="form-hint">Models will inherit this as default when configured</span>
</div>
<div class="form-row" style="margin-top:.75rem">
<label class="form-label">Max global RAM (GB) <span class="muted">(blank = no cap)</span></label>
<input type="number" id="s-max-ram" class="form-input" min="0" step="1" placeholder="e.g. 96">
<span class="form-hint">Server-wide ceiling on host RAM (process-tree RSS). New model loads get a CPU-offload budget clamped to the remaining headroom, so the overflow spills to the offload directory (disk) instead of pushing past the limit. Applied live on save.</span>
</div>
<div class="form-row" style="margin:0">
<label style="display:flex;align-items:center;gap:.5rem;cursor:pointer">
<input type="checkbox" id="s-evict-idle-ram">
<span>Evict idle models when over the RAM limit</span>
</label>
<span class="form-hint">Unload least-recently-used idle models to free real RAM before forcing disk offload.</span>
</div>
<div class="form-row" style="margin:0">
<label style="display:flex;align-items:center;gap:.5rem;cursor:pointer">
<input type="checkbox" id="s-ram-leak-watch">
<span>Watch for RAM leaks (auto-mitigate)</span>
</label>
<span class="form-hint">Background watcher samples RSS; when it keeps climbing while idle or nears the cap it runs gc / CUDA cache release / heap trim and (if enabled) evicts idle models.</span>
</div>
<div class="form-row" style="margin-top:.75rem">
<label class="form-label">Temporary working directory <span class="muted">(default: system /tmp)</span></label>
<input type="text" id="s-tmp-dir" class="form-input" placeholder="e.g. /data/tmp">
......@@ -355,6 +374,9 @@ async function loadSettings(){
document.getElementById('s-hf-cache').value = d.models?.hf_cache_dir ?? '';
document.getElementById('s-gguf-cache').value = d.models?.gguf_cache_dir ?? '';
document.getElementById('s-offload-dir').value = d.offload?.directory ?? './offload';
document.getElementById('s-max-ram').value = d.offload?.max_ram_gb ?? '';
document.getElementById('s-evict-idle-ram').checked = d.offload?.evict_idle_on_ram !== false;
document.getElementById('s-ram-leak-watch').checked = d.offload?.ram_leak_watch !== false;
document.getElementById('s-tmp-dir').value = d.tmp_dir ?? '';
document.getElementById('s-allow-ffmpeg').checked = !!(d.enhance && d.enhance.allow_ffmpeg);
document.getElementById('s-allow-rife-ncnn').checked = !!(d.enhance && d.enhance.allow_rife_ncnn);
......@@ -424,6 +446,9 @@ async function saveSettings(){
},
offload:{
directory: document.getElementById('s-offload-dir').value.trim() || './offload',
max_ram_gb: (parseFloat(document.getElementById('s-max-ram').value) || null),
evict_idle_on_ram: document.getElementById('s-evict-idle-ram').checked,
ram_leak_watch: document.getElementById('s-ram-leak-watch').checked,
},
tmp_dir: strOrNull('s-tmp-dir'),
enhance:{
......
......@@ -65,6 +65,13 @@ class OffloadConfig:
load_in_8bit: bool = False
manual_ram_gb: Optional[float] = None
flash_attention: bool = False
# Server-wide ceiling on host RAM (process-tree RSS) the server may use, in GB.
# None = no global cap (per-load budget = available RAM, as before). When set, new
# model loads get a CPU-offload budget clamped to the remaining headroom so the
# overflow spills to the offload directory (disk), and idle models can be evicted.
max_ram_gb: Optional[float] = None
evict_idle_on_ram: bool = True # unload idle LRU models when over the RAM cap
ram_leak_watch: bool = True # background watcher samples RSS + auto-mitigates
@dataclass
......@@ -414,7 +421,10 @@ class ConfigManager:
"load_in_4bit": self.config.offload.load_in_4bit,
"load_in_8bit": self.config.offload.load_in_8bit,
"manual_ram_gb": self.config.offload.manual_ram_gb,
"flash_attention": self.config.offload.flash_attention
"flash_attention": self.config.offload.flash_attention,
"max_ram_gb": self.config.offload.max_ram_gb,
"evict_idle_on_ram": self.config.offload.evict_idle_on_ram,
"ram_leak_watch": self.config.offload.ram_leak_watch
},
"vulkan": {
"n_gpu_layers": self.config.vulkan.n_gpu_layers,
......
......@@ -427,6 +427,15 @@ def main():
# Migrate any GGUF files that ended up in the HF cache to the GGUF cache
_t.Thread(target=_migrate_hf_gguf_to_gguf_cache, daemon=True).start()
# Start the global host-RAM watcher (leak detection + auto-mitigation). Safe to
# start unconditionally: it idles cheaply and reads the cap/watch flags live, so
# it begins acting as soon as a max_ram_gb is configured (incl. via live reload).
try:
from codai.models.ram_monitor import start as _start_ram_monitor
_start_ram_monitor()
except Exception as _e:
logging.getLogger(__name__).debug("RAM monitor not started: %s", _e)
# Import core modules (only after early exits)
from codai.api import app
from codai.api.state import (
......@@ -802,6 +811,10 @@ def main():
global_args.load_in_8bit = config.offload.load_in_8bit
global_args.flash_attn = config.offload.flash_attention
global_args.max_gpu_percent = config.offload.max_gpu_percent
# Global host-RAM cap + leak watch (read live by hf_loading, manager, ram_monitor).
global_args.max_ram_gb = config.offload.max_ram_gb
global_args.evict_idle_on_ram = config.offload.evict_idle_on_ram
global_args.ram_leak_watch = config.offload.ram_leak_watch
# Thermal protection settings (read live by codai.models.thermal).
global_args.thermal_cpu_enabled = config.thermal.cpu_enabled
global_args.thermal_gpu_enabled = config.thermal.gpu_enabled
......
......@@ -352,6 +352,23 @@ def build_from_pretrained_kwargs(
else:
cpu_budget = max(0, psutil.virtual_memory().available - int(4e9))
# Global RAM cap: clamp the CPU-offload budget to the headroom remaining under
# the server-wide ceiling, so the overflow spills to the disk offload folder
# (set below) instead of pushing process RSS past the cap. Read live from
# global_args (same pattern as pipeline_cache). None = no cap (legacy behaviour).
try:
from codai.api.state import get_global_args as _gga
_ga = _gga()
_cap = getattr(_ga, 'max_ram_gb', None) if _ga else None
if _cap:
_used = psutil.Process().memory_info().rss
_headroom = int(float(_cap) * 1e9) - _used
# Keep a small floor so a single component can still land in RAM; the
# rest goes to disk. Never raise the budget above the cap headroom.
cpu_budget = max(0, min(cpu_budget, _headroom))
except Exception:
pass
kwargs['device_map'] = 'auto'
kwargs['max_memory'] = {0: gpu_budget, 'cpu': cpu_budget}
......
This diff is collapsed.
"""Global host-RAM watcher with leak detection and auto-mitigation.
Coderai caps host RAM via ``offload.max_ram_gb`` (see ``OffloadConfig``). The
per-load CPU budget in ``hf_loading`` keeps individual loads under the cap by
spilling to disk, and ``MultiModelManager`` evicts idle models before a new load.
This module adds the *continuous* side: a background thread samples process-tree
RSS, flags a suspected leak when RSS keeps climbing **while the scheduler is idle**
(so growth isn't just an in-flight generation), and runs a mitigation ladder when
RSS crosses a soft threshold — gc → CUDA empty_cache → malloc_trim → drop the
upscaler cache → evict idle models.
Mirrors ``codai.models.thermal`` in shape: module-level state + ``get_status()``
for the admin dashboard, started once from ``codai.main``.
"""
import threading
import time
import logging
from typing import Optional, Dict, Any
_log = logging.getLogger(__name__)
# How often to sample RSS, and how many consecutive rising idle samples count as a
# suspected leak. SOFT_FRACTION of the cap is where the mitigation ladder engages.
_POLL_SECONDS = 15.0
_LEAK_SAMPLES = 4 # consecutive idle increases before flagging a leak
_LEAK_MIN_GROWTH_GB = 0.3 # ignore sub-0.3 GB jitter between samples
_SOFT_FRACTION = 0.90 # mitigate at/above this fraction of the cap
_EVICT_TARGET_FRACTION = 0.85 # evict idle models down to this fraction of the cap
_state_lock = threading.Lock()
_state: Dict[str, Any] = {
"rss_gb": 0.0,
"cap_gb": None,
"percent": None,
"leak_suspected": False,
"last_action": "",
"last_action_ts": 0.0,
"samples": 0,
}
_recent: list = [] # recent idle RSS samples (for trend detection)
_thread: Optional[threading.Thread] = None
_started = False
def get_status() -> Dict[str, Any]:
"""Snapshot for the admin status endpoint / dashboard."""
with _state_lock:
return dict(_state)
def _cap_gb() -> Optional[float]:
try:
from codai.api.state import get_global_args
ga = get_global_args()
cap = getattr(ga, "max_ram_gb", None) if ga else None
return float(cap) if cap else None
except Exception:
return None
def _watch_enabled() -> bool:
try:
from codai.api.state import get_global_args
ga = get_global_args()
return bool(getattr(ga, "ram_leak_watch", True)) if ga else True
except Exception:
return True
def _scheduler_idle() -> bool:
"""True when no request is being served (so RSS growth isn't a live job)."""
try:
from codai.queue.manager import queue_manager
return not queue_manager.active_leases
except Exception:
return True
def _process_ram_gb() -> float:
try:
from codai.models.manager import multi_model_manager
return multi_model_manager._get_process_ram_gb()
except Exception:
try:
import psutil
return psutil.Process().memory_info().rss / 1e9
except Exception:
return 0.0
def _mitigate(rss_gb: float, cap_gb: float, leak: bool) -> str:
"""Run the mitigation ladder; return a short description of what was done."""
import gc
actions = []
for _ in range(3):
gc.collect()
actions.append("gc")
try:
import torch
if torch.cuda.is_available():
torch.cuda.empty_cache()
actions.append("empty_cache")
except Exception:
pass
try:
from codai.models.manager import _trim_cpu_ram
_trim_cpu_ram()
actions.append("trim")
except Exception:
pass
# Drop the in-memory super-resolution upscaler cache — it can hold a multi-GB
# ESRGAN/diffusers model alive long after an enhance job finished.
try:
from codai.api import images as _img
if getattr(_img, "_UPSCALER_CACHE", None):
_img._UPSCALER_CACHE.clear()
actions.append("drop_upscalers")
except Exception:
pass
# Still over and eviction is enabled → unload idle LRU models.
try:
from codai.models.manager import multi_model_manager as _mm
if (_mm._get_process_ram_gb() > cap_gb * _SOFT_FRACTION
and _mm._evict_idle_on_ram_enabled()):
_mm._evict_models_for_ram(cap_gb * _EVICT_TARGET_FRACTION)
actions.append("evict_idle")
except Exception as e:
_log.warning("RAM mitigation eviction failed: %s", e)
desc = ("leak-suspected; " if leak else "") + "+".join(actions)
return desc
def _loop():
global _recent
while True:
try:
time.sleep(_POLL_SECONDS)
if not _watch_enabled():
continue
cap = _cap_gb()
rss = _process_ram_gb()
idle = _scheduler_idle()
# Leak heuristic: only trust growth measured while idle (a live job
# legitimately inflates RSS). Keep a short rolling window of idle samples.
leak = False
if idle:
_recent.append(rss)
_recent = _recent[-(_LEAK_SAMPLES + 1):]
if len(_recent) > _LEAK_SAMPLES:
rising = all(
_recent[i + 1] - _recent[i] >= _LEAK_MIN_GROWTH_GB
for i in range(len(_recent) - 1)
)
leak = rising
else:
_recent = [] # reset trend while a job runs
with _state_lock:
_state["rss_gb"] = round(rss, 2)
_state["cap_gb"] = cap
_state["percent"] = round(100.0 * rss / cap, 1) if cap else None
_state["leak_suspected"] = leak
_state["samples"] += 1
# Engage the ladder when over the soft threshold or a leak is suspected.
if cap and (rss >= cap * _SOFT_FRACTION or leak):
desc = _mitigate(rss, cap, leak)
new_rss = _process_ram_gb()
_log.warning(
"RAM watch: RSS %.1f/%.1f GB (%.0f%%)%s — mitigation [%s] → %.1f GB",
rss, cap, 100.0 * rss / cap,
" LEAK SUSPECTED" if leak else "", desc, new_rss,
)
with _state_lock:
_state["last_action"] = desc
_state["last_action_ts"] = time.time()
_state["rss_gb"] = round(new_rss, 2)
_recent = [] # avoid re-triggering on the same growth
except Exception as e:
_log.debug("RAM watch loop error: %s", e)
def start() -> None:
"""Start the background watcher once. Safe to call when no cap is configured —
it idles cheaply and begins acting as soon as a cap is set live."""
global _thread, _started
if _started:
return
_started = True
_thread = threading.Thread(target=_loop, name="ram-monitor", daemon=True)
_thread.start()
_log.info("RAM monitor started (poll %.0fs)", _POLL_SECONDS)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment