front/engine split, ds4 + media tooling, gemma-4 native tools; ignore runtime artifacts

- frontproxy: torch-free front proxy + per-vendor engine supervisor with auth, localhost binding, model routing; Ctrl-C now force-kills engines (own session + PDEATHSIG, SIGKILL of engine process groups, watchdog on hung drain) - gemma-4 tool calling: prompt via native tools= template, parse call:NAME{...} into tool_calls, honour generation_config EOS so it stops instead of looping - ds4 external worker, parler/expressive TTS backends, video editor tooling - --debug-requests: full client<->API request/response logging + live snapshots - stop tracking runtime artifacts (video_editor/sessions/, tools/coderai_media/) Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

front/engine split, ds4 + media tooling, gemma-4 native tools; ignore runtime artifacts
- frontproxy: torch-free front proxy + per-vendor engine supervisor with auth, localhost binding, model routing; Ctrl-C now force-kills engines (own session + PDEATHSIG, SIGKILL of engine process groups, watchdog on hung drain) - gemma-4 tool calling: prompt via native tools= template, parse call:NAME{...} into tool_calls, honour generation_config EOS so it stops instead of looping - ds4 external worker, parler/expressive TTS backends, video editor tooling - --debug-requests: full client<->API request/response logging + live snapshots - stop tracking runtime artifacts (video_editor/sessions/, tools/coderai_media/) Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
b297b25f · Stefy Lanza (nextime / spora ) · 2fb085f4 · b297b25f · b297b25f · b297b25f
Commit b297b25f authored Jun 18, 2026 by Stefy Lanza (nextime / spora )
46 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -33,3 +33,7 @@ township_output/
 # Packaging build cache + runtime temp (large artifacts)
 .packaging-cache/
 tmp/
+
+# Video editor sessions + generated media (runtime artifacts)
+video_editor/sessions/
+tools/coderai_media/
--- a/build.sh
+++ b/build.sh
@@ -35,6 +35,7 @@ BACKEND="${1:-all}"
 FLASH=false
 CUSTOM_VENV=""
 PACKAGE=false
+DS4=false

 # Parse arguments
 i=1
@@ -50,6 +51,9 @@ for arg in "$@"; do
        --package)
            PACKAGE=true
            ;;
+        --ds4)
+            DS4=true
+            ;;
    esac
    i=$((i + 1))
 done
@@ -68,6 +72,7 @@ if [[ "$BACKEND" != "nvidia" && "$BACKEND" != "vulkan" && "$BACKEND" != "vulkan-
    echo ""
    echo "Options:"
    echo "  --flash     - Install Flash Attention 2 for faster inference (NVIDIA only)"
+    echo "  --ds4       - Clone + build the ds4 (DeepSeek V4) native engine"
    exit 1
 fi

@@ -755,6 +760,35 @@ package_app() {
    echo -e "${YELLOW}Note: The target machine must still provide compatible system GPU/runtime libraries.${NC}"
 }

+# Optionally clone + build ds4 (DeepSeek V4 native engine). Opt-in via --ds4.
+# coderai can also auto-build this at runtime on first use, but doing it here lets
+# the OCI/Docker packaging bundle the prebuilt ds4-server binary.
+build_ds4() {
+    local DS4_DIR="${CODERAI_DS4_DIR:-$HOME/.coderai/ds4}"
+    echo -e "${YELLOW}Building ds4 (DeepSeek V4 engine) → $DS4_DIR ...${NC}"
+    if [ ! -e "$DS4_DIR/Makefile" ]; then
+        mkdir -p "$(dirname "$DS4_DIR")"
+        git clone --depth 1 https://github.com/antirez/ds4 "$DS4_DIR" || {
+            echo -e "${YELLOW}Warning: could not clone ds4; skipping.${NC}"; return 0; }
+    fi
+    local TARGET="cpu"
+    if command -v nvcc &> /dev/null || [ -d "/usr/local/cuda" ]; then
+        TARGET="cuda-generic"
+    elif [ "$(uname -s)" = "Darwin" ]; then
+        TARGET=""   # bare `make` builds the macOS Metal backend
+    fi
+    ( cd "$DS4_DIR" && make $TARGET ) || {
+        echo -e "${YELLOW}Warning: ds4 build failed; it can still be built at runtime.${NC}"; return 0; }
+    if [ -x "$DS4_DIR/ds4-server" ]; then
+        echo -e "${GREEN}✓ ds4-server built at $DS4_DIR/ds4-server${NC}"
+        echo -e "${YELLOW}Note: DeepSeek V4 weights are downloaded on first use (multi-GB).${NC}"
+    fi
+}
+
+if [ "$DS4" = true ]; then
+    build_ds4
+fi
+
 # Create .backend file to track which backend was used
 echo "$BACKEND" > .backend


--- a/codai/admin/routes.py
+++ b/codai/admin/routes.py
@@ -16,6 +16,7 @@

 """Admin dashboard routes."""
 from pathlib import Path
+import asyncio
 import re
 import shutil
 from typing import Optional
@@ -830,14 +831,35 @@ def _run_download_thread(session_id: str, model_id: str, file_pattern: str, pq):
    # JSON lines on stdout, which we relay onto this session's SSE queue.
    import subprocess as _sp
    import sys as _sys
-
+    import collections as _collections
+    import pathlib as _pathlib
+
+    # The worker runs as `python -m codai.admin.download_worker`; when coderai is
+    # run from source (not pip-installed) the child won't find the `codai`
+    # package unless the repo root is on its path. routes.py lives at
+    # <repo>/codai/admin/routes.py, so parents[2] is the repo root.
+    _repo_root = str(_pathlib.Path(__file__).resolve().parents[2])
+
+    def _attempt(disable_xet: bool):
+        """Spawn the worker once; relay its events. Returns (terminal, rc, tail)."""
+        env = dict(os.environ)
+        env["PYTHONPATH"] = _repo_root + (os.pathsep + env["PYTHONPATH"]
+                                          if env.get("PYTHONPATH") else "")
+        # hf_xet (the accelerated transfer) bypasses our tqdm progress hook — so
+        # the bar freezes near 100% while a big file silently downloads — and can
+        # hard-crash the worker (segfault / signal kill) with no traceback. The
+        # plain HTTPS path reports byte-accurate progress and is reliable, so we
+        # default to it unless the operator explicitly opted in (set
+        # HF_HUB_DISABLE_XET=0). A crash retry always disables it.
+        if disable_xet or os.environ.get("HF_HUB_DISABLE_XET") is None:
+            env["HF_HUB_DISABLE_XET"] = "1"
        proc = _sp.Popen(
            [_sys.executable, "-m", "codai.admin.download_worker", model_id, file_pattern or ""],
-        stdout=_sp.PIPE, stderr=_sp.STDOUT, text=True, bufsize=1,
+            stdout=_sp.PIPE, stderr=_sp.STDOUT, text=True, bufsize=1, env=env, cwd=_repo_root,
        )
        _download_procs[session_id] = proc
-
-    terminal = None  # set to "done"/"error" once the child reports a final event
+        terminal = None
+        recent = _collections.deque(maxlen=12)
        try:
            for line in proc.stdout:
                line = line.strip()
@@ -846,7 +868,9 @@ def _run_download_thread(session_id: str, model_id: str, file_pattern: str, pq):
                try:
                    evt = _j.loads(line)
                except Exception:
-                # Non-JSON output (warnings / tracebacks) → surface as info.
+                    # Non-JSON output (warnings / tracebacks) → surface as info
+                    # and keep a tail so a hard crash can report what it printed.
+                    recent.append(line)
                    push({"type": "info", "message": line})
                    continue
                etype = evt.get("type")
@@ -855,12 +879,11 @@ def _run_download_thread(session_id: str, model_id: str, file_pattern: str, pq):
                    terminal = etype
        except Exception as exc:
            push({"type": "error", "message": str(exc)})
+            terminal = "error"
        finally:
-        # Ensure the child is gone (cancel, crash, or normal exit).
            if proc.poll() is None:
                try:
-                proc.terminate()
-                proc.wait(timeout=10)
+                    proc.terminate(); proc.wait(timeout=10)
                except Exception:
                    pass
            if proc.poll() is None:
@@ -869,15 +892,32 @@ def _run_download_thread(session_id: str, model_id: str, file_pattern: str, pq):
                except Exception:
                    pass
            _download_procs.pop(session_id, None)
+        return terminal, proc.poll(), " | ".join(list(recent)[-4:]).strip()
+
+    try:
+        terminal, rc, tail = _attempt(disable_xet=False)
+        # A hard crash (no done/error event, not a user cancel) is the classic
+        # hf_xet failure — retry once with Xet disabled before giving up.
+        crashed = (terminal is None and session_id not in _download_cancelled)
+        if crashed and "HF_HUB_DISABLE_XET" not in os.environ:
+            push({"type": "info",
+                  "message": "Transfer crashed; retrying without the Xet accelerator…"})
+            _download_status.get(session_id, {}).update({"status": "downloading", "percent": 0})
+            terminal, rc, tail = _attempt(disable_xet=True)

        if terminal is None:
-            # Child ended without a done/error event → cancelled or died.
+            # Still no final event → cancelled or died for good.
            if session_id in _download_cancelled:
                pq.put({"type": "cancelled", "message": "Download cancelled by user"})
                _download_status.get(session_id, {}).update({"status": "cancelled"})
            else:
-                push({"type": "error", "message": "Download process exited unexpectedly"})
-
+                detail = f"Download process exited unexpectedly (exit code {rc})"
+                if rc is not None and rc < 0:
+                    detail += f" — killed by signal {-rc} (often out-of-memory)"
+                if tail:
+                    detail += f". Last output: {tail}"
+                push({"type": "error", "message": detail})
+    finally:
        _download_cancelled.discard(session_id)

        def _gc():
@@ -1115,7 +1155,28 @@ def _scan_caches() -> dict:
                    continue
                if p not in configured_settings:
                    configured_settings[p] = (s, cat)
-                all_configs.setdefault(p, []).append({"settings": s, "cat": cat})
+                # A single logical config can be registered under multiple
+                # categories via model_types (for example text+vision). It is
+                # stored once per category in models.json with the same
+                # config_id, but the UI should show it as one editable config,
+                # not duplicate pills that appear to delete each other.
+                _cfg_list = all_configs.setdefault(p, [])
+                _cid = s.get("config_id") if isinstance(s, dict) else None
+                _existing = None
+                if _cid:
+                    for _cfg in _cfg_list:
+                        _settings = _cfg.get("settings") or {}
+                        if isinstance(_settings, dict) and _settings.get("config_id") == _cid:
+                            _existing = _cfg
+                            break
+                if _existing is not None:
+                    _cats = _existing.setdefault("cats", [])
+                    if cat not in _cats:
+                        _cats.append(cat)
+                    if not _existing.get("cat"):
+                        _existing["cat"] = cat
+                else:
+                    _cfg_list.append({"settings": s, "cat": cat, "cats": [cat]})

    # Secondary index: basename → (settings_tuple, original_path)
    # Used to reconnect a config to a re-downloaded file that landed at a different path.
@@ -1678,7 +1739,6 @@ async def api_model_load(request: Request, username: str = Depends(require_admin
                    multi_model_manager.add_model(model_key, pipeline)
                    multi_model_manager.record_vram_delta(model_key, _snap)
        elif model_type == "video":
-            import asyncio
            from codai.api.video import _load_video_pipeline, _derive_device
            model_key = f"video:{path}"
            device = _derive_device()
@@ -1693,7 +1753,6 @@ async def api_model_load(request: Request, username: str = Depends(require_admin
            multi_model_manager.models_in_vram.add(model_key)
            multi_model_manager.record_vram_delta(model_key, _snap)
        elif model_type == "audio_gen":
-            import asyncio
            from codai.api.audio_gen import _load_musicgen, _load_audioldm, _detect_audio_gen_type, _derive_device
            model_key = f"audio_gen:{path}"
            device = _derive_device()
@@ -1711,32 +1770,26 @@ async def api_model_load(request: Request, username: str = Depends(require_admin
            multi_model_manager.models_in_vram.add(model_key)
            multi_model_manager.record_vram_delta(model_key, _snap)
        elif model_type == "tts":
-            import asyncio
            model_key = f"tts:{path}"
            _snap = multi_model_manager.vram_before_load()
+            # Use the same backend factory as a real request so every engine is
+            # handled identically — in particular a Parler model boots its managed
+            # worker here, so "loading" it from the interface starts the service.
+            cfg = (multi_model_manager.config.get(model_key)
+                   or multi_model_manager.config.get(f"tts:{path}")
+                   or model_cfg or {})
            def _load_tts():
-                try:
-                    from kokoro import Kokoro
-                    return Kokoro(path)
-                except ImportError:
-                    pass
-                try:
-                    from bark import preload_models
-                    preload_models()
-                    return {"bark": True}
-                except ImportError:
-                    pass
-                return None
+                from codai.api import tts_backends
+                return tts_backends.load_backend(path, path, cfg)
            tts_obj = await asyncio.to_thread(_load_tts)
            if tts_obj is None:
-                raise RuntimeError("No supported TTS backend found (kokoro / bark)")
+                raise RuntimeError("TTS model failed to load")
            multi_model_manager.models[model_key] = tts_obj
            multi_model_manager.current_model_key = model_key
            multi_model_manager.active_in_vram = model_key
            multi_model_manager.models_in_vram.add(model_key)
            multi_model_manager.record_vram_delta(model_key, _snap)
        elif model_type in ("embedding", "spatial", "vision"):
-            import asyncio
            from codai.api.images import _load_diffusers_pipeline
            from codai.api.state import get_global_args
            model_key = f"{model_type}:{path}"
@@ -1797,6 +1850,78 @@ async def api_model_unload(request: Request, username: str = Depends(require_adm
    return {"success": True, "was_loaded": True}


+def _sanitize_engine_int_overrides(raw) -> dict:
+    """Clean a {engine_name: int} override map: keep positive ints, drop the rest."""
+    out = {}
+    if isinstance(raw, dict):
+        for name, val in raw.items():
+            if val in (None, ""):
+                continue
+            try:
+                iv = int(val)
+            except (TypeError, ValueError):
+                continue
+            if iv >= 1:
+                out[str(name)] = iv
+    return out
+
+
+def _resolve_engine_spec(engine_name: str, engine_specs):
+    """Find the declared engine matching ``engine_name`` (by name or backend)."""
+    for s in (engine_specs or []):
+        if not isinstance(s, dict):
+            continue
+        if (s.get("name") or "").lower() == engine_name.lower() \
+                or (s.get("backend") or "").lower() == engine_name.lower():
+            return s
+    return None
+
+
+def validate_engine_pin(engine_name: str, model_path: str, engine_specs,
+                        model_backend: str = None, ds4_cfg=None) -> list:
+    """Return human-readable warnings if pinning ``model_path`` to ``engine_name``
+    is wrong (unknown engine, or an engine that can't run this model's format).
+
+    Empty list = the pin is fine. Used to *notify* the admin instead of silently
+    ignoring a bad pin (the router would otherwise just fall back)."""
+    engine_name = (engine_name or "").strip()
+    if not engine_name:
+        return []
+    from codai.frontproxy.registry import _DEFAULT_CAPS
+    from codai.frontproxy.router import required_capability
+    specs = engine_specs or []
+    if specs:
+        spec = _resolve_engine_spec(engine_name, specs)
+        if spec is None:
+            names = [s.get("name") for s in specs if isinstance(s, dict) and s.get("name")]
+            return [f"Engine '{engine_name}' is not declared. Known engines: "
+                    f"{', '.join(names) or '(none)'}."]
+        backend = (spec.get("backend") or "auto").lower()
+        caps = set(spec.get("capabilities")
+                   or _DEFAULT_CAPS.get(backend, {"transformers", "gguf"}))
+    else:
+        # Auto-detection: no engine_specs to resolve against — infer the engine's
+        # capabilities from its vendor/backend name so we can still catch an
+        # impossible pin (e.g. a transformers model pinned to the Radeon engine).
+        key = engine_name.lower()
+        backend = {"radeon": "vulkan", "amd": "vulkan", "intel": "vulkan",
+                   "cuda": "nvidia"}.get(key, key)
+        caps = _DEFAULT_CAPS.get(backend)
+        if caps is None:
+            return []   # unknown name, nothing to validate against — accept silently
+        caps = set(caps)
+    req = required_capability(
+        model_path, backend=model_backend,
+        ds4_model_id=getattr(ds4_cfg, "model_id", None) if ds4_cfg else None,
+        ds4_enabled=bool(getattr(ds4_cfg, "enabled", False)) if ds4_cfg else False)
+    if req and req not in caps:
+        return [f"Engine '{engine_name}' (backend '{backend}') can't run this model: "
+                f"it needs '{req}' capability but the engine only provides "
+                f"{sorted(caps)}. The request would fall back to a compatible engine — "
+                f"pick a different engine or adjust the engine's capabilities."]
+    return []
+
+
 @router.post("/admin/api/model-configure", summary="Update a model's configuration")
 async def api_model_configure(request: Request, username: str = Depends(require_admin)):
    """Save per-model configuration and register/update in models.json."""
@@ -1897,6 +2022,11 @@ async def api_model_configure(request: Request, username: str = Depends(require_
        if existing_cid and not is_new_config:
            # Targeted removal: only the entry that shares this config_id
            return existing_cid == config_id
+        if existing_cid and is_new_config:
+            # Adding a new configuration for the same model must preserve modern
+            # sibling configs. Only legacy entries without config_id fall through
+            # to path-based replacement because they cannot be targeted safely.
+            return False
        # Path-based removal (no config_id on either side, or new entry replacing old)
        key = m_entry.get("path", m_entry.get("id", ""))
        return key in paths_to_remove or (fnames_to_remove and _os.path.basename(key) in fnames_to_remove)
@@ -1938,7 +2068,7 @@ async def api_model_configure(request: Request, username: str = Depends(require_
                "max_vram", "sdcpp_flash_attn", "sdcpp_diffusion_flash_attn", "vae_tiling",
                "component_quantization", "output_crf", "force_vram_update",
                "balanced_gpu_percent", "acceleration",
-                "cache_type_k", "cache_type_v", "turboquant"):
+                "cache_type_k", "cache_type_v", "turboquant", "engine"):
        if key in data:
            entry[key] = data[key]

@@ -1966,7 +2096,15 @@ async def api_model_configure(request: Request, username: str = Depends(require_
        applied = apply_model_entry_live(entry, model_types)
    except Exception as e:
        print(f"  [admin] live config apply failed (restart to apply): {e}")
-    return {"success": True, "applied_live": applied}
+    warnings = []
+    if entry.get("engine"):
+        warnings = validate_engine_pin(
+            entry["engine"], path, config_manager.config.server.engine_specs,
+            model_backend=entry.get("backend"),
+            ds4_cfg=getattr(config_manager.config, "ds4", None))
+        for w in warnings:
+            print(f"  [admin] engine-pin warning: {w}")
+    return {"success": True, "applied_live": applied, "warnings": warnings}


 @router.get("/admin/api/accel-presets", summary="List acceleration / distillation presets")
@@ -2244,6 +2382,21 @@ def _read_vram_info() -> Optional[dict]:
    return None


+@router.get("/admin/api/gpu-stats", summary="Per-card GPU utilization, VRAM and temperature")
+def api_gpu_stats(username: str = Depends(require_auth)):
+    """Live stats for EVERY physical GPU installed (NVIDIA via nvidia-smi, AMD via
+    sysfs), independent of which engine owns it. Used by the Tasks page to show
+    per-card VRAM + utilization across all cards. Best-effort; empty if unreadable.
+
+    SYNC handler: it shells out to nvidia-smi / reads sysfs, so it runs in the
+    threadpool rather than on the event loop."""
+    try:
+        from codai.frontproxy.gpu_detect import gpu_stats
+        return {"cards": gpu_stats()}
+    except Exception as e:
+        return {"cards": [], "error": str(e)}
+
+
 @router.get("/admin/api/system-stats", summary="Live CPU / GPU / RAM / VRAM usage and temperatures")
 def api_system_stats(username: str = Depends(require_admin)):
    """Lightweight hardware telemetry for the Tasks page header: CPU & GPU
@@ -2406,6 +2559,12 @@ async def api_get_settings(username: str = Depends(require_admin)):
            "https_cert_path": c.server.https_cert_path,
            "queue_max_size": c.server.queue_max_size,
            "max_parallel_requests": c.server.max_parallel_requests,
+            "max_parallel_requests_overrides": c.server.max_parallel_requests_overrides,
+            "internal_port_base": c.server.internal_port_base,
+            "default_engine": c.server.default_engine,
+            # Engine names available to pick as the default (for the settings UI).
+            "engine_names": [s.get("name") for s in (c.server.engine_specs or [])
+                             if isinstance(s, dict) and s.get("name")],
        },
        "backend": {
            "type": c.backend.type,
@@ -2417,6 +2576,8 @@ async def api_get_settings(username: str = Depends(require_admin)):
            "default_load_mode": c.models.default_load_mode,
            "hf_cache_dir": c.models.hf_cache_dir,
            "gguf_cache_dir": c.models.gguf_cache_dir,
+            "max_model_instances": c.models.max_model_instances,
+            "max_model_instances_overrides": c.models.max_model_instances_overrides,
        },
        "offload": {
            "directory": c.offload.directory,
@@ -2430,6 +2591,9 @@ async def api_get_settings(username: str = Depends(require_admin)):
            "max_ram_gb": c.offload.max_ram_gb,
            "evict_idle_on_ram": c.offload.evict_idle_on_ram,
            "ram_leak_watch": c.offload.ram_leak_watch,
+            "ram_watch_poll_seconds": c.offload.ram_watch_poll_seconds,
+            "ram_watch_soft_fraction": c.offload.ram_watch_soft_fraction,
+            "ram_watch_cuda": c.offload.ram_watch_cuda,
        },
        "vulkan": {
            "n_gpu_layers": c.vulkan.n_gpu_layers,
@@ -2449,6 +2613,7 @@ async def api_get_settings(username: str = Depends(require_admin)):
            "cpu_resume": c.thermal.cpu_resume,
            "gpu_high": c.thermal.gpu_high,
            "gpu_resume": c.thermal.gpu_resume,
+            "gpu_overrides": c.thermal.gpu_overrides,
            "poll_seconds": c.thermal.poll_seconds,
            "soft_throttle_enabled": c.thermal.soft_throttle_enabled,
            "soft_throttle_temp": c.thermal.soft_throttle_temp,
@@ -2461,6 +2626,19 @@ async def api_get_settings(username: str = Depends(require_admin)):
            "allow_ffmpeg": c.enhance.allow_ffmpeg,
            "allow_rife_ncnn": c.enhance.allow_rife_ncnn,
        },
+        "ds4": {
+            "enabled": c.ds4.enabled,
+            "repo_url": c.ds4.repo_url,
+            "install_dir": c.ds4.install_dir,
+            "build_target": c.ds4.build_target,
+            "model_variant": c.ds4.model_variant,
+            "model_id": c.ds4.model_id,
+            "host": c.ds4.host,
+            "port": c.ds4.port,
+            "ctx": c.ds4.ctx,
+            "extra_args": c.ds4.extra_args,
+            "auto_build": c.ds4.auto_build,
+        },
        "broker": {
            "enabled": c.broker.enabled,
            "base_url": c.broker.base_url,
@@ -2495,6 +2673,7 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad

    data = await request.json()
    c = config_manager.config
+    _settings_warnings: list = []

    if "server" in data:
        srv = data["server"]
@@ -2511,6 +2690,28 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
            c.server.max_parallel_requests = int(srv["max_parallel_requests"])
            from codai.queue.manager import queue_manager
            queue_manager.max_parallel_requests = c.server.max_parallel_requests
+        if "max_parallel_requests_overrides" in srv:
+            c.server.max_parallel_requests_overrides = _sanitize_engine_int_overrides(
+                srv["max_parallel_requests_overrides"])
+        if "internal_port_base" in srv:
+            try:
+                c.server.internal_port_base = max(1, min(65535, int(srv["internal_port_base"])))
+            except (TypeError, ValueError):
+                pass
+        if "default_engine" in srv:
+            c.server.default_engine = (srv.get("default_engine") or "").strip() or None
+            # Only validate against engine_specs when they're explicitly declared.
+            # With auto-detection engine_specs is empty and the engines (nvidia/
+            # radeon/…) are only known to the front, so don't false-warn there — the
+            # front validates the name at routing time and logs if it can't honour it.
+            if (c.server.default_engine and c.server.engine_specs
+                    and _resolve_engine_spec(c.server.default_engine,
+                                             c.server.engine_specs) is None):
+                names = [s.get("name") for s in (c.server.engine_specs or [])
+                         if isinstance(s, dict) and s.get("name")]
+                _settings_warnings.append(
+                    f"Default engine '{c.server.default_engine}' is not declared. "
+                    f"Known engines: {', '.join(names) or '(none)'}.")

    if "backend" in data:
        bk = data["backend"]
@@ -2526,6 +2727,14 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
            c.models.hf_cache_dir = mdl["hf_cache_dir"] or None
        if "gguf_cache_dir" in mdl:
            c.models.gguf_cache_dir = mdl["gguf_cache_dir"] or None
+        if "max_model_instances" in mdl:
+            try:
+                c.models.max_model_instances = max(1, int(mdl["max_model_instances"]))
+            except (TypeError, ValueError):
+                pass
+        if "max_model_instances_overrides" in mdl:
+            c.models.max_model_instances_overrides = _sanitize_engine_int_overrides(
+                mdl["max_model_instances_overrides"])

    if "offload" in data:
        off = data["offload"]
@@ -2543,6 +2752,11 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
            c.offload.max_ram_gb = off["max_ram_gb"] or None
        c.offload.evict_idle_on_ram = bool(off.get("evict_idle_on_ram", c.offload.evict_idle_on_ram))
        c.offload.ram_leak_watch = bool(off.get("ram_leak_watch", c.offload.ram_leak_watch))
+        if "ram_watch_poll_seconds" in off:
+            c.offload.ram_watch_poll_seconds = float(off["ram_watch_poll_seconds"] or c.offload.ram_watch_poll_seconds)
+        if "ram_watch_soft_fraction" in off:
+            c.offload.ram_watch_soft_fraction = float(off["ram_watch_soft_fraction"] or c.offload.ram_watch_soft_fraction)
+        c.offload.ram_watch_cuda = bool(off.get("ram_watch_cuda", c.offload.ram_watch_cuda))
        # Push the RAM-cap settings to live global_args so the watcher, per-load
        # budget clamp and eviction honour them without a restart.
        try:
@@ -2552,6 +2766,9 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
                ga.max_ram_gb = c.offload.max_ram_gb
                ga.evict_idle_on_ram = c.offload.evict_idle_on_ram
                ga.ram_leak_watch = c.offload.ram_leak_watch
+                ga.ram_watch_poll_seconds = c.offload.ram_watch_poll_seconds
+                ga.ram_watch_soft_fraction = c.offload.ram_watch_soft_fraction
+                ga.ram_watch_cuda = c.offload.ram_watch_cuda
        except Exception:
            pass

@@ -2607,6 +2824,22 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
        c.thermal.cpu_resume = float(th.get("cpu_resume", c.thermal.cpu_resume))
        c.thermal.gpu_high = float(th.get("gpu_high", c.thermal.gpu_high))
        c.thermal.gpu_resume = float(th.get("gpu_resume", c.thermal.gpu_resume))
+        if "gpu_overrides" in th and isinstance(th["gpu_overrides"], dict):
+            # Sanitize: {vendor: {high, resume}} with numeric values only.
+            clean = {}
+            for vendor, ov in th["gpu_overrides"].items():
+                if not isinstance(ov, dict):
+                    continue
+                entry = {}
+                for k in ("high", "resume"):
+                    if ov.get(k) not in (None, ""):
+                        try:
+                            entry[k] = float(ov[k])
+                        except (TypeError, ValueError):
+                            pass
+                if entry:
+                    clean[str(vendor).lower()] = entry
+            c.thermal.gpu_overrides = clean
        c.thermal.poll_seconds = max(1.0, float(th.get("poll_seconds", c.thermal.poll_seconds)))
        c.thermal.soft_throttle_enabled = bool(th.get("soft_throttle_enabled", c.thermal.soft_throttle_enabled))
        c.thermal.soft_throttle_temp = float(th.get("soft_throttle_temp", c.thermal.soft_throttle_temp))
@@ -2622,6 +2855,7 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
                ga.thermal_cpu_resume = c.thermal.cpu_resume
                ga.thermal_gpu_high = c.thermal.gpu_high
                ga.thermal_gpu_resume = c.thermal.gpu_resume
+                ga.thermal_gpu_overrides = c.thermal.gpu_overrides
                ga.thermal_poll_seconds = c.thermal.poll_seconds
                ga.thermal_soft_throttle_enabled = c.thermal.soft_throttle_enabled
                ga.thermal_soft_throttle_temp = c.thermal.soft_throttle_temp
@@ -2656,6 +2890,30 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
        except Exception:
            pass

+    if "ds4" in data:
+        d = data["ds4"]
+        c.ds4.enabled = bool(d.get("enabled", c.ds4.enabled))
+        if "repo_url" in d:
+            c.ds4.repo_url = (d.get("repo_url") or c.ds4.repo_url or "").strip()
+        if "install_dir" in d:
+            c.ds4.install_dir = (d.get("install_dir") or "").strip() or None
+        if "build_target" in d:
+            c.ds4.build_target = (d.get("build_target") or "auto").strip()
+        if "model_variant" in d:
+            c.ds4.model_variant = (d.get("model_variant") or c.ds4.model_variant).strip()
+        if "model_id" in d:
+            c.ds4.model_id = (d.get("model_id") or c.ds4.model_id or "deepseek-v4").strip()
+        if "host" in d:
+            c.ds4.host = (d.get("host") or "127.0.0.1").strip()
+        if "port" in d:
+            c.ds4.port = int(d.get("port") or 0)
+        if "ctx" in d:
+            c.ds4.ctx = max(1024, int(d.get("ctx") or c.ds4.ctx))
+        if "extra_args" in d:
+            c.ds4.extra_args = (d.get("extra_args") or "").strip()
+        if "auto_build" in d:
+            c.ds4.auto_build = bool(d["auto_build"])
+
    if "broker" in data:
        bro = data["broker"]
        c.broker.enabled = bool(bro.get("enabled", c.broker.enabled))
@@ -2688,7 +2946,7 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
            raise HTTPException(status_code=400, detail=str(error)) from error

    config_manager.save_config()
-    return {"success": True}
+    return {"success": True, "warnings": _settings_warnings}


 # =============================================================================

--- a/codai/admin/templates/chat.html
+++ b/codai/admin/templates/chat.html
@@ -2372,7 +2372,7 @@ const STUDIO_CAPABILITIES = {
    optional:[],
    notes:[
      'Requires <code>insightface</code> and <code>onnxruntime</code>: <code>pip install insightface onnxruntime</code>.',
-      'The <b>inswapper_128.onnx</b> model is <b>auto-downloaded</b> from HuggingFace on first use (<a href="/admin/models?tab=search&q=inswapper&pipeline=&gguf=no-gguf" class="cap-find-link">deepinsight/inswapper<span class="cap-find-icon">↗</span></a>).',
+      'The <b>inswapper_128.onnx</b> model is <b>auto-downloaded</b> from HuggingFace on first use (<a href="' + (window.ROOT_PATH||'') + '/admin/models?tab=search&q=inswapper&pipeline=&gguf=no-gguf" class="cap-find-link">deepinsight/inswapper<span class="cap-find-icon">↗</span></a>).',
      'No AI model selection needed — this feature uses its own dedicated backend.',
    ],
    backendPath: ROOT_PATH + '/v1/images/faceswap',
@@ -2386,7 +2386,7 @@ const STUDIO_CAPABILITIES = {
    optional:[],
    notes:[
      'Requires <code>insightface</code> and <code>onnxruntime</code>: <code>pip install insightface onnxruntime</code>.',
-      'The <b>inswapper_128.onnx</b> model is <b>auto-downloaded</b> from HuggingFace on first use (<a href="/admin/models?tab=search&q=inswapper&pipeline=&gguf=no-gguf" class="cap-find-link">deepinsight/inswapper<span class="cap-find-icon">↗</span></a>).',
+      'The <b>inswapper_128.onnx</b> model is <b>auto-downloaded</b> from HuggingFace on first use (<a href="' + (window.ROOT_PATH||'') + '/admin/models?tab=search&q=inswapper&pipeline=&gguf=no-gguf" class="cap-find-link">deepinsight/inswapper<span class="cap-find-icon">↗</span></a>).',
      'No AI model selection needed — this feature uses its own dedicated backend.',
    ],
    backendPath: ROOT_PATH + '/v1/images/faceswap',
@@ -2461,14 +2461,14 @@ function capSearchUrl(cap) {
  const s = CAP_TO_HF_SEARCH[cap];
  if (!s) return null;
  const p = new URLSearchParams({ tab:'search', q: s.q, pipeline: s.pipeline, gguf: s.gguf });
-  return '/admin/models?' + p.toString();
+  return (window.ROOT_PATH || '') + '/admin/models?' + p.toString();
 }
 function capMissingHtml(caps, label) {
  if (!caps.length) return '';
  const links = caps.map(cap => {
    const chip = `<span class="cap-chip dim">${cap.replace(/_/g,' ')}</span>`;
    if (_localCapSet.has(cap)) {
-      const url = `/admin/models?local_cap=${encodeURIComponent(cap)}`;
+      const url = `${window.ROOT_PATH || ''}/admin/models?local_cap=${encodeURIComponent(cap)}`;
      return `<a href="${url}" class="cap-find-link" title="You have a local model with ${cap.replace(/_/g,' ')} — click to configure it">${chip}<span class="cap-find-icon" style="color:#6ecf7e">↑ configure</span></a>`;
    }
    const url = capSearchUrl(cap);

--- a/codai/admin/templates/models.html
+++ b/codai/admin/templates/models.html
@@ -577,6 +577,13 @@ window.__DEFAULT_WHISPER_SERVER_PATH__ = {{ default_whisper_server_path|tojson }
          </select>
        </div>
      </div>
+      <div class="form-row" id="cfg-engine-row" style="margin-top:.75rem;display:none">
+        <label class="form-label">Engine / card</label>
+        <select id="cfg-engine" class="form-input">
+          <option value="">Default (auto — by capability)</option>
+        </select>
+        <span class="form-hint" style="font-size:11px">Pin this model to a specific engine/card. Overrides the default engine. Only shown when multiple engines are running.</span>
+      </div>
      <div style="display:grid;grid-template-columns:1fr 1fr;gap:.75rem;margin-top:.75rem">
        <div class="form-row" style="margin:0">
          <label class="form-label">Used VRAM <span class="muted">(GB)</span></label>
@@ -1441,8 +1448,7 @@ function handleProgressEvent(evt){
    showDownloadError(evt.message);
  }else if(evt.type==='cancelled'){
    _dlDone=true;
-    if(_dlEs){_dlEs.close();_dlEs=null;}
-    showDownloadError('Download cancelled');
+    showDownloadCancelled();
  }
  // keepalive: ignore
 }
@@ -1483,7 +1489,7 @@ async function reopenDownload(session_id){
        if(s.rate) document.getElementById('dl-speed').textContent=fmtRate(s.rate);
        if(s.eta!=null) document.getElementById('dl-eta').textContent=fmtEta(s.eta);
        if(s.status==='done'){handleProgressEvent({type:'done'});return;}
-        if(s.status==='cancelled'){showDownloadError('Download cancelled');return;}
+        if(s.status==='cancelled'){_dlDone=true;showDownloadCancelled();return;}
        if(s.status==='error'){showDownloadError(s.error||'Download failed');return;}
      }
    }
@@ -1501,15 +1507,27 @@ async function reopenDownload(session_id){
  };
 }

+function showDownloadCancelled(){
+  if(_dlEs){_dlEs.close();_dlEs=null}
+  document.getElementById('dl-form').style.display='block';
+  document.getElementById('dl-progress').style.display='none';
+}
+
 async function stopDownload(session_id){
  if(!confirm('Cancel this download?')) return;
  try{
-    await fetch(ROOT_PATH + '/admin/api/download-cancel/'+session_id, {method:'POST'});
+    const r = await fetch(ROOT_PATH + '/admin/api/download-cancel/'+session_id, {method:'POST'});
+    if(!r.ok){
+      let detail = r.status+' '+r.statusText;
+      try{ const j = await r.json(); if(j&&j.detail) detail = j.detail; }catch{}
+      alert('Could not cancel download: '+detail);
+      return;
+    }
    if(_dlSessionId===session_id){
-      if(_dlEs){_dlEs.close();_dlEs=null;}
      _dlDone=true;
-      showDownloadError('Download cancelled');
+      showDownloadCancelled();
    }
+    pollDownloads(); // refresh the active-downloads strip immediately
  }catch(e){
    alert('Could not cancel download: '+e.message);
  }
@@ -1798,12 +1816,53 @@ let _localModels = [];
 let _ggufFiles = [];
 let _hfModels = [];

+// Engine/card hardware info (fetched once); used to tag models with the card they
+// run on when more than one engine is configured.
+let _engineNames = [];
+let _defaultEngine = '';
+async function _loadEngineInfo(){
+  // Live engine names from the front (covers auto-detected engines, not just those
+  // declared in engine_specs); default_engine still comes from settings.
+  try {
+    const er = await fetch(ROOT_PATH + '/admin/api/engines');
+    if (er.ok) _engineNames = ((await er.json()).engines || []).map(e => e.name);
+  } catch(e) {}
+  try {
+    const d = await (await fetch(ROOT_PATH + '/admin/api/settings')).json();
+    if (!_engineNames.length) _engineNames = (d.server && d.server.engine_names) || [];
+    _defaultEngine = (d.server && d.server.default_engine) || '';
+  } catch(e) {}
+}
+// Compact card tag for a model config. Pinned engines show as-is (with 📌);
+// otherwise the engine is inferred from the model's format (transformers/ds4 →
+// nvidia; gguf/whisper → the default engine, or "any"). Hidden when ≤1 engine, so
+// it never widens single-card setups.
+function _engineTagHtml(m, s){
+  if(!_engineNames || _engineNames.length < 2) return '';
+  let eng = ((s && s.engine) || '').trim();
+  let pinned = !!eng;
+  if(!eng){
+    const path = (((m && (m.path || m.id || m.filename)) || '') + '').toLowerCase();
+    const isGguf = path.endsWith('.gguf') || path.includes('gguf');
+    const isWhisper = ((s && s.backend) || '') === 'whisper-server';
+    const isDs4 = path.includes('deepseek-v4');
+    if(isDs4 || (!isGguf && !isWhisper)) eng = 'nvidia';   // ds4/transformers → nvidia
+    else eng = _defaultEngine || 'any';                     // gguf/whisper → default
+  }
+  const lc = eng.toLowerCase();
+  const color = (lc.includes('nv')) ? '#76b900'
+              : (lc.includes('rad') || lc.includes('amd')) ? '#ed1c24'
+              : 'var(--text-3)';
+  const title = pinned ? ('Pinned to engine: ' + eng) : ('Runs on: ' + eng + ' (auto)');
+  return `<span class="badge" title="${esc(title)}" style="font-size:9px;padding:.05rem .3rem;margin:.1rem .1rem 0 0;vertical-align:middle;border:1px solid ${color};color:${color};background:transparent">${esc(eng)}${pinned?' 📌':''}</span>`;
+}
+
 function _renderConfigPills(idx, m) {
  const configs = m.configs || [];
  if (!configs.length) return '';
  const pills = configs.map((c, cfgIdx) => {
    const label = (c.settings && (c.settings.config_name || c.settings.alias)) || `Config ${cfgIdx + 1}`;
-    return `<span class="badge badge-user" style="font-size:10px;cursor:pointer;vertical-align:middle;margin:.1rem .1rem 0 0" onclick="openCfgModal(${idx},${cfgIdx})" title="Edit this configuration">${esc(label)}</span>`;
+    return `<span class="badge badge-user" style="font-size:10px;cursor:pointer;vertical-align:middle;margin:.1rem .1rem 0 0" onclick="openCfgModal(${idx},${cfgIdx})" title="Edit this configuration">${esc(label)}</span>${_engineTagHtml(m, c.settings)}`;
  }).join('');
  const addPill = `<span class="badge" style="font-size:10px;cursor:pointer;vertical-align:middle;margin:.1rem 0 0 0;background:var(--raised);border:1px dashed var(--border);color:var(--text-2)" onclick="openCfgModalNew(${idx})" title="Add another configuration for this model">+ Config</span>`;
  return `<br style="line-height:.5rem">${pills}${addPill}`;
@@ -2338,6 +2397,9 @@ async function refreshLocal(){
 }

 loadGlobalSettings();
+// Load engine/card info first so the per-model card tags render on the first paint,
+// then re-render once it's available (covers the fetch resolving after the list).
+_loadEngineInfo().then(() => loadCachedModels());
 refreshLocal();

 // Toggle the acceleration / TurboQuant sections as model types are checked/unchecked.
@@ -2731,6 +2793,7 @@ function openCfgModal(idx, cfgIdx){
  document.getElementById('cfg-noram').checked = !!s.no_ram;
  document.getElementById('cfg-offload-strategy').value = s.offload_strategy || 'auto';
  document.getElementById('cfg-offload-dir').value = s.offload_dir || _defaultOffloadDir;
+  _populateEnginePin(s.engine || '');
  document.getElementById('cfg-sysprompt').value = s.system_prompt || '';
  document.getElementById('cfg-parser').value = s.parser || (!m.in_config ? _autoDetectParser(m.path) : 'auto');
  document.getElementById('cfg-tools').checked = !!s.tools_closer_prompt;
@@ -3027,6 +3090,21 @@ async function removeThisConfig(){
  } catch(e) { alert('Error: ' + e.message); }
 }

+// Engine-pin field: populate the datalist from declared engines and only show the
+// row when more than one engine is configured (single-engine setups don't need it).
+async function _populateEnginePin(desired){
+  const row = document.getElementById('cfg-engine-row');
+  const sel = document.getElementById('cfg-engine');
+  try {
+    if (!_engineNames || !_engineNames.length) await _loadEngineInfo();
+    const want = (desired !== undefined) ? desired : sel.value;
+    sel.querySelectorAll('option:not([value=""])').forEach(o => o.remove());
+    _engineNames.forEach(n => { const o=document.createElement('option'); o.value=n; o.textContent=n; sel.appendChild(o); });
+    sel.value = want || '';   // set AFTER options exist so the selection sticks
+    row.style.display = _engineNames.length > 1 ? '' : 'none';
+  } catch(e) { row.style.display = 'none'; }
+}
+
 async function saveModelConfig(){
  const path = document.getElementById('cfg-path').value;
  const maxGpu = parseFloat(document.getElementById('cfg-max-gpu').value);
@@ -3063,6 +3141,7 @@ async function saveModelConfig(){
    no_ram:            document.getElementById('cfg-noram').checked,
    offload_strategy:  document.getElementById('cfg-offload-strategy').value,
    offload_dir:       document.getElementById('cfg-offload-dir').value.trim() || './offload',
+    engine:            document.getElementById('cfg-engine').value.trim() || null,
    system_prompt:     document.getElementById('cfg-sysprompt').value.trim() || null,
    parser:            document.getElementById('cfg-parser').value,
    tools_closer_prompt: document.getElementById('cfg-tools').checked,
@@ -3094,7 +3173,12 @@ async function saveModelConfig(){
      body: JSON.stringify(data)
    });
    const d = await r.json();
-    if(d.success){ closeModal('cfg-modal'); loadCachedModels(); }
+    if(d.success){
+      if (d.warnings && d.warnings.length) {
+        alert('Saved, but check this:\n\n• ' + d.warnings.join('\n• '));
+      }
+      closeModal('cfg-modal'); loadCachedModels();
+    }
    else alert('Error: '+(d.detail||'Unknown'));
  }catch(e){ alert('Error: '+e.message); }
 }

--- a/codai/admin/templates/settings.html
+++ b/codai/admin/templates/settings.html
@@ -50,6 +50,40 @@
    <input type="number" id="s-queue-max" class="form-input" placeholder="6" min="1" max="1000" style="max-width:160px">
    <span class="form-hint">Maximum number of concurrent queued requests. Authenticated requests arriving when the queue is full receive a 429 response.</span>
  </div>
+  <div class="form-row" style="margin-top:1rem;margin-bottom:0">
+    <label class="form-label">Engine base port</label>
+    <input type="number" id="s-internal-port-base" class="form-input" placeholder="8780" min="1" max="65535" style="max-width:160px">
+    <span class="form-hint">First internal port for engine subprocesses (the front assigns this and the next free ports). Keep it different from the listen port above. Restart to apply.</span>
+  </div>
+  <div class="form-row" id="default-engine-row" style="margin-top:1rem;margin-bottom:0;display:none">
+    <label class="form-label">Default engine</label>
+    <select id="s-default-engine" class="form-input" style="max-width:260px">
+      <option value="">Auto (least-loaded compatible)</option>
+    </select>
+    <span class="form-hint">When a model can run on more than one card (e.g. a GGUF on either the NVIDIA or Radeon engine), prefer this one. A per-model <em>engine</em> set on the Models page overrides this. Only shown when multiple engines are declared.</span>
+  </div>
+</div>
+
+<!-- Concurrency -->
+<div class="card mb-0" style="margin-top:1rem">
+  <div class="card-title">Concurrency</div>
+  <div style="display:grid;grid-template-columns:1fr 1fr;gap:1rem">
+    <div class="form-row" style="margin:0">
+      <label class="form-label">Max parallel requests <span class="muted">(per engine)</span></label>
+      <input type="number" id="s-max-parallel" class="form-input" min="1" max="64" placeholder="2" style="max-width:160px">
+      <span class="form-hint">How many requests each engine runs at once. Each engine is its own process, so this applies per-engine — total concurrency is the sum across engines.</span>
+    </div>
+    <div class="form-row" style="margin:0">
+      <label class="form-label">Max instances per model <span class="muted">(per engine)</span></label>
+      <input type="number" id="s-max-instances" class="form-input" min="1" max="16" placeholder="1" style="max-width:160px">
+      <span class="form-hint">Default number of concurrent copies of one model. Override per model on the Models page.</span>
+    </div>
+  </div>
+  <div id="concurrency-overrides" style="margin-top:.7rem;display:none">
+    <label class="form-label" style="margin-bottom:.3rem">Per-engine overrides <span class="muted">(blank = use the defaults above)</span></label>
+    <span class="form-hint" style="margin-bottom:.4rem">Give a bigger card more concurrency than a smaller one — e.g. more parallel requests on a 24 GB NVIDIA than an 8 GB Radeon.</span>
+    <div id="concurrency-override-rows" style="display:grid;grid-template-columns:110px 1fr 1fr;gap:.5rem;align-items:center"></div>
+  </div>
 </div>

 <!-- Storage -->
@@ -87,6 +121,23 @@
    </label>
    <span class="form-hint">Background watcher samples RSS; when it keeps climbing while idle or nears the cap it runs gc / CUDA cache release / heap trim and (if enabled) evicts idle models.</span>
  </div>
+  <div class="form-row" style="margin:0">
+    <label style="display:flex;align-items:center;gap:.5rem;cursor:pointer">
+      <input type="checkbox" id="s-ram-watch-cuda">
+      <span>Allow CUDA cache release in mitigation</span>
+    </label>
+    <span class="form-hint">When the watcher mitigates, let it call <code>torch.cuda.empty_cache()</code>. Turn this <b>off</b> on a GPU that drops off the bus (Xid 79) to stop the background thread from touching CUDA — host-RAM mitigation (gc / heap trim / idle eviction) still runs. Loads are always skipped regardless.</span>
+  </div>
+  <div class="form-row" style="margin-top:.75rem">
+    <label class="form-label">RAM watch poll interval (s) <span class="muted">(default: 15)</span></label>
+    <input type="number" id="s-ram-watch-poll" class="form-input" min="1" step="1" placeholder="15">
+    <span class="form-hint">How often the watcher samples process-tree RSS.</span>
+  </div>
+  <div class="form-row" style="margin-top:.75rem">
+    <label class="form-label">RAM watch soft threshold <span class="muted">(0–1, default: 0.90)</span></label>
+    <input type="number" id="s-ram-watch-soft" class="form-input" min="0" max="1" step="0.05" placeholder="0.90">
+    <span class="form-hint">Fraction of the RAM cap at which the mitigation ladder engages.</span>
+  </div>
  <div class="form-row" style="margin-top:.75rem">
    <label class="form-label">Temporary working directory <span class="muted">(default: system /tmp)</span></label>
    <input type="text" id="s-tmp-dir" class="form-input" placeholder="e.g. /data/tmp">
@@ -172,6 +223,12 @@
      <input type="number" id="s-therm-gpu-resume" class="form-input" min="30" max="120" step="1" placeholder="87">
    </div>
  </div>
+  <div id="therm-gpu-overrides" class="form-row" style="margin-top:.5rem;display:none">
+    <label class="form-label" style="margin-bottom:.3rem">Per-card overrides <span class="muted">(optional — blank uses the limits above)</span></label>
+    <span class="form-hint" style="margin-bottom:.4rem">Set a different pause/resume limit per GPU vendor detected on this machine — e.g. let a Radeon run hotter than an NVIDIA card. Each card is judged against its own vendor's limit.</span>
+    <!-- One row per GPU vendor actually present, built from detected hardware. -->
+    <div id="therm-gpu-overrides-rows" style="display:grid;grid-template-columns:90px 1fr 1fr;gap:.5rem;align-items:center"></div>
+  </div>

  <div class="form-row" style="margin-top:.5rem">
    <label style="display:flex;align-items:center;gap:.5rem;cursor:pointer">
@@ -321,10 +378,148 @@
    </div>
  </div>
 </div>
+
+<!-- DeepSeek V4 (ds4) -->
+<div class="card mb-0" style="margin-top:1rem">
+  <div class="card-title">DeepSeek V4 (ds4)</div>
+  <p class="form-hint" style="margin-bottom:.6rem">Run DeepSeek V4 through antirez's native <a href="https://github.com/antirez/ds4" target="_blank" rel="noopener">ds4 / DwarfStar</a> engine as a managed subprocess. When enabled, requests for a matching model name are proxied to ds4-server. First use clones + builds ds4 and downloads several GB of weights — this is slow and needs lots of RAM (96 GB+).</p>
+  <div class="form-row">
+    <label style="display:flex;align-items:center;gap:.5rem;cursor:pointer">
+      <input type="checkbox" id="s-ds4-enabled" onchange="toggleDs4Fields()">
+      <span style="font-size:13px;font-weight:500">Enable ds4 (DeepSeek V4)</span>
+    </label>
+  </div>
+  <div id="ds4-fields" style="display:none">
+    <div style="display:grid;grid-template-columns:1fr 1fr;gap:1rem;align-items:start">
+      <div class="form-row" style="margin:0">
+        <label class="form-label">Model id / alias</label>
+        <input type="text" id="s-ds4-model-id" class="form-input" placeholder="deepseek-v4">
+        <span class="form-hint">Requests for this id (or any name containing "deepseek-v4") route to ds4.</span>
+      </div>
+      <div class="form-row" style="margin:0">
+        <label class="form-label">Weight variant</label>
+        <select id="s-ds4-variant" class="form-input">
+          <option value="q2-imatrix">q2-imatrix (96/128 GB)</option>
+          <option value="q2-q4-imatrix">q2-q4-imatrix (96/128 GB)</option>
+          <option value="q4-imatrix">q4-imatrix (256 GB+)</option>
+          <option value="pro-q2-imatrix">pro-q2-imatrix (512 GB)</option>
+        </select>
+      </div>
+    </div>
+    <div style="display:grid;grid-template-columns:1fr 1fr;gap:1rem;align-items:start">
+      <div class="form-row" style="margin:0">
+        <label class="form-label">Build target</label>
+        <select id="s-ds4-build-target" class="form-input">
+          <option value="auto">auto-detect</option>
+          <option value="cuda-generic">cuda-generic</option>
+          <option value="cuda-spark">cuda-spark (DGX Spark/GB10)</option>
+          <option value="metal">metal (macOS)</option>
+          <option value="cpu">cpu</option>
+        </select>
+      </div>
+      <div class="form-row" style="margin:0">
+        <label class="form-label">Install dir</label>
+        <input type="text" id="s-ds4-install-dir" class="form-input" placeholder="~/.coderai/ds4">
+      </div>
+    </div>
+    <div class="form-row">
+      <label style="display:flex;align-items:center;gap:.5rem;cursor:pointer">
+        <input type="checkbox" id="s-ds4-auto-build">
+        <span>Auto clone + build the ds4-server binary if missing</span>
+      </label>
+    </div>
+    <div style="display:grid;grid-template-columns:1fr 120px 140px;gap:1rem;align-items:start">
+      <div class="form-row" style="margin:0">
+        <label class="form-label">Bind host</label>
+        <input type="text" id="s-ds4-host" class="form-input" placeholder="127.0.0.1">
+      </div>
+      <div class="form-row" style="margin:0">
+        <label class="form-label">Port</label>
+        <input type="number" id="s-ds4-port" class="form-input" min="0" placeholder="0 = auto">
+      </div>
+      <div class="form-row" style="margin:0">
+        <label class="form-label">Context (--ctx)</label>
+        <input type="number" id="s-ds4-ctx" class="form-input" min="1024" placeholder="100000">
+      </div>
+    </div>
+    <div class="form-row">
+      <label class="form-label">Repo URL</label>
+      <input type="text" id="s-ds4-repo-url" class="form-input" placeholder="https://github.com/antirez/ds4">
+    </div>
+    <div class="form-row">
+      <label class="form-label">Extra ds4-server args</label>
+      <input type="text" id="s-ds4-extra-args" class="form-input" placeholder="--kv-disk-dir /tmp/ds4-kv --kv-disk-space-mb 8192">
+    </div>
+  </div>
+</div>
 {% endblock %}

 {% block scripts %}
 <script>
+// Build one per-vendor threshold-override row for each GPU vendor actually present
+// on this machine (detected via /admin/api/gpu-stats) — never hardcode the mix.
+const _VENDOR_LABEL = {nvidia:'NVIDIA', amd:'Radeon', intel:'Intel'};
+const _VENDOR_COLOR = {nvidia:'#76b900', amd:'#ed1c24', intel:'#0071c5'};
+async function _buildGpuOverrideRows(overrides){
+  const wrap = document.getElementById('therm-gpu-overrides');
+  const rows = document.getElementById('therm-gpu-overrides-rows');
+  let vendors = [];
+  try {
+    const cards = ((await fetch(ROOT_PATH + '/admin/api/gpu-stats').then(r=>r.json())).cards) || [];
+    vendors = [...new Set(cards.map(c => c.vendor).filter(Boolean))];
+  } catch(e) {}
+  // Also include any vendor that already has a saved override, even if not detected now.
+  Object.keys(overrides || {}).forEach(v => { if(!vendors.includes(v)) vendors.push(v); });
+  if(!vendors.length){ wrap.style.display = 'none'; rows.innerHTML = ''; return; }
+  wrap.style.display = '';
+  rows.innerHTML = vendors.map(v => {
+    const ov = (overrides && overrides[v]) || {};
+    const label = _VENDOR_LABEL[v] || v;
+    const color = _VENDOR_COLOR[v] || 'var(--text-2)';
+    return `<span style="font-size:12px;color:${color}">${label}</span>
+      <input type="number" class="form-input therm-ov" data-vendor="${v}" data-k="high"
+             min="40" max="120" step="1" placeholder="pause °C" value="${ov.high ?? ''}">
+      <input type="number" class="form-input therm-ov" data-vendor="${v}" data-k="resume"
+             min="30" max="120" step="1" placeholder="resume °C" value="${ov.resume ?? ''}">`;
+  }).join('');
+}
+function _collectGpuOverrides(){
+  const ov = {};
+  document.querySelectorAll('#therm-gpu-overrides-rows .therm-ov').forEach(inp => {
+    const v = parseFloat(inp.value);
+    if(isNaN(v)) return;
+    const vendor = inp.dataset.vendor;
+    (ov[vendor] = ov[vendor] || {})[inp.dataset.k] = v;
+  });
+  return ov;
+}
+// Per-engine concurrency overrides — one row per running engine (by name).
+function _buildConcurrencyOverrides(engNames, parOv, instOv){
+  const wrap = document.getElementById('concurrency-overrides');
+  const rows = document.getElementById('concurrency-override-rows');
+  if(!engNames || engNames.length < 2){ wrap.style.display='none'; rows.innerHTML=''; return; }
+  wrap.style.display = '';
+  rows.innerHTML =
+    `<span class="muted" style="font-size:11px">Engine</span>
+     <span class="muted" style="font-size:11px">Max parallel</span>
+     <span class="muted" style="font-size:11px">Max instances</span>` +
+    engNames.map(n => `
+      <span style="font-size:12px">${n}</span>
+      <input type="number" class="form-input conc-par" data-engine="${n}" min="1" max="64" placeholder="default" value="${parOv[n] ?? ''}">
+      <input type="number" class="form-input conc-inst" data-engine="${n}" min="1" max="16" placeholder="default" value="${instOv[n] ?? ''}">`).join('');
+}
+function _collectConcOverrides(cls){
+  const o = {};
+  document.querySelectorAll('#concurrency-override-rows .' + cls).forEach(inp => {
+    const v = parseInt(inp.value);
+    if(!isNaN(v) && v >= 1) o[inp.dataset.engine] = v;
+  });
+  return o;
+}
+function toggleDs4Fields(){
+  document.getElementById('ds4-fields').style.display =
+    document.getElementById('s-ds4-enabled').checked ? 'block' : 'none';
+}
 function toggleHttps(){
  document.getElementById('https-fields').style.display =
    document.getElementById('s-https').checked ? 'block' : 'none';
@@ -371,12 +566,41 @@ async function loadSettings(){
    document.getElementById('s-key').value   = d.server?.https_key_path ?? '';
    document.getElementById('s-cert').value  = d.server?.https_cert_path ?? '';
    document.getElementById('s-queue-max').value = d.server?.queue_max_size ?? 6;
+    document.getElementById('s-internal-port-base').value = d.server?.internal_port_base ?? 8780;
+    // Default engine — surfaced when 2+ engines are running. Sourced from the
+    // front's live engine list so it also covers AUTO-DETECTED engines (no
+    // engine_specs needed); falls back to declared engine_specs names.
+    let engNames = [];
+    try {
+      const er = await fetch(ROOT_PATH + '/admin/api/engines');
+      if (er.ok) engNames = ((await er.json()).engines || []).map(e => e.name);
+    } catch(e) {}
+    if (!engNames.length) engNames = d.server?.engine_names || [];
+    const engRow = document.getElementById('default-engine-row');
+    const engSel = document.getElementById('s-default-engine');
+    if (engNames.length > 1) {
+      engSel.querySelectorAll('option:not([value=""])').forEach(o => o.remove());
+      engNames.forEach(n => { const o=document.createElement('option'); o.value=n; o.textContent=n; engSel.appendChild(o); });
+      engSel.value = d.server?.default_engine || '';
+      engRow.style.display = '';
+    } else {
+      engRow.style.display = 'none';
+    }
+    // Concurrency: defaults + per-engine override rows.
+    document.getElementById('s-max-parallel').value = d.server?.max_parallel_requests ?? 2;
+    document.getElementById('s-max-instances').value = d.models?.max_model_instances ?? 1;
+    _buildConcurrencyOverrides(engNames,
+      d.server?.max_parallel_requests_overrides || {},
+      d.models?.max_model_instances_overrides || {});
    document.getElementById('s-hf-cache').value   = d.models?.hf_cache_dir ?? '';
    document.getElementById('s-gguf-cache').value = d.models?.gguf_cache_dir ?? '';
    document.getElementById('s-offload-dir').value = d.offload?.directory ?? './offload';
    document.getElementById('s-max-ram').value = d.offload?.max_ram_gb ?? '';
    document.getElementById('s-evict-idle-ram').checked = d.offload?.evict_idle_on_ram !== false;
    document.getElementById('s-ram-leak-watch').checked = d.offload?.ram_leak_watch !== false;
+    document.getElementById('s-ram-watch-cuda').checked = d.offload?.ram_watch_cuda !== false;
+    document.getElementById('s-ram-watch-poll').value = d.offload?.ram_watch_poll_seconds ?? '';
+    document.getElementById('s-ram-watch-soft').value = d.offload?.ram_watch_soft_fraction ?? '';
    document.getElementById('s-tmp-dir').value = d.tmp_dir ?? '';
    document.getElementById('s-allow-ffmpeg').checked = !!(d.enhance && d.enhance.allow_ffmpeg);
    document.getElementById('s-allow-rife-ncnn').checked = !!(d.enhance && d.enhance.allow_rife_ncnn);
@@ -416,6 +640,7 @@ async function loadSettings(){
    document.getElementById('s-therm-cpu-enabled').checked = therm.cpu_enabled !== false;
    document.getElementById('s-therm-gpu-high').value = therm.gpu_high ?? 90;
    document.getElementById('s-therm-gpu-resume').value = therm.gpu_resume ?? 87;
+    await _buildGpuOverrideRows(therm.gpu_overrides || {});
    document.getElementById('s-therm-cpu-high').value = therm.cpu_high ?? 90;
    document.getElementById('s-therm-cpu-resume').value = therm.cpu_resume ?? 87;
    document.getElementById('s-therm-poll').value = therm.poll_seconds ?? 5;
@@ -426,6 +651,20 @@ async function loadSettings(){
    // Background jobs
    const jobs = d.jobs || {};
    document.getElementById('s-jobs-resume').checked = jobs.resume_on_restart !== false;
+    // DeepSeek V4 (ds4)
+    const ds4 = d.ds4 || {};
+    document.getElementById('s-ds4-enabled').checked = !!ds4.enabled;
+    document.getElementById('s-ds4-model-id').value = ds4.model_id ?? 'deepseek-v4';
+    document.getElementById('s-ds4-variant').value = ds4.model_variant ?? 'q4-imatrix';
+    document.getElementById('s-ds4-build-target').value = ds4.build_target ?? 'auto';
+    document.getElementById('s-ds4-install-dir').value = ds4.install_dir ?? '';
+    document.getElementById('s-ds4-auto-build').checked = ds4.auto_build !== false;
+    document.getElementById('s-ds4-host').value = ds4.host ?? '127.0.0.1';
+    document.getElementById('s-ds4-port').value = ds4.port ?? 0;
+    document.getElementById('s-ds4-ctx').value = ds4.ctx ?? 100000;
+    document.getElementById('s-ds4-repo-url').value = ds4.repo_url ?? 'https://github.com/antirez/ds4';
+    document.getElementById('s-ds4-extra-args').value = ds4.extra_args ?? '';
+    toggleDs4Fields();
  }catch(e){ showAlert('error','Failed to load settings: '+e.message); }
 }

@@ -439,16 +678,25 @@ async function saveSettings(){
      https_key_path:  strOrNull('s-key'),
      https_cert_path: strOrNull('s-cert'),
      queue_max_size: parseInt(document.getElementById('s-queue-max').value) || 6,
+      internal_port_base: parseInt(document.getElementById('s-internal-port-base').value) || 8780,
+      max_parallel_requests: parseInt(document.getElementById('s-max-parallel').value) || 2,
+      max_parallel_requests_overrides: _collectConcOverrides('conc-par'),
+      default_engine: document.getElementById('s-default-engine').value || null,
    },
    models:{
      hf_cache_dir:   strOrNull('s-hf-cache'),
      gguf_cache_dir: strOrNull('s-gguf-cache'),
+      max_model_instances: parseInt(document.getElementById('s-max-instances').value) || 1,
+      max_model_instances_overrides: _collectConcOverrides('conc-inst'),
    },
    offload:{
      directory: document.getElementById('s-offload-dir').value.trim() || './offload',
      max_ram_gb: (parseFloat(document.getElementById('s-max-ram').value) || null),
      evict_idle_on_ram: document.getElementById('s-evict-idle-ram').checked,
      ram_leak_watch: document.getElementById('s-ram-leak-watch').checked,
+      ram_watch_cuda: document.getElementById('s-ram-watch-cuda').checked,
+      ram_watch_poll_seconds: (parseFloat(document.getElementById('s-ram-watch-poll').value) || null),
+      ram_watch_soft_fraction: (parseFloat(document.getElementById('s-ram-watch-soft').value) || null),
    },
    tmp_dir: strOrNull('s-tmp-dir'),
    enhance:{
@@ -465,6 +713,7 @@ async function saveSettings(){
      cpu_enabled: document.getElementById('s-therm-cpu-enabled').checked,
      gpu_high:   parseFloat(document.getElementById('s-therm-gpu-high').value)   || 90,
      gpu_resume: parseFloat(document.getElementById('s-therm-gpu-resume').value) || 87,
+      gpu_overrides: _collectGpuOverrides(),
      cpu_high:   parseFloat(document.getElementById('s-therm-cpu-high').value)   || 90,
      cpu_resume: parseFloat(document.getElementById('s-therm-cpu-resume').value) || 87,
      poll_seconds: parseFloat(document.getElementById('s-therm-poll').value) || 5,
@@ -475,6 +724,19 @@ async function saveSettings(){
    jobs:{
      resume_on_restart: document.getElementById('s-jobs-resume').checked,
    },
+    ds4:{
+      enabled: document.getElementById('s-ds4-enabled').checked,
+      model_id: document.getElementById('s-ds4-model-id').value.trim() || 'deepseek-v4',
+      model_variant: document.getElementById('s-ds4-variant').value,
+      build_target: document.getElementById('s-ds4-build-target').value,
+      install_dir: document.getElementById('s-ds4-install-dir').value.trim(),
+      auto_build: document.getElementById('s-ds4-auto-build').checked,
+      host: document.getElementById('s-ds4-host').value.trim() || '127.0.0.1',
+      port: parseInt(document.getElementById('s-ds4-port').value) || 0,
+      ctx: parseInt(document.getElementById('s-ds4-ctx').value) || 100000,
+      repo_url: document.getElementById('s-ds4-repo-url').value.trim() || 'https://github.com/antirez/ds4',
+      extra_args: document.getElementById('s-ds4-extra-args').value.trim(),
+    },
    broker:{
      enabled: document.getElementById('s-broker-enabled').checked,
      base_url: document.getElementById('s-broker-base-url').value.trim(),
@@ -499,7 +761,14 @@ async function saveSettings(){
      method:'POST', headers:{'Content-Type':'application/json'},
      body: JSON.stringify(data)
    });
-    if(r.ok) showAlert('info','Settings saved. Archive and thermal-protection changes take effect immediately; restart CoderAI for other changes.');
+    if(r.ok){
+      const d = await r.json().catch(()=>({}));
+      if(d.warnings && d.warnings.length){
+        showAlert('error', 'Saved, but: ' + d.warnings.join(' '));
+      } else {
+        showAlert('info','Settings saved. Archive and thermal-protection changes take effect immediately; restart CoderAI for other changes.');
+      }
+    }
    else{ const e=await r.json(); showAlert('error', e.detail||'Save failed'); }
  }catch(e){ showAlert('error','Error: '+e.message); }
 }

--- a/codai/admin/templates/tasks.html
+++ b/codai/admin/templates/tasks.html
@@ -30,11 +30,23 @@
 <div id="sys-stats" style="display:grid;grid-template-columns:repeat(auto-fit,minmax(220px,1fr));
     gap:.75rem;margin:0 0 1.25rem">
  <div class="sys-tile" id="tile-cpu"></div>
-  <div class="sys-tile" id="tile-gpu"></div>
  <div class="sys-tile" id="tile-ram"></div>
+  <!-- Per-card GPU tiles (util + VRAM) injected here when cards are detected. -->
+  <div id="tile-cards" style="display:contents"></div>
+  <!-- Fallback single tiles when per-card stats are unavailable. -->
+  <div class="sys-tile" id="tile-gpu"></div>
  <div class="sys-tile" id="tile-vram"></div>
 </div>

+<!-- Engines (only shown in front/multi-engine mode) -->
+<div id="engines-card" style="display:none;margin:0 0 1.25rem">
+  <div style="display:flex;align-items:baseline;gap:.5rem;margin-bottom:.5rem">
+    <h2 style="font-size:14px;margin:0">Engines</h2>
+    <span class="dim small">restart a stuck engine — the supervisor respawns it</span>
+  </div>
+  <div id="engines-body" style="display:grid;grid-template-columns:repeat(auto-fit,minmax(240px,1fr));gap:.6rem"></div>
+</div>
+
 <style>
 .sys-tile{border:1px solid var(--border,#2a2a2a);border-radius:10px;padding:.7rem .85rem;
  background:var(--card-bg,rgba(255,255,255,.02))}
@@ -76,7 +88,7 @@ function fmtTime(s) {
  } catch { return ''; }
 }

-const KIND_LABEL = {training:'Training', image:'Image', video:'Video', upscale:'Upscale', interpolate:'Interpolate', audio:'Audio', text:'Text', pipeline:'Pipeline', request:'Request', loading:'Loading'};
+const KIND_LABEL = {training:'Training', image:'Image', video:'Video', upscale:'Upscale', interpolate:'Interpolate', audio:'Audio', text:'Text', tts:'Speech (TTS)', transcription:'Transcription', embedding:'Embedding', spatial:'3D / Spatial', pipeline:'Pipeline', request:'Request', loading:'Loading'};
 const STATUS_BADGE = {
  running:'badge-admin', queued:'badge-user', done:'badge-ok', error:'badge-err',
  cancelled:'badge-user', interrupted:'badge-warn'
@@ -140,18 +152,89 @@ function _memTile(name, used, total, pct){
  return `<div class="sys-head"><span class="sys-name">${name}</span><span class="sys-val">${valTxt}</span></div>`
    + _bar(p) + `<div class="sys-sub"><span>${p == null ? '' : Math.round(p)+'% used'}</span><span></span></div>`;
 }
+// One tile per physical card showing both GPU utilization and VRAM (+ temp).
+function _cardTile(c){
+  const vColor = c.vendor==='nvidia' ? '#76b900'
+               : c.vendor==='amd' ? '#ed1c24' : 'var(--text-3)';
+  const memP = (c.mem_total ? (c.mem_used / c.mem_total * 100) : null);
+  const temp = (c.temp!=null) ? ' · '+Math.round(c.temp)+'°C' : '';
+  const util = (c.util!=null) ? Math.round(c.util)+'%' : '—';
+  return `<div class="sys-tile">
+    <div class="sys-head"><span class="sys-name" style="color:${vColor}">${esc(c.name)}</span>
+      <span class="sys-val">${util}${temp}</span></div>
+    ${_bar(c.util)}
+    <div class="sys-sub"><span>VRAM ${c.mem_used!=null?c.mem_used.toFixed(1):'—'}/${c.mem_total!=null?c.mem_total.toFixed(0):'—'} GB</span>
+      <span>${memP!=null?Math.round(memP)+'% used':''}</span></div>
+    ${_bar(memP)}
+  </div>`;
+}
+
 async function loadSystemStats(){
  try {
    const s = await fetch(ROOT_PATH + '/admin/api/system-stats').then(r => r.json());
    const cpu = s.cpu || {}, gpu = s.gpu || {}, ram = s.ram || {}, vram = s.vram || {};
    document.getElementById('tile-cpu').innerHTML = _utilTile('CPU', cpu.util, cpu.temp, (cpu.cores || 1) * 100);
-    document.getElementById('tile-gpu').innerHTML = _utilTile('GPU', gpu.util, gpu.temp);
    document.getElementById('tile-ram').innerHTML = _memTile('RAM', ram.used, ram.total, ram.percent);
-    document.getElementById('tile-vram').innerHTML =
-      _memTile('VRAM', vram.used, vram.total, vram.percent);
+
+    // Per-card GPU+VRAM tiles for every physical card; fall back to single tiles.
+    let cards = [];
+    try { cards = ((await fetch(ROOT_PATH + '/admin/api/gpu-stats').then(r => r.json())).cards) || []; } catch(e){}
+    const cardsEl = document.getElementById('tile-cards');
+    const gpuEl = document.getElementById('tile-gpu');
+    const vramEl = document.getElementById('tile-vram');
+    if (cards.length) {
+      cardsEl.innerHTML = cards.map(_cardTile).join('');
+      gpuEl.style.display = 'none'; vramEl.style.display = 'none';
+    } else {
+      cardsEl.innerHTML = '';
+      gpuEl.style.display = ''; vramEl.style.display = '';
+      gpuEl.innerHTML = _utilTile('GPU', gpu.util, gpu.temp);
+      vramEl.innerHTML = _memTile('VRAM', vram.used, vram.total, vram.percent);
+    }
  } catch(e){ /* keep last render on transient errors */ }
 }

+// Engines panel — only present in front/multi-engine mode (404 in single-process).
+async function loadEngines(){
+  let engines = null;
+  try {
+    const r = await fetch(ROOT_PATH + '/admin/api/engines');
+    if (!r.ok) { document.getElementById('engines-card').style.display = 'none'; return; }
+    engines = (await r.json()).engines || [];
+  } catch(e) { document.getElementById('engines-card').style.display = 'none'; return; }
+  const card = document.getElementById('engines-card');
+  if (!engines.length) { card.style.display = 'none'; return; }
+  card.style.display = '';
+  document.getElementById('engines-body').innerHTML = engines.map(e => {
+    const dot = e.healthy ? '#3fb950' : '#e5534b';
+    const state = e.healthy ? 'healthy' : 'down / starting';
+    const vram = e.vram ? `${(e.vram.used ?? 0).toFixed ? e.vram.used.toFixed(1) : e.vram.used}/${e.vram.total} GB` : '';
+    const cool = e.cooling ? ` <span class="badge badge-warn" style="font-size:9px">❄ cooling</span>` : '';
+    const prim = e.primary ? ` <span class="badge badge-user" style="font-size:9px">primary</span>` : '';
+    const models = (e.loaded_models||[]).length;
+    return `<div class="sys-tile">
+      <div class="sys-head">
+        <span class="sys-name">${esc(e.name)} <span class="dim" style="text-transform:none">(${esc(e.backend)})</span>${prim}${cool}</span>
+        <span style="width:9px;height:9px;border-radius:50%;background:${dot};display:inline-block" title="${state}"></span>
+      </div>
+      <div class="sys-sub"><span>${esc(state)}${vram?' · '+esc(vram):''}</span><span>${models} model${models!==1?'s':''}</span></div>
+      <div style="margin-top:.5rem;text-align:right">
+        <button class="btn btn-ghost" style="font-size:11px;padding:.15rem .5rem;color:var(--error,#e55)"
+                onclick="restartEngine(${e.id}, '${esc(e.name)}')" title="Kill and respawn this engine">↻ Restart</button>
+      </div>
+    </div>`;
+  }).join('');
+}
+
+async function restartEngine(id, name){
+  if (!confirm(`Restart engine "${name}"? In-flight requests on it will fail; the supervisor respawns it immediately.`)) return;
+  try {
+    const r = await fetch(ROOT_PATH + '/admin/api/engines/' + id + '/restart', {method:'POST'});
+    if (!r.ok) { const e = await r.json().catch(()=>({})); alert(e.detail || 'Restart failed'); }
+    setTimeout(loadEngines, 800);
+  } catch(e) { alert(e.message); }
+}
+
 let _refreshing = false;
 async function loadTasks() {
  if (_refreshing) return;
@@ -165,7 +248,19 @@ async function loadTasks() {

    const therm = data.thermal || {};
    const banner = document.getElementById('thermal-banner');
-    if (therm.active) {
+    // Multi-engine: name which engine(s) are cooling and on what (GPU vs CPU).
+    const cooling = data.cooling_engines || [];
+    if (cooling.length) {
+      const parts = cooling.map(c => {
+        const what = (c.gpu != null && c.cpu == null) ? `GPU ${Math.round(c.gpu)}°C`
+                   : (c.cpu != null && c.gpu == null) ? `CPU ${Math.round(c.cpu)}°C`
+                   : (c.message || 'cooling');
+        return `${esc(c.engine)} (${esc(what)})`;
+      });
+      document.getElementById('thermal-banner-msg').textContent =
+        ' Cooling down: ' + parts.join(', ');
+      banner.style.display = '';
+    } else if (therm.active) {
      document.getElementById('thermal-banner-msg').textContent = ' ' + (therm.message || '');
      banner.style.display = '';
    } else {
@@ -207,7 +302,7 @@ async function loadTasks() {
      }
      return `<tr>
        <td><span class="badge badge-user">${esc(KIND_LABEL[t.kind] || t.kind)}</span></td>
-        <td><div class="td-name">${esc(title)}</div><div class="dim small mono">${esc(t.model || '')}</div></td>
+        <td><div class="td-name">${esc(title)}${t.engine?` <span class="badge badge-user" style="font-size:9px;padding:.05rem .3rem;vertical-align:middle" title="Running on engine">${esc(t.engine)}</span>`:''}</div><div class="dim small mono">${esc(t.model || '')}</div></td>
        <td>${statusCell}</td>
        <td>${progressBar(t)}</td>
        <td class="dim small">${fmtTime(t.started_at)}</td>
@@ -248,7 +343,9 @@ async function removeTask(id) {

 loadTasks();
 loadSystemStats();
+loadEngines();
 setInterval(loadTasks, 2000);
 setInterval(loadSystemStats, 2000);
+setInterval(loadEngines, 5000);
 </script>
 {% endblock %}
--- a/codai/api/app.py
+++ b/codai/api/app.py
@@ -160,6 +160,32 @@ except ImportError:
    pass


+class _InternalAuthMiddleware:
+    """Reject any HTTP request that doesn't carry the front's internal token.
+
+    Active only when CODERAI_INTERNAL_TOKEN is set (i.e. this process is an engine
+    spawned by the front). It binds 127.0.0.1, but this also blocks anything else on
+    localhost from talking to the engine directly and bypassing the front. In
+    single-process mode the token is unset and this is a no-op."""
+
+    def __init__(self, app):
+        self._app = app
+        self._token = os.environ.get("CODERAI_INTERNAL_TOKEN")
+
+    async def __call__(self, scope, receive, send):
+        if self._token and scope.get("type") == "http":
+            headers = dict(scope.get("headers", []))
+            got = headers.get(b"x-coderai-internal", b"").decode("latin-1")
+            if got != self._token:
+                await send({"type": "http.response.start", "status": 403,
+                            "headers": [(b"content-type", b"application/json")]})
+                await send({"type": "http.response.body",
+                            "body": b'{"error":"forbidden: engines are reachable only '
+                                    b'through the front proxy"}'})
+                return
+        await self._app(scope, receive, send)
+
+
 class _ForwardedPrefixMiddleware:
    """Populate ASGI root_path from X-Forwarded-Prefix / X-Script-Name headers."""

@@ -180,6 +206,9 @@ class _ForwardedPrefixMiddleware:


 app.add_middleware(_ForwardedPrefixMiddleware)
+# Added last → outermost: the internal-token gate runs before anything else, so a
+# request without the front's token never reaches a route.
+app.add_middleware(_InternalAuthMiddleware)

 # Mount static files for admin dashboard
 from fastapi.staticfiles import StaticFiles
@@ -193,6 +222,77 @@ from fastapi.responses import FileResponse, Response as _FaviconResponse
 _favicon_path = admin_static_dir / "favicon.ico"


+@app.get("/healthz", include_in_schema=False)
+async def healthz():
+    """Cheap liveness probe that touches no torch/model state.
+
+    The front proxy's engine supervisor polls this to distinguish a *slow* engine
+    (busy loading a model — the event loop may be blocked, so this can be late but
+    will eventually answer) from a *dead* one (connection refused). It must stay
+    trivial and dependency-free so it returns the instant the loop is free."""
+    import os as _os
+    return {"ok": True, "pid": _os.getpid()}
+
+
+@app.get("/internal/engine-state", include_in_schema=False)
+async def internal_engine_state():
+    """Auth-free engine introspection for the front proxy's router/aggregator.
+
+    Engines bind 127.0.0.1 only, so this is not publicly reachable. Returns which
+    models are resident (for model→engine routing) and this engine's GPU/VRAM (for
+    cross-engine status aggregation). Kept cheap so it answers even mid-generation.
+    """
+    import os as _os
+    try:
+        loaded = list(multi_model_manager.models.keys())
+    except Exception:
+        loaded = []
+    vram = None
+    try:
+        import torch
+        if torch.cuda.is_available():
+            # Sum across every CUDA device this engine can see — an engine may own
+            # more than one GPU (e.g. two NVIDIA cards sharding one large model), so
+            # reporting only device 0 would under-count its VRAM.
+            n = torch.cuda.device_count()
+            used = free = total = 0
+            devs = []
+            for i in range(n):
+                f, t = torch.cuda.mem_get_info(i)
+                used += (t - f); free += f; total += t
+                devs.append({"index": i, "name": torch.cuda.get_device_name(i),
+                             "free": round(f / 1e9, 2), "total": round(t / 1e9, 2)})
+            label = (torch.cuda.get_device_name(0) if n == 1
+                     else f"{n}× CUDA")
+            vram = {"used": round(used / 1e9, 2), "free": round(free / 1e9, 2),
+                    "total": round(total / 1e9, 2), "gpu": label,
+                    "devices": devs, "device_count": n}
+    except Exception:
+        vram = None
+    # Running tasks so the front can show cross-engine activity without needing a
+    # session on this engine (sessions live only on the primary).
+    tasks = []
+    try:
+        from codai.tasks import task_registry
+        tasks = [t for t in task_registry.list()
+                 if t.get("status") in ("running", "queued", "paused")]
+    except Exception:
+        tasks = []
+    # This engine's thermal cooldown state, so the front can show WHICH engine is
+    # cooling (each engine pauses on its own GPUs; CPU pauses everything).
+    cooling = None
+    try:
+        from codai.models import thermal
+        cs = thermal.get_cooldown_state()
+        if cs.get("active"):
+            cooling = {"gpu": cs.get("gpu"), "cpu": cs.get("cpu"),
+                       "message": cs.get("message")}
+    except Exception:
+        cooling = None
+    return {"ok": True, "pid": _os.getpid(), "loaded_models": loaded,
+            "vram": vram, "tasks": tasks, "cooling": cooling}
+
+
 @app.get("/favicon.ico", include_in_schema=False)
 async def favicon():
    if _favicon_path.exists():

--- a/codai/api/ds4_worker.py
+++ b/codai/api/ds4_worker.py
+# CoderAI - OpenAI-compatible API server
+# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+"""Fully-managed ds4 (DeepSeek V4) worker.
+
+ds4 / DwarfStar (https://github.com/antirez/ds4) is a native inference engine for
+DeepSeek V4 that ships its own OpenAI-compatible HTTP server (``ds4-server``). It is
+not a Python package — it's a C/CUDA/Metal binary built with ``make`` — so coderai
+owns the whole lifecycle here, mirroring :mod:`codai.api.parler_worker`:
+
+* :func:`ensure_built` clones the repo and runs the right ``make`` target so the
+  ``ds4-server`` binary exists (idempotent).
+* :func:`ensure_model` runs the project's ``download_model.sh`` for the configured
+  weight variant if the model isn't already present.
+* :func:`ensure_service` launches ``ds4-server`` on a free port, health-checks its
+  ``/v1/models`` endpoint, and returns the base URL.
+
+The matching ``Ds4Backend.cleanup()`` calls :func:`stop_service`, so the model
+manager's normal eviction tears the process down.
+"""
+
+import os
+import platform
+import shutil
+import socket
+import subprocess
+import sys
+import threading
+import time
+import collections
+from pathlib import Path
+from typing import Optional
+
+_lock = threading.RLock()
+# Single managed server (ds4 serves one DeepSeek V4 model). Keyed by model_id so a
+# config change to a different id restarts cleanly.
+_services: dict[str, dict] = {}   # model_id -> {"proc","port","url"}
+_built = False
+
+
+def default_install_dir() -> Path:
+    return Path(os.environ.get("CODERAI_DS4_DIR")
+                or os.path.expanduser("~/.coderai/ds4"))
+
+
+def _install_dir(cfg) -> Path:
+    return Path(cfg.install_dir).expanduser() if getattr(cfg, "install_dir", None) \
+        else default_install_dir()
+
+
+def _server_bin(install_dir: Path) -> Path:
+    return install_dir / "ds4-server"
+
+
+def _detect_build_target() -> str:
+    """Pick a ``make`` target from the host: CUDA → cuda-generic, macOS → metal."""
+    if platform.system() == "Darwin":
+        return "metal"
+    if shutil.which("nvcc") or os.path.isdir("/usr/local/cuda"):
+        return "cuda-generic"
+    return "cpu"
+
+
+def _resolve_target(cfg) -> str:
+    target = (getattr(cfg, "build_target", "auto") or "auto").strip()
+    if target in ("", "auto"):
+        return _detect_build_target()
+    # ds4's macOS Metal target is the bare ``make`` (no suffix).
+    return "" if target == "metal" else target
+
+
+def _run_logged(cmd, cwd, label, tail, **kw):
+    """Run a subprocess, streaming its output with a ``[ds4]`` prefix into ``tail``."""
+    print(f"[ds4] $ {' '.join(str(c) for c in cmd)}", flush=True)
+    proc = subprocess.Popen(cmd, cwd=str(cwd), stdout=subprocess.PIPE,
+                            stderr=subprocess.STDOUT, text=True, bufsize=1, **kw)
+    for line in proc.stdout:
+        line = line.rstrip()
+        if line:
+            tail.append(line)
+            print(f"[ds4] {line}", flush=True)
+    proc.wait()
+    if proc.returncode != 0:
+        joined = " | ".join(list(tail)[-5:])
+        raise RuntimeError(f"{label} failed (exit {proc.returncode}). {joined}")
+
+
+def ensure_built(cfg) -> Path:
+    """Clone + build ds4 if the ``ds4-server`` binary is missing. Returns its path."""
+    global _built
+    install_dir = _install_dir(cfg)
+    binary = _server_bin(install_dir)
+    if binary.exists():
+        _built = True
+        return binary
+    if not getattr(cfg, "auto_build", True):
+        raise RuntimeError(
+            f"ds4-server not found at {binary} and auto_build is disabled. Build ds4 "
+            f"manually (git clone {cfg.repo_url}; make <target>) or enable auto_build.")
+
+    tail = collections.deque(maxlen=40)
+    install_dir.parent.mkdir(parents=True, exist_ok=True)
+    if not (install_dir / ".git").exists() and not (install_dir / "Makefile").exists():
+        print(f"[ds4] cloning {cfg.repo_url} → {install_dir} …", flush=True)
+        _run_logged(["git", "clone", "--depth", "1", cfg.repo_url, str(install_dir)],
+                    cwd=install_dir.parent, label="git clone", tail=tail)
+
+    target = _resolve_target(cfg)
+    make_cmd = ["make"] + ([target] if target else [])
+    print(f"[ds4] building ds4 (make {target or 'metal'}) — this can take a while …",
+          flush=True)
+    _run_logged(make_cmd, cwd=install_dir, label="make", tail=tail)
+
+    if not binary.exists():
+        raise RuntimeError(
+            f"ds4 build completed but {binary} is missing. Last output: "
+            + " | ".join(list(tail)[-5:]))
+    _built = True
+    print(f"[ds4] built {binary}", flush=True)
+    return binary
+
+
+def ensure_model(cfg) -> None:
+    """Download the configured GGUF weight variant if it isn't present.
+
+    ds4's ``download_model.sh`` writes into ``<install_dir>/gguf/`` and updates the
+    ``ds4flash.gguf`` symlink that ``ds4-server`` loads by default.
+    """
+    install_dir = _install_dir(cfg)
+    # If the default-loaded model file already resolves, nothing to do.
+    default_model = install_dir / "ds4flash.gguf"
+    if default_model.exists():
+        return
+    script = install_dir / "download_model.sh"
+    if not script.exists():
+        # The binary may have been bundled (e.g. into a Docker image) without the
+        # repo scripts. Shallow-clone the repo to get download_model.sh — this does
+        # not rebuild the already-present binary.
+        tail0 = collections.deque(maxlen=20)
+        if (install_dir / ".git").exists() or (install_dir / "Makefile").exists():
+            _run_logged(["git", "-C", str(install_dir), "pull", "--ff-only"],
+                        cwd=install_dir, label="git pull", tail=tail0)
+        else:
+            tmp = install_dir.parent / (install_dir.name + ".repo")
+            shutil.rmtree(tmp, ignore_errors=True)
+            _run_logged(["git", "clone", "--depth", "1", cfg.repo_url, str(tmp)],
+                        cwd=install_dir.parent, label="git clone (scripts)", tail=tail0)
+            # Copy repo files into install_dir without clobbering the built binary.
+            for item in tmp.iterdir():
+                dest = install_dir / item.name
+                if dest.exists():
+                    continue
+                if item.is_dir():
+                    shutil.copytree(item, dest)
+                else:
+                    shutil.copy2(item, dest)
+            shutil.rmtree(tmp, ignore_errors=True)
+    if not script.exists():
+        raise RuntimeError(f"ds4 download script not found at {script}")
+    variant = (getattr(cfg, "model_variant", "") or "q4-imatrix").strip()
+    tail = collections.deque(maxlen=40)
+    print(f"[ds4] downloading DeepSeek V4 weights (variant '{variant}', multi-GB, "
+          f"resumable) …", flush=True)
+    _run_logged(["bash", str(script), variant], cwd=install_dir,
+                label="download_model.sh", tail=tail)
+
+
+def _free_port() -> int:
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    s.bind(("127.0.0.1", 0))
+    port = s.getsockname()[1]
+    s.close()
+    return port
+
+
+def _pump_logs(proc, tail):
+    for line in proc.stdout:
+        line = line.rstrip()
+        if line:
+            tail.append(line)
+            print(f"[ds4] {line}", flush=True)
+
+
+def _health_ok(url: str) -> bool:
+    import requests
+    try:
+        r = requests.get(url + "/v1/models", timeout=3)
+        return r.ok
+    except Exception:
+        return False
+
+
+def ensure_service(cfg, ready_timeout: float = 3600.0) -> str:
+    """Build + download (as needed), then start (or reuse) ds4-server.
+
+    Returns the base URL. First call clones, builds, and downloads several GB, so the
+    timeout is generous. Raises RuntimeError if the service never becomes ready.
+    """
+    model_id = getattr(cfg, "model_id", "deepseek-v4") or "deepseek-v4"
+    with _lock:
+        svc = _services.get(model_id)
+        if svc and svc["proc"].poll() is None and _health_ok(svc["url"]):
+            return svc["url"]
+        if svc and svc["proc"].poll() is not None:
+            _services.pop(model_id, None)   # died — restart below
+
+        binary = ensure_built(cfg)
+        ensure_model(cfg)
+
+        install_dir = _install_dir(cfg)
+        host = getattr(cfg, "host", "127.0.0.1") or "127.0.0.1"
+        port = int(getattr(cfg, "port", 0) or 0) or _free_port()
+        # ds4-server binds the requested host; build the URL from a loopback address
+        # for our own health checks / proxying when it's bound to 0.0.0.0.
+        connect_host = "127.0.0.1" if host in ("0.0.0.0", "::") else host
+        url = f"http://{connect_host}:{port}"
+
+        cmd = [str(binary), "--host", host, "--port", str(port),
+               "--ctx", str(int(getattr(cfg, "ctx", 100000) or 100000)),
+               "--chdir", str(install_dir)]
+        extra = (getattr(cfg, "extra_args", "") or "").strip()
+        if extra:
+            import shlex
+            cmd += shlex.split(extra)
+
+        proc = subprocess.Popen(
+            cmd, cwd=str(install_dir), stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT, text=True, bufsize=1,
+        )
+        tail = collections.deque(maxlen=15)
+        threading.Thread(target=_pump_logs, args=(proc, tail), daemon=True).start()
+        _services[model_id] = {"proc": proc, "port": port, "url": url}
+
+    def _tail_msg():
+        joined = " | ".join(list(tail)[-5:]).strip()
+        return f". Last output: {joined}" if joined else ""
+
+    deadline = time.time() + ready_timeout
+    while time.time() < deadline:
+        if proc.poll() is not None:
+            raise RuntimeError(
+                f"ds4-server exited (code {proc.returncode}) before becoming ready"
+                + _tail_msg())
+        if _health_ok(url):
+            print(f"[ds4] service ready for {model_id} at {url}", flush=True)
+            return url
+        time.sleep(2)
+    stop_service(model_id)
+    raise RuntimeError(f"ds4-server for {model_id} did not become ready in time"
+                       + _tail_msg())
+
+
+def stop_service(model_id: str) -> None:
+    with _lock:
+        svc = _services.pop(model_id, None)
+    if not svc:
+        return
+    proc = svc["proc"]
+    if proc.poll() is None:
+        try:
+            proc.terminate()
+            proc.wait(timeout=10)
+        except Exception:
+            pass
+    if proc.poll() is None:
+        try:
+            proc.kill()
+        except Exception:
+            pass
+    print(f"[ds4] service for {model_id} stopped", flush=True)
+
+
+def stop_all() -> None:
+    for mid in list(_services.keys()):
+        stop_service(mid)
+
+
+import atexit as _atexit
+_atexit.register(stop_all)
--- a/codai/api/embeddings.py
+++ b/codai/api/embeddings.py
@@ -106,6 +106,27 @@ async def create_embeddings(request: EmbeddingsRequest, http_request: Request =
    """
    OpenAI-compatible embeddings endpoint.
    """
+    # Register a task so embeddings appear in the unified task list, like every
+    # other model type. Finished on success or error below.
+    from codai.tasks import task_registry
+    _title = request.input if isinstance(request.input, str) else "embeddings"
+    _tid = task_registry.register(
+        "embedding", title=str(_title)[:80], model=(request.model or "embedding"))
+    task_registry.start(_tid)
+    try:
+        _resp = await _run_embeddings(request, http_request)
+        task_registry.finish(_tid, "done")
+        return _resp
+    except HTTPException:
+        task_registry.finish(_tid, "error")
+        raise
+    except Exception as e:
+        task_registry.finish(_tid, "error", str(e)[:200])
+        raise
+
+
+async def _run_embeddings(request: EmbeddingsRequest, http_request: Request = None):
+    """Core embeddings logic; registered as a task by create_embeddings()."""
    model_info = await asyncio.to_thread(
        multi_model_manager.request_model, request.model, model_type="embedding")
    model_name = model_info.get('model_name')

--- a/codai/api/parler_worker.py
+++ b/codai/api/parler_worker.py
+# CoderAI - OpenAI-compatible API server
+# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+"""Fully-managed Parler-TTS worker.
+
+parler-tts pins an old transformers/tokenizers/huggingface-hub that conflict with
+the coderai server's stack, so it can't share this venv. Instead coderai owns the
+whole lifecycle here: on first use it bootstraps a dedicated venv (installing
+parler-tts), launches ``tools/parler_tts_service.py`` in it as a local HTTP
+service, health-checks it, and hands back the URL. The matching
+``_RemoteParlerBackend.cleanup()`` calls :func:`stop_service`, so the model
+manager's normal eviction tears the process down — no manual setup or config.
+"""
+
+import os
+import socket
+import subprocess
+import sys
+import threading
+import time
+from pathlib import Path
+
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+_SERVICE_SCRIPT = _REPO_ROOT / "tools" / "parler_tts_service.py"
+
+# Dedicated venv for the (incompatible) parler-tts stack. Created with access to
+# the base interpreter's packages so torch/numpy aren't re-downloaded; parler's
+# pinned transformers installs into the venv and shadows the system one.
+_VENV_DIR = Path(os.environ.get("CODERAI_PARLER_VENV")
+                 or os.path.expanduser("~/.coderai/parler_venv"))
+
+_lock = threading.RLock()
+_services: dict[str, dict] = {}   # model_name -> {"proc","port","url"}
+_bootstrapped = False
+
+
+def _venv_python() -> Path:
+    return _VENV_DIR / ("Scripts" if os.name == "nt" else "bin") / (
+        "python.exe" if os.name == "nt" else "python")
+
+
+def _pip_ok(py: Path) -> bool:
+    try:
+        return subprocess.run([str(py), "-c", "import parler_tts, soundfile"],
+                              capture_output=True).returncode == 0
+    except Exception:
+        return False
+
+
+def _venv_is_system_site() -> bool:
+    """True if the venv was built with --system-site-packages (can't isolate)."""
+    try:
+        return "include-system-site-packages = true" in \
+            (_VENV_DIR / "pyvenv.cfg").read_text().lower()
+    except Exception:
+        return False
+
+
+def _bootstrap_venv() -> Path:
+    """Create a fully-isolated venv and install parler-tts (idempotent).
+
+    Isolation is the whole point: parler-tts pins an old transformers/tokenizers
+    that must NOT be shared with — or shadowed by — the server's stack, so the
+    venv gets its own copy of everything (torch included). Returns its python."""
+    global _bootstrapped
+    py = _venv_python()
+    if _bootstrapped and py.exists():
+        return py
+    # A previously-created shared-site venv leaks the server's transformers in;
+    # rebuild it isolated.
+    if py.exists() and _venv_is_system_site():
+        import shutil
+        print("[parler] rebuilding venv as fully isolated …", flush=True)
+        shutil.rmtree(_VENV_DIR, ignore_errors=True)
+    if not _venv_python().exists():
+        print(f"[parler] creating isolated venv at {_VENV_DIR} …", flush=True)
+        _VENV_DIR.parent.mkdir(parents=True, exist_ok=True)
+        subprocess.run([sys.executable, "-m", "venv", str(_VENV_DIR)], check=True)
+    py = _venv_python()
+    if not _pip_ok(py):
+        print("[parler] installing parler-tts + torch into the isolated venv "
+              "(first run, downloads several GB, this can take a while) …", flush=True)
+        subprocess.run([str(py), "-m", "pip", "install",
+                        "git+https://github.com/huggingface/parler-tts.git",
+                        "soundfile"], check=True)
+        if not _pip_ok(py):
+            raise RuntimeError("parler-tts install did not yield an importable package")
+    _bootstrapped = True
+    return py
+
+
+def _free_port() -> int:
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    s.bind(("127.0.0.1", 0))
+    port = s.getsockname()[1]
+    s.close()
+    return port
+
+
+def _pump_logs(proc: subprocess.Popen, tail):
+    for line in proc.stdout:
+        line = line.rstrip()
+        if line:
+            tail.append(line)
+            print(f"[parler] {line}", flush=True)
+
+
+def _health_ok(url: str) -> bool:
+    import requests
+    try:
+        r = requests.get(url + "/health", timeout=3)
+        return r.ok and bool(r.json().get("ok"))
+    except Exception:
+        return False
+
+
+def ensure_service(model_name: str, ready_timeout: float = 1800.0) -> str:
+    """Start (or reuse) the worker for ``model_name`` and return its base URL.
+
+    First call bootstraps the venv and downloads the model, so the timeout is
+    generous. Raises RuntimeError if the service never comes up."""
+    with _lock:
+        svc = _services.get(model_name)
+        if svc and svc["proc"].poll() is None and _health_ok(svc["url"]):
+            return svc["url"]
+        if svc and svc["proc"].poll() is not None:
+            _services.pop(model_name, None)   # died — restart below
+
+        py = _bootstrap_venv()
+        port = _free_port()
+        url = f"http://127.0.0.1:{port}"
+        env = dict(os.environ)
+        # The worker must use the model already pulled via coderai's HF download
+        # interface — it never downloads anything itself. Point it at coderai's
+        # cache and force offline mode, so a missing model fails fast instead of
+        # silently fetching.
+        try:
+            from codai.models.cache import get_hf_hub_cache_dir
+            hub = get_hf_hub_cache_dir()
+            env["HF_HUB_CACHE"] = hub
+            env["HUGGINGFACE_HUB_CACHE"] = hub
+        except Exception:
+            pass
+        env["HF_HUB_OFFLINE"] = "1"
+        env["TRANSFORMERS_OFFLINE"] = "1"
+        proc = subprocess.Popen(
+            [str(py), str(_SERVICE_SCRIPT), "--model", model_name,
+             "--host", "127.0.0.1", "--port", str(port)],
+            stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True,
+            bufsize=1, env=env, cwd=str(_REPO_ROOT),
+        )
+        import collections
+        tail = collections.deque(maxlen=15)
+        threading.Thread(target=_pump_logs, args=(proc, tail), daemon=True).start()
+        _services[model_name] = {"proc": proc, "port": port, "url": url}
+
+    def _tail_msg():
+        joined = " | ".join(list(tail)[-5:]).strip()
+        if "offline" in joined.lower() or "not" in joined.lower() and "found" in joined.lower():
+            return (f". The model isn't in coderai's cache — download "
+                    f"'{model_name}' from the model interface first. ({joined})")
+        return f". Last output: {joined}" if joined else ""
+
+    # Wait (outside the lock) for the service to load the model and answer.
+    deadline = time.time() + ready_timeout
+    while time.time() < deadline:
+        if proc.poll() is not None:
+            raise RuntimeError(
+                f"Parler worker exited (code {proc.returncode}) before becoming ready"
+                + _tail_msg())
+        if _health_ok(url):
+            print(f"[parler] service ready for {model_name} at {url}", flush=True)
+            return url
+        time.sleep(2)
+    stop_service(model_name)
+    raise RuntimeError(f"Parler worker for {model_name} did not become ready in time"
+                       + _tail_msg())
+
+
+def stop_service(model_name: str) -> None:
+    with _lock:
+        svc = _services.pop(model_name, None)
+    if not svc:
+        return
+    proc = svc["proc"]
+    if proc.poll() is None:
+        try:
+            proc.terminate()
+            proc.wait(timeout=10)
+        except Exception:
+            pass
+    if proc.poll() is None:
+        try:
+            proc.kill()
+        except Exception:
+            pass
+    print(f"[parler] service for {model_name} stopped", flush=True)
+
+
+def stop_all() -> None:
+    for name in list(_services.keys()):
+        stop_service(name)
+
+
+import atexit as _atexit
+_atexit.register(stop_all)
--- a/codai/api/spatial.py
+++ b/codai/api/spatial.py
@@ -45,6 +45,31 @@ global_args = None
 global_file_path = None


+def _spatial_task(title: str):
+    """Decorator: register a spatial/3D endpoint in the unified task list so
+    every model type is visible there. Finishes done/error around the call."""
+    import functools
+
+    def deco(fn):
+        @functools.wraps(fn)
+        async def wrap(*args, **kwargs):
+            from codai.tasks import task_registry
+            tid = task_registry.register("spatial", title=title, model="spatial")
+            task_registry.start(tid)
+            try:
+                result = await fn(*args, **kwargs)
+                task_registry.finish(tid, "done")
+                return result
+            except HTTPException:
+                task_registry.finish(tid, "error")
+                raise
+            except Exception as e:
+                task_registry.finish(tid, "error", str(e)[:200])
+                raise
+        return wrap
+    return deco
+
+
 def set_global_args(args):
    global global_args
    global_args = args
@@ -500,6 +525,7 @@ class ImageTo3DRequest(BaseModel):


 @router.post("/v1/images/to3d", summary="Image to 3D model")
+@_spatial_task("Image → 3D")
 async def image_to_3d(request: ImageTo3DRequest, http_request: Request = None):
    """Convert a 2D image to a 3D representation.

@@ -568,6 +594,7 @@ class ImageFrom3DRequest(BaseModel):


 @router.post("/v1/images/from3d", summary="Render a 3D model to an image")
+@_spatial_task("3D → image")
 async def image_from_3d(request: ImageFrom3DRequest, http_request: Request = None):
    """Render a 3D model (GLB/OBJ) to a 2D PNG image from a specified camera angle."""
    raw = _decode_b64(request.model_data)
@@ -601,6 +628,7 @@ class VideoTo3DRequest(BaseModel):


 @router.post("/v1/video/to3d", summary="Video to 3D model")
+@_spatial_task("Video → 3D")
 async def video_to_3d(request: VideoTo3DRequest, http_request: Request = None):
    """Convert a 2D video to a 3D video frame-by-frame.

@@ -642,6 +670,7 @@ class VideoFrom3DRequest(BaseModel):


 @router.post("/v1/video/from3d", summary="Render a 3D model to a video")
+@_spatial_task("3D → video")
 async def video_from_3d(request: VideoFrom3DRequest, http_request: Request = None):
    """Render a 3D model as a 360° turntable video."""
    raw = _decode_b64(request.model_data)
@@ -675,6 +704,7 @@ class Generate3DRequest(BaseModel):


 @router.post("/v1/3d/generate", summary="Generate a 3D model from a prompt")
+@_spatial_task("Generate 3D")
 async def generate_3d(request: Generate3DRequest, http_request: Request = None):
    """Generate a 3D model (GLB) from a text prompt and/or an image.


--- a/codai/api/text.py
+++ b/codai/api/text.py
@@ -86,6 +86,105 @@ def set_grammar_guided_gen(enabled: bool):
    _set_grammar_guided_gen(enabled)


+def _debug_requests_enabled() -> bool:
+    """True when --debug-requests is set (full client<->API payload logging)."""
+    return bool(getattr(global_args, 'debug_requests', False)) if global_args else False
+
+
+def _summarize_tool_calls(tool_calls):
+    """Compact one-line-per-call view of OpenAI tool_calls (dict or pydantic)."""
+    out = []
+    for tc in (tool_calls or []):
+        fn = (tc.get('function') if isinstance(tc, dict) else getattr(tc, 'function', None)) or {}
+        name = fn.get('name', '') if isinstance(fn, dict) else getattr(fn, 'name', '')
+        args = fn.get('arguments', '') if isinstance(fn, dict) else getattr(fn, 'arguments', '')
+        if not isinstance(args, str):
+            try:
+                args = json.dumps(args)
+            except Exception:
+                args = str(args)
+        out.append(f"{name}({args})")
+    return out
+
+
+def log_request_exchange(request):
+    """Dump the incoming chat request (messages + tools) when --debug-requests.
+
+    Shows exactly what an agentic client (opencode, etc.) sends each turn —
+    including whether it replays prior assistant tool_calls and `role:tool`
+    results — so tool-call loops can be diagnosed from the wire, not guesswork."""
+    if not _debug_requests_enabled():
+        return
+    try:
+        print(f"\n{'#'*80}\n# >>> REQUEST  model={getattr(request, 'model', '?')}  "
+              f"stream={getattr(request, 'stream', False)}  "
+              f"tools={len(getattr(request, 'tools', None) or [])}\n{'#'*80}")
+        for i, m in enumerate(getattr(request, 'messages', []) or []):
+            role = getattr(m, 'role', '?')
+            content = getattr(m, 'content', '') or ''
+            if isinstance(content, list):
+                content = json.dumps(content)
+            line = f"[{i}] {role}: {str(content)[:2000]}"
+            tcs = getattr(m, 'tool_calls', None)
+            if tcs:
+                line += "  tool_calls=" + json.dumps(_summarize_tool_calls(tcs))
+            tcid = getattr(m, 'tool_call_id', None)
+            if tcid:
+                line += f"  tool_call_id={tcid}"
+            name = getattr(m, 'name', None)
+            if name:
+                line += f"  name={name}"
+            print(line)
+        tools = getattr(request, 'tools', None) or []
+        if tools:
+            names = []
+            for t in tools:
+                fn = t.get('function', {}) if isinstance(t, dict) else getattr(t, 'function', None)
+                names.append((fn.get('name') if isinstance(fn, dict) else getattr(fn, 'name', '?')))
+            print(f"# tools offered: {names}")
+        print(f"{'#'*80}\n", flush=True)
+    except Exception as e:
+        print(f"[debug-requests] failed to log request: {e}", flush=True)
+
+
+def log_response_exchange(content, tool_calls=None, finish_reason=None,
+                          streamed=False, stage="pre-format"):
+    """Dump the assistant message coderai *extracted* (content + tool_calls) when
+    --debug-requests. This is the model's decision **before** the OpenAI formatter
+    runs — pair it with :func:`log_response_payload` to see what the client gets."""
+    if not _debug_requests_enabled():
+        return
+    try:
+        tag = "STREAM" if streamed else "RESPONSE"
+        print(f"\n{'#'*80}\n# <<< {tag} [{stage}]  finish_reason={finish_reason}\n{'#'*80}")
+        if content:
+            print(f"content: {str(content)[:2000]}")
+        if tool_calls:
+            for c in _summarize_tool_calls(tool_calls):
+                print(f"tool_call: {c}")
+        if not content and not tool_calls:
+            print("(empty)")
+        print(f"{'#'*80}\n", flush=True)
+    except Exception as e:
+        print(f"[debug-requests] failed to log response: {e}", flush=True)
+
+
+def log_response_payload(payload, streamed=False):
+    """Dump the exact payload the client receives (post OpenAI-formatter) when
+    --debug-requests — the SSE chunk dict for streaming or the full JSON body for
+    non-streaming. This is the ground truth of what opencode actually parses, so a
+    formatter that rewrites/drops tool_calls or content is caught here."""
+    if not _debug_requests_enabled():
+        return
+    try:
+        tag = "STREAM CHUNK" if streamed else "RESPONSE BODY"
+        print(f"\n{'#'*80}\n# <<< {tag} [post-format, sent to client]\n{'#'*80}")
+        print(json.dumps(payload, indent=2, default=str)[:4000])
+        print(f"{'#'*80}\n", flush=True)
+    except Exception as e:
+        print(f"[debug-requests] failed to log payload: {e}", flush=True)
+
+
 # =============================================================================
 # Router and Endpoints
 # =============================================================================
@@ -96,6 +195,7 @@ router = APIRouter()
 @router.post("/v1/chat/completions", summary="Chat completions")
 async def chat_completions(request: ChatCompletionRequest, http_request: Request = None):
    """Chat completions endpoint with streaming and tool support."""
+    log_request_exchange(request)

    # Check if we should use litellm backend
    parser_type = getattr(global_args, 'parser', 'auto') if global_args else 'auto'
@@ -344,6 +444,13 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
            mm = _candidate
            break

+        _load_err = None
+        if _model_key:
+            _load_err = getattr(multi_model_manager, '_last_load_errors', {}).get(_model_key)
+        if _load_err:
+            raise HTTPException(status_code=503, detail=(
+                f"Model '{requested_model}' failed to load: {_load_err}"))
+
        print(f"Text model '{requested_model}' not ready, retrying in 5s "
              f"(attempt {_attempt + 1}/{_MAX_WAIT_TRIES})…")
        await asyncio.sleep(5)
@@ -1188,6 +1295,7 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
            }
        
        from fastapi.responses import JSONResponse
+        log_response_payload(formatted_response, streamed=False)
        return JSONResponse(content=formatted_response, headers=headers)

    # Compute prefix key for prompt-aggregation scheduling
@@ -1235,6 +1343,65 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
        finally:
            _release_instance()

+import re as _re
+
+_TOOL_SPAN_RE = _re.compile(r'<(tool|tool_call)\b[\s\S]*?</\1\s*>', _re.IGNORECASE)
+_TOOL_OPEN_RE = _re.compile(r'<(?:tool|tool_call)\b', _re.IGNORECASE)
+_TOOL_OPEN_TAGS = ('<tool>', '<tool_call>')
+# gemma-4 native tool call: `call:NAME{…}` (the <|tool_call> markers are stripped
+# by skip_special_tokens). Once it starts we withhold everything to the end of the
+# stream — the call is surfaced as structured tool_calls after generation.
+_GEMMA_CALL_OPEN_RE = _re.compile(r'call:\s*[A-Za-z_]\w*\s*\{')
+
+
+def _gate_tool_content(buffer: str, final: bool = False):
+    """Split accumulated stream text into (content_to_emit, held_buffer).
+
+    During tool-enabled streaming the model emits ``<tool>{json}</tool>`` spans
+    inline. Those must NOT reach the client as visible ``content`` (they're
+    surfaced separately as structured ``tool_calls``); otherwise the raw tags leak
+    into the chat. This withholds any complete or in-progress tool span, plus a
+    trailing partial ``<`` that could still grow into a tool tag, and streams only
+    the safe text around them. With ``final=True`` any leftover (possibly unclosed)
+    tool span is dropped and the rest emitted.
+    """
+    emit = []
+    # Strip complete tool spans, emitting the text around each.
+    while True:
+        m = _TOOL_SPAN_RE.search(buffer)
+        if not m:
+            break
+        emit.append(buffer[:m.start()])
+        buffer = buffer[m.end():]
+    # An open tag with no close yet → hold from there (a call is in progress).
+    m = _TOOL_OPEN_RE.search(buffer)
+    if m:
+        emit.append(buffer[:m.start()])
+        held = '' if final else buffer[m.start():]
+        return ''.join(emit), held
+    # gemma-4 `call:NAME{…}` — withhold from the call onward (extracted at the end).
+    gm = _GEMMA_CALL_OPEN_RE.search(buffer)
+    if gm:
+        emit.append(buffer[:gm.start()])
+        held = '' if final else buffer[gm.start():]
+        return ''.join(emit), held
+    # Hold back a trailing '<…' that could still become a tool open tag.
+    if not final:
+        lt = buffer.rfind('<')
+        if lt != -1:
+            tail = buffer[lt:].lower()
+            if any(t.startswith(tail) for t in _TOOL_OPEN_TAGS):
+                emit.append(buffer[:lt])
+                return ''.join(emit), buffer[lt:]
+        # Hold a trailing 'call:NAME' (no '{' yet) that may grow into a gemma call.
+        cm = _re.search(r'call:\s*[A-Za-z_]?\w*$', buffer)
+        if cm:
+            emit.append(buffer[:cm.start()])
+            return ''.join(emit), buffer[cm.start():]
+    emit.append(buffer)
+    return ''.join(emit), ''
+
+
 async def stream_chat_response(
    messages: List[Dict],
    model_name: str,
@@ -1350,6 +1517,12 @@ async def stream_chat_response(
    
    try:
        chunk_count = 0
+        # Buffer for withholding in-progress tool tags from the content stream.
+        content_buffer = ""
+        # Exact content deltas actually streamed to the client (post-format,
+        # post tool-gating) — logged once at the end under --debug-requests so we
+        # see the real reply, not just what we extracted internally.
+        client_sent_content = ""

        # Debug: Print what is being passed to the model
        if get_global_debug():
@@ -1399,6 +1572,30 @@ async def stream_chat_response(
            # Pass through all content including whitespace - it's essential for message composition
            generated_text += filtered_chunk

+            # Live progress under --debug-requests so a non-terminating / looping
+            # generation is visible AS IT HAPPENS — the end-of-stream response logs
+            # below never fire if the model never stops. The front pumps engine
+            # stdout line-by-line, so emit newline-terminated snapshots (every N
+            # chunks) of the accumulated tail; a loop shows up as the same text
+            # repeating across snapshots.
+            if _debug_requests_enabled():
+                if chunk_count == 1:
+                    print(f"# <<< STREAMING [live] model={model_name} "
+                          f"(snapshots every 64 tokens until stop)", flush=True)
+                if chunk_count % 64 == 0:
+                    _tail = generated_text[-220:].replace("\n", "\\n")
+                    print(f"# <<< [live @{chunk_count} tok] …{_tail}", flush=True)
+
+            # When tools are enabled, gate the content so in-progress <tool>…</tool>
+            # spans are never streamed as visible text (they're surfaced as
+            # structured tool_calls after the stream). Without tools, stream as-is.
+            if tools:
+                content_buffer += filtered_chunk
+                filtered_chunk, content_buffer = _gate_tool_content(content_buffer)
+                if not filtered_chunk:
+                    await asyncio.sleep(0)
+                    continue
+
            data = {
                "id": completion_id,
                "object": "chat.completion.chunk",
@@ -1410,10 +1607,30 @@ async def stream_chat_response(
                    "finish_reason": None,
                }],
            }
+            client_sent_content += filtered_chunk
            yield f"data: {json.dumps(data)}\n\n"
            # Explicitly flush to ensure data is sent immediately
            await asyncio.sleep(0)

+        # Flush any safe trailing text held back by the tool-content gate
+        # (dropping leftover/unclosed tool tags — they become tool_calls below).
+        if tools and content_buffer:
+            tail_content, content_buffer = _gate_tool_content(content_buffer, final=True)
+            if tail_content:
+                data = {
+                    "id": completion_id,
+                    "object": "chat.completion.chunk",
+                    "created": created,
+                    "model": model_name,
+                    "choices": [{
+                        "index": 0,
+                        "delta": {"content": tail_content},
+                        "finish_reason": None,
+                    }],
+                }
+                client_sent_content += tail_content
+                yield f"data: {json.dumps(data)}\n\n"
+

        # In debug mode, dump the full generated text
        if get_global_debug():
@@ -1484,6 +1701,9 @@ async def stream_chat_response(
                    print(f"{'='*80}\n")
                # Tool calls were extracted and stripped from content during streaming
                # Just send the tool_calls chunk
+                log_response_exchange(generated_text, tool_calls=tool_calls,
+                                      finish_reason="tool_calls", streamed=True,
+                                      stage="pre-format extracted")
                data = {
                    "id": completion_id,
                    "object": "chat.completion.chunk",
@@ -1497,6 +1717,7 @@ async def stream_chat_response(
                        "native_finish_reason": "tool_calls",
                    }],
                }
+                log_response_payload(data, streamed=True)
                yield f"data: {json.dumps(data)}\n\n"
            else:
                # Calculate token counts for usage in final chunk
@@ -1514,7 +1735,12 @@ async def stream_chat_response(
                    "completion_tokens": completion_tokens,
                    "total_tokens": prompt_tokens + completion_tokens,
                }
+                log_response_exchange(generated_text, finish_reason="stop",
+                                      streamed=True, stage="pre-format extracted")
+                log_response_exchange(client_sent_content, finish_reason="stop",
+                                      streamed=True, stage="post-format sent to client")
                final_chunk = formatter.format_litellm_chunk("", is_final=True, usage=usage_details, context_size=context_size)
+                log_response_payload(final_chunk, streamed=True)
                yield f"data: {json.dumps(final_chunk)}\n\n"
        else:
            # Calculate token counts for usage in final chunk
@@ -1571,6 +1797,11 @@ async def stream_chat_response(
                },
                "system_fingerprint": None,
            }
+            log_response_exchange(generated_text, finish_reason="stop",
+                                  streamed=True, stage="pre-format extracted")
+            log_response_exchange(client_sent_content, finish_reason="stop",
+                                  streamed=True, stage="post-format sent to client")
+            log_response_payload(final_chunk, streamed=True)
            yield f"data: {json.dumps(final_chunk)}\n\n"

        yield "data: [DONE]\n\n"
@@ -1740,6 +1971,10 @@ async def generate_chat_response(
        context_size = current_manager.get_context_size()
        
        # Use OpenAIFormatter for final sanitization
+        log_response_exchange(response_message.get("content", ""),
+                              tool_calls=response_message.get("tool_calls"),
+                              finish_reason=finish_reason, streamed=False,
+                              stage="pre-format extracted")
        formatter = OpenAIFormatter(model_name)
        formatted_response = formatter.format_litellm_full(
            text=response_message.get("content", ""),
@@ -1789,6 +2024,7 @@ async def generate_chat_response(
            print(json.dumps(formatted_response, indent=2))
            print(f"{'='*80}\n")

+        log_response_payload(formatted_response, streamed=False)
        return formatted_response
    except Exception as e:
        print(f"Error during generation: {e}")

--- a/codai/api/transcriptions.py
+++ b/codai/api/transcriptions.py
@@ -135,6 +135,32 @@ async def create_transcription(
    if len(file_content) > _MAX_AUDIO_BYTES:
        raise HTTPException(status_code=413, detail="Audio file too large (max 100 MB)")

+    # Register a task so transcription appears in the unified task list, like
+    # every other model type. Finished on success or error below.
+    from codai.tasks import task_registry
+    _tid = task_registry.register(
+        "transcription",
+        title=(file.filename or "audio")[:80],
+        model=model or "",
+    )
+    task_registry.start(_tid)
+    try:
+        _resp = await _run_transcription(
+            file_content, model, language, prompt, response_format, temperature, file)
+        task_registry.finish(_tid, "done")
+        return _resp
+    except HTTPException:
+        task_registry.finish(_tid, "error")
+        raise
+    except Exception as e:
+        task_registry.finish(_tid, "error", str(e)[:200])
+        raise
+
+
+async def _run_transcription(
+    file_content: bytes, model: str, language, prompt, response_format, temperature, file
+):
+    """Core transcription logic; registered as a task by create_transcription()."""
    # Check if the requested model maps to a configured whisper-server instance first.
    # Try alias round-robin resolution before direct ID lookup.
    whisper_model_id = multi_model_manager.resolve_whisper_alias_model_id(model)

--- a/codai/api/tts.py
+++ b/codai/api/tts.py
@@ -28,6 +28,7 @@ from pydantic import BaseModel, ConfigDict

 # Import from codai modules
 from codai.models.manager import multi_model_manager
+from codai.api import tts_backends


 # Global reference to be set by coderai
@@ -40,6 +41,20 @@ def set_global_args(args):
    global_args = args


+# Substrings that mark a model as a text/classifier/embedding model wrongly routed
+# to TTS (e.g. an emotion classifier exposed under a stray ``tts:`` alias).
+_NON_TTS_HINTS = (
+    "go_emotions", "roberta", "bert", "embedding", "e5-", "minilm",
+    "classifier", "toxic", "reranker", "sentence-transformers",
+)
+
+
+def _family_is_text_model(model_name: str) -> bool:
+    """Heuristic guard: True when the model is clearly not a speech synthesizer."""
+    n = (model_name or "").lower()
+    return any(h in n for h in _NON_TTS_HINTS)
+
+
 # =============================================================================
 # Router and Endpoints
 # =============================================================================
@@ -72,6 +87,16 @@ async def create_speech(request: TTSRequest, http_request: Request = None):
    Supports:
    - Kokoro TTS models (when --tts-model is specified)
    """
+    # Register a task so TTS shows up in the unified task list / dashboard,
+    # like every other model type. Finished on success or error below.
+    from codai.tasks import task_registry, loading_task
+    _tid = task_registry.register(
+        "tts",
+        title=(request.input or "")[:80],
+        model=(request.model or request.voice_profile or "tts"),
+    )
+    task_registry.start(_tid)
+    try:
        # If a voice profile is requested, delegate to voice cloning (F5-TTS)
        if request.voice_profile:
            from codai.api.voice_clone import _load_voice, _f5tts_clone
@@ -96,6 +121,7 @@ async def create_speech(request: TTSRequest, http_request: Request = None):
            except Exception as e:
                raise HTTPException(status_code=500, detail=f"Voice cloning failed: {e}")
            audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
+            task_registry.finish(_tid, "done")
            return {"audio": audio_base64}

        # Use the manager to resolve the model and manage VRAM
@@ -111,7 +137,7 @@ async def create_speech(request: TTSRequest, http_request: Request = None):

        model_name = model_info['model_name']
        model_key = model_info['model_key']
-    kokoro_model = model_info['model_object']
+        tts_backend = model_info['model_object']

        # If no TTS model configured, return an error
        if not model_name:
@@ -120,35 +146,42 @@ async def create_speech(request: TTSRequest, http_request: Request = None):
                detail="TTS not configured. Use --tts-model to specify a model."
            )

-    # Try to use kokoro if available
-    try:
-        from kokoro import Kokoro
+        # Reject text/classifier models that aren't actually speech synthesizers.
+        if _family_is_text_model(model_name):
+            raise HTTPException(
+                status_code=404,
+                detail=(f"Model '{model_name}' is a text model and cannot be used for "
+                        "tts generation. Use a TTS model (e.g. a kokoro/XTTS/Bark model).")
+            )

-        if kokoro_model is None:
-            print(f"Loading Kokoro TTS model: {model_name}")
+        try:
+            from codai.api import tts_backends

-            # Check if model_name is a URL - download it (with caching)
-            model_path = None
-            if model_name.startswith('http://') or model_name.startswith('https://'):
-                print(f"Loading model from URL: {model_name}")
-                from codai.models.cache import load_model
-                model_path = load_model(model_name)
-                if not model_path:
-                    raise Exception(f"Failed to load model from {model_name}")
-            else:
-                # Use local path or model name
+            if tts_backend is None:
+                print(f"Loading TTS model: {model_name}")
                model_path = model_name
-            
-            # Load the Kokoro model
-            kokoro_model = Kokoro(model_path if model_path else model_name)
-            multi_model_manager.add_model(model_key, kokoro_model)
+                if model_name.startswith(('http://', 'https://')):
+                    from codai.models.cache import load_model
+                    model_path = load_model(model_name) or model_name
+                cfg = multi_model_manager.config.get(model_key) or \
+                    multi_model_manager.config.get(f"tts:{model_name}") or {}
+                with loading_task(model_name, model_type="tts"):
+                    tts_backend = await asyncio.to_thread(
+                        tts_backends.load_backend, model_name, model_path, cfg)
+                multi_model_manager.add_model(model_key, tts_backend)
                multi_model_manager.current_model_key = model_key

-        # Generate speech
-        voice = request.voice or "af_sarah"
+            voice = request.voice or getattr(tts_backend, "default_voice", "")
            speed = request.speed or 1.0
+            lang = getattr(request, "language", None) or "en-us"
+            emotion = getattr(request, "emotion", None) or ""
+            style = getattr(request, "style", None) or ""
+            fmt = request.response_format or "wav"

-        audio_bytes = kokoro_model.generate(request.input, voice=voice, speed=speed)
+            samples, sample_rate = await asyncio.to_thread(
+                tts_backend.synthesize, request.input, voice, speed, lang, emotion, style)
+            audio_bytes, out_fmt = await asyncio.to_thread(
+                tts_backends.encode_audio, samples, sample_rate, fmt)

            try:
                from codai.api.archive import archive_manager
@@ -157,27 +190,29 @@ async def create_speech(request: TTSRequest, http_request: Request = None):
                    "tts", "/v1/audio/speech",
                    model_name,
                    request.input,
-                {"voice": voice, "speed": speed, "response_format": request.response_format},
-                [(audio_bytes, request.response_format or "mp3")],
+                    {"voice": voice, "speed": speed, "response_format": out_fmt},
+                    [(audio_bytes, out_fmt)],
                ))
            except Exception:
                pass

-        # Convert to base64
            audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
+            task_registry.finish(_tid, "done")
+            return {"audio": audio_base64}

-        return {
-            "audio": audio_base64
-        }
-        
-    except ImportError as e:
-        # kokoro not installed
-        raise HTTPException(
-            status_code=501,
-            detail=f"TTS not available. Install kokoro: pip install kokoro. Error: {str(e)}"
-        )
+        except HTTPException:
+            raise
+        except tts_backends.MissingEngineError as e:
+            # Missing optional engine (e.g. coqui-tts) → actionable 501.
+            raise HTTPException(status_code=501, detail=str(e))
        except Exception as e:
            print(f"TTS error: {e}")
            import traceback
            traceback.print_exc()
            raise HTTPException(status_code=500, detail=f"TTS error: {str(e)}")
+    except HTTPException:
+        task_registry.finish(_tid, "error")
+        raise
+    except Exception as e:
+        task_registry.finish(_tid, "error", str(e)[:200])
+        raise
\ No newline at end of file
--- a/codai/api/tts_backends.py
+++ b/codai/api/tts_backends.py
+"""Pluggable text-to-speech backends for the /v1/audio/speech endpoint.
+
+Dispatches a TTS request to the right engine based on the model family:
+
+* **kokoro**  → ``kokoro-onnx`` (ONNX runtime, no torch/spaCy needed). Requires a
+  ``kokoro-*.onnx`` model file and a ``voices-*.bin`` file; both are auto-resolved
+  from a local dir / HF repo, or downloaded from the kokoro-onnx release.
+* **coqui / XTTS** → ``coqui-TTS`` (``pip install coqui-tts``) when installed.
+* **parler** → ``parler-tts`` (expressive; voice/emotion/delivery/speed are steered
+  through a natural-language description prompt) when installed.
+* **anything else** → transformers ``pipeline("text-to-speech")`` (SpeechT5, Bark,
+  VITS / MMS-TTS, …).
+
+Every backend returns ``(samples: np.float32 [-1, 1], sample_rate: int)`` which is
+then encoded to the requested container by :func:`encode_audio`.
+"""
+
+from __future__ import annotations
+
+import io
+import os
+from pathlib import Path
+from typing import Optional, Tuple
+
+import numpy as np
+
+# Official kokoro-onnx model + voices release (used when files aren't local).
+_KOKORO_MODEL_URL = (
+    "https://github.com/thewh1teagle/kokoro-onnx/releases/download/"
+    "model-files-v1.0/kokoro-v1.0.onnx"
+)
+_KOKORO_VOICES_URL = (
+    "https://github.com/thewh1teagle/kokoro-onnx/releases/download/"
+    "model-files-v1.0/voices-v1.0.bin"
+)
+
+
+class MissingEngineError(RuntimeError):
+    """Raised when the optional engine for a TTS family isn't installed."""
+
+
+def _family(model_name: str) -> str:
+    """Classify a TTS model name into a backend family."""
+    n = (model_name or "").lower()
+    if "kokoro" in n:
+        return "kokoro"
+    if "xtts" in n or "coqui" in n:
+        return "coqui"
+    if "parler" in n:
+        return "parler"
+    if "bark" in n:
+        return "bark"
+    return "transformers"
+
+
+# Discrete emotion presets a family can steer at synthesis time. Empty unless an
+# engine actually supports it — clients surface an emotion picker only when this
+# is non-empty, so the control stays hidden for engines that can't honour it.
+_FAMILY_EMOTIONS: dict[str, list[str]] = {
+    # Parler steers these through its natural-language description prompt.
+    "parler": ["neutral", "happy", "sad", "angry", "excited", "calm", "fearful"],
+    # Bark has no true emotion knob; it inserts matching non-verbal cues in text.
+    "bark": ["neutral", "laughter", "sigh", "gasp"],
+}
+
+# Delivery / vocal styles a family can steer (whisper, shout/scream, tone, …).
+# Empty unless an engine actually honours it — kept separate from emotions so a
+# client can offer "how it's said" independently of "what's felt".
+_FAMILY_STYLES: dict[str, list[str]] = {
+    "parler": ["normal", "whispering", "shouting", "monotone", "expressive"],
+    "bark": ["normal", "whispering", "singing", "emphasis"],
+}
+
+
+def family_emotions(model_name: str) -> list[str]:
+    """Emotions the given model can steer, or [] when none are available."""
+    return list(_FAMILY_EMOTIONS.get(_family(model_name), []))
+
+
+def family_styles(model_name: str) -> list[str]:
+    """Delivery styles (whisper/shout/tone/…) the model can steer, or []."""
+    return list(_FAMILY_STYLES.get(_family(model_name), []))
+
+
+# --------------------------------------------------------------------------- #
+# kokoro-onnx
+# --------------------------------------------------------------------------- #
+
+def _cache_dir() -> Path:
+    base = os.environ.get("CODERAI_TTS_CACHE") or os.path.expanduser("~/.coderai/tts_cache")
+    p = Path(base)
+    p.mkdir(parents=True, exist_ok=True)
+    return p
+
+
+def _download(url: str, dest: Path) -> Path:
+    if dest.exists() and dest.stat().st_size > 0:
+        return dest
+    import urllib.request
+    tmp = dest.with_suffix(dest.suffix + ".part")
+    print(f"  Downloading TTS asset: {url}")
+    urllib.request.urlretrieve(url, tmp)
+    tmp.replace(dest)
+    return dest
+
+
+def _resolve_kokoro_files(model_path: str, config: dict) -> Tuple[str, str]:
+    """Return (onnx_model_path, voices_path) for kokoro-onnx.
+
+    Order of resolution: explicit config fields → files alongside a local dir /
+    .onnx path → download the official release into the TTS cache.
+    """
+    voices_path = (config or {}).get("voices_path")
+    onnx_path = (config or {}).get("model_path") or model_path
+
+    cand = Path(onnx_path) if onnx_path else None
+    if cand and cand.is_dir():
+        onnx = next(iter(sorted(cand.glob("*.onnx"))), None)
+        vb = next(iter(sorted(cand.glob("voices*.bin"))), None)
+        if onnx:
+            onnx_path = str(onnx)
+        if vb and not voices_path:
+            voices_path = str(vb)
+    elif cand and cand.suffix == ".onnx" and cand.exists():
+        onnx_path = str(cand)
+        if not voices_path:
+            sib = next(iter(sorted(cand.parent.glob("voices*.bin"))), None)
+            if sib:
+                voices_path = str(sib)
+
+    # Fall back to the official release files in the cache.
+    if not (onnx_path and Path(onnx_path).exists()):
+        onnx_path = str(_download(_KOKORO_MODEL_URL, _cache_dir() / "kokoro-v1.0.onnx"))
+    if not (voices_path and Path(voices_path).exists()):
+        voices_path = str(_download(_KOKORO_VOICES_URL, _cache_dir() / "voices-v1.0.bin"))
+    return onnx_path, voices_path
+
+
+class _KokoroBackend:
+    family = "kokoro"
+    default_voice = "af_sarah"
+
+    def __init__(self, model_path: str, config: dict):
+        from kokoro_onnx import Kokoro
+        onnx_path, voices_path = _resolve_kokoro_files(model_path, config)
+        print(f"  kokoro-onnx model={onnx_path} voices={voices_path}")
+        self._kokoro = Kokoro(onnx_path, voices_path)
+
+    def synthesize(self, text: str, voice: str, speed: float, lang: str,
+                   emotion: str = "", style: str = "") -> Tuple[np.ndarray, int]:
+        samples, sr = self._kokoro.create(
+            text, voice=voice or self.default_voice, speed=speed or 1.0,
+            lang=lang or "en-us",
+        )
+        return np.asarray(samples, dtype=np.float32), int(sr)
+
+    def voices(self):
+        try:
+            return sorted(self._kokoro.get_voices())
+        except Exception:
+            return []
+
+
+# --------------------------------------------------------------------------- #
+# transformers pipeline("text-to-speech")
+# --------------------------------------------------------------------------- #
+
+class _TransformersBackend:
+    family = "transformers"
+    default_voice = ""
+
+    def __init__(self, model_name: str, config: dict):
+        from transformers import pipeline
+        import torch
+        device = 0 if torch.cuda.is_available() else -1
+        print(f"  transformers TTS pipeline model={model_name} device={device}")
+        self._pipe = pipeline("text-to-speech", model=model_name, device=device)
+
+    def synthesize(self, text: str, voice: str, speed: float, lang: str,
+                   emotion: str = "", style: str = "") -> Tuple[np.ndarray, int]:
+        out = self._pipe(text)
+        audio = np.asarray(out["audio"], dtype=np.float32)
+        if audio.ndim > 1:
+            audio = audio.squeeze()
+        return audio, int(out["sampling_rate"])
+
+    def voices(self):
+        return []
+
+
+# --------------------------------------------------------------------------- #
+# coqui / XTTS (optional)
+# --------------------------------------------------------------------------- #
+
+class _CoquiBackend:
+    family = "coqui"
+    default_voice = ""
+
+    def __init__(self, model_name: str, config: dict):
+        try:
+            from TTS.api import TTS  # coqui-tts
+        except ImportError as e:
+            raise MissingEngineError(
+                "Coqui/XTTS models need the coqui-tts package: "
+                "pip install coqui-tts"
+            ) from e
+        import torch
+        self._cfg = config or {}
+        self._tts = TTS(model_name).to("cuda" if torch.cuda.is_available() else "cpu")
+
+    def synthesize(self, text: str, voice: str, speed: float, lang: str,
+                   emotion: str = "", style: str = "") -> Tuple[np.ndarray, int]:
+        # XTTS can clone from a reference wav, use one of its built-in speakers,
+        # or fall back to a default. `voice` may be a wav path, a built-in
+        # speaker name, or (e.g. a Kokoro id like "af_sarah") neither — so only
+        # treat it as a clone source when it's an actual file.
+        kwargs = {"text": text, "language": (lang or "en")[:2]}
+        speakers = list(getattr(self._tts, "speakers", None) or [])
+        cfg_wav = self._cfg.get("speaker_wav")
+        if voice and os.path.isfile(voice):
+            kwargs["speaker_wav"] = voice
+        elif cfg_wav and os.path.isfile(cfg_wav):
+            kwargs["speaker_wav"] = cfg_wav
+        elif voice and voice in speakers:
+            kwargs["speaker"] = voice
+        elif speakers:
+            # Multi-speaker model (e.g. XTTS-v2) needs *a* speaker; pick a default.
+            kwargs["speaker"] = self._cfg.get("speaker") or speakers[0]
+        try:
+            kwargs["speed"] = float(speed) if speed else 1.0
+            wav = np.asarray(self._tts.tts(**kwargs), dtype=np.float32)
+        except TypeError:
+            kwargs.pop("speed", None)   # some coqui models don't accept speed
+            wav = np.asarray(self._tts.tts(**kwargs), dtype=np.float32)
+        sr = int(getattr(self._tts.synthesizer, "output_sample_rate", 24000))
+        return wav, sr
+
+    def voices(self):
+        return list(getattr(self._tts, "speakers", None) or [])
+
+
+# --------------------------------------------------------------------------- #
+# Parler-TTS (optional) — expressive, description-prompt driven
+# --------------------------------------------------------------------------- #
+
+class _ParlerBackend:
+    family = "parler"
+    default_voice = ""
+
+    def __init__(self, model_name: str, config: dict):
+        try:
+            from parler_tts import ParlerTTSForConditionalGeneration
+            from transformers import AutoTokenizer
+        except ImportError as e:
+            raise MissingEngineError(
+                "Parler-TTS isn't installed. NOTE: parler-tts pins an old "
+                "transformers/tokenizers/huggingface-hub that conflict with this "
+                "server — do NOT pip install it into this environment. Run it in a "
+                "separate venv as its own service, or use an expressive engine that "
+                "works with this stack (e.g. Bark via the transformers pipeline)."
+            ) from e
+        import torch
+        self._cfg = config or {}
+        self._device = "cuda" if torch.cuda.is_available() else "cpu"
+        self._model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(self._device)
+        self._tok = AutoTokenizer.from_pretrained(model_name)
+        self._sr = int(self._model.config.sampling_rate)
+
+    def _describe(self, voice: str, speed: float, emotion: str, style: str) -> str:
+        # Parler is steered by a free-text description of the delivery; map the
+        # UI controls (voice name, emotion, delivery style, speed) into one.
+        speaker = (voice or "").strip()
+        if speaker and (os.sep in speaker or speaker.lower().startswith(("af_", "am_", "bf_", "bm_"))):
+            speaker = ""   # a file path or a Kokoro id is not a Parler speaker name
+        who = speaker or self._cfg.get("speaker") or "A speaker"
+        bits = [f"{who} speaks"]
+        if emotion and emotion != "neutral":
+            bits.append(f"in a {emotion} tone")
+        smap = {"whispering": "whispering softly", "shouting": "shouting loudly",
+                "monotone": "in a flat monotone", "expressive": "in a very expressive, animated way"}
+        if style and style not in ("", "normal"):
+            bits.append(smap.get(style, style))
+        try:
+            sp = float(speed or 1.0)
+        except (TypeError, ValueError):
+            sp = 1.0
+        bits.append(f"at a {'slow' if sp < 0.9 else 'fast' if sp > 1.15 else 'moderate'} pace")
+        return (" ".join(bits) +
+                ". The recording is very high quality, the voice clear and close up "
+                "with no background noise.")
+
+    def synthesize(self, text: str, voice: str, speed: float, lang: str,
+                   emotion: str = "", style: str = "") -> Tuple[np.ndarray, int]:
+        description = self._cfg.get("description") or self._describe(voice, speed, emotion, style)
+        input_ids = self._tok(description, return_tensors="pt").input_ids.to(self._device)
+        prompt_ids = self._tok(text, return_tensors="pt").input_ids.to(self._device)
+        gen = self._model.generate(input_ids=input_ids, prompt_input_ids=prompt_ids)
+        audio = np.asarray(gen.cpu().numpy().squeeze(), dtype=np.float32)
+        return audio, self._sr
+
+    def voices(self):
+        return []
+
+
+# --------------------------------------------------------------------------- #
+# Bark (suno/bark) — expressive via text markup; works with current transformers
+# --------------------------------------------------------------------------- #
+
+class _BarkBackend:
+    family = "bark"
+    default_voice = "v2/en_speaker_6"
+    # Curated English Bark presets by gender (speaker_6 is a clear male, speaker_9
+    # is the commonly-used female). Override via config: "bark_voice_male" /
+    # "bark_voice_female", or "bark_voices": {"male": ..., "female": ...}.
+    _BARK_MALE = "v2/en_speaker_6"
+    _BARK_FEMALE = "v2/en_speaker_9"
+
+    def __init__(self, model_name: str, config: dict):
+        # Uses the stable AutoProcessor + BarkModel API (not the pipeline) so
+        # voice presets and generation params are passed reliably.
+        from transformers import AutoProcessor, BarkModel
+        import torch
+        self._cfg = config or {}
+        self._device = "cuda" if torch.cuda.is_available() else "cpu"
+        self._proc = AutoProcessor.from_pretrained(model_name)
+        self._model = BarkModel.from_pretrained(model_name).to(self._device)
+        self._sr = int(getattr(self._model.generation_config, "sample_rate", 24000))
+
+    def _markup(self, text: str, emotion: str, style: str) -> str:
+        # Bark is steered by in-text cues rather than parameters.
+        if style == "emphasis":
+            text = text.upper()
+        elif style == "singing":
+            text = f"♪ {text} ♪"
+        elif style == "whispering":
+            text = f"[whispers] {text}"
+        cue = {"laughter": "[laughs] ", "sigh": "[sighs] ", "gasp": "[gasps] "}.get(emotion, "")
+        return cue + text
+
+    def _resolve_preset(self, voice: str) -> str:
+        v = (voice or "").strip()
+        # An explicit Bark preset passes straight through.
+        if v and ("speaker" in v or v.startswith("v2/")):
+            return v
+        # The editor sends Kokoro-style ids whose 2nd char is the gender
+        # (af_/bf_ = female, am_/bm_ = male). Map that to a gendered preset.
+        lv = v.lower()
+        gender = "male" if (len(lv) >= 2 and lv[1] == "m") else \
+                 ("female" if (len(lv) >= 2 and lv[1] == "f") else "")
+        vmap = self._cfg.get("bark_voices") or {}
+        if gender == "male":
+            return self._cfg.get("bark_voice_male") or vmap.get("male") or self._BARK_MALE
+        if gender == "female":
+            return self._cfg.get("bark_voice_female") or vmap.get("female") or self._BARK_FEMALE
+        return self._cfg.get("voice_preset") or self.default_voice
+
+    def synthesize(self, text: str, voice: str, speed: float, lang: str,
+                   emotion: str = "", style: str = "") -> Tuple[np.ndarray, int]:
+        import torch
+        # Speed isn't controllable in Bark; the voice maps to a gendered preset.
+        preset = self._resolve_preset(voice)
+        prompt = self._markup(text, emotion, style)
+        inputs = self._proc(prompt, voice_preset=preset)
+        inputs = {k: (v.to(self._device) if hasattr(v, "to") else v) for k, v in inputs.items()}
+        with torch.no_grad():
+            audio = self._model.generate(**inputs)
+        arr = np.asarray(audio.cpu().numpy().squeeze(), dtype=np.float32)
+        return arr, self._sr
+
+    def voices(self):
+        return [f"v2/en_speaker_{i}" for i in range(10)]
+
+
+# --------------------------------------------------------------------------- #
+# Parler over HTTP — the real engine runs in an isolated venv as a microservice
+# (parler-tts pins an old transformers that conflicts with this server's stack).
+# --------------------------------------------------------------------------- #
+
+class _RemoteParlerBackend:
+    family = "parler"
+    default_voice = ""
+
+    def __init__(self, config: dict, managed_model: Optional[str] = None):
+        self._cfg = config or {}
+        self._url = str(self._cfg["service_url"]).rstrip("/")
+        # When coderai launched the worker itself, remember the model so the
+        # manager's eviction (which calls cleanup()) can shut it down.
+        self._managed_model = managed_model
+
+    def synthesize(self, text: str, voice: str, speed: float, lang: str,
+                   emotion: str = "", style: str = "") -> Tuple[np.ndarray, int]:
+        import io
+        import requests
+        import soundfile as sf
+        payload = {"text": text, "voice": voice, "speed": speed,
+                   "emotion": emotion, "style": style, "language": lang}
+        if self._cfg.get("description"):
+            payload["description"] = self._cfg["description"]
+        resp = requests.post(self._url + "/speak", json=payload, timeout=600)
+        resp.raise_for_status()
+        data, sr = sf.read(io.BytesIO(resp.content), dtype="float32")
+        if getattr(data, "ndim", 1) > 1:
+            data = data.mean(axis=1)
+        return np.asarray(data, dtype=np.float32), int(sr)
+
+    def voices(self):
+        return []
+
+    def cleanup(self):
+        # Called by the model manager on eviction; stop the worker we launched.
+        if self._managed_model:
+            try:
+                from codai.api import parler_worker
+                parler_worker.stop_service(self._managed_model)
+            except Exception:
+                pass
+
+
+def load_backend(model_name: str, model_path: Optional[str], config: Optional[dict]):
+    """Instantiate the TTS backend for ``model_name`` (cached by the caller)."""
+    fam = _family(model_name)
+    config = config or {}
+    if fam == "kokoro":
+        return _KokoroBackend(model_path or model_name, config)
+    if fam == "coqui":
+        return _CoquiBackend(model_name, config)
+    if fam == "bark":
+        return _BarkBackend(model_name, config)
+    if fam == "parler":
+        # An explicit service_url points at an externally-run service. Otherwise
+        # coderai fully manages the worker: bootstrap its venv, spawn it, and
+        # route to it — no manual setup needed.
+        if config.get("service_url"):
+            return _RemoteParlerBackend(config)
+        from codai.api import parler_worker
+        url = parler_worker.ensure_service(model_name)
+        return _RemoteParlerBackend({**config, "service_url": url},
+                                    managed_model=model_name)
+    return _TransformersBackend(model_name, config)
+
+
+# --------------------------------------------------------------------------- #
+# encoding
+# --------------------------------------------------------------------------- #
+
+def encode_audio(samples: np.ndarray, sample_rate: int, fmt: str) -> Tuple[bytes, str]:
+    """Encode float samples to the requested container, returning (bytes, fmt).
+
+    WAV/FLAC/OGG go straight through soundfile; mp3 (and anything else) is muxed
+    via ffmpeg when available, else falls back to WAV.
+    """
+    import soundfile as sf
+    fmt = (fmt or "wav").lower()
+    samples = np.clip(np.asarray(samples, dtype=np.float32), -1.0, 1.0)
+
+    sf_formats = {"wav": "WAV", "flac": "FLAC", "ogg": "OGG"}
+    if fmt in sf_formats:
+        buf = io.BytesIO()
+        sf.write(buf, samples, sample_rate, format=sf_formats[fmt])
+        return buf.getvalue(), fmt
+
+    # mp3/other: write WAV then transcode with ffmpeg if present.
+    wav = io.BytesIO()
+    sf.write(wav, samples, sample_rate, format="WAV")
+    wav_bytes = wav.getvalue()
+    import shutil
+    import subprocess
+    if shutil.which("ffmpeg"):
+        try:
+            proc = subprocess.run(
+                ["ffmpeg", "-hide_banner", "-loglevel", "error",
+                 "-f", "wav", "-i", "pipe:0", "-f", fmt, "pipe:1"],
+                input=wav_bytes, stdout=subprocess.PIPE, check=True,
+            )
+            return proc.stdout, fmt
+        except Exception as exc:
+            print(f"  ffmpeg transcode to {fmt} failed ({exc}); returning WAV")
+    return wav_bytes, "wav"
--- a/codai/backends/cuda.py
+++ b/codai/backends/cuda.py
@@ -161,6 +161,272 @@ class NvidiaBackend(ModelBackend):
            print(f"Warning: Could not estimate model size: {e}")
            return None
    
+    def _model_head_dim(self, model_name: str) -> Optional[int]:
+        """Return the model's attention head dimension from its config.
+
+        Prefers the explicit ``head_dim`` field (Gemma sets it directly, decoupled
+        from hidden_size/num_heads); otherwise derives hidden_size // num_heads.
+        Returns None when the config can't be read.
+        """
+        from transformers import AutoConfig
+        try:
+            config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+        except Exception as e:
+            print(f"Warning: Could not read head dimension from config: {e}")
+            return None
+        # Multimodal models (Gemma, Qwen-VL) nest the real attention dims under
+        # text_config/vision_config; the top level reports None. Return the max
+        # head dim across all sub-configs so the FA2 limit check can't be fooled.
+        dims = []
+        for cfg in (config,
+                    getattr(config, 'text_config', None),
+                    getattr(config, 'vision_config', None)):
+            if cfg is None:
+                continue
+            head_dim = getattr(cfg, 'head_dim', None)
+            if head_dim:
+                dims.append(int(head_dim))
+                continue
+            hidden = getattr(cfg, 'hidden_size', None)
+            heads = getattr(cfg, 'num_attention_heads', None)
+            if hidden and heads:
+                dims.append(int(hidden) // int(heads))
+        return max(dims) if dims else None
+
+    def _estimate_kv_cache_bytes(self, model_name: str, n_ctx) -> int:
+        """Estimate the KV-cache size (bytes) for an ``n_ctx``-token sequence.
+
+        KV = 2 (key+value) × Σ(effective tokens per layer) × kv_heads × head_dim ×
+        dtype_bytes. Effective tokens per layer depend on the attention type:
+        full-attention layers hold the whole context; sliding-window layers (gemma)
+        cap at the window; linear-attention layers (Qwen3.5/Qwen3-Next) keep only a
+        small fixed recurrent state (~0 KV). The cache stays fp16/bf16 (2 bytes)
+        even when weights are 4-bit; head_dim/kv_heads come from the *text* config
+        (multimodal models nest them under ``text_config``). Returns 0 when
+        ``n_ctx`` or the architecture can't be determined.
+        """
+        try:
+            n_ctx = int(n_ctx)
+        except (TypeError, ValueError):
+            return 0
+        if n_ctx <= 0 or not model_name:
+            return 0
+        try:
+            from transformers import AutoConfig
+            cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+            tc = getattr(cfg, 'text_config', None) or cfg
+            layers = getattr(tc, 'num_hidden_layers', None)
+            kv_heads = (getattr(tc, 'num_key_value_heads', None)
+                        or getattr(tc, 'num_attention_heads', None))
+            head_dim = getattr(tc, 'head_dim', None)
+            if not head_dim:
+                hidden = getattr(tc, 'hidden_size', None)
+                heads = getattr(tc, 'num_attention_heads', None)
+                if hidden and heads:
+                    head_dim = int(hidden) // int(heads)
+            try:
+                sliding = int(getattr(tc, 'sliding_window', None) or 0) or None
+            except (TypeError, ValueError):
+                sliding = None
+            # Sum cached tokens contributed by each layer, honouring its attn type.
+            layer_types = getattr(tc, 'layer_types', None)
+            if layer_types:
+                eff_tokens = 0
+                for t in layer_types:
+                    tl = str(t).lower()
+                    if 'linear' in tl:
+                        continue  # recurrent state — negligible KV
+                    if 'sliding' in tl and sliding:
+                        eff_tokens += min(n_ctx, sliding)
+                    else:
+                        eff_tokens += n_ctx
+            elif layers:
+                eff_tokens = int(layers) * n_ctx
+            else:
+                return 0
+            if not (kv_heads and head_dim and eff_tokens > 0):
+                return 0
+            dtype_bytes = 2  # KV cache is fp16/bf16 regardless of weight quant
+            return 2 * int(eff_tokens) * int(kv_heads) * int(head_dim) * dtype_bytes
+        except Exception as e:
+            print(f"Warning: could not estimate KV cache size: {e}")
+            return 0
+
+    def _kv_quant_nbits(self):
+        """Decide KV-cache quantization width (2 or 4 bits) or None for fp16.
+
+        Honours an explicit ``cache_type_k``/``cache_type_v`` request (e.g. "q4_0",
+        "int4", "q2"); otherwise auto-enables 4-bit quantization when the model's
+        estimated fp16 KV cache is large enough to threaten VRAM. Quantizing the
+        KV cache (quanto) is what lets a long context coexist with the weights on
+        a single GPU instead of forcing a heavy weight offload.
+        """
+        # quanto/HQQ QuantizedCache only works with plain full-attention models.
+        # Both hybrid linear-attention (Qwen3.5/Qwen3-Next) and sliding-window
+        # (gemma) models raise during generation, so skip quantization entirely —
+        # regardless of any explicit cache_type request.
+        if not self._kv_quant_compatible():
+            return None
+        spec = str(
+            getattr(self, '_pending_cache_type_k', None)
+            or getattr(self, '_pending_cache_type_v', None)
+            or ''
+        ).lower()
+        if spec in ('', 'f16', 'fp16', 'bf16', 'f32', 'none', 'auto'):
+            kv = self._estimate_kv_cache_bytes(
+                getattr(self, '_pending_model_name', None),
+                getattr(self, '_pending_ctx', None),
+            )
+            return 4 if kv > 6 * 1024 ** 3 else None
+        if spec.startswith('q2') or 'int2' in spec or spec == '2':
+            return 2
+        return 4
+
+    def _kv_quant_compatible(self) -> bool:
+        """Whether the model supports transformers' quantized KV cache.
+
+        Only plain full-attention models do. Hybrid linear-attention models
+        (Qwen3.5/Qwen3-Next, identified by 'linear' entries in ``layer_types``)
+        raise "`has_previous_state` can only be called on LinearAttention layers",
+        and sliding-window/gemma models also fail — so exclude both.
+        """
+        try:
+            cfg = getattr(self.model, 'config', None)
+            if cfg is None:
+                from transformers import AutoConfig
+                name = getattr(self, '_pending_model_name', None)
+                if not name:
+                    return False
+                cfg = AutoConfig.from_pretrained(name, trust_remote_code=True)
+            tc = getattr(cfg, 'text_config', None) or cfg
+            layer_types = getattr(tc, 'layer_types', None) or []
+            if any('linear' in str(t).lower() for t in layer_types):
+                return False
+            if self._is_sliding_window_model():
+                return False
+            return True
+        except Exception:
+            return False
+
+    def _is_sliding_window_model(self) -> bool:
+        """True for hybrid / sliding-window-attention models (gemma family).
+
+        Prefers the loaded model's config; falls back to AutoConfig at load time
+        (before the model exists) using the pending model name.
+        """
+        try:
+            cfg = getattr(self.model, 'config', None)
+            if cfg is None:
+                from transformers import AutoConfig
+                name = getattr(self, '_pending_model_name', None)
+                if not name:
+                    return False
+                cfg = AutoConfig.from_pretrained(name, trust_remote_code=True)
+            tc = getattr(cfg, 'text_config', None) or cfg
+            model_type = (getattr(tc, 'model_type', '') or '').lower()
+            cache_impl = (getattr(tc, 'cache_implementation', '') or '').lower()
+            return (
+                model_type.startswith('gemma')
+                or getattr(tc, 'sliding_window', None) is not None
+                or cache_impl in {'hybrid', 'sliding_window'}
+            )
+        except Exception:
+            return False
+
+    def _kv_cache_reserve_bytes(self) -> int:
+        """VRAM (bytes) to reserve for the KV cache, accounting for quantization.
+
+        Quantized caches keep a small fp16 residual window plus group metadata, so
+        we scale the fp16 estimate by nbits/16 with ~1.5× overhead rather than a
+        naive 4×. Returns 0 when the size is unknown.
+        """
+        fp16 = self._estimate_kv_cache_bytes(
+            getattr(self, '_pending_model_name', None),
+            getattr(self, '_pending_ctx', None),
+        )
+        if fp16 <= 0:
+            return 0
+        nbits = self._kv_quant_nbits()
+        if nbits:
+            return int(fp16 * (nbits / 16.0) * 1.5)
+        return fp16
+
+    def _kv_offload_threshold_bytes(self) -> int:
+        """Free VRAM (after weights) below which a large KV should live on CPU.
+
+        Computed once per load from the actual free VRAM headroom; falls back to a
+        fixed 8 GB if it can't be read.
+        """
+        try:
+            import torch
+            free, _ = torch.cuda.mem_get_info()
+            # Leave ~2 GB for activations/compute; KV above that goes to CPU.
+            return max(int(2 * 1024 ** 3), int(free - 2 * 1024 ** 3))
+        except Exception:
+            return 8 * 1024 ** 3
+
+    def _offloaded_cache_impl(self) -> str:
+        """Name of the offloaded KV cache for sliding-window / hybrid models.
+
+        transformers >=5.12 merges the hybrid offloaded cache into
+        ``offloaded_static`` (the sliding/full layer structure is inferred from the
+        model config automatically); ``offloaded_hybrid`` is deprecated and removed
+        in v5.13. Prefer the new name when the installed transformers exposes it, so
+        we stay correct across versions without emitting the deprecation warning.
+        """
+        try:
+            from transformers.generation.configuration_utils import ALL_CACHE_IMPLEMENTATIONS
+            if 'offloaded_static' in ALL_CACHE_IMPLEMENTATIONS:
+                return 'offloaded_static'
+        except Exception:
+            pass
+        return 'offloaded_hybrid'
+
+    def _cache_gen_kwargs(self, using_prefix: bool, plain: bool = False) -> dict:
+        """generate() kwargs selecting the KV-cache strategy, or {} for default.
+
+        Priority: (1) quantized cache for compatible large-KV models (cuts VRAM
+        ~4×); (2) offloaded cache when the estimated KV won't fit in free VRAM —
+        keeps weights on GPU and streams KV from CPU RAM, and works on hybrid /
+        sliding-window models where the quantized cache crashes. ``plain=True`` (the
+        fallback path) forces the default in-GPU DynamicCache so a request can
+        always succeed even if a special cache is unsupported. Skipped entirely
+        when a manually-prefilled prefix cache is in use.
+        """
+        if using_prefix or plain:
+            return {}
+
+        # 1. Quantized cache (full-attention models only; returns None otherwise).
+        nbits = self._kv_quant_nbits()
+        if nbits:
+            if not getattr(self, '_cache_strategy_announced', False):
+                print(f"KV cache quantization enabled: quanto int{nbits} (residual_length=128)")
+                self._cache_strategy_announced = True
+            return {
+                'cache_implementation': 'quantized',
+                'cache_config': {
+                    'backend': 'quanto',
+                    'nbits': nbits,
+                    'q_group_size': 64,
+                    'residual_length': 128,
+                },
+            }
+
+        # 2. Offloaded cache when the KV is too large to fit in free VRAM.
+        kv = self._estimate_kv_cache_bytes(
+            getattr(self, '_pending_model_name', None),
+            getattr(self, '_pending_ctx', None),
+        )
+        if kv > 0 and kv > self._kv_offload_threshold_bytes():
+            impl = self._offloaded_cache_impl() if self._is_sliding_window_model() else 'offloaded'
+            if not getattr(self, '_cache_strategy_announced', False):
+                print(f"KV cache offloaded to CPU: cache_implementation={impl} "
+                      f"(est ~{kv/1e9:.1f}GB exceeds free VRAM)")
+                self._cache_strategy_announced = True
+            return {'cache_implementation': impl}
+
+        return {}
+
    def _get_gpu_memory_map(self) -> Dict:
        """Get max_memory dict for Accelerate."""
        import torch
@@ -193,6 +459,7 @@ class NvidiaBackend(ModelBackend):
        from transformers import AutoModelForCausalLM
        
        try:
+            load_kwargs = self._strip_invalid_native_quant_config(model_name, load_kwargs)
            model = AutoModelForCausalLM.from_pretrained(model_name, **load_kwargs)
            if device == "cpu" and load_kwargs.get('device_map') is None:
                model = model.to(device)
@@ -226,6 +493,61 @@ class NvidiaBackend(ModelBackend):
                        raise e
            raise
    
+    def _prequant_method(self, model_name: str):
+        """Return the checkpoint's valid embedded quantization method, or None.
+
+        Models shipped already-quantized (FP8 / GPTQ / AWQ / compressed-tensors,
+        e.g. DeepSeek-V4-Flash's FineGrainedFP8Config) carry a ``quantization_config``
+        in their config.json and MUST be loaded with that native config —
+        bitsandbytes cannot be layered on top (transformers raises
+        "is quantized with ... but you are passing a BitsAndBytesConfig").
+
+        Some non-transformers repositories (notably MLX checkpoints) publish a
+        partial ``quantization_config`` without ``quant_method``.  Transformers
+        treats that as invalid and raises during ``from_pretrained`` even if we
+        don't pass our own config.  Do not treat those checkpoints as native
+        transformers quantized models.
+        """
+        try:
+            from transformers import AutoConfig
+            cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+            qc = getattr(cfg, 'quantization_config', None)
+            if not qc:
+                return None
+            if isinstance(qc, dict):
+                return qc.get('quant_method')
+            return getattr(qc, 'quant_method', None)
+        except Exception:
+            return None
+
+    def _strip_invalid_native_quant_config(self, model_name: str, load_kwargs: dict) -> dict:
+        """Avoid passing malformed native quantization configs to transformers.
+
+        If a checkpoint config has ``quantization_config`` but no
+        ``quant_method``, recent transformers aborts with:
+        "The model's quantization config ... has no `quant_method` attribute".
+        Removing it lets normal HF/bitsandbytes loading paths proceed; MLX-only
+        checkpoints will then fail with a clearer architecture/weight mismatch
+        instead of entering the text endpoint retry loop for a bogus quant config.
+        """
+        if 'quantization_config' in load_kwargs:
+            return load_kwargs
+        try:
+            from transformers import AutoConfig
+            cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+            qc = getattr(cfg, 'quantization_config', None)
+            if not isinstance(qc, dict) or qc.get('quant_method'):
+                return load_kwargs
+            if hasattr(cfg, 'quantization_config'):
+                delattr(cfg, 'quantization_config')
+            patched = dict(load_kwargs)
+            patched['config'] = cfg
+            print("Ignoring invalid checkpoint quantization_config without quant_method; "
+                  "using explicit loader quantization/settings instead.")
+            return patched
+        except Exception:
+            return load_kwargs
+
    def _make_bnb_config(self, model_name: str, load_in_4bit: bool, load_in_8bit: bool):
        """Build a transformers BitsAndBytesConfig (the modern quant API).

@@ -236,6 +558,14 @@ class NvidiaBackend(ModelBackend):
        Always go through quantization_config instead.
        """
        ml = model_name.lower()
+        # Already-quantized checkpoints must load with their own config; bnb on top
+        # is rejected by transformers. Skip bnb and let from_pretrained use the
+        # embedded quantization_config.
+        pq = self._prequant_method(model_name)
+        if pq:
+            print(f"Model is pre-quantized ({pq}); skipping bitsandbytes and loading "
+                  f"with its native quantization config.")
+            return None
        if 'qwen3.5' in ml and ('a3b' in ml or 'moe' in ml):
            print(f"Warning: {model_name} does not support bitsandbytes quantization")
            return None
@@ -365,6 +695,9 @@ class NvidiaBackend(ModelBackend):
        import torch
        from transformers import AutoModelForCausalLM, AutoTokenizer

+        # Re-evaluate KV-prefix support for the model about to be loaded.
+        self._kv_prefix_ok = None
+
        offload_dir = kwargs.get('offload_dir')
        load_in_4bit = kwargs.get('load_in_4bit', False)
        load_in_8bit = kwargs.get('load_in_8bit', False)
@@ -386,6 +719,10 @@ class NvidiaBackend(ModelBackend):
                pass

        self._pending_ram_gb = manual_ram_gb
+        self._pending_model_name = model_name
+        self._pending_ctx = kwargs.get('ctx')
+        self._pending_cache_type_k = kwargs.get('cache_type_k')
+        self._pending_cache_type_v = kwargs.get('cache_type_v')

        print(f"Loading HuggingFace model: {model_name}")

@@ -417,6 +754,36 @@ class NvidiaBackend(ModelBackend):
        self.use_flash_attn = flash_attn and self._fa2_safe
        self.check_flash_attn_support()

+        # FlashAttention-2's forward kernel supports a head dimension of at most
+        # 256. Gemma (and some other large-head-dim models) exceed this, so FA2
+        # raises "FlashAttention forward only supports head dimension at most
+        # 256" on EVERY forward pass — both the KV-prefix build and the actual
+        # model.generate (whose error is swallowed by the streamer thread, so the
+        # request silently produces no output and appears to hang). Fall back to
+        # SDPA, which handles any head dimension and still uses flash kernels.
+        if self.use_flash_attn:
+            head_dim = self._model_head_dim(model_name)
+            fa2_bad = bool(head_dim and head_dim > 256)
+            reason = f"head dimension {head_dim} exceeds FA2's limit of 256" if fa2_bad else None
+            if not fa2_bad:
+                # Gemma reports head_dim==256 but still raises "FlashAttention
+                # forward only supports head dimension at most 256" on every
+                # forward (its sliding-window attention path), producing empty
+                # replies. Treat the whole gemma family as FA2-incompatible.
+                try:
+                    from transformers import AutoConfig
+                    _cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+                    _tc = getattr(_cfg, 'text_config', None) or _cfg
+                    _mt = (getattr(_tc, 'model_type', '') or getattr(_cfg, 'model_type', '') or '').lower()
+                    if _mt.startswith('gemma'):
+                        fa2_bad = True
+                        reason = f"gemma family (model_type={_mt}) is incompatible with FA2"
+                except Exception:
+                    pass
+            if fa2_bad:
+                self.use_flash_attn = False
+                print(f"  Flash Attention 2 disabled: {reason} → using SDPA instead.")
+
        self.device = self._detect_device()
        
        self.tokenizer = AutoTokenizer.from_pretrained(
@@ -454,6 +821,7 @@ class NvidiaBackend(ModelBackend):
                    load_kwargs['quantization_config'] = _qc
            
            try:
+                load_kwargs = self._strip_invalid_native_quant_config(model_name, load_kwargs)
                model = AutoModelForCausalLM.from_pretrained(model_name, **load_kwargs)
                self.model = model
                self.model.eval()
@@ -514,6 +882,7 @@ class NvidiaBackend(ModelBackend):
            load_kwargs.pop('dtype', None)
            
            try:
+                load_kwargs = self._strip_invalid_native_quant_config(model_name, load_kwargs)
                model = AutoModelForCausalLM.from_pretrained(model_name, **load_kwargs)
            except Exception as e:
                raise RuntimeError(
@@ -521,9 +890,56 @@ class NvidiaBackend(ModelBackend):
                    f"The model may be too large for available VRAM. Error: {e}"
                )
        else:
+            # 'auto'/'auto-borderline': honour the dropdown's documented contract —
+            # "over-VRAM → straight to model offload" means that when the model's
+            # peak (quantized weights + KV reserve + activations) FITS in free VRAM
+            # we load it full-GPU on a single device (fast, no device_map split, no
+            # CPU staging), and only fall through to the device_map=auto offload
+            # ladder below when it genuinely doesn't fit. The full-GPU attempt is
+            # non-fatal: on OOM it falls back to the ladder (unlike strategy 'none',
+            # which hard-errors). Honours the large-context KV reserve so a model
+            # that fits *with* its 64k KV stays resident.
+            if (model is None and self.device == "cuda"
+                    and offload_strategy in ('auto', 'auto-borderline')):
+                _fits = False
+                try:
+                    if torch.cuda.is_available() and expected_vram_gb > 0:
+                        _free, _ = torch.cuda.mem_get_info(0)
+                        _free_gb = _free / 1e9
+                        _kv_gb = self._kv_cache_reserve_bytes() / 1e9
+                        _act_gb = 1.5 if _kv_gb > 0 else 0.0
+                        _need_gb = expected_vram_gb + _kv_gb + _act_gb
+                        _borderline = 3.0 if offload_strategy == 'auto-borderline' else 0.0
+                        _fits = _need_gb <= (_free_gb - 0.5 + _borderline)
+                        if _fits:
+                            print(f"\n  Auto: peak VRAM need {_need_gb:.1f} GB "
+                                  f"(weights {expected_vram_gb:.1f} + KV {_kv_gb:.1f} "
+                                  f"+ act {_act_gb:.1f}) fits in {_free_gb:.1f} GB free "
+                                  f"— loading full-GPU (no offload)")
+                        else:
+                            print(f"\n  Auto: peak VRAM need {_need_gb:.1f} GB > "
+                                  f"{_free_gb:.1f} GB free — going straight to "
+                                  f"device_map offload")
+                except Exception:
+                    _fits = False
+                if _fits:
+                    cuda_device = self._derive_cuda_device()
+                    _fg_kwargs = dict(load_kwargs)
+                    _fg_kwargs['device_map'] = cuda_device
+                    _fg_kwargs['low_cpu_mem_usage'] = True
+                    _fg_kwargs = self._strip_invalid_native_quant_config(model_name, _fg_kwargs)
+                    model = self._try_load_model(model_name, _fg_kwargs, self.device)
+                    if model is not None:
+                        print(f"  ✓ Model loaded full-GPU on {cuda_device}")
+                    else:
+                        print("  ✗ Full-GPU load OOMed — falling back to "
+                              "device_map offload ladder")
+
            first_vram_pct = vram_percentages[0] if vram_percentages else 0.93

            for vram_pct in vram_percentages:
+                if model is not None:
+                    break
                if self.device != "cuda":
                    # No CUDA device — go straight to CPU+disk loading below.
                    break
@@ -627,6 +1043,14 @@ class NvidiaBackend(ModelBackend):
        import torch
        max_memory = {}

+        # Reserve VRAM for the KV cache (grows with context) plus a fixed
+        # activation/compute buffer, so device_map offloads enough weight layers
+        # to CPU instead of packing VRAM with weights and OOMing at generation.
+        # Uses the quantization-aware reserve so an int4 KV cache doesn't force a
+        # needless heavy offload.
+        kv_reserve = self._kv_cache_reserve_bytes()
+        activation_reserve = int(1.5 * 1024 ** 3) if kv_reserve > 0 else 0
+
        if torch.cuda.is_available():
            for i in range(torch.cuda.device_count()):
                props = torch.cuda.get_device_properties(i)
@@ -638,7 +1062,24 @@ class NvidiaBackend(ModelBackend):
                headroom = 512 * 1024 * 1024  # 512 MB for CUDA driver overhead
                limit_by_fraction = int(total_vram * vram_fraction)
                limit_by_free     = max(0, free_vram - headroom)
-                max_memory[i] = min(limit_by_fraction, limit_by_free)
+                weight_budget = min(limit_by_fraction, limit_by_free)
+                # Cap the reservation so a large/mis-estimated KV cache can never
+                # crush the weight budget: never reserve more than 60% of the GPU
+                # budget for context. If the KV genuinely doesn't fit in the
+                # remaining 40%, KV quantization (see _kv_quant_nbits) is the lever,
+                # not starving the weights onto CPU.
+                reserved = min(kv_reserve + activation_reserve, int(weight_budget * 0.6))
+                if reserved > 0:
+                    new_budget = max(weight_budget - reserved, int(weight_budget * 0.4))
+                    print(
+                        f"  GPU {i}: reserving {reserved/1e9:.1f}GB for KV+activations "
+                        f"(KV~{kv_reserve/1e9:.1f}GB, ctx={getattr(self, '_pending_ctx', None)}, "
+                        f"quant={self._kv_quant_nbits()}); "
+                        f"weight budget {weight_budget/1e9:.1f}→{new_budget/1e9:.1f}GB "
+                        f"(rest spills to CPU)"
+                    )
+                    weight_budget = new_budget
+                max_memory[i] = weight_budget

        manual_ram_gb = getattr(self, '_pending_ram_gb', None)
        if manual_ram_gb:
@@ -965,6 +1406,9 @@ class NvidiaBackend(ModelBackend):
        if repeat_penalty != 1.0:
            generation_kwargs["repetition_penalty"] = repeat_penalty

+        # Quantize the KV cache when enabled (completions never use a prefix cache).
+        generation_kwargs.update(self._cache_gen_kwargs(using_prefix=False))
+        
        # Mid-generation thermal checkpoint (runs on the generate thread).
        _criteria = []
        _therm = _make_thermal_criteria()
@@ -998,12 +1442,36 @@ class NvidiaBackend(ModelBackend):
                    torch.cuda.empty_cache()
                else:
                    generation_error = str(e)
+            except Exception as e:
+                # Any other failure (shape/cache mismatch, transformers API change, …)
+                # must still be recorded — otherwise it is silently swallowed.
+                generation_error = str(e)
+                print(f"Error during streaming generation: {e}")
+            finally:
+                # generate() only calls streamer.end() on its success path. If it
+                # raised before finishing, end the streamer here so the consumer is
+                # never left blocked forever on an empty queue (which freezes the
+                # whole event loop).
+                streamer.end()

        thread = Thread(target=generate_with_error_handling)
        thread.start()

+        # Pull each token from a worker thread so a blocking streamer.__next__
+        # never runs on (and freezes) the asyncio event loop between tokens.
+        import asyncio
+        _SENT = object()
+        _it = iter(streamer)
+        def _next_token():
+            try:
+                return next(_it)
+            except StopIteration:
+                return _SENT
        try:
-            for text in streamer:
+            while True:
+                text = await asyncio.to_thread(_next_token)
+                if text is _SENT:
+                    break
                yield text
        except Exception as e:
            print(f"Error during stream iteration: {e}")
@@ -1059,6 +1527,79 @@ class NvidiaBackend(ModelBackend):
            del self._kv_past_key_values
        self._kv_past_key_values = None
        self._kv_prefix_len = 0
+
+    def _kv_prefix_supported(self) -> bool:
+        """Whether this model can safely reuse a manually-prefilled KV cache.
+
+        The prefix fast-path builds a cache with a plain forward pass and then
+        continues it via generate(input_ids=suffix, past_key_values=cache). That
+        only works for models that use a simple growing (Dynamic) cache. Models
+        with a sliding-window / hybrid cache (e.g. the gemma family) build a
+        different cache object during generate() and raise before the first
+        token when handed our prefix — so disable the fast-path for them and let
+        the full forward pass handle the request.
+        """
+        cached = getattr(self, "_kv_prefix_ok", None)
+        if cached is not None:
+            return cached
+        ok = True
+        try:
+            cfg = getattr(self.model, "config", None)
+            # Multimodal wrappers nest the LM config under text_config.
+            text_cfg = getattr(cfg, "text_config", None) or cfg
+            model_type = (getattr(text_cfg, "model_type", "") or "").lower()
+            cache_impl = (getattr(text_cfg, "cache_implementation", "") or "").lower()
+            sliding = getattr(text_cfg, "sliding_window", None)
+            reason = None
+            if (
+                model_type.startswith("gemma")
+                or cache_impl in {"hybrid", "static", "sliding_window"}
+                or sliding is not None
+            ):
+                reason = "hybrid/sliding-window cache"
+            else:
+                # A large configured context means the stored prefix KV is several
+                # GB and lives *alongside* the generation cache — doubling KV
+                # memory and risking OOM on a single GPU. Not worth it: disable the
+                # fast-path so only one KV cache is ever resident.
+                kv_bytes = self._estimate_kv_cache_bytes(
+                    getattr(self, '_pending_model_name', None),
+                    getattr(self, '_pending_ctx', None),
+                )
+                if kv_bytes > 2 * 1024 ** 3:
+                    reason = f"large KV cache (~{kv_bytes/1e9:.1f}GB at configured ctx)"
+            if reason is not None:
+                ok = False
+                self._kv_prefix_off_reason = reason
+        except Exception:
+            # If we can't introspect the config, stay safe and skip the fast-path.
+            ok = False
+            self._kv_prefix_off_reason = "config introspection failed"
+        if not ok:
+            print(
+                "KV-prefix fast-path disabled for this model "
+                f"({getattr(self, '_kv_prefix_off_reason', 'unsupported')}); "
+                "using full forward pass"
+            )
+        self._kv_prefix_ok = ok
+        return ok
+
+    def _kv_prefix_headroom_ok(self, min_free_gb: float = 1.5) -> bool:
+        """Whether there is enough free VRAM to safely build/store a KV prefix.
+
+        The prefix path runs an extra forward pass and keeps a second copy of the
+        prefix KV alongside the live model. On a nearly-full card that extra
+        allocation OOMs (the build is caught and we fall back, but it wastes a
+        forward pass and risks fragmentation). Skip it when headroom is low and
+        let normal generation — which doesn't keep a separate stored prefix —
+        handle the request.
+        """
+        import torch
+        try:
+            free, _total = torch.cuda.mem_get_info()
+            return free / 1e9 >= min_free_gb
+        except Exception:
+            return False
        self._kv_timestamp = 0.0

    # ------------------------------------------------------------------
@@ -1092,8 +1633,21 @@ class NvidiaBackend(ModelBackend):
                ids.add(int(self.tokenizer.eos_token_id))
        except Exception:
            pass
+        # The model's own generation_config is authoritative for the turn-end
+        # token(s) — e.g. gemma-4's turn terminator is <turn|> (id 106), which has
+        # no recognisable name in the loop below, so without this the model never
+        # stops after a tool call and loops to max_tokens.
+        try:
+            gc_eos = getattr(getattr(self.model, 'generation_config', None),
+                             'eos_token_id', None)
+            if isinstance(gc_eos, int):
+                ids.add(gc_eos)
+            elif isinstance(gc_eos, (list, tuple)):
+                ids.update(int(t) for t in gc_eos if isinstance(t, int))
+        except Exception:
+            pass
        for tok in ('<|im_end|>', '<|eot_id|>', '<|end|>', '<|endoftext|>',
-                    '<|end_of_text|>', '<end_of_turn>'):
+                    '<|end_of_text|>', '<end_of_turn>', '<turn|>'):
            try:
                tid = self.tokenizer.convert_tokens_to_ids(tok)
                if isinstance(tid, int) and tid >= 0 and tid != getattr(
@@ -1103,25 +1657,189 @@ class NvidiaBackend(ModelBackend):
                pass
        return list(ids) if ids else self.tokenizer.eos_token_id

+    def supports_native_tools(self) -> bool:
+        """True when the loaded model's chat template understands `tools=` natively
+        (gemma-4, Qwen, Llama-3.1, …). For those we pass the structured tools to the
+        template instead of injecting coderai's custom <tool>{…} text prompt, so the
+        model is prompted in — and replies in — its own trained tool-call format."""
+        tmpl = getattr(self.tokenizer, 'chat_template', None)
+        return bool(tmpl) and ('tools' in tmpl or 'tool_calls' in tmpl)
+
+    def _native_tools_payload(self, tools):
+        """Normalise tools to the OpenAI [{'type','function':{…}}] dicts that chat
+        templates expect. Accepts dicts or pydantic Tool objects; returns None if
+        there's nothing usable."""
+        if not tools:
+            return None
+        out = []
+        for t in tools:
+            if isinstance(t, dict):
+                fn = t.get('function') or {}
+                name = fn.get('name') if isinstance(fn, dict) else None
+                if not name:
+                    continue
+                out.append({'type': t.get('type', 'function'),
+                            'function': {'name': name,
+                                         'description': fn.get('description') or '',
+                                         'parameters': fn.get('parameters') or {}}})
+            else:
+                fn = getattr(t, 'function', None)
+                name = getattr(fn, 'name', None) if fn else None
+                if not name:
+                    continue
+                out.append({'type': getattr(t, 'type', 'function'),
+                            'function': {'name': name,
+                                         'description': getattr(fn, 'description', '') or '',
+                                         'parameters': getattr(fn, 'parameters', {}) or {}}})
+        return out or None
+
+    def _build_native_tool_prompt(self, messages, native_tools, enable_thinking,
+                                  add_generation_prompt):
+        """Render the prompt via the model's template with native `tools=`, keeping
+        structured `tool_calls` and `role:tool` turns intact so the template emits
+        the model's own tool-call/tool-response format. Returns the string, or None
+        if the template can't handle it (caller falls back)."""
+        import re as _re
+
+        def _get(m, k, default=None):
+            return m.get(k, default) if isinstance(m, dict) else getattr(m, k, default)
+
+        # Going native: strip coderai's custom <tool>{…} text instruction that
+        # format_tools_for_prompt() prepends to the system prompt, so the model
+        # isn't told to use two different tool formats at once (native tool
+        # declarations are supplied via tools= below).
+        def _strip_injected(text):
+            if not text or 'You have access to the following tools:' not in text:
+                return text
+            return _re.sub(
+                r"You have access to the following tools:.*?example\.txt.*?</tool>\s*",
+                "", text, count=1, flags=_re.DOTALL).lstrip()
+
+        norm = []
+        for m in messages:
+            role = _get(m, 'role')
+            content = _get(m, 'content') or ''
+            if isinstance(content, list):
+                content = '\n'.join(
+                    str(p.get('text', '')) if isinstance(p, dict) else str(p)
+                    for p in content)
+            if role in ('system', 'developer'):
+                content = _strip_injected(content)
+            entry = {'role': role, 'content': content}
+            tcs = _get(m, 'tool_calls')
+            if tcs:
+                # Pass tool_calls through in the OpenAI shape the templates expect
+                # (function.name + function.arguments as a JSON string).
+                norm_tcs = []
+                for tc in tcs:
+                    fn = (tc.get('function') if isinstance(tc, dict)
+                          else getattr(tc, 'function', None)) or {}
+                    name = fn.get('name') if isinstance(fn, dict) else getattr(fn, 'name', '')
+                    args = fn.get('arguments') if isinstance(fn, dict) else getattr(fn, 'arguments', '{}')
+                    tcid = (tc.get('id') if isinstance(tc, dict) else getattr(tc, 'id', None)) or f"call_{len(norm_tcs)}"
+                    norm_tcs.append({'id': tcid, 'type': 'function',
+                                     'function': {'name': name, 'arguments': args}})
+                entry['tool_calls'] = norm_tcs
+            tcid = _get(m, 'tool_call_id')
+            if tcid:
+                entry['tool_call_id'] = tcid
+            name = _get(m, 'name')
+            if name:
+                entry['name'] = name
+            norm.append(entry)
+        for kwargs in ({'tools': native_tools, 'add_generation_prompt': add_generation_prompt,
+                        'enable_thinking': enable_thinking},
+                       {'tools': native_tools, 'add_generation_prompt': add_generation_prompt}):
+            try:
+                return self.tokenizer.apply_chat_template(norm, tokenize=False, **kwargs)
+            except TypeError:
+                continue
+            except Exception:
+                return None
+        return None
+
    def _build_chat_prompt(self, messages, enable_thinking: bool = False,
-                           add_generation_prompt: bool = True) -> str:
+                           add_generation_prompt: bool = True, tools=None) -> str:
        """Build the prompt string using the MODEL's own chat template when it has
        one (correct special tokens + proper `enable_thinking` handling for Qwen3).
        Falls back to the legacy custom formatter when no template is available.

        `enable_thinking=True` keeps reasoning <think> blocks available for callers
        that ask for them; `False` (default) suppresses them via the template.
+
+        When `tools` is given and the template natively supports tools, the tools
+        and the structured tool_calls/tool-role turns are passed straight to the
+        template (native format) — see :meth:`supports_native_tools`.
        """
+        import json
        tmpl = getattr(self.tokenizer, 'chat_template', None)
+
+        # Native-tools fast path: hand structured tools + tool turns to the model's
+        # own template so it renders (and the model emits) its trained tool-call
+        # format, instead of folding everything into custom <tool>{…} text.
+        native_tools = self._native_tools_payload(tools) if (
+            tmpl and tools and self.supports_native_tools()) else None
+        if native_tools:
+            prompt = self._build_native_tool_prompt(
+                messages, native_tools, enable_thinking, add_generation_prompt)
+            if prompt is not None:
+                return prompt
+            # else: native render failed — fall through to the generic path.
+
        if tmpl:
            # Normalise to plain {role, content} dicts for apply_chat_template.
+            #
+            # Most chat templates (gemma, mistral, …) only understand the
+            # system/user/assistant roles and a plain `content` string — they
+            # ignore `tool_calls`/`tool_call_id` and reject (or silently drop) the
+            # `tool` role. If we simply stripped those, an agentic client
+            # (opencode, etc.) would lose the record of the tool call it already
+            # made *and* the result it got back, so the model re-issues the same
+            # call every turn — an infinite tool-call loop. So we fold tool turns
+            # back into `content` using the same `<tool>{…}</tool>` convention the
+            # tool-injection prompt teaches, and render tool results as readable
+            # text under a role the template accepts.
+            def _get(m, k, default=None):
+                return m.get(k, default) if isinstance(m, dict) else getattr(m, k, default)
+
            norm = []
            for m in messages:
-                if isinstance(m, dict):
-                    norm.append({'role': m.get('role'), 'content': m.get('content') or ''})
+                role = _get(m, 'role')
+                content = _get(m, 'content') or ''
+                if isinstance(content, list):
+                    content = '\n'.join(
+                        str(p.get('text', '')) if isinstance(p, dict) else str(p)
+                        for p in content)
+                if role == 'assistant':
+                    tcs = _get(m, 'tool_calls') or []
+                    for tc in tcs:
+                        fn = (tc.get('function') if isinstance(tc, dict)
+                              else getattr(tc, 'function', None)) or {}
+                        name = fn.get('name', '') if isinstance(fn, dict) else getattr(fn, 'name', '')
+                        args = fn.get('arguments', '{}') if isinstance(fn, dict) else getattr(fn, 'arguments', '{}')
+                        if not isinstance(args, str):
+                            try:
+                                args = json.dumps(args)
+                            except Exception:
+                                args = '{}'
+                        block = f'<tool>{{"name": "{name}", "arguments": {args}}}</tool>'
+                        content = (content + '\n' + block) if content else block
+                    norm.append({'role': 'assistant', 'content': content})
+                elif role == 'tool':
+                    # Templates that lack a `tool` role would error/drop this.
+                    # Render the result as a user turn so the model sees it.
+                    name = _get(m, 'name') or ''
+                    label = f'Tool result ({name})' if name else 'Tool result'
+                    text = f'{label}: {content}'
+                    # Merge into the previous turn if it's also a synthesised
+                    # user/tool message, to avoid consecutive same-role turns that
+                    # strict templates (gemma) reject.
+                    if norm and norm[-1]['role'] == 'user':
+                        norm[-1]['content'] = norm[-1]['content'] + '\n' + text
+                    else:
+                        norm.append({'role': 'user', 'content': text})
                else:
-                    norm.append({'role': getattr(m, 'role', None),
-                                 'content': getattr(m, 'content', '') or ''})
+                    norm.append({'role': role, 'content': content})
            try:
                return self.tokenizer.apply_chat_template(
                    norm, tokenize=False,
@@ -1154,7 +1872,7 @@ class NvidiaBackend(ModelBackend):
            max_tokens = 512

        full_prompt = self._build_chat_prompt(messages, enable_thinking=enable_thinking,
-                                              add_generation_prompt=True)
+                                              add_generation_prompt=True, tools=tools)
        total_input_ids = self.tokenizer(full_prompt, return_tensors="pt")['input_ids']
        total_prompt_len = int(total_input_ids.shape[1])

@@ -1168,9 +1886,9 @@ class NvidiaBackend(ModelBackend):
        past_kv = None
        cached_len = 0

-        if prefix_msgs and self._model_on_cuda():
+        if prefix_msgs and self._model_on_cuda() and self._kv_prefix_supported() and self._kv_prefix_headroom_ok():
            prefix_text = self._build_chat_prompt(
-                prefix_msgs, enable_thinking=enable_thinking, add_generation_prompt=False)
+                prefix_msgs, enable_thinking=enable_thinking, add_generation_prompt=False, tools=tools)
            if self._kv_cache_valid() and self._kv_prefix_text == prefix_text:
                past_kv = self._kv_past_key_values
                cached_len = self._kv_prefix_len
@@ -1223,6 +1941,7 @@ class NvidiaBackend(ModelBackend):
                        input_ids=total_input_ids,
                        attention_mask=attn_mask,
                        **gen_kwargs,
+                        **self._cache_gen_kwargs(using_prefix=False),
                    )
                new_tokens = outputs[0][total_prompt_len:]
            generated_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
@@ -1300,7 +2019,7 @@ class NvidiaBackend(ModelBackend):
            max_tokens = 512

        full_prompt = self._build_chat_prompt(messages, enable_thinking=enable_thinking,
-                                              add_generation_prompt=True)
+                                              add_generation_prompt=True, tools=tools)
        total_input_ids = self.tokenizer(full_prompt, return_tensors="pt")['input_ids']
        total_prompt_len = int(total_input_ids.shape[1])

@@ -1312,9 +2031,9 @@ class NvidiaBackend(ModelBackend):
        past_kv = None
        cached_len = 0

-        if prefix_msgs and self._model_on_cuda():
+        if prefix_msgs and self._model_on_cuda() and self._kv_prefix_supported() and self._kv_prefix_headroom_ok():
            prefix_text = self._build_chat_prompt(
-                prefix_msgs, enable_thinking=enable_thinking, add_generation_prompt=False)
+                prefix_msgs, enable_thinking=enable_thinking, add_generation_prompt=False, tools=tools)
            if self._kv_cache_valid() and self._kv_prefix_text == prefix_text:
                past_kv = self._kv_past_key_values
                cached_len = self._kv_prefix_len
@@ -1329,21 +2048,46 @@ class NvidiaBackend(ModelBackend):
        temperature, top_p, do_sample = self._validate_params(temperature, top_p)

        total_input_ids = total_input_ids.to(self.model.device)
-        if past_kv is not None and 0 < cached_len < total_prompt_len:
+
+        # Stopping criteria (thermal checkpoint + optional stop sequences) are
+        # independent of the KV-prefix path, so build them once and reuse across
+        # both the cached attempt and any full-forward fallback.
+        _criteria = []
+        _therm = _make_thermal_criteria()
+        if _therm is not None:
+            _criteria.append(_therm)
+        if stop:
+            class _StopOnSeq(StoppingCriteria):
+                def __init__(self, seqs, tok):
+                    self.seqs = seqs
+                    self.tok = tok
+                def __call__(self, input_ids, scores, **kw):
+                    decoded = self.tok.decode(input_ids[0][-20:], skip_special_tokens=True)
+                    return any(s in decoded for s in self.seqs)
+            _criteria.append(_StopOnSeq(stop, self.tokenizer))
+        stopping = StoppingCriteriaList(_criteria) if _criteria else None
+
+        import asyncio
+        _SENT = object()
+
+        def _build_attempt(use_prefix, plain_cache=False):
+            """Build (streamer, gen_kwargs, used_cached_len) for one attempt.
+            use_prefix=False forces a clean full forward pass (no past_key_values);
+            plain_cache=True forces the default in-GPU cache (fallback path)."""
+            if use_prefix and past_kv is not None and 0 < cached_len < total_prompt_len:
                gen_input_ids = total_input_ids[:, cached_len:]
                full_attn = torch.ones(
                    1, total_prompt_len, dtype=torch.long, device=self.model.device
                )
                extra_gen = {'past_key_values': past_kv, 'attention_mask': full_attn}
+                used_cached = cached_len
            else:
-            cached_len = 0
                gen_input_ids = total_input_ids
                extra_gen = {'attention_mask': torch.ones_like(total_input_ids)}
-
+                used_cached = 0
            streamer = TextIteratorStreamer(
                self.tokenizer, skip_prompt=True, skip_special_tokens=True
            )
-
            gen_kwargs = dict(
                input_ids=gen_input_ids,
                max_new_tokens=max_tokens,
@@ -1356,26 +2100,25 @@ class NvidiaBackend(ModelBackend):
                use_cache=True,
                **extra_gen,
            )
-
-        # Mid-generation thermal checkpoint (runs on the generate thread).
-        _criteria = []
-        _therm = _make_thermal_criteria()
-        if _therm is not None:
-            _criteria.append(_therm)
-        if stop:
-            class _StopOnSeq(StoppingCriteria):
-                def __init__(self, seqs, tok):
-                    self.seqs = seqs
-                    self.tok = tok
-                def __call__(self, input_ids, scores, **kw):
-                    decoded = self.tok.decode(input_ids[0][-20:], skip_special_tokens=True)
-                    return any(s in decoded for s in self.seqs)
-            _criteria.append(_StopOnSeq(stop, self.tokenizer))
-        if _criteria:
-            gen_kwargs['stopping_criteria'] = StoppingCriteriaList(_criteria)
+            if stopping is not None:
+                gen_kwargs['stopping_criteria'] = stopping
+            # Select the KV-cache strategy (quantized / offloaded) on the
+            # full-forward path. Not combinable with a prefix cache; plain_cache
+            # forces the default cache for the guaranteed-working fallback.
+            gen_kwargs.update(self._cache_gen_kwargs(
+                using_prefix=used_cached > 0, plain=plain_cache))
+            return streamer, gen_kwargs, used_cached

        gen_error = [None]
        comp_tokens = [0]
+        final_cached_len = [0]
+
+        async def _attempt(use_prefix, plain_cache=False):
+            """Run one generation attempt, yielding decoded text. Records any
+            failure in gen_error[0] and the prefix length actually used."""
+            gen_error[0] = None
+            streamer, gen_kwargs, used_cached = _build_attempt(use_prefix, plain_cache)
+            final_cached_len[0] = used_cached

            def _run():
                try:
@@ -1383,27 +2126,71 @@ class NvidiaBackend(ModelBackend):
                        self.model.generate(**gen_kwargs)
                except Exception as e:
                    gen_error[0] = str(e)
+                    print(f"Error during {'KV-cached' if use_prefix else 'full'} stream generation: {e}")
+                    # Release whatever the failed pass reserved so the next request
+                    # starts from a clean allocator state (esp. on OOM).
+                    if "out of memory" in str(e).lower():
+                        try:
+                            torch.cuda.empty_cache()
+                        except Exception:
+                            pass
+                finally:
+                    # generate() only ends the streamer on success; if it raised
+                    # before finishing, end it here so the consumer never deadlocks
+                    # on an empty queue (which would freeze the whole event loop).
+                    streamer.end()

            thread = Thread(target=_run)
            thread.start()
-
+            # Pull each token from a worker thread so a blocking streamer.__next__
+            # never runs on (and freezes) the asyncio event loop between tokens.
+            _it = iter(streamer)
+            def _next_token():
                try:
-            for text in streamer:
+                    return next(_it)
+                except StopIteration:
+                    return _SENT
+            try:
+                while True:
+                    text = await asyncio.to_thread(_next_token)
+                    if text is _SENT:
+                        break
                    comp_tokens[0] += 1
                    yield text
-        except Exception as e:
-            print(f"Error during KV-cached stream iteration: {e}")
            finally:
                thread.join()
+
+        try:
+            # First attempt: use the cached KV prefix when one is available.
+            async for text in _attempt(use_prefix=True):
+                yield text
+            # If the first attempt failed before emitting any token — whether from
+            # a stale prefix or an unsupported quantized/offloaded cache — retry
+            # once with a clean full forward pass and the default in-GPU cache,
+            # which is guaranteed to work if the model works at all.
+            if gen_error[0] and comp_tokens[0] == 0:
+                print("generation failed before first token; retrying with plain full forward pass")
+                self.invalidate_kv_cache()
+                async for text in _attempt(use_prefix=False, plain_cache=True):
+                    yield text
+        finally:
            self._last_usage = {
                'prompt_tokens': total_prompt_len,
                'completion_tokens': comp_tokens[0],
-                'cached_tokens': cached_len,
+                'cached_tokens': final_cached_len[0],
            }

        if gen_error[0]:
-            print(f"Warning: KV-cached stream generation error: {gen_error[0]}")
+            print(f"Warning: stream generation error (after fallback): {gen_error[0]}")
            self.invalidate_kv_cache()
+            # If we produced nothing, the client would otherwise receive an empty
+            # but successful completion. Stream a visible error notice instead.
+            if comp_tokens[0] == 0:
+                if "out of memory" in gen_error[0].lower():
+                    yield ("[error: GPU ran out of memory during generation — "
+                           "try a shorter prompt/context or a smaller model]")
+                else:
+                    yield "[error: text generation failed — see server logs]"

    def get_model_name(self) -> str:
        return self.model_name or "unknown"

--- a/codai/backends/ds4.py
+++ b/codai/backends/ds4.py
+# CoderAI - OpenAI-compatible API server
+# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+"""ds4 (DeepSeek V4) proxy backend.
+
+ds4-server already speaks the OpenAI HTTP API, so this backend is a thin proxy: it
+forwards chat/completion requests to the managed ``ds4-server`` subprocess (whose
+lifecycle is owned by :mod:`codai.api.ds4_worker`) and adapts the responses to the
+:class:`~codai.backends.base.ModelBackend` contract the model manager expects.
+
+Tool/think parsing is handled the same way as the other backends — by
+``ModelParserAdapter`` over the returned text — so tools are not forwarded to
+ds4-server; the text-level ``DeepSeekParser`` extracts ``<think>`` and tool calls.
+"""
+
+import asyncio
+import threading
+from typing import AsyncGenerator, Dict, List, Optional
+
+from codai.backends.base import ModelBackend
+
+
+class Ds4Backend(ModelBackend):
+    """Proxy backend that routes generation to a managed ds4-server."""
+
+    def __init__(self, cfg=None):
+        # cfg is a codai.config.Ds4Config. When omitted, resolve the active one.
+        if cfg is None:
+            from codai.config import Ds4Config
+            cfg = Ds4Config()
+        self._cfg = cfg
+        self._model_id = getattr(cfg, "model_id", "deepseek-v4") or "deepseek-v4"
+        self._url: Optional[str] = None
+        self._ctx = int(getattr(cfg, "ctx", 100000) or 100000)
+        self._last_usage: Dict = {}
+
+    # ------------------------------------------------------------------ #
+    # lifecycle
+    # ------------------------------------------------------------------ #
+    def load_model(self, model_name: str, **kwargs) -> None:
+        from codai.api import ds4_worker
+        if model_name:
+            self._model_id = model_name
+        self._url = ds4_worker.ensure_service(self._cfg)
+
+    def get_model_name(self) -> str:
+        return self._model_id
+
+    def get_context_size(self) -> int:
+        return self._ctx
+
+    def get_last_usage(self) -> dict:
+        return dict(self._last_usage)
+
+    def cleanup(self) -> None:
+        from codai.api import ds4_worker
+        ds4_worker.stop_service(getattr(self._cfg, "model_id", self._model_id))
+        self._url = None
+
+    # ------------------------------------------------------------------ #
+    # helpers
+    # ------------------------------------------------------------------ #
+    def _base(self) -> str:
+        if not self._url:
+            raise RuntimeError("ds4 service not started")
+        return self._url
+
+    def _store_usage(self, usage: dict) -> None:
+        if usage:
+            self._last_usage = {
+                "prompt_tokens": usage.get("prompt_tokens", 0),
+                "completion_tokens": usage.get("completion_tokens", 0),
+                "total_tokens": usage.get("total_tokens", 0),
+            }
+
+    def format_messages(self, messages) -> str:
+        # ds4-server applies DeepSeek V4's own chat template server-side; this is only
+        # used by callers that need a flat prompt string.
+        parts = []
+        for m in messages:
+            role = m.get("role") if isinstance(m, dict) else getattr(m, "role", "")
+            content = m.get("content") if isinstance(m, dict) else getattr(m, "content", "")
+            parts.append(f"{role}: {content}")
+        return "\n".join(parts)
+
+    def _chat_payload(self, messages, max_tokens, temperature, top_p, stop, stream):
+        payload = {
+            "model": self._model_id,
+            "messages": messages,
+            "temperature": temperature,
+            "top_p": top_p,
+            "stream": stream,
+        }
+        if max_tokens:
+            payload["max_tokens"] = max_tokens
+        if stop:
+            payload["stop"] = stop
+        return payload
+
+    # ------------------------------------------------------------------ #
+    # chat-level generation (preferred by the manager)
+    # ------------------------------------------------------------------ #
+    def generate_chat(self, messages: List[Dict], max_tokens=None, temperature=0.7,
+                      top_p=1.0, stop=None, tools=None, response_format=None):
+        import requests
+        payload = self._chat_payload(messages, max_tokens, temperature, top_p, stop, False)
+        if response_format and response_format.get("type") == "json_object":
+            payload["response_format"] = {"type": "json_object"}
+        r = requests.post(self._base() + "/v1/chat/completions", json=payload, timeout=3600)
+        r.raise_for_status()
+        data = r.json()
+        self._store_usage(data.get("usage", {}))
+        return data["choices"][0]["message"].get("content") or ""
+
+    async def generate_chat_stream(self, messages: List[Dict], max_tokens=None,
+                                   temperature=0.7, top_p=1.0, stop=None, tools=None,
+                                   response_format=None) -> AsyncGenerator[str, None]:
+        payload = self._chat_payload(messages, max_tokens, temperature, top_p, stop, True)
+        async for chunk in self._stream(self._base() + "/v1/chat/completions", payload,
+                                        delta_key="delta"):
+            yield chunk
+
+    # ------------------------------------------------------------------ #
+    # plain completion (fallback path)
+    # ------------------------------------------------------------------ #
+    def generate(self, prompt: str, max_tokens=None, temperature: float = 0.7,
+                 top_p: float = 1.0, stop=None, repeat_penalty: float = 1.0,
+                 presence_penalty: float = 0.0, frequency_penalty: float = 0.0) -> str:
+        return self.generate_chat([{"role": "user", "content": prompt}],
+                                  max_tokens, temperature, top_p, stop)
+
+    async def generate_stream(self, prompt: str, max_tokens=None, temperature: float = 0.7,
+                              top_p: float = 1.0, stop=None, repeat_penalty: float = 1.0,
+                              presence_penalty: float = 0.0,
+                              frequency_penalty: float = 0.0) -> AsyncGenerator[str, None]:
+        async for chunk in self.generate_chat_stream(
+                [{"role": "user", "content": prompt}], max_tokens, temperature, top_p, stop):
+            yield chunk
+
+    # ------------------------------------------------------------------ #
+    # SSE streaming: iterate the blocking requests stream on a worker thread
+    # and hand chunks to the event loop through an asyncio.Queue.
+    # ------------------------------------------------------------------ #
+    async def _stream(self, url: str, payload: dict, delta_key: str
+                      ) -> AsyncGenerator[str, None]:
+        import json
+        loop = asyncio.get_event_loop()
+        queue: asyncio.Queue = asyncio.Queue()
+        _SENTINEL = object()
+
+        def _worker():
+            import requests
+            try:
+                with requests.post(url, json=payload, stream=True, timeout=3600) as r:
+                    r.raise_for_status()
+                    for raw in r.iter_lines(decode_unicode=True):
+                        if not raw or not raw.startswith("data:"):
+                            continue
+                        data = raw[len("data:"):].strip()
+                        if data == "[DONE]":
+                            break
+                        try:
+                            obj = json.loads(data)
+                        except ValueError:
+                            continue
+                        choice = (obj.get("choices") or [{}])[0]
+                        text = (choice.get(delta_key) or {}).get("content") or ""
+                        if text:
+                            loop.call_soon_threadsafe(queue.put_nowait, text)
+                        if obj.get("usage"):
+                            self._store_usage(obj["usage"])
+                        if choice.get("finish_reason"):
+                            break
+            except Exception as exc:  # surface to the consumer
+                loop.call_soon_threadsafe(queue.put_nowait, exc)
+            finally:
+                loop.call_soon_threadsafe(queue.put_nowait, _SENTINEL)
+
+        threading.Thread(target=_worker, daemon=True).start()
+        while True:
+            item = await queue.get()
+            if item is _SENTINEL:
+                break
+            if isinstance(item, Exception):
+                raise item
+            yield item
--- a/codai/backends/vulkan.py
+++ b/codai/backends/vulkan.py
@@ -621,6 +621,27 @@ class VulkanBackend(ModelBackend):
            else:
                raise ValueError(f"Could not cache model from URL: {model_path}")
        
+        # Fallback: a configured .gguf path that no longer exists (e.g. the file was
+        # downloaded into the GGUF cache rather than the HF-hub snapshot the entry
+        # points at, or a stale snapshot hash). Look for the same filename in the
+        # GGUF cache dir before giving up — the model loads without re-editing the
+        # config entry.
+        if model_path.endswith('.gguf') and not os.path.exists(model_path):
+            try:
+                from codai.models.cache import get_model_cache_dir
+                _base = os.path.basename(model_path)
+                _cache = get_model_cache_dir()
+                _cand = os.path.join(_cache, _base)
+                if not os.path.exists(_cand):
+                    import glob as _glob
+                    _hits = _glob.glob(os.path.join(_cache, "**", _base), recursive=True)
+                    _cand = _hits[0] if _hits else _cand
+                if os.path.exists(_cand):
+                    print(f"  Model path missing; resolved from GGUF cache: {_cand}")
+                    model_path = _cand
+            except Exception:
+                pass
+
        if not os.path.exists(model_path):
            raise FileNotFoundError(f"Model file not found: {model_path}")
        

--- a/codai/broker/capabilities.py
+++ b/codai/broker/capabilities.py
@@ -49,7 +49,13 @@ def build_hardware_summary() -> Dict[str, Any]:
    total_vram_mb = 0
    available_vram_mb = 0

+    # Only use torch if it's ALREADY loaded (i.e. we're in an engine). Never import
+    # it here — the front is torch-free and must stay that way (importing torch in
+    # the front is heavy and would initialise CUDA in the wrong process).
+    import sys as _sys
    try:
+        if "torch" not in _sys.modules:
+            raise ImportError("torch not loaded (front) — using torch-free path")
        import torch

        if torch.cuda.is_available():
@@ -76,6 +82,23 @@ def build_hardware_summary() -> Dict[str, Any]:
    except Exception:
        pass

+    # Torch-free path (e.g. the front, which imports no torch): enumerate every
+    # physical card via nvidia-smi + sysfs so VRAM is reported for the whole node.
+    if not gpus:
+        try:
+            from codai.frontproxy.gpu_detect import gpu_stats
+            for c in gpu_stats():
+                total_mb = int(round((c.get("mem_total") or 0) * 1024))
+                used_mb = int(round((c.get("mem_used") or 0) * 1024))
+                if total_mb <= 0:
+                    continue
+                gpus.append({"name": c.get("name") or c.get("vendor"),
+                             "total_vram_mb": total_mb})
+                total_vram_mb += total_mb
+                available_vram_mb += max(0, total_mb - used_mb)
+        except Exception:
+            pass
+
    if not gpus:
        for total_path in sorted(glob.glob("/sys/class/drm/card*/device/mem_info_vram_total")):
            used_path = total_path.replace("vram_total", "vram_used")

--- a/codai/broker/dispatcher.py
+++ b/codai/broker/dispatcher.py
@@ -60,8 +60,13 @@ def _is_text_response(content_type: str | None) -> bool:
    )


-async def execute_broker_request(app, envelope):
-    """Validate and execute a broker request envelope."""
+async def execute_broker_request(app, envelope, executor=None):
+    """Validate and execute a broker request envelope.
+
+    ``executor`` is an ``async (method, path, headers, query, body) -> {status_code,
+    headers, body}`` callable. When omitted the request is run in-process against
+    ``app`` via the ASGI bridge (engine / single-process mode). The front passes its
+    own executor that proxies to the right engine over HTTP."""

    logger.debug(
        "broker dispatch → op=%s request_id=%s path=%r method=%r stream=%s",
@@ -136,6 +141,12 @@ async def execute_broker_request(app, envelope):
        headers["content-type"] = envelope.content_type

    started_at = perf_counter()
+    if executor is not None:
+        response = await executor(
+            method=envelope.method, path=envelope.path, headers=headers,
+            query=envelope.query, body=body,
+        )
+    else:
        response = await execute_internal_request(
            app,
            method=envelope.method,

--- a/codai/cli.py
+++ b/codai/cli.py
@@ -224,6 +224,13 @@ configuration directory (--config DIR, default: OS-specific CoderAI directory).
        action="store_true",
        help="Dump model output: raw output, parsed output, and litellm debug info",
    )
+    parser.add_argument(
+        "--debug-requests",
+        action="store_true",
+        help="Log the full request/response payloads exchanged with API clients "
+             "(opencode, etc.): incoming messages + tools and the outgoing "
+             "content/tool_calls. Use to diagnose agentic tool-call loops.",
+    )
    parser.add_argument(
        "--list-cached-models",
        action="store_true",
@@ -278,4 +285,39 @@ configuration directory (--config DIR, default: OS-specific CoderAI directory).
        help="Ignore any existing pipeline cache and rebuild it from scratch this "
             "run (use after changing a model's quantization/precision config).",
    )
+    # ─── Frontend/engine split ───────────────────────────────────────────────
+    parser.add_argument(
+        "--single-process",
+        action="store_true",
+        help="Run the legacy single-process server (UI/API and all model work in "
+             "one process). Default boots a front proxy + supervised engine "
+             "subprocess(es) so the web UI stays responsive during model work.",
+    )
+    parser.add_argument(
+        "--engine-only",
+        action="store_true",
+        help="Run this process as an engine (binds an internal localhost port, no "
+             "front proxy). Normally launched automatically by the front; not "
+             "intended to be run by hand.",
+    )
+    parser.add_argument(
+        "--internal-port",
+        type=int,
+        default=None,
+        help="Internal port for --engine-only mode (the front assigns one per engine).",
+    )
+    parser.add_argument(
+        "--debug-engine",
+        action="store_true",
+        help="General engine debugging in the front/engine split (engine lifecycle, "
+             "spawn details, health transitions). Does NOT include the internal "
+             "HTTP access log — use --debug-engine-web for that.",
+    )
+    parser.add_argument(
+        "--debug-engine-web",
+        action="store_true",
+        help="Show the internal front↔engine HTTP requests in an engine's access log "
+             "(proxied calls, /internal/engine-state, /healthz, …). Suppressed by "
+             "default since every engine only ever serves internal front traffic.",
+    )
    return parser.parse_args()
--- a/codai/config.py
+++ b/codai/config.py
@@ -34,6 +34,43 @@ class ServerConfig:
    https_cert_path: Optional[str] = None
    queue_max_size: int = 6
    max_parallel_requests: int = 2
+    # Per-engine overrides for max_parallel_requests, keyed by engine name
+    # (e.g. {"nvidia": 4, "radeon": 1}). Each engine is a separate process and
+    # enforces this on itself, so the default already applies per-engine; the
+    # override lets a bigger card run more concurrently than a smaller one. Blank =
+    # use the default above.
+    max_parallel_requests_overrides: dict = field(default_factory=dict)
+    # ─── Frontend/engine split ───────────────────────────────────────────────
+    # By default coderai boots a thin, always-responsive *front* reverse proxy on
+    # the public host/port and supervises one or more *engine* subprocesses (which
+    # do all GPU/model work) on internal localhost ports. This keeps the web UI
+    # responsive while a model loads or generates. Set single_process=True (or pass
+    # --single-process) to keep the legacy one-process behavior.
+    single_process: bool = False
+    internal_port_base: int = 8780      # first engine binds here; +1 per extra engine
+    engines: int = 0                    # 0 = auto (one per detected GPU, min 1)
+    engine_gpus: Optional[list] = None  # explicit GPU indices, e.g. [0, 1]; None = auto
+    proxy_status_timeout: float = 2.0   # short timeout for UI/status proxying (seconds)
+    proxy_max_inflight: int = 64        # max concurrent proxied requests through the front
+    # Explicit, heterogeneous engine declarations. Auto GPU detection only finds
+    # NVIDIA cards and assumes one backend, and CUDA vs Vulkan device enumeration is
+    # inconsistent — so for mixed setups (e.g. an NVIDIA + a Radeon card, where the
+    # NVIDIA engine also serves GGUF via Vulkan) declare each engine with its own
+    # env block. When non-empty this overrides `engines`/`engine_gpus`. Each item:
+    #   {
+    #     "name": "nvidia",          # label for logs
+    #     "backend": "nvidia",       # nvidia | vulkan (forces this engine's backend)
+    #     "capabilities": [...],     # optional; defaults from backend (see below)
+    #     "env": { "CUDA_VISIBLE_DEVICES": "0", "GGML_VK_VISIBLE_DEVICES": "0",
+    #              "VK_ICD_FILENAMES": "/usr/share/vulkan/icd.d/nvidia_icd.json" }
+    #   }
+    # Default capabilities: nvidia → ["transformers","gguf"]; vulkan → ["gguf"].
+    engine_specs: Optional[list] = None
+    # Preferred engine (by name or backend) when a model is compatible with more
+    # than one — e.g. a GGUF that could run on either an NVIDIA or a Radeon engine.
+    # None = spread to the least-loaded compatible engine. A per-model "engine" set
+    # in models.json overrides this for that model.
+    default_engine: Optional[str] = None


 @dataclass
@@ -52,6 +89,9 @@ class ModelsConfig:
    hf_cache_dir: Optional[str] = None
    gguf_cache_dir: Optional[str] = None
    max_model_instances: int = 1  # max concurrent instances per model (global default; overridable per-model via "max_instances")
+    # Per-engine overrides for max_model_instances, keyed by engine name
+    # (e.g. {"nvidia": 2, "radeon": 1}). Applied per-engine process; blank = default.
+    max_model_instances_overrides: dict = field(default_factory=dict)


 @dataclass
@@ -72,6 +112,13 @@ class OffloadConfig:
    max_ram_gb: Optional[float] = None
    evict_idle_on_ram: bool = True   # unload idle LRU models when over the RAM cap
    ram_leak_watch: bool = True      # background watcher samples RSS + auto-mitigates
+    # Leak-watch mitigation tuning. The watcher runs a mitigation ladder when RSS
+    # crosses ram_watch_soft_fraction of the cap (or a leak is suspected). On a
+    # marginal GPU the cross-thread CUDA call in that ladder can be undesirable, so
+    # ram_watch_cuda gates whether mitigation is allowed to call torch.cuda.empty_cache().
+    ram_watch_poll_seconds: float = 15.0    # how often the watcher samples RSS
+    ram_watch_soft_fraction: float = 0.90   # mitigate at/above this fraction of the cap
+    ram_watch_cuda: bool = True             # allow mitigation to call CUDA empty_cache()


 @dataclass
@@ -130,6 +177,11 @@ class ThermalConfig:
    cpu_resume: float = 87.0    # resume once CPU drops back to/below this
    gpu_high: float = 90.0      # pause when GPU reaches this temperature
    gpu_resume: float = 87.0    # resume once GPU drops back to/below this
+    # Per-vendor GPU threshold overrides, e.g. {"amd": {"high": 95, "resume": 92}}.
+    # A card uses its vendor's override when present, else the gpu_high/gpu_resume
+    # defaults above — so e.g. a Radeon that runs hotter can have a higher limit
+    # than an NVIDIA card. Keyed by vendor: "nvidia" | "amd" | "intel".
+    gpu_overrides: dict = field(default_factory=dict)
    poll_seconds: float = 5.0   # how often to re-check while cooling down
    # Proactive soft-throttle: before a hard pause, when a sensor enters the warm
    # band [soft_throttle_temp, *_high) insert a short per-step sleep (scaled by
@@ -162,6 +214,30 @@ class EnhanceConfig:
    allow_rife_ncnn: bool = False     # allow the external rife-ncnn-vulkan binary instead of a torch model


+@dataclass
+class Ds4Config:
+    """DeepSeek V4 via ds4 (antirez/DwarfStar) external-worker configuration.
+
+    ds4 is a native inference engine built specifically for DeepSeek V4 that exposes
+    an OpenAI-compatible HTTP server (``ds4-server``). When ``enabled``, coderai owns
+    the whole lifecycle: on first use it clones + builds ds4, downloads the chosen
+    GGUF weight variant, launches ``ds4-server`` as a managed subprocess, and proxies
+    text requests to it. Any requested model whose name matches ``model_id`` (or
+    contains ``deepseek-v4``) is routed to ds4 instead of the normal backends.
+    """
+    enabled: bool = False
+    repo_url: str = "https://github.com/antirez/ds4"
+    install_dir: Optional[str] = None      # None = ~/.coderai/ds4
+    build_target: str = "auto"             # auto|cuda-generic|cuda-spark|metal|cpu
+    model_variant: str = "q4-imatrix"      # download_model.sh variant
+    model_id: str = "deepseek-v4"          # model id/alias that routes to ds4
+    host: str = "127.0.0.1"
+    port: int = 0                          # 0 = auto-pick a free port
+    ctx: int = 100000                      # ds4-server --ctx context window
+    extra_args: str = ""                   # extra flags passed to ds4-server
+    auto_build: bool = True                # clone+build the binary if it's missing
+
+
 @dataclass
 class Config:
    """Main configuration class."""
@@ -177,6 +253,7 @@ class Config:
    thermal: ThermalConfig = field(default_factory=ThermalConfig)
    jobs: JobsConfig = field(default_factory=JobsConfig)
    enhance: EnhanceConfig = field(default_factory=EnhanceConfig)
+    ds4: Ds4Config = field(default_factory=Ds4Config)
    broker: BrokerConfig = field(default_factory=BrokerConfig)
    system_prompt: Optional[str] = None
    tools_closer_prompt: bool = False
@@ -338,6 +415,7 @@ class ConfigManager:
                thermal=ThermalConfig(**config_data.get("thermal", {})),
                jobs=JobsConfig(**config_data.get("jobs", {})),
                enhance=EnhanceConfig(**config_data.get("enhance", {})),
+                ds4=Ds4Config(**config_data.get("ds4", {})),
                broker=BrokerConfig(**config_data.get("broker", {})),
                system_prompt=config_data.get("system_prompt"),
                tools_closer_prompt=config_data.get("tools_closer_prompt", False),
@@ -401,6 +479,15 @@ class ConfigManager:
                "https_cert_path": self.config.server.https_cert_path,
                "queue_max_size": self.config.server.queue_max_size,
                "max_parallel_requests": self.config.server.max_parallel_requests,
+                "max_parallel_requests_overrides": self.config.server.max_parallel_requests_overrides,
+                "single_process": self.config.server.single_process,
+                "internal_port_base": self.config.server.internal_port_base,
+                "engines": self.config.server.engines,
+                "engine_gpus": self.config.server.engine_gpus,
+                "proxy_status_timeout": self.config.server.proxy_status_timeout,
+                "proxy_max_inflight": self.config.server.proxy_max_inflight,
+                "engine_specs": self.config.server.engine_specs,
+                "default_engine": self.config.server.default_engine,
            },
            "backend": {
                "type": self.config.backend.type,
@@ -412,6 +499,8 @@ class ConfigManager:
                "default_load_mode": self.config.models.default_load_mode,
                "hf_cache_dir": self.config.models.hf_cache_dir,
                "gguf_cache_dir": self.config.models.gguf_cache_dir,
+                "max_model_instances": self.config.models.max_model_instances,
+                "max_model_instances_overrides": self.config.models.max_model_instances_overrides,
            },
            "offload": {
                "directory": self.config.offload.directory,
@@ -424,7 +513,10 @@ class ConfigManager:
                "flash_attention": self.config.offload.flash_attention,
                "max_ram_gb": self.config.offload.max_ram_gb,
                "evict_idle_on_ram": self.config.offload.evict_idle_on_ram,
-                "ram_leak_watch": self.config.offload.ram_leak_watch
+                "ram_leak_watch": self.config.offload.ram_leak_watch,
+                "ram_watch_poll_seconds": self.config.offload.ram_watch_poll_seconds,
+                "ram_watch_soft_fraction": self.config.offload.ram_watch_soft_fraction,
+                "ram_watch_cuda": self.config.offload.ram_watch_cuda
            },
            "vulkan": {
                "n_gpu_layers": self.config.vulkan.n_gpu_layers,
@@ -458,6 +550,7 @@ class ConfigManager:
                "cpu_resume": self.config.thermal.cpu_resume,
                "gpu_high": self.config.thermal.gpu_high,
                "gpu_resume": self.config.thermal.gpu_resume,
+                "gpu_overrides": self.config.thermal.gpu_overrides,
                "poll_seconds": self.config.thermal.poll_seconds,
                "soft_throttle_enabled": self.config.thermal.soft_throttle_enabled,
                "soft_throttle_temp": self.config.thermal.soft_throttle_temp,
@@ -470,6 +563,19 @@ class ConfigManager:
                "allow_ffmpeg": self.config.enhance.allow_ffmpeg,
                "allow_rife_ncnn": self.config.enhance.allow_rife_ncnn,
            },
+            "ds4": {
+                "enabled": self.config.ds4.enabled,
+                "repo_url": self.config.ds4.repo_url,
+                "install_dir": self.config.ds4.install_dir,
+                "build_target": self.config.ds4.build_target,
+                "model_variant": self.config.ds4.model_variant,
+                "model_id": self.config.ds4.model_id,
+                "host": self.config.ds4.host,
+                "port": self.config.ds4.port,
+                "ctx": self.config.ds4.ctx,
+                "extra_args": self.config.ds4.extra_args,
+                "auto_build": self.config.ds4.auto_build,
+            },
            "broker": {
                "enabled": self.config.broker.enabled,
                "base_url": self.config.broker.base_url,

--- a/codai/frontproxy/__init__.py
+++ b/codai/frontproxy/__init__.py
+# CoderAI - OpenAI-compatible API server
+# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+"""Front proxy package: always-responsive web/API front + supervised engines.
+
+See ``docs/frontend-engine-split.md`` and ``docs/process-isolation-plans.md``.
+"""
+
+from codai.frontproxy.app import run_front, build_app
+
+__all__ = ["run_front", "build_app"]
--- a/codai/frontproxy/app.py
+++ b/codai/frontproxy/app.py
+# CoderAI - OpenAI-compatible API server
+# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+"""The front: a thin, always-responsive reverse proxy in front of the engines.
+
+It imports no torch/transformers/diffusers, so its event loop is never blocked by
+model work. It streams requests/responses (incl. SSE) to the engine chosen by
+:mod:`codai.frontproxy.router`, and serves an aggregated, cached status so the web
+UI stays live even while an engine is busy loading a model.
+"""
+
+import json
+import time
+from typing import Optional
+
+import httpx
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, Response, StreamingResponse
+from starlette.background import BackgroundTask
+
+from codai.frontproxy.registry import EngineRegistry
+from codai.frontproxy.engine_supervisor import EngineSupervisor
+from codai.frontproxy import router as _router
+
+# Hop-by-hop headers that must not be forwarded verbatim (RFC 7230 §6.1) plus
+# length/host headers that the client/StreamingResponse recompute.
+_HOP_BY_HOP = {
+    "connection", "keep-alive", "proxy-authenticate", "proxy-authorization",
+    "te", "trailers", "transfer-encoding", "upgrade",
+}
+# Also strip any client-supplied internal token so a caller can't spoof/override the
+# real one the front injects — only the front's httpx default header reaches engines.
+_DROP_REQ = _HOP_BY_HOP | {"host", "content-length", "x-coderai-internal"}
+_DROP_RESP = _HOP_BY_HOP | {"content-length"}
+
+
+class FrontProxy:
+    def __init__(self, config, config_dir=None):
+        self.config = config
+        self.default_engine = getattr(config.server, "default_engine", None)
+        # Per-model engine pins are read from models.json (torch-free) and refreshed
+        # when the file changes, so admin edits take effect without a front restart.
+        import os
+        self._models_path = os.path.join(config_dir, "models.json") if config_dir else None
+        self._pins: dict = {}
+        self._pins_mtime: float = -1.0
+        self.registry = EngineRegistry()
+        self.supervisor: Optional[EngineSupervisor] = None
+        # Per-run secret shared only with the engines (passed via env at spawn). The
+        # front stamps every engine request with it and engines reject requests that
+        # lack it, so nothing on localhost can talk to an engine bypassing the front.
+        import secrets
+        self.internal_token = secrets.token_urlsafe(32)
+        _auth = {"x-coderai-internal": self.internal_token}
+        # Short client for status/UI; long client (no read timeout) for generation
+        # that may legitimately wait for a model load.
+        self._short = httpx.AsyncClient(timeout=config.server.proxy_status_timeout,
+                                        headers=_auth)
+        self._long = httpx.AsyncClient(
+            timeout=httpx.Timeout(connect=10.0, read=None, write=None, pool=None),
+            headers=_auth)
+        self._status_cache: Optional[dict] = None
+        self._status_cache_at: float = 0.0
+        self._broker = None
+        self.debug_engine = False   # --debug-engine: verbose engine lifecycle
+
+    async def aclose(self):
+        await self._short.aclose()
+        await self._long.aclose()
+
+    # ------------------------------------------------------------------ broker
+    def start_broker(self):
+        """Run the AISBF broker here in the front (always-responsive, one
+        registration for the whole node) instead of inside a model engine. Brokered
+        requests are dispatched to the right engine through the same router/proxy."""
+        cfg = getattr(self.config, "broker", None)
+        if cfg is None or not getattr(cfg, "enabled", False):
+            return
+        try:
+            from codai.broker import build_broker_runtime_config, BrokerConfigError
+            from codai.broker.client import BrokerClient
+            from codai.broker.service import BrokerService
+            from codai.broker.dispatcher import execute_broker_request
+        except Exception as exc:
+            print(f"[front] broker not available: {exc}", flush=True)
+            return
+        try:
+            runtime = build_broker_runtime_config(cfg)
+        except BrokerConfigError as exc:
+            print(f"[front] broker disabled (invalid config): {exc}", flush=True)
+            return
+        if not runtime.enabled:
+            return
+        client = BrokerClient(runtime)
+
+        async def _dispatch(message):
+            envelope = client.message_to_envelope(message)
+            return await execute_broker_request(None, envelope,
+                                                executor=self.broker_execute)
+        client.dispatcher = _dispatch
+        self._broker = BrokerService(client)   # app=None → keep our dispatcher
+        self._broker.start()
+        print("[front] AISBF broker started (front-managed, routes to engines)",
+              flush=True)
+
+    async def stop_broker(self):
+        if self._broker is not None:
+            await self._broker.stop()
+            self._broker = None
+
+    async def collect_models(self, headers):
+        """Union of every healthy engine's /v1/models. Each engine registers only
+        the models the front assigned to it, so the union is the full set with no
+        duplicates. Returns ("ok", {...}) or ("passthrough", httpx.Response) when an
+        auth/error response should be relayed instead."""
+        seen, order, relay = {}, [], None
+        for e in self.registry.healthy():
+            try:
+                r = await self._short.get(e.url + "/v1/models", headers=headers)
+            except Exception:
+                continue
+            if r.status_code != 200:
+                relay = relay or r
+                continue
+            try:
+                data = r.json()
+            except Exception:
+                continue
+            for m in (data.get("data") or []):
+                mid = m.get("id")
+                if mid and mid not in seen:
+                    seen[mid] = m
+                    order.append(mid)
+        if not order and relay is not None:
+            return ("passthrough", relay)
+        return ("ok", {"object": "list", "data": [seen[i] for i in order]})
+
+    async def broker_execute(self, *, method, path, headers, query, body):
+        # Brokered models.list must reflect the WHOLE node (union across engines),
+        # not a single engine's assigned subset.
+        if method.upper() == "GET" and path.split("?", 1)[0].rstrip("/") == "/v1/models":
+            hdrs = {k: v for k, v in (headers or {}).items() if k.lower() not in _DROP_REQ}
+            kind, val = await self.collect_models(hdrs)
+            if kind == "ok":
+                import json as _json
+                return {"status_code": 200,
+                        "headers": {"content-type": "application/json"},
+                        "body": _json.dumps(val).encode()}
+            return {"status_code": val.status_code, "headers": dict(val.headers),
+                    "body": val.content}
+        return await self._broker_execute_route(method=method, path=path,
+                                                headers=headers, query=query, body=body)
+
+    async def _broker_execute_route(self, *, method, path, headers, query, body):
+        """Executor for brokered requests: route to an engine over HTTP and return
+        the buffered response (the broker dispatcher base64s/relays it)."""
+        import json as _json
+        model = None
+        if method.upper() == "POST" and _router.is_inference_path(path):
+            try:
+                model = (_json.loads(body or b"{}") or {}).get("model")
+            except Exception:
+                model = None
+        engine = _router.pick_engine(
+            self.registry, path, method, model,
+            required_cap=self._required_cap(path, model),
+            default_engine=self.default_engine, pinned=self._pin_for(model))
+        if engine is None:
+            return {"status_code": 503, "headers": {"content-type": "application/json"},
+                    "body": b'{"error":"No engine is ready yet."}'}
+        send_headers = {k: v for k, v in (headers or {}).items()
+                        if k.lower() not in _DROP_REQ}
+        try:
+            r = await self._long.request(method, engine.url + path,
+                                         headers=send_headers, params=query or {},
+                                         content=body or b"")
+        except Exception as exc:
+            return {"status_code": 502,
+                    "headers": {"content-type": "application/json"},
+                    "body": ('{"error":"engine#%s unreachable: %s"}'
+                             % (engine.id, exc)).encode()}
+        return {"status_code": r.status_code, "headers": dict(r.headers),
+                "body": r.content}
+
+    # ------------------------------------------------------------------ helpers
+    @staticmethod
+    def _filter_headers(headers, drop) -> list:
+        return [(k, v) for k, v in headers.items() if k.lower() not in drop]
+
+    def _model_info(self, model: Optional[str]) -> dict:
+        """Return {"engine": pin, "backend": backend} for a model from models.json.
+
+        Builds a {model id / alias / short-name → info} map, refreshed on file mtime
+        change. Used for per-model engine pins and capability detection (e.g. a
+        ``whisper-server`` backend → the ``whisper`` capability)."""
+        if not model or not self._models_path:
+            return {}
+        import os
+        try:
+            mtime = os.path.getmtime(self._models_path)
+        except OSError:
+            return {}
+        if mtime != self._pins_mtime:
+            self._pins = self._load_pins()
+            self._pins_mtime = mtime
+        m = model.lower()
+        return self._pins.get(m) or self._pins.get(m.split("/")[-1]) or {}
+
+    def _pin_for(self, model: Optional[str]) -> Optional[str]:
+        return self._model_info(model).get("engine")
+
+    def _load_pins(self) -> dict:
+        import json as _json
+        info: dict = {}
+        try:
+            data = _json.load(open(self._models_path))
+        except Exception:
+            return info
+        for key, lst in data.items():
+            if not isinstance(lst, list):
+                continue
+            for m in lst:
+                if not isinstance(m, dict):
+                    continue
+                rec = {"engine": (m.get("engine") or "").strip() or None,
+                       "backend": (m.get("backend") or "").strip() or None}
+                for field_ in (m.get("path"), m.get("id"), m.get("alias")):
+                    if field_:
+                        info[str(field_).lower()] = rec
+                        info[str(field_).split("/")[-1].lower()] = rec
+        return info
+
+    def _required_cap(self, path: str, model: Optional[str]) -> Optional[str]:
+        ds4 = getattr(self.config, "ds4", None)
+        return _router.required_capability(
+            model, path=path,
+            backend=self._model_info(model).get("backend"),
+            ds4_model_id=getattr(ds4, "model_id", None) if ds4 else None,
+            ds4_enabled=bool(getattr(ds4, "enabled", False)) if ds4 else False)
+
+    @staticmethod
+    def _peek_model(body: bytes, content_type: str) -> Optional[str]:
+        if not body or "application/json" not in (content_type or "").lower():
+            return None
+        try:
+            return (json.loads(body) or {}).get("model")
+        except Exception:
+            return None
+
+    # High-frequency dashboard pollers: serve with a short timeout and a graceful
+    # fallback so a momentarily-blocked engine loop can never hang the web UI.
+    _POLL_PATHS = {"/admin/api/tasks", "/admin/api/system-stats"}
+
+    async def poll(self, request: Request) -> Response:
+        prim = self.registry.primary()
+        if prim is None:
+            return JSONResponse({"engine": "down", "tasks": [], "queue": []})
+        is_tasks = request.url.path.rstrip("/").endswith("/tasks")
+        try:
+            headers = self._filter_headers(request.headers, _DROP_REQ)
+            r = await self._short.get(prim.url + request.url.path, headers=headers,
+                                      params=request.query_params)
+            if is_tasks and r.status_code == 200:
+                try:
+                    data = r.json()
+                    data["tasks"] = self._merge_engine_tasks(prim, data.get("tasks") or [])
+                    data["cooling_engines"] = self._cooling_engines()
+                    return JSONResponse(data)
+                except Exception:
+                    pass
+            return Response(content=r.content, status_code=r.status_code,
+                            headers=dict(self._filter_headers(r.headers, _DROP_RESP)),
+                            media_type=r.headers.get("content-type"))
+        except Exception:
+            # Engine busy (event loop blocked by GIL-heavy work) — don't hang the UI.
+            # Still surface known running tasks from other engines.
+            tasks = self._merge_engine_tasks(prim, []) if is_tasks else []
+            return JSONResponse({"engine": "loading", "stale": True,
+                                 "tasks": tasks, "queue": []})
+
+    async def is_admin(self, request: Request) -> bool:
+        """Authorize a front-handled admin action by validating the caller's session
+        against the primary engine (which owns sessions). 200 → authorized."""
+        prim = self.registry.primary()
+        if prim is None:
+            return False
+        try:
+            headers = self._filter_headers(request.headers, _DROP_REQ)
+            r = await self._short.get(prim.url + "/admin/api/status", headers=headers)
+            return r.status_code == 200
+        except Exception:
+            return False
+
+    def engines_list(self) -> list:
+        out = []
+        for e in self.registry.all():
+            try:
+                pid = e.proc.pid if e.proc else None
+            except Exception:
+                pid = None
+            out.append({"id": e.id, "name": e.name, "backend": e.backend,
+                        "gpu": e.gpu, "healthy": e.healthy, "primary": e.primary,
+                        "vram": e.vram, "cooling": bool(e.cooling),
+                        "loaded_models": sorted(e.loaded_models), "pid": pid})
+        return out
+
+    def _cooling_engines(self) -> list:
+        """Which engines are in thermal cooldown right now (for the Tasks banner)."""
+        out = []
+        for e in self.registry.all():
+            if e.cooling:
+                out.append({"engine": e.name, "gpu": e.cooling.get("gpu"),
+                            "cpu": e.cooling.get("cpu"),
+                            "message": e.cooling.get("message")})
+        return out
+
+    def _merge_engine_tasks(self, primary, primary_tasks: list) -> list:
+        """Tasks from all engines, each tagged with the engine *name* it runs on."""
+        merged = []
+        seen = set()
+        # Primary's tasks (from its authed response) — tag with the primary name.
+        for t in primary_tasks:
+            if isinstance(t, dict):
+                t = dict(t)
+                t.setdefault("engine", primary.name if primary else None)
+                seen.add(t.get("id"))
+            merged.append(t)
+        # Tasks the supervisor saw on the other engines.
+        for e in self.registry.all():
+            if primary is not None and e.id == primary.id:
+                continue
+            for t in (e.tasks or []):
+                if not isinstance(t, dict) or t.get("id") in seen:
+                    continue
+                t = dict(t)
+                t["engine"] = e.name
+                merged.append(t)
+                seen.add(t.get("id"))
+        return merged
+
+    # -------------------------------------------------------------------- proxy
+    async def proxy(self, request: Request) -> Response:
+        path = request.url.path
+        method = request.method
+
+        # Inference JSON bodies are small: buffer so we can route by `model`, then
+        # forward the buffered bytes. Everything else streams through unbuffered.
+        body_bytes: Optional[bytes] = None
+        model = None
+        if method == "POST" and _router.is_inference_path(path):
+            body_bytes = await request.body()
+            model = self._peek_model(body_bytes, request.headers.get("content-type", ""))
+
+        engine = _router.pick_engine(
+            self.registry, path, method, model,
+            required_cap=self._required_cap(path, model),
+            default_engine=self.default_engine, pinned=self._pin_for(model))
+        if engine is None:
+            return JSONResponse(
+                {"error": "No engine is ready yet (still starting/loading)."},
+                status_code=503)
+
+        url = engine.url + path
+        headers = self._filter_headers(request.headers, _DROP_REQ)
+        content = body_bytes if body_bytes is not None else request.stream()
+
+        rp_req = self._long.build_request(
+            method, url, headers=headers, params=request.query_params,
+            content=content)
+        try:
+            rp_resp = await self._long.send(rp_req, stream=True)
+        except Exception as exc:
+            return JSONResponse(
+                {"error": f"Engine#{engine.id} unreachable: {exc}"}, status_code=502)
+
+        resp_headers = self._filter_headers(rp_resp.headers, _DROP_RESP)
+        return StreamingResponse(
+            rp_resp.aiter_raw(),
+            status_code=rp_resp.status_code,
+            headers=dict(resp_headers),
+            media_type=rp_resp.headers.get("content-type"),
+            background=BackgroundTask(rp_resp.aclose),
+        )
+
+    # ----------------------------------------------------------------- status
+    async def status(self, request: Request) -> Response:
+        """Aggregate /admin/api/status across engines, with a last-good cache.
+
+        Proxies the user's authed request to the primary engine (sessions live
+        there), then overlays cross-engine VRAM/loaded-model totals from the
+        registry so the dashboard reflects every GPU. On engine timeout, serve the
+        cache plus an ``engine: loading|down`` marker — the UI never hangs.
+        """
+        prim = self.registry.primary()
+        if prim is None:
+            return self._cached_status("down")
+        try:
+            headers = self._filter_headers(request.headers, _DROP_REQ)
+            r = await self._short.get(prim.url + request.url.path, headers=headers,
+                                      params=request.query_params)
+            if r.status_code != 200:
+                # Pass through auth redirects/errors unchanged (e.g. login needed).
+                return Response(content=r.content, status_code=r.status_code,
+                                headers=dict(self._filter_headers(r.headers, _DROP_RESP)),
+                                media_type=r.headers.get("content-type"))
+            data = r.json()
+            data = self._overlay_engine_totals(data)
+            self._status_cache = data
+            self._status_cache_at = time.monotonic()
+            return JSONResponse(data)
+        except Exception:
+            return self._cached_status("loading")
+
+    def _overlay_engine_totals(self, data: dict) -> dict:
+        engines = self.registry.all()
+        per = []
+        used = free = total = 0.0
+        loaded = set()
+        have_vram = False
+        for e in engines:
+            per.append({"id": e.id, "name": e.name, "backend": e.backend,
+                        "gpu": e.gpu, "healthy": e.healthy, "vram": e.vram,
+                        "capabilities": sorted(e.capabilities),
+                        "loaded_models": sorted(e.loaded_models)})
+            loaded |= e.loaded_models
+            if e.vram:
+                have_vram = True
+                used += e.vram.get("used", 0.0)
+                free += e.vram.get("free", 0.0)
+                total += e.vram.get("total", 0.0)
+        data["x_engines"] = per
+        if len([e for e in engines if e.healthy]) > 1:
+            if have_vram:
+                data["vram"] = {"used": round(used, 2), "free": round(free, 2),
+                                "total": round(total, 2),
+                                "gpu": f"{len(engines)} engines"}
+            if loaded:
+                data["loaded_models"] = sorted(loaded)
+                data["models_loaded"] = len(loaded)
+        return data
+
+    def _cached_status(self, engine_state: str) -> Response:
+        body = dict(self._status_cache or {"models_loaded": 0, "loaded_models": [],
+                                           "vram": None})
+        body["engine"] = engine_state
+        body["stale"] = True
+        if self._status_cache_at:
+            body["stale_age_seconds"] = round(time.monotonic() - self._status_cache_at, 1)
+        return JSONResponse(body)
+
+
+class _PollNoiseFilter:
+    """Hide web-UI traffic from the front's access log unless --debug-web.
+
+    The admin dashboard constantly polls/reads (status, gpu-stats, tasks, settings,
+    downloads, model-loaded-status, models, …) plus loads static assets — all noise
+    for normal operation. So drop **read** requests (GET/HEAD/OPTIONS) to /admin,
+    /static, /, and /favicon. Real API calls (/v1/...) and admin **mutations**
+    (POST/PUT/PATCH/DELETE — model-configure, deletes, etc.) still log.
+    """
+    _READ = ("GET", "HEAD", "OPTIONS")
+    _WEB_PREFIXES = ("/admin", "/static", "/login", "/logout")
+    _WEB_EXACT = ("/", "/favicon.ico")
+
+    def filter(self, record):
+        try:
+            a = record.args
+            if isinstance(a, (tuple, list)) and len(a) >= 3:
+                method = str(a[1]).upper()
+                path = str(a[2]).split("?", 1)[0]
+                if method in self._READ and (
+                        path in self._WEB_EXACT
+                        or any(path == p or path.startswith(p + "/") or path == p
+                               for p in self._WEB_PREFIXES)):
+                    return False
+        except Exception:
+            pass
+        return True
+
+
+def _front_log_config(debug_web: bool):
+    """uvicorn log config that prefixes every front-process line with ``[front]``
+    (so it's never confused with an engine's ``[nvidia]``/``[radeon]`` lines) and
+    routes codai/broker logs through the same handler. Drops poll noise unless
+    --debug-web."""
+    import copy
+    import uvicorn
+    lc = copy.deepcopy(uvicorn.config.LOGGING_CONFIG)
+    for fmt in lc.get("formatters", {}).values():
+        if "fmt" in fmt and not fmt["fmt"].startswith("[front]"):
+            fmt["fmt"] = "[front] " + fmt["fmt"]
+    # Surface codai/broker logs (the broker now runs here) via uvicorn's handler.
+    lc.setdefault("loggers", {})
+    lc["loggers"]["codai"] = {"handlers": ["default"], "level": "INFO", "propagate": False}
+    if not debug_web:
+        lc.setdefault("filters", {})["pollnoise"] = {
+            "()": "codai.frontproxy.app._PollNoiseFilter"}
+        lc["handlers"].get("access", {}).setdefault("filters", []).append("pollnoise")
+    return lc
+
+
+def build_app(config, config_dir=None) -> FastAPI:
+    front = FrontProxy(config, config_dir=config_dir)
+    app = FastAPI(title="CoderAI Front", docs_url=None, redoc_url=None,
+                  openapi_url=None)
+    app.state.front = front
+
+    @app.on_event("startup")
+    async def _startup():
+        front.supervisor = EngineSupervisor(config, None, front.registry,
+                                            models_path=front._models_path,
+                                            internal_token=front.internal_token,
+                                            debug=front.debug_engine)
+        front.supervisor.start()
+        front.start_broker()
+
+    @app.on_event("shutdown")
+    async def _shutdown():
+        await front.stop_broker()
+        if front.supervisor:
+            front.supervisor.stop_all()
+        await front.aclose()
+
+    @app.get("/healthz", include_in_schema=False)
+    async def _healthz():
+        prim = front.registry.primary()
+        return {"ok": True, "engine_ready": bool(prim and prim.healthy),
+                "engines": [{"id": e.id, "gpu": e.gpu, "healthy": e.healthy}
+                            for e in front.registry.all()]}
+
+    # Status/UI poll endpoints get the cached, cross-engine-aggregated handler so a
+    # busy engine can never hang the dashboard.
+    @app.get("/admin/api/status", include_in_schema=False)
+    async def _status(request: Request):
+        return await front.status(request)
+
+    @app.get("/admin/api/tasks", include_in_schema=False)
+    async def _tasks(request: Request):
+        return await front.poll(request)
+
+    @app.get("/admin/api/system-stats", include_in_schema=False)
+    async def _system_stats(request: Request):
+        return await front.poll(request)
+
+    # /v1/models is the union across engines (each engine registers only the models
+    # the front assigned to it). Registered before the catch-all so it's aggregated.
+    @app.get("/v1/models", include_in_schema=False)
+    async def _models(request: Request):
+        headers = front._filter_headers(request.headers, _DROP_REQ)
+        kind, val = await front.collect_models(headers)
+        if kind == "passthrough":
+            return Response(content=val.content, status_code=val.status_code,
+                            headers=dict(front._filter_headers(val.headers, _DROP_RESP)),
+                            media_type=val.headers.get("content-type"))
+        return JSONResponse(val)
+
+    # Engine management (front-owned: it runs the supervisor). Registered before
+    # the catch-all so they aren't proxied to an engine.
+    @app.get("/admin/api/engines", include_in_schema=False)
+    async def _engines(request: Request):
+        if not await front.is_admin(request):
+            return JSONResponse({"detail": "Unauthorized"}, status_code=401)
+        return JSONResponse({"engines": front.engines_list()})
+
+    @app.post("/admin/api/engines/{eid}/restart", include_in_schema=False)
+    async def _engine_restart(eid: int, request: Request):
+        if not await front.is_admin(request):
+            return JSONResponse({"detail": "Unauthorized"}, status_code=401)
+        ok = bool(front.supervisor and front.supervisor.restart_engine(eid))
+        return JSONResponse({"success": ok}, status_code=200 if ok else 404)
+
+    # Catch-all reverse proxy for everything else (admin UI, /v1 inference, files…).
+    @app.api_route("/{path:path}", include_in_schema=False,
+                   methods=["GET", "POST", "PUT", "PATCH", "DELETE", "OPTIONS", "HEAD"])
+    async def _proxy(path: str, request: Request):
+        return await front.proxy(request)
+
+    return app
+
+
+def _serve_front(app, **uvicorn_kwargs) -> None:
+    """Serve the front with uvicorn, but own the SIGINT/SIGTERM handling so a
+    Ctrl-C ALWAYS tears the engines down — even if uvicorn's graceful shutdown
+    hangs draining an in-flight proxy stream to a stuck (e.g. mid-CUDA) engine.
+
+    On the first signal we ask uvicorn to exit AND arm a watchdog that force-stops
+    the engines (escalating to SIGKILL of their process groups) after a short
+    grace, regardless of whether the drain ever completes. A second Ctrl-C stops
+    them immediately. As a backstop, engines are also stopped after serve returns.
+    """
+    import signal
+    import threading
+    import uvicorn
+
+    supervisor = getattr(app.state.front, "supervisor", None)
+    server = uvicorn.Server(uvicorn.Config(app, **uvicorn_kwargs))
+    server.install_signal_handlers = lambda: None   # we manage signals ourselves
+
+    state = {"hits": 0}
+
+    def _handle(signum, _frame):
+        state["hits"] += 1
+        server.should_exit = True
+        if state["hits"] >= 2:
+            server.force_exit = True
+            if supervisor is not None:
+                supervisor.stop_all(grace=0.0)
+            return
+        print("\n[front] shutdown requested — stopping engines "
+              "(Ctrl-C again to force)…", flush=True)
+
+        def _watchdog():
+            # If the graceful drain hasn't finished promptly, force engines down
+            # so a stuck upstream stream can't keep them (and us) alive.
+            time.sleep(6.0)
+            if supervisor is not None:
+                supervisor.stop_all(grace=5.0)
+            server.force_exit = True
+        threading.Thread(target=_watchdog, daemon=True).start()
+
+    for _sig in (signal.SIGINT, signal.SIGTERM):
+        try:
+            signal.signal(_sig, _handle)
+        except Exception:
+            pass
+
+    try:
+        server.run()
+    finally:
+        # Backstop: whatever path we exited by, make sure no engine survives us.
+        if supervisor is not None:
+            supervisor.stop_all(grace=5.0)
+
+
+def run_front(config, args) -> None:
+    """Build the front app, start engine supervision, and serve on the public port."""
+    config_dir = getattr(args, "config", None) if args is not None else None
+    app = build_app(config, config_dir=config_dir)
+    app.state.front.debug_engine = getattr(args, "debug_engine", False)
+    host = config.server.host
+    port = config.server.port
+    print(f"\n[front] CoderAI front proxy on http://{host}:{port}")
+    print(f"[front] Admin UI: http://{host}:{port}/admin")
+
+    _log_config = _front_log_config(getattr(args, "debug_web", False))
+    if config.server.https:
+        import ssl
+        keyfile = config.server.https_key_path
+        certfile = config.server.https_cert_path
+        if not (keyfile and certfile):
+            print("[front] HTTPS requested but no cert/key configured; using HTTP.")
+            _serve_front(app, host=host, port=port, log_config=_log_config)
+            return
+        ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
+        ctx.load_cert_chain(certfile, keyfile)
+        # uvicorn.Server reads ssl via Config(ssl_*), so pass cert/key paths.
+        _serve_front(app, host=host, port=port, log_config=_log_config,
+                     ssl_keyfile=keyfile, ssl_certfile=certfile)
+    else:
+        _serve_front(app, host=host, port=port, log_config=_log_config)
--- a/codai/frontproxy/assignment.py
+++ b/codai/frontproxy/assignment.py
+# CoderAI - OpenAI-compatible API server
+# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+"""Assign each configured model to exactly one engine.
+
+With multiple engines, every engine would otherwise read the shared models.json and
+register *every* model — so a model would appear on several engines at once. The
+front instead computes a single **owner** per model and tells each engine which
+models it owns; the engine then registers only those.
+
+Owner precedence (per model):
+  1. The per-model ``engine`` pin (models.json), if that engine can run the model.
+  2. The configured default engine, if it can run the model.
+  3. Round-robin across the capability-compatible engines (balanced, deterministic),
+     so unpinned models spread out instead of all landing on one engine.
+
+A model whose format no engine can serve is left unassigned (it can't run anyway).
+"""
+
+import json
+
+# models.json categories that hold servable model entries.
+_CATEGORIES = (
+    "text_models", "gguf_models", "vision_models", "image_models",
+    "audio_models", "tts_models", "video_models", "audio_gen_models",
+    "embedding_models", "spatial_models",
+)
+
+
+def _entry_path(entry):
+    """The model's path/id — used for capability detection (e.g. is it a .gguf)."""
+    if isinstance(entry, str):
+        return entry
+    if isinstance(entry, dict):
+        return entry.get("path") or entry.get("id")
+    return None
+
+
+def _route_key(entry):
+    """The identifier clients address this entry by (alias > path > id).
+
+    Keying on the alias lets two *configs* of the same model — with distinct
+    aliases — be assigned to different engines; configs sharing a path with no
+    distinct alias collapse to one owner (they're not separately addressable)."""
+    if isinstance(entry, str):
+        return entry
+    if isinstance(entry, dict):
+        return entry.get("alias") or entry.get("path") or entry.get("id")
+    return None
+
+
+def _required_cap(entry, ds4_cfg):
+    from codai.frontproxy.router import required_capability
+    path = _entry_path(entry) or ""
+    backend = entry.get("backend") if isinstance(entry, dict) else None
+    return required_capability(
+        path, backend=backend,
+        ds4_model_id=getattr(ds4_cfg, "model_id", None) if ds4_cfg else None,
+        ds4_enabled=bool(getattr(ds4_cfg, "enabled", False)) if ds4_cfg else False)
+
+
+def compute_assignment(engines, models_path, default_engine=None, ds4_cfg=None):
+    """Return {engine_name: [model_identifiers]} — each model owned by one engine."""
+    assignment = {e.name: [] for e in engines}
+    if not engines or not models_path:
+        return assignment
+    try:
+        with open(models_path) as f:
+            data = json.load(f)
+    except Exception:
+        return assignment
+
+    default_engine = (default_engine or "").strip().lower()
+    rr = {}   # round-robin cursor per candidate-set signature
+    seen = set()
+
+    for cat in _CATEGORIES:
+        for entry in data.get(cat, []):
+            ident = _route_key(entry)
+            if not ident or ident in seen:
+                continue
+            cap = _required_cap(entry, ds4_cfg)
+            candidates = [e for e in engines if e.can_serve(cap)]
+            if not candidates:
+                continue   # nothing can run it — leave unassigned
+
+            owner = None
+            pin = ((entry.get("engine") if isinstance(entry, dict) else "") or "").strip().lower()
+            if pin:
+                owner = next((e for e in candidates
+                              if e.name.lower() == pin or (e.backend or "").lower() == pin), None)
+            if owner is None and default_engine:
+                owner = next((e for e in candidates
+                              if e.name.lower() == default_engine
+                              or (e.backend or "").lower() == default_engine), None)
+            if owner is None:
+                key = tuple(sorted(e.name for e in candidates))
+                i = rr.get(key, 0)
+                owner = candidates[i % len(candidates)]
+                rr[key] = i + 1
+
+            assignment[owner.name].append(ident)
+            seen.add(ident)
+
+    return assignment
--- a/codai/frontproxy/engine_supervisor.py
+++ b/codai/frontproxy/engine_supervisor.py
+# CoderAI - OpenAI-compatible API server
+# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+"""Spawn and supervise engine subprocesses for the front proxy.
+
+One engine per GPU (or a configured count). Each engine is this same codebase
+relaunched with ``--engine-only --internal-port P`` and ``CUDA_VISIBLE_DEVICES``
+pinned to its GPU, so inside the engine its GPU is always ``cuda:0`` and the
+existing per-process VRAM/eviction logic is untouched.
+
+The supervisor polls each engine's auth-free ``/internal/engine-state`` to keep the
+:class:`EngineRegistry` current (health, resident models, VRAM) and respawns an
+engine that dies or stops answering — which is also how a CUDA-poisoned engine
+recovers (the front and sibling engines survive).
+"""
+
+import atexit
+import collections
+import json
+import os
+import shutil
+import signal
+import socket
+import subprocess
+import sys
+import threading
+import time
+
+import httpx
+
+from codai.frontproxy.registry import Engine, EngineRegistry
+
+
+def _engine_preexec():
+    """Run in the child just before exec: put the engine in its OWN process group
+    (so the terminal's Ctrl-C reaches only the front, which then stops engines
+    deterministically) and ask the kernel to SIGKILL the engine if the front dies
+    unexpectedly — even by SIGKILL, where our atexit/handlers can't run. Linux-only;
+    best-effort elsewhere."""
+    try:
+        os.setsid()
+    except Exception:
+        pass
+    try:
+        import ctypes
+        # prctl(PR_SET_PDEATHSIG, SIGKILL) — parent-death signal.
+        ctypes.CDLL("libc.so.6", use_errno=True).prctl(1, 9, 0, 0, 0)
+    except Exception:
+        pass
+
+
+def _port_is_free(port: int, host: str = "127.0.0.1") -> bool:
+    """True if ``port`` can be bound right now on ``host``."""
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    try:
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        s.bind((host, port))
+        return True
+    except OSError:
+        return False
+    finally:
+        s.close()
+
+
+def detect_gpus() -> list:
+    """Return CUDA GPU indices via nvidia-smi (no torch). Empty when none found."""
+    smi = shutil.which("nvidia-smi")
+    if not smi:
+        return []
+    try:
+        out = subprocess.run(
+            [smi, "--query-gpu=index", "--format=csv,noheader"],
+            capture_output=True, text=True, timeout=10,
+        )
+        if out.returncode != 0:
+            return []
+        return [int(line.strip()) for line in out.stdout.splitlines() if line.strip()]
+    except Exception:
+        return []
+
+
+def _gpu_selectors(spec: dict, env: dict) -> list:
+    """Which physical cards an engine owns, as selectors thermal can match against:
+    NVIDIA UUIDs (precise) and/or vendor keywords ("nvidia"/"amd"/"intel").
+
+    Derived from the engine's CUDA_VISIBLE_DEVICES (UUIDs), its ``gpus`` vendor
+    keyword, its Vulkan ICD, and its backend."""
+    sels = []
+    for tok in (env.get("CUDA_VISIBLE_DEVICES") or "").split(","):
+        tok = tok.strip()
+        if tok.startswith("GPU-"):
+            sels.append(tok)          # precise NVIDIA UUID
+        elif tok.isdigit():
+            sels.append("nvidia")     # index → vendor fallback
+    vmap = {"radeon": "amd", "amd": "amd", "intel": "intel", "nvidia": "nvidia"}
+    gpus_kw = (spec.get("gpus") or "").strip().lower()
+    if gpus_kw in vmap:
+        sels.append(vmap[gpus_kw])
+    icd = (env.get("VK_ICD_FILENAMES") or "").lower()
+    if "radeon" in icd or "amd" in icd:
+        sels.append("amd")
+    elif "intel" in icd:
+        sels.append("intel")
+    elif "nvidia" in icd:
+        sels.append("nvidia")
+    if (spec.get("backend") or "").lower() == "nvidia" and not sels:
+        sels.append("nvidia")
+    sels = list(dict.fromkeys(sels))
+    # When we have precise NVIDIA UUIDs, drop the broad "nvidia" vendor so two
+    # separate NVIDIA engines don't each match every NVIDIA card.
+    if any(s.startswith("GPU-") for s in sels):
+        sels = [s for s in sels if s != "nvidia"]
+    return sels
+
+
+class EngineSupervisor:
+    def __init__(self, config, args, registry: EngineRegistry, models_path=None,
+                 internal_token=None, debug=False):
+        self.config = config
+        self.args = args
+        self.registry = registry
+        self.models_path = models_path   # for computing per-engine model assignment
+        self.internal_token = internal_token  # shared secret stamped on engine calls
+        self.debug = debug               # --debug-engine: verbose engine lifecycle
+        self._health = {}                # engine_id -> last healthy bool (for debug)
+        self._stopped = threading.Event()
+        self._poll_thread = None
+        self._logs = {}   # engine_id -> deque tail
+        self._restart_lock = threading.RLock()
+
+    def _assign_models(self, engines) -> None:
+        """Give each engine the set of models it owns (via CODERAI_ENGINE_MODELS), so
+        a model is registered on exactly one engine. With a single engine there's
+        nothing to partition — it owns everything."""
+        if not self.models_path or len(engines) < 2:
+            return
+        try:
+            from codai.frontproxy.assignment import compute_assignment
+            default_engine = getattr(self.config.server, "default_engine", None)
+            ds4 = getattr(self.config, "ds4", None)
+            assignment = compute_assignment(engines, self.models_path,
+                                            default_engine, ds4)
+            for e in engines:
+                owned = assignment.get(e.name, [])
+                e.assigned_models = set(owned)   # the front's router enforces this
+                # Also hand the set to the engine so it only registers/pre-loads its
+                # assigned models (avoids e.g. whisper-server starting on every
+                # engine). models.json itself stays full for the admin view.
+                e.env["CODERAI_ENGINE_MODELS"] = json.dumps(owned)
+                print(f"[front] engine '{e.name}' assigned {len(owned)} model(s): "
+                      f"{', '.join(owned) if owned else '(none)'}", flush=True)
+        except Exception as exc:
+            print(f"[front] model assignment skipped: {exc}", flush=True)
+
+    def _alloc_port(self) -> int:
+        """Next free internal port at/above internal_port_base, skipping the front's
+        own port and any port already in use, so engines never collide with the
+        front or each other (or a stale process on the base port)."""
+        p = self._port_cursor
+        front_port = int(getattr(self.config.server, "port", 0) or 0)
+        while p == front_port or not _port_is_free(p):
+            p += 1
+        self._port_cursor = p + 1
+        return p
+
+    # ----------------------------------------------------------------- planning
+    def _build_engines(self) -> list:
+        """Return the list of Engine objects to launch.
+
+        Explicit ``engine_specs`` (heterogeneous: per-engine backend + env, e.g. an
+        NVIDIA card and a Radeon card) take precedence. Otherwise auto-detect the
+        LOCAL hardware and create one engine per GPU vendor actually present —
+        NVIDIA (CUDA), AMD/Radeon (Vulkan), Intel (Vulkan) — so e.g. a box with an
+        NVIDIA + a Radeon gets both engines without any config. A machine with no
+        GPU gets a single CPU engine.
+        """
+        srv = self.config.server
+        self._port_cursor = srv.internal_port_base
+        specs = getattr(srv, "engine_specs", None)
+        engines = []
+
+        if specs:
+            from codai.frontproxy.gpu_detect import vendor_env
+            for idx, spec in enumerate(specs):
+                backend = (spec.get("backend") or "auto").strip()
+                # Vendor keyword → all of that vendor's cards on this machine. A
+                # plain nvidia backend defaults to "nvidia" (unambiguous); Vulkan
+                # vendors must be named ("radeon"/"amd"/"intel"). Explicit env wins.
+                gpus_kw = (spec.get("gpus") or "").strip().lower()
+                if not gpus_kw and not spec.get("env") and backend == "nvidia":
+                    gpus_kw = "nvidia"
+                detected = vendor_env(gpus_kw) if gpus_kw else {}
+                explicit = {str(k): str(v) for k, v in (spec.get("env") or {}).items()}
+                env = {**detected, **explicit}     # explicit overrides detected
+                # Tell the engine which physical cards it owns, so thermal
+                # protection scopes GPU cooldowns to this engine (CPU stays global).
+                sels = _gpu_selectors(spec, env)
+                if sels and "CODERAI_ENGINE_GPUS" not in env:
+                    env["CODERAI_ENGINE_GPUS"] = ",".join(sels)
+                caps = set(spec.get("capabilities") or [])
+                engines.append(Engine(
+                    id=idx, gpu=None, port=self._alloc_port(), primary=(idx == 0),
+                    name=spec.get("name") or f"engine#{idx}",
+                    backend=backend, env=env, capabilities=caps,
+                ))
+            return engines
+
+        # Auto: one engine per GPU vendor actually present on this machine. Vendors
+        # come from Vulkan enumeration AND the sysfs PCI-vendor fallback, so AMD/Intel
+        # are detected even without vulkaninfo installed.
+        from codai.frontproxy.gpu_detect import nvidia_gpus, gpu_vendors, vendor_env
+        vendors = gpu_vendors()
+        # (engine name, vendor keyword, backend). NVIDIA first so it's the primary
+        # (it owns admin/sessions and has the broadest capabilities). NVIDIA needs
+        # CUDA, so it's gated on nvidia-smi rather than the Vulkan/sysfs presence.
+        plan = []
+        if nvidia_gpus():
+            plan.append(("nvidia", "nvidia", "nvidia"))
+        if "amd" in vendors:
+            plan.append(("radeon", "amd", "vulkan"))
+        if "intel" in vendors:
+            plan.append(("intel", "intel", "vulkan"))
+
+        if not plan:
+            engines.append(Engine(id=0, gpu=None, port=self._alloc_port(),
+                                  primary=True, name="cpu", backend="auto", env={}))
+            return engines
+
+        for idx, (name, vkw, backend) in enumerate(plan):
+            env = vendor_env(vkw)
+            sels = _gpu_selectors({"backend": backend, "gpus": vkw}, env)
+            if sels:
+                env["CODERAI_ENGINE_GPUS"] = ",".join(sels)
+            engines.append(Engine(id=idx, gpu=None, port=self._alloc_port(),
+                                  primary=(idx == 0), name=name,
+                                  backend=backend, env=env))
+        return engines
+
+    # ------------------------------------------------------------------ spawning
+    def _engine_cmd(self, port: int) -> list:
+        """Build the command to relaunch this codebase as an engine."""
+        # sys.argv[0] is the launcher script (``coderai``); preserve all original
+        # args (config dir, model selection, …) and append the engine flags. Strip
+        # any flag that would re-trigger front mode or fix a different port.
+        passthrough = []
+        skip_next = False
+        for a in sys.argv[1:]:
+            if skip_next:
+                skip_next = False
+                continue
+            if a in ("--single-process", "--engine-only"):
+                continue
+            if a == "--internal-port":
+                skip_next = True
+                continue
+            passthrough.append(a)
+        return [sys.executable, sys.argv[0], *passthrough,
+                "--engine-only", "--internal-port", str(port)]
+
+    def _spawn(self, engine: Engine) -> None:
+        env = dict(os.environ)
+        # Engine stdout is a pipe (not a TTY), so CPython block-buffers print()
+        # output — debug lines (e.g. --debug-requests) would stall in the buffer
+        # until it fills or the process exits, unlike tqdm which flushes stderr
+        # itself. Force unbuffered so engine logs reach the front terminal live.
+        env["PYTHONUNBUFFERED"] = "1"
+        # Per-engine env block (device pinning, Vulkan ICD, etc.). Empty-string
+        # values are honoured (e.g. CUDA_VISIBLE_DEVICES="" hides all CUDA cards).
+        for k, v in (engine.env or {}).items():
+            env[str(k)] = str(v)
+        # The global host-RAM cap (offload.max_ram_gb) is SHARED across all engines,
+        # not split: tell each engine the front's PID so it measures the whole
+        # fleet's RAM (front + every engine + workers) against the one cap.
+        env["CODERAI_FRONT_PID"] = str(os.getpid())
+        # Only the primary engine talks to the AISBF broker, so N engines don't
+        # register N times under the same provider id.
+        if engine.primary:
+            env["CODERAI_ENGINE_PRIMARY"] = "1"
+        # Shared secret: the engine rejects any HTTP request that doesn't carry it,
+        # so only the front (which has it) can reach the engine on localhost.
+        if self.internal_token:
+            env["CODERAI_INTERNAL_TOKEN"] = self.internal_token
+        # Resolve this engine's concurrency limits (global default, or a per-engine
+        # override keyed by engine name) and hand them down so a bigger card can run
+        # more in parallel than a smaller one.
+        srv = self.config.server
+        mdl = self.config.models
+        par = (srv.max_parallel_requests_overrides or {}).get(engine.name,
+                                                              srv.max_parallel_requests)
+        inst = (getattr(mdl, "max_model_instances_overrides", None) or {}).get(
+            engine.name, getattr(mdl, "max_model_instances", 1))
+        if par is not None:
+            env["CODERAI_MAX_PARALLEL"] = str(int(par))
+        if inst is not None:
+            env["CODERAI_MAX_MODEL_INSTANCES"] = str(int(inst))
+        # Force this engine's backend (the engine reads this in --engine-only mode
+        # and overrides config.backend.type) so a Vulkan/Radeon engine doesn't
+        # auto-pick CUDA, and vice-versa.
+        if engine.backend and engine.backend != "auto":
+            env["CODERAI_ENGINE_BACKEND"] = engine.backend
+        cmd = self._engine_cmd(engine.port)
+        tag = engine.name + (f"(gpu{engine.gpu})" if engine.gpu is not None else "")
+        print(f"[front] launching {tag} on port {engine.port}: {' '.join(cmd)}", flush=True)
+        proc = subprocess.Popen(
+            cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+            text=True, bufsize=1,
+            preexec_fn=_engine_preexec if os.name == "posix" else None,
+        )
+        engine.proc = proc
+        tail = self._logs.setdefault(engine.id, collections.deque(maxlen=30))
+        threading.Thread(target=self._pump_logs, args=(tag, proc, tail),
+                         daemon=True).start()
+
+    @staticmethod
+    def _pump_logs(tag, proc, tail):
+        for line in proc.stdout:
+            line = line.rstrip()
+            if line:
+                tail.append(line)
+                print(f"[{tag}] {line}", flush=True)
+
+    # -------------------------------------------------------------------- lifecycle
+    def _set_primary(self, engines) -> None:
+        """The primary engine owns admin/sessions/config. Honour the configured
+        engine (server.default_engine) as the primary when it's present; otherwise
+        keep the first engine (the build order) as primary."""
+        de = (getattr(self.config.server, "default_engine", None) or "").strip().lower()
+        if not de or len(engines) < 2:
+            return
+        match = next((e for e in engines
+                      if e.name.lower() == de or (e.backend or "").lower() == de), None)
+        if match is None:
+            return   # configured engine isn't present — leave the default primary
+        for e in engines:
+            e.primary = (e is match)
+        print(f"[front] primary engine: '{match.name}' (from settings.default_engine)",
+              flush=True)
+
+    def start(self) -> None:
+        engines = self._build_engines()
+        self._set_primary(engines)       # configured engine owns admin/sessions
+        self._assign_models(engines)     # set CODERAI_ENGINE_MODELS before spawning
+        for engine in engines:
+            self.registry.add(engine)
+            self._spawn(engine)
+        self._poll_thread = threading.Thread(target=self._poll_loop, daemon=True)
+        self._poll_thread.start()
+        atexit.register(self.stop_all)
+
+    def _poll_loop(self) -> None:
+        _auth = ({"x-coderai-internal": self.internal_token}
+                 if self.internal_token else {})
+        client = httpx.Client(timeout=self.config.server.proxy_status_timeout,
+                              headers=_auth)
+        while not self._stopped.is_set():
+            for engine in self.registry.all():
+                # Respawn engines whose process has exited.
+                if engine.proc is not None and engine.proc.poll() is not None:
+                    self._maybe_restart(engine)
+                    continue
+                healthy = False
+                try:
+                    r = client.get(engine.url + "/internal/engine-state")
+                    if r.status_code == 200:
+                        d = r.json()
+                        healthy = True
+                        self.registry.update_state(
+                            engine.id, healthy=True,
+                            loaded_models=d.get("loaded_models") or [],
+                            vram=d.get("vram"),
+                            tasks=d.get("tasks") or [],
+                            cooling=d.get("cooling"),
+                        )
+                    else:
+                        self.registry.update_state(engine.id, healthy=False)
+                except Exception:
+                    # Connection refused / timeout: still-loading or dead. Mark
+                    # unhealthy; the process-exit check above handles true death.
+                    self.registry.update_state(engine.id, healthy=False)
+                # --debug-engine: report health transitions (ready / lost).
+                if self.debug and self._health.get(engine.id) != healthy:
+                    self._health[engine.id] = healthy
+                    print(f"[front] engine '{engine.name}' "
+                          f"{'ready' if healthy else 'not responding'}", flush=True)
+            self._stopped.wait(self.config.server.proxy_status_timeout)
+        client.close()
+
+    def _maybe_restart(self, engine: Engine) -> None:
+        with self._restart_lock:
+            if self._stopped.is_set():
+                return
+            code = engine.proc.poll() if engine.proc else None
+            tail = " | ".join(list(self._logs.get(engine.id, []))[-3:])
+            print(f"[front] engine#{engine.id} exited (code {code}); respawning. {tail}",
+                  flush=True)
+            self.registry.update_state(engine.id, healthy=False)
+            time.sleep(1.0)   # avoid a tight crash loop
+            self._spawn(engine)
+
+    def restart_engine(self, engine_id: int) -> bool:
+        """Forcibly kill and respawn one engine (e.g. it's stuck in a loop).
+
+        Holds the restart lock so the poll loop's own respawn can't double-spawn."""
+        engine = self.registry.get(engine_id)
+        if engine is None:
+            return False
+        with self._restart_lock:
+            proc = engine.proc
+            if proc is not None and proc.poll() is None:
+                try:
+                    proc.terminate()
+                    proc.wait(timeout=8)
+                except Exception:
+                    pass
+                if proc.poll() is None:
+                    try:
+                        proc.kill()
+                        proc.wait(timeout=3)
+                    except Exception:
+                        pass
+            self.registry.update_state(engine_id, healthy=False)
+            print(f"[front] restarting engine#{engine_id} ({engine.name}) on request",
+                  flush=True)
+            self._spawn(engine)
+        return True
+
+    def wait_ready(self, timeout: float = 1800.0) -> bool:
+        """Block until at least the primary engine answers (best effort)."""
+        deadline = time.time() + timeout
+        while time.time() < deadline and not self._stopped.is_set():
+            prim = self.registry.primary()
+            if prim and prim.healthy:
+                return True
+            time.sleep(1.0)
+        return bool(self.registry.primary())
+
+    def stop_all(self, grace: float = 8.0) -> None:
+        """Stop every engine, escalating to SIGKILL of the engine's whole process
+        group if it doesn't exit within ``grace`` seconds — so a stuck (e.g.
+        mid-CUDA) engine, and any children it spawned (whisper-server, ds4), are
+        guaranteed dead. Idempotent and safe to call from a signal handler."""
+        self._stopped.set()
+
+        def _signal_group(proc, sig):
+            # Engines are started in their own session (setsid), so killing the
+            # process group reaps the engine + its grandchildren in one shot.
+            try:
+                os.killpg(os.getpgid(proc.pid), sig)
+            except Exception:
+                try:
+                    proc.send_signal(sig)
+                except Exception:
+                    pass
+
+        procs = [(e, e.proc) for e in self.registry.all()
+                 if e.proc is not None and e.proc.poll() is None]
+        # Phase 1: polite SIGTERM to each group.
+        for _engine, proc in procs:
+            _signal_group(proc, signal.SIGTERM)
+        # Phase 2: wait up to `grace`, then SIGKILL whatever is still alive.
+        deadline = time.time() + grace
+        for _engine, proc in procs:
+            remaining = max(0.0, deadline - time.time())
+            try:
+                proc.wait(timeout=remaining)
+            except Exception:
+                pass
+        for _engine, proc in procs:
+            if proc.poll() is None:
+                _signal_group(proc, signal.SIGKILL)
+                try:
+                    proc.wait(timeout=3)
+                except Exception:
+                    pass
+        print("[front] all engines stopped", flush=True)
--- a/codai/frontproxy/gpu_detect.py
+++ b/codai/frontproxy/gpu_detect.py
+# CoderAI - OpenAI-compatible API server
+# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+"""Torch-free GPU detection for the front proxy.
+
+Turns a *vendor keyword* (``"nvidia"`` / ``"radeon"`` / ``"intel"``) into the env
+that pins an engine to **all** of that vendor's cards on the local machine — so the
+same ``engine_specs`` work on a 1-, 2-, or 10-card box without hand-writing UUIDs or
+device indices:
+
+* **CUDA** is pinned by UUID (stable across reboots/reordering), listing every
+  NVIDIA card for the NVIDIA engine and ``""`` (hidden) for the others.
+* **Vulkan** is isolated by pointing ``VK_ICD_FILENAMES`` at only that vendor's ICD,
+  so the engine sees exactly that vendor's cards (no index fragility, no llvmpipe /
+  cross-vendor interference).
+
+Everything shells out to ``nvidia-smi`` / ``vulkaninfo`` and reads the Vulkan ICD
+directory; nothing imports torch.
+"""
+
+import glob
+import os
+import shutil
+import subprocess
+
+# Vulkan ICD search dirs (loader defaults) and per-vendor filename patterns. Names
+# vary by distro/driver, so we match several and skip disabled ones.
+_ICD_DIRS = ("/usr/share/vulkan/icd.d", "/etc/vulkan/icd.d",
+             "/usr/local/share/vulkan/icd.d")
+_ICD_PATTERNS = {
+    "nvidia": ("nvidia_icd*.json",),
+    "amd": ("radeon_icd*.json", "amd_icd*.json"),      # RADV (mesa) or AMDVLK
+    "intel": ("intel_icd*.json", "intel_hasvk_icd*.json"),
+}
+# Vulkan vendor IDs (PCI) for reporting.
+VENDOR_IDS = {0x10de: "nvidia", 0x1002: "amd", 0x8086: "intel"}
+_ALIASES = {"radeon": "amd", "amd": "amd", "nvidia": "nvidia", "nv": "nvidia",
+            "intel": "intel"}
+
+
+def _norm_vendor(vendor: str) -> str:
+    return _ALIASES.get((vendor or "").strip().lower(), (vendor or "").strip().lower())
+
+
+def nvidia_gpus() -> list:
+    """Return [{'uuid','name','pci'}] for each NVIDIA GPU (empty if none)."""
+    smi = shutil.which("nvidia-smi")
+    if not smi:
+        return []
+    try:
+        out = subprocess.run(
+            [smi, "--query-gpu=uuid,name,pci.bus_id", "--format=csv,noheader"],
+            capture_output=True, text=True, timeout=10)
+        if out.returncode != 0:
+            return []
+        gpus = []
+        for line in out.stdout.splitlines():
+            parts = [p.strip() for p in line.split(",")]
+            if parts and parts[0]:
+                gpus.append({"uuid": parts[0],
+                             "name": parts[1] if len(parts) > 1 else "",
+                             "pci": parts[2] if len(parts) > 2 else ""})
+        return gpus
+    except Exception:
+        return []
+
+
+def vulkan_devices() -> list:
+    """Return [{'vendor','vendor_id','name'}] from ``vulkaninfo --summary``.
+
+    Order matches Vulkan device indexing. Best-effort: empty if vulkaninfo is
+    missing or unparseable."""
+    vk = shutil.which("vulkaninfo")
+    if not vk:
+        return []
+    try:
+        out = subprocess.run([vk, "--summary"], capture_output=True, text=True,
+                             timeout=15)
+        text = out.stdout
+    except Exception:
+        return []
+    devices = []
+    cur = {}
+    for raw in text.splitlines():
+        line = raw.strip()
+        if line.startswith("GPU") and line.endswith(":"):
+            if cur:
+                devices.append(cur)
+            cur = {}
+        elif "=" in line:
+            k, _, v = line.partition("=")
+            k = k.strip().lower(); v = v.strip()
+            if k == "vendorid":
+                try:
+                    vid = int(v, 16) if v.lower().startswith("0x") else int(v)
+                except ValueError:
+                    vid = None
+                cur["vendor_id"] = vid
+                cur["vendor"] = VENDOR_IDS.get(vid, "other")
+            elif k == "devicename":
+                cur["name"] = v
+    if cur:
+        devices.append(cur)
+    return devices
+
+
+_PCI_VENDOR = {"0x10de": "nvidia", "0x1002": "amd", "0x8086": "intel"}
+
+
+def sysfs_gpu_vendors() -> set:
+    """GPU vendors present per sysfs (``/sys/class/drm/card*/device/vendor``).
+
+    A driver-independent fallback for AMD/Intel detection when ``vulkaninfo`` isn't
+    installed. Returns vendor keywords ({"amd","nvidia","intel"})."""
+    import re
+    vendors = set()
+    for card in glob.glob("/sys/class/drm/card*"):
+        base = os.path.basename(card)
+        if not re.match(r"^card\d+$", base):
+            continue
+        try:
+            with open(os.path.join(card, "device", "vendor")) as f:
+                vid = f.read().strip().lower()
+        except OSError:
+            continue
+        v = _PCI_VENDOR.get(vid)
+        if v:
+            vendors.add(v)
+    return vendors
+
+
+def gpu_vendors() -> set:
+    """All GPU vendors present, combining Vulkan enumeration (vulkaninfo) with the
+    sysfs PCI-vendor fallback, so detection doesn't depend on vulkaninfo alone."""
+    vendors = {d.get("vendor") for d in vulkan_devices() if d.get("vendor")}
+    vendors |= sysfs_gpu_vendors()
+    vendors.discard("other")   # llvmpipe / software rasterizers
+    return vendors
+
+
+def find_vulkan_icd(vendor: str) -> str:
+    """Return the path to a Vulkan ICD JSON for ``vendor``, or '' if not found."""
+    vendor = _norm_vendor(vendor)
+    patterns = _ICD_PATTERNS.get(vendor, ())
+    for d in _ICD_DIRS:
+        for pat in patterns:
+            for path in sorted(glob.glob(os.path.join(d, pat))):
+                if path.endswith(".disabled") or ".disabled" in os.path.basename(path):
+                    continue
+                if os.path.isfile(path):
+                    return path
+    return ""
+
+
+def vendor_env(vendor: str) -> dict:
+    """Env that pins an engine to **all** of ``vendor``'s cards on this machine.
+
+    NVIDIA: CUDA visible = all NVIDIA UUIDs (+ PCI_BUS_ID order), Vulkan ICD = nvidia.
+    AMD/Intel: CUDA hidden (""), Vulkan ICD = that vendor's, so it sees only those
+    cards. Missing tools degrade gracefully (the key is simply omitted)."""
+    vendor = _norm_vendor(vendor)
+    env = {}
+    if vendor == "nvidia":
+        uuids = [g["uuid"] for g in nvidia_gpus() if g.get("uuid")]
+        if uuids:
+            env["CUDA_VISIBLE_DEVICES"] = ",".join(uuids)
+            env["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    else:
+        # Non-NVIDIA engine: hide all CUDA cards so torch/llama-CUDA can't grab them.
+        env["CUDA_VISIBLE_DEVICES"] = ""
+    # Likewise hide AMD/Radeon cards from any engine that isn't the AMD one, so a
+    # non-AMD engine can't pick up a Radeon (mirrors CUDA hiding for non-NVIDIA).
+    if vendor != "amd":
+        env["RADEON_VISIBLE_DEVICES"] = ""
+    icd = find_vulkan_icd(vendor)
+    if icd:
+        env["VK_ICD_FILENAMES"] = icd
+    # After ICD isolation only THIS vendor's card(s) are visible to Vulkan, as
+    # indices 0..n-1. Pin GGML_VK_VISIBLE_DEVICES to those indices so an inherited
+    # value (e.g. a launcher exporting GGML_VK_VISIBLE_DEVICES=1 from the old
+    # multi-vendor enumeration) can't select an invalid index and silently fall back
+    # to CPU. Default to "0" when the count can't be determined (single card).
+    _n = sum(1 for d in vulkan_devices() if d.get("vendor") == vendor)
+    env["GGML_VK_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(max(1, _n)))
+    return env
+
+
+def _nvidia_stats() -> list:
+    """Per-GPU live stats from nvidia-smi (reports ALL cards regardless of
+    CUDA_VISIBLE_DEVICES). Memory in GB."""
+    smi = shutil.which("nvidia-smi")
+    if not smi:
+        return []
+    try:
+        out = subprocess.run(
+            [smi, "--query-gpu=index,name,utilization.gpu,memory.used,memory.total,"
+                  "temperature.gpu,uuid", "--format=csv,noheader,nounits"],
+            capture_output=True, text=True, timeout=10)
+        if out.returncode != 0:
+            return []
+    except Exception:
+        return []
+    cards = []
+    for line in out.stdout.splitlines():
+        p = [x.strip() for x in line.split(",")]
+        if len(p) < 6 or not p[0].isdigit():
+            continue
+        def _f(v):
+            try:
+                return float(v)
+            except ValueError:
+                return None
+        cards.append({"vendor": "nvidia", "index": int(p[0]), "name": p[1],
+                      "util": _f(p[2]),
+                      "mem_used": round((_f(p[3]) or 0) / 1024, 2),
+                      "mem_total": round((_f(p[4]) or 0) / 1024, 2),
+                      "temp": _f(p[5]),
+                      "uuid": p[6] if len(p) > 6 else None})
+    return cards
+
+
+def _amd_stats() -> list:
+    """Per-GPU live stats for AMD cards from sysfs (amdgpu). Memory in GB."""
+    import re
+    cards = []
+    for card in sorted(glob.glob("/sys/class/drm/card*")):
+        base = os.path.basename(card)
+        if not re.match(r"^card\d+$", base):
+            continue
+        dev = os.path.join(card, "device")
+
+        def _read(rel):
+            try:
+                with open(os.path.join(dev, rel)) as f:
+                    return f.read().strip()
+            except OSError:
+                return None
+        vendor = (_read("vendor") or "").lower()
+        if vendor != "0x1002":          # AMD only (NVIDIA handled via nvidia-smi)
+            continue
+        busy = _read("gpu_busy_percent")
+        used = _read("mem_info_vram_used")
+        total = _read("mem_info_vram_total")
+        temp = None
+        for hw in glob.glob(os.path.join(dev, "hwmon", "hwmon*", "temp1_input")):
+            t = None
+            try:
+                with open(hw) as f:
+                    t = int(f.read().strip())
+            except OSError:
+                t = None
+            if t is not None:
+                temp = t / 1000.0
+                break
+        cards.append({
+            "vendor": "amd", "index": int(base[4:]),
+            "name": f"AMD GPU ({base})",
+            "util": float(busy) if busy and busy.isdigit() else None,
+            "mem_used": round(int(used) / 1e9, 2) if used and used.isdigit() else None,
+            "mem_total": round(int(total) / 1e9, 2) if total and total.isdigit() else None,
+            "temp": temp})
+    return cards
+
+
+def gpu_stats() -> list:
+    """Live per-card stats for EVERY physical GPU installed (vendor-agnostic):
+    ``[{vendor, index, name, util%, mem_used GB, mem_total GB, temp °C, uuid}]``.
+
+    Independent of engine ownership — nvidia-smi and sysfs report all cards
+    regardless of CUDA_VISIBLE_DEVICES — so this shows the whole machine."""
+    return _nvidia_stats() + _amd_stats()
+
+
+def engine_gpu_stats() -> list:
+    """Like :func:`gpu_stats` but scoped to the cards THIS engine owns, per the
+    ``CODERAI_ENGINE_GPUS`` env the front sets (comma-separated vendor keywords
+    and/or NVIDIA UUIDs). Unset → all cards (single-process / legacy mode).
+
+    Used by thermal protection so a hot GPU only pauses the engine(s) using it,
+    while a hot CPU (read globally) still pauses everything."""
+    cards = gpu_stats()
+    raw = (os.environ.get("CODERAI_ENGINE_GPUS") or "").strip()
+    if not raw:
+        return cards
+    sels = {s.strip() for s in raw.split(",") if s.strip()}
+    return [c for c in cards
+            if c.get("vendor") in sels or (c.get("uuid") and c["uuid"] in sels)]
+
+
+def summary() -> dict:
+    """Detected hardware, for a 'detect engines' UI / debugging."""
+    return {"nvidia": nvidia_gpus(), "vulkan": vulkan_devices(),
+            "icd": {v: find_vulkan_icd(v) for v in ("nvidia", "amd", "intel")}}
--- a/codai/frontproxy/registry.py
+++ b/codai/frontproxy/registry.py
+# CoderAI - OpenAI-compatible API server
+# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+"""Front-side registry of engine subprocesses.
+
+The front never imports torch; it knows about engines only through the small,
+auth-free ``/internal/engine-state`` endpoint each engine exposes on localhost.
+This module holds the shared, thread-safe view the supervisor writes and the
+router/aggregator read.
+"""
+
+import threading
+import time
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Set
+
+
+# Default model-format capabilities implied by an engine's backend:
+#   transformers — safetensors/HF models (CUDA only here)
+#   gguf         — llama.cpp models (CUDA or Vulkan)
+#   whisper      — whisper.cpp STT (CUDA or Vulkan)
+#   ds4          — DeepSeek V4 via the native ds4 engine (CUDA-only build)
+# An NVIDIA engine can do all of them; a Vulkan (e.g. Radeon) engine does GGUF and
+# whisper, but not transformers and not ds4.
+_DEFAULT_CAPS = {
+    "nvidia": {"transformers", "gguf", "whisper", "ds4"},
+    "cuda": {"transformers", "gguf", "whisper", "ds4"},
+    "vulkan": {"gguf", "whisper"},
+    "opencl": {"gguf", "whisper"},
+    "auto": {"transformers", "gguf", "whisper", "ds4"},
+}
+
+
+@dataclass
+class Engine:
+    id: int
+    gpu: Optional[int]             # device hint for logs (CUDA/Vulkan index; None = n/a)
+    port: int
+    primary: bool = False          # the engine that owns admin/auth/config traffic
+    name: str = ""                 # human label for logs
+    backend: str = "auto"          # nvidia | vulkan | … (forced for this engine)
+    env: dict = field(default_factory=dict)        # extra env applied at spawn
+    capabilities: Set[str] = field(default_factory=set)  # model formats it can serve
+    assigned_models: Set[str] = field(default_factory=set)  # routable ids it owns
+    url: str = ""
+    healthy: bool = False
+    loaded_models: Set[str] = field(default_factory=set)
+    vram: Optional[dict] = None
+    tasks: list = field(default_factory=list)   # running/queued tasks on this engine
+    cooling: Optional[dict] = None  # thermal cooldown state, or None when not cooling
+    last_ok: float = 0.0           # monotonic time of last successful poll
+    proc: object = None            # subprocess.Popen (set by the supervisor)
+
+    def __post_init__(self):
+        if not self.url:
+            self.url = f"http://127.0.0.1:{self.port}"
+        if not self.name:
+            self.name = f"engine#{self.id}"
+        if not self.capabilities:
+            self.capabilities = set(_DEFAULT_CAPS.get(self.backend, {"transformers", "gguf"}))
+
+    def can_serve(self, required_cap: Optional[str]) -> bool:
+        return (not required_cap) or (required_cap in self.capabilities)
+
+
+class EngineRegistry:
+    def __init__(self):
+        self._engines: Dict[int, Engine] = {}
+        self._lock = threading.RLock()
+
+    def add(self, engine: Engine) -> None:
+        with self._lock:
+            self._engines[engine.id] = engine
+
+    def get(self, engine_id: int) -> Optional[Engine]:
+        with self._lock:
+            return self._engines.get(engine_id)
+
+    def all(self) -> List[Engine]:
+        with self._lock:
+            return list(self._engines.values())
+
+    def healthy(self) -> List[Engine]:
+        with self._lock:
+            return [e for e in self._engines.values() if e.healthy]
+
+    def primary(self) -> Optional[Engine]:
+        """The engine that owns admin/session/config — falls back to first healthy."""
+        with self._lock:
+            prim = next((e for e in self._engines.values() if e.primary), None)
+            if prim and prim.healthy:
+                return prim
+            return next((e for e in self._engines.values() if e.healthy), prim)
+
+    def by_name(self, name: Optional[str]) -> Optional[Engine]:
+        """Resolve an engine by its declared name (or, failing that, its backend).
+
+        Used for the configured default engine and per-model pins. Prefers a healthy
+        match but returns an unhealthy one too, so callers can decide."""
+        if not name:
+            return None
+        name = name.strip().lower()
+        with self._lock:
+            engines = list(self._engines.values())
+        match = None
+        for e in engines:
+            if (e.name or "").lower() == name or (e.backend or "").lower() == name:
+                if e.healthy:
+                    return e
+                match = match or e
+        return match
+
+    def update_state(self, engine_id: int, *, healthy: bool,
+                     loaded_models=None, vram=None, tasks=None,
+                     cooling=False) -> None:
+        with self._lock:
+            e = self._engines.get(engine_id)
+            if not e:
+                return
+            e.healthy = healthy
+            if healthy:
+                e.last_ok = time.monotonic()
+            if loaded_models is not None:
+                e.loaded_models = set(loaded_models)
+            if vram is not None:
+                e.vram = vram
+            if tasks is not None:
+                e.tasks = list(tasks)
+            elif not healthy:
+                e.tasks = []
+            if cooling is not False:        # explicit None clears it
+                e.cooling = cooling
+            elif not healthy:
+                e.cooling = None
+
+    def engine_for_model(self, model_key: str, required_cap: Optional[str] = None) -> Optional[Engine]:
+        """Return a healthy, capability-compatible engine that already has the model
+        resident, if any.
+
+        Matching is forgiving: exact key, short-name, or type-prefixed variants —
+        the same fuzzy spirit the manager uses, but read-only over loaded keys."""
+        if not model_key:
+            return None
+        short = model_key.split("/")[-1]
+        with self._lock:
+            for e in self._engines.values():
+                if not e.healthy or not e.can_serve(required_cap):
+                    continue
+                for k in e.loaded_models:
+                    if k == model_key or k.split("/")[-1] == short \
+                            or k.endswith(model_key) or model_key.endswith(k.split(":")[-1]):
+                        return e
+        return None
+
+    def engine_for_assigned(self, model_key: str) -> Optional[Engine]:
+        """The engine the front ASSIGNED this model to (single owner), or None.
+
+        The assignment is the authoritative routing decision (it already encodes
+        pins, the default engine, and balanced auto-selection); match leniently so a
+        short-name / alias resolves to the owner."""
+        if not model_key:
+            return None
+        short = model_key.split("/")[-1]
+        with self._lock:
+            for e in self._engines.values():
+                if not e.healthy:
+                    continue
+                for k in e.assigned_models:
+                    if (k == model_key or k.split("/")[-1] == short
+                            or k.endswith(model_key) or model_key.endswith(k.split("/")[-1])):
+                        return e
+        return None
+
+    def least_loaded(self, required_cap: Optional[str] = None) -> Optional[Engine]:
+        """Pick a healthy, capability-compatible engine to load a new model on:
+        fewest resident models, then most free VRAM."""
+        with self._lock:
+            cands = [e for e in self._engines.values()
+                     if e.healthy and e.can_serve(required_cap)]
+        if not cands:
+            return None
+
+        def _free(e: Engine) -> float:
+            return (e.vram or {}).get("free", 0.0) if e.vram else 0.0
+
+        cands.sort(key=lambda e: (len(e.loaded_models), -_free(e)))
+        return cands[0]
--- a/codai/frontproxy/router.py
+++ b/codai/frontproxy/router.py
+# CoderAI - OpenAI-compatible API server
+# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+"""Decide which engine handles a proxied request.
+
+Policy (Plan B + multi-engine):
+
+* **Admin / auth / config / UI / status / tasks** → the **primary** engine. These
+  own per-process session and config state, so pinning them to one engine keeps
+  sessions consistent without a shared session store (that's Plan C).
+* **Inference** (``/v1/...`` POST carrying a ``model``) → the engine that already
+  has that model resident; otherwise the least-loaded engine (which loads it on
+  demand). This is what lets one model load on engine A while engine B keeps
+  generating.
+* **Everything else** (e.g. ``GET /v1/models``, file downloads) → primary.
+"""
+
+from typing import Optional
+
+from codai.frontproxy.registry import Engine, EngineRegistry
+
+# POST endpoints that carry a `model` and should be load-balanced across engines.
+_INFERENCE_PATHS = {
+    "/v1/chat/completions",
+    "/v1/completions",
+    "/v1/embeddings",
+    "/v1/images/generations",
+    "/v1/images/edits",
+    "/v1/audio/speech",
+    "/v1/audio/transcriptions",
+    "/v1/videos/generations",
+}
+
+
+def is_inference_path(path: str) -> bool:
+    p = path.split("?", 1)[0].rstrip("/")
+    return p in _INFERENCE_PATHS
+
+
+def is_admin_path(path: str) -> bool:
+    p = path.split("?", 1)[0]
+    return (p.startswith("/admin") or p.startswith("/login") or p.startswith("/logout")
+            or p == "/" or p.startswith("/static"))
+
+
+_warned_pins: set = set()
+
+
+def _warn_bad_pin(model, pinned, cap, engine) -> None:
+    key = (model, pinned)
+    if key in _warned_pins:
+        return
+    _warned_pins.add(key)
+    if engine is None:
+        reason = f"no engine named/backed '{pinned}' is declared"
+    elif not engine.healthy:
+        reason = f"engine '{pinned}' is not healthy"
+    else:
+        reason = (f"engine '{pinned}' (backend '{engine.backend}') can't serve a "
+                  f"'{cap}' model (capabilities: {sorted(engine.capabilities)})")
+    print(f"[front] WARNING: model '{model}' is pinned to '{pinned}' but {reason}; "
+          f"falling back to a compatible engine.", flush=True)
+
+
+def required_capability(model: Optional[str], path: Optional[str] = None,
+                        backend: Optional[str] = None,
+                        ds4_model_id: Optional[str] = None,
+                        ds4_enabled: bool = False) -> Optional[str]:
+    """The capability an engine must have to serve this request.
+
+    * ``whisper``      — whisper.cpp STT (transcription endpoint or a
+                         ``whisper-server`` model). Runs on CUDA or Vulkan.
+    * ``ds4``          — DeepSeek V4 via the native ds4 engine. CUDA-only.
+    * ``gguf``         — llama.cpp model. Runs on CUDA or Vulkan.
+    * ``transformers`` — safetensors/HF model. CUDA-only.
+
+    Signals are combined: the request path and the model's configured ``backend``
+    (from models.json) take precedence over the name heuristic, so whisper works
+    even when the model id isn't in the request body (multipart upload)."""
+    p = (path or "").split("?", 1)[0].rstrip("/")
+    if p == "/v1/audio/transcriptions" or (backend or "") == "whisper-server":
+        return "whisper"
+    m = (model or "").lower()
+    if ds4_enabled and m:
+        mid = (ds4_model_id or "").lower()
+        if (mid and (m == mid or m.split("/")[-1] == mid)) or "deepseek-v4" in m:
+            return "ds4"
+    if m.endswith(".gguf") or "gguf" in m:
+        return "gguf"
+    if not model:
+        return None
+    return "transformers"
+
+
+def pick_engine(registry: EngineRegistry, path: str, method: str,
+                model: Optional[str], required_cap: Optional[str] = None,
+                default_engine: Optional[str] = None,
+                pinned: Optional[str] = None) -> Optional[Engine]:
+    """Return the engine to proxy this request to, or None if none are ready.
+
+    Precedence for inference: per-model pin → engine already holding the model →
+    configured default engine → least-loaded compatible engine. Each candidate must
+    be capability-compatible (``required_cap``) and healthy. Works even when the
+    model id isn't known (e.g. a multipart transcription upload), routing purely by
+    capability.
+    """
+    if method.upper() == "POST" and is_inference_path(path):
+        cap = required_cap
+
+        # 0. The front's precomputed assignment is authoritative — it already folds
+        # in the pin, the default engine, and balanced auto-selection, and is what
+        # keeps a model on exactly one engine. Honour it first when it's compatible.
+        if model:
+            owner = registry.engine_for_assigned(model)
+            if owner is not None and owner.can_serve(cap):
+                return owner
+
+        # 1. Per-model pin (models.json "engine") — only honoured if compatible.
+        if pinned:
+            e = registry.by_name(pinned)
+            if e and e.healthy and e.can_serve(cap):
+                return e
+            # Pin can't be honoured — say why (once per model+engine) instead of
+            # silently falling back, so a misconfiguration is visible in the logs.
+            _warn_bad_pin(model, pinned, cap, e)
+
+        # 2. Engine that already has the model resident.
+        if model:
+            e = registry.engine_for_model(model, cap)
+            if e:
+                return e
+
+        # 3. Configured default engine, when it can serve this request.
+        if default_engine:
+            e = registry.by_name(default_engine)
+            if e and e.healthy and e.can_serve(cap):
+                return e
+
+        # 4. Least-loaded compatible engine; then any engine rather than 503.
+        return (registry.least_loaded(cap)
+                or registry.least_loaded(None)
+                or registry.primary())
+
+    # Admin/auth/config/UI and everything else → primary (consistent sessions).
+    return registry.primary() or registry.least_loaded()
--- a/codai/main.py
+++ b/codai/main.py
@@ -20,6 +20,12 @@ import os
 import logging
 import threading as _t

+# Reduce CUDA allocator fragmentation: expandable segments let large transient
+# allocations (KV cache, attention activations) grow into reserved-but-unallocated
+# memory instead of OOMing on a borderline shortfall. Must be set before torch
+# initialises CUDA; honour an explicit override if the operator already set it.
+os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
+
 # Import configuration from codai modules
 from codai.cli import parse_args
 from codai.config import ConfigManager
@@ -225,6 +231,10 @@ def apply_model_entry_live(entry, model_types) -> int:
        old_cfg = multi_model_manager.config.get(key)
        cfg = build_runtime_model_cfg(entry, type_str)
        multi_model_manager.config[key] = cfg
+        try:
+            multi_model_manager._remember_registered_type(mid, type_str)
+        except Exception:
+            pass
        updated += 1
        # Acceleration (Lightning/Lightx2v/LCM distill LoRA + scheduler) is FUSED
        # into the pipeline at load time, so it can't be toggled on an already
@@ -248,6 +258,45 @@ def apply_model_entry_live(entry, model_types) -> int:
    return updated


+def _repair_stale_model_paths(config_mgr) -> int:
+    """Rewrite models.json entries whose .gguf file path no longer exists but whose
+    file is present in the GGUF cache (by filename). Returns the number of entries
+    fixed; saves models.json only when something changed."""
+    import glob
+    from codai.models.cache import get_model_cache_dir
+    cache = get_model_cache_dir()
+    if not cache or not os.path.isdir(cache):
+        return 0
+    cats = ("text_models", "gguf_models", "vision_models", "image_models",
+            "audio_models", "tts_models", "video_models", "audio_gen_models",
+            "embedding_models", "spatial_models")
+
+    def _resolve(p):
+        if not p or not str(p).endswith(".gguf") or os.path.exists(p):
+            return None
+        base = os.path.basename(p)
+        cand = os.path.join(cache, base)
+        if os.path.exists(cand):
+            return cand
+        hits = glob.glob(os.path.join(cache, "**", base), recursive=True)
+        return hits[0] if hits else None
+
+    fixed = 0
+    for cat in cats:
+        for m in config_mgr.models_data.get(cat, []):
+            if not isinstance(m, dict):
+                continue
+            for key in ("path", "model_path"):
+                new = _resolve(m.get(key))
+                if new:
+                    print(f"  Repaired model path: {m[key]} -> {new}")
+                    m[key] = new
+                    fixed += 1
+    if fixed:
+        config_mgr.save_models()
+    return fixed
+
+
 def main():
    """Main entry point for the codai server."""
    # Suppress unraisable exceptions from LlamaModel.__del__
@@ -309,6 +358,19 @@ def main():
    if config.models.gguf_cache_dir:
        os.environ['CODERAI_CACHE_DIR'] = config.models.gguf_cache_dir

+    # Repair stale .gguf paths in models.json: an entry may point at an HF-hub
+    # snapshot path that no longer exists while the file actually lives in the GGUF
+    # cache (downloads route there). Rewrite the entry to the real file so the config
+    # is correct (the GGUF loader has the same fallback, but this fixes it on disk).
+    # The front runs this before spawning engines, so they read the corrected file;
+    # it is idempotent (only writes when something changed).
+    try:
+        _repaired = _repair_stale_model_paths(config_mgr)
+        if _repaired:
+            print(f"Repaired {_repaired} stale model path(s) in models.json")
+    except Exception as _e:
+        logging.getLogger(__name__).debug("model-path repair skipped: %s", _e)
+
    # Configure generation archive
    _arc_dir = config.archive.directory
    if not _arc_dir:
@@ -424,6 +486,43 @@ def main():
            print(f"Error listing devices: {e}")
        sys.exit(0)

+    # ─── Frontend/engine split ───────────────────────────────────────────────
+    # Default boot: run the always-responsive front proxy on the public port and
+    # let it supervise engine subprocess(es) that do all GPU/model work. This
+    # process becomes the front and returns here — none of the heavy engine init
+    # below runs in it (so its event loop is never blocked by model work).
+    #   --engine-only     → this process IS an engine: bind an internal localhost
+    #                       port and run the full app below (the front spawns these).
+    #   --single-process  → legacy: one process, full app on the public port.
+    _engine_only = getattr(args, "engine_only", False)
+    _single_process = getattr(args, "single_process", False) or config.server.single_process
+    if not _engine_only and not _single_process:
+        from codai.frontproxy import run_front
+        run_front(config, args)
+        return
+    if _engine_only:
+        # Engines bind plain localhost HTTP; the front owns the public host + TLS.
+        # NOTE: don't mutate config.server here — the settings API reads it, and it
+        # must keep reporting the user's CONFIGURED public host/port/https. The
+        # actual bind target is computed separately at serve time.
+        # The front pins this engine's backend (so a Radeon engine uses Vulkan and
+        # an NVIDIA engine uses CUDA) via CODERAI_ENGINE_BACKEND; honour it over the
+        # shared config.backend.type. Device selection is done by the env block the
+        # front set (CUDA_VISIBLE_DEVICES / GGML_VK_VISIBLE_DEVICES / VK_ICD_FILENAMES).
+        _forced_backend = os.environ.get("CODERAI_ENGINE_BACKEND")
+        if _forced_backend:
+            config.backend.type = _forced_backend
+            print(f"[engine] backend forced to '{_forced_backend}' by the front")
+        # The front owns the AISBF broker (always-responsive, one registration for
+        # the whole node, routes to engines). So no engine runs its own broker
+        # client — that would double-register and stall when the engine loads.
+        if config.broker.enabled:
+            config.broker.enabled = False
+            print("[engine] broker disabled (the front manages the broker)")
+        # Note: model→engine assignment is enforced by the FRONT's router (each model
+        # is routed to its single owner engine), not by pruning models.json here —
+        # so the admin model list (served from the primary) stays complete.
+
    # Migrate any GGUF files that ended up in the HF cache to the GGUF cache
    _t.Thread(target=_migrate_hf_gguf_to_gguf_cache, daemon=True).start()

@@ -519,6 +618,14 @@ def main():
    set_load_mode(load_mode)
    multi_model_manager.set_load_mode(load_mode)
    multi_model_manager._global_max_instances = config.models.max_model_instances
+    # Per-engine override of the default instances-per-model, set by the front.
+    _mi = os.environ.get("CODERAI_MAX_MODEL_INSTANCES")
+    if _mi:
+        try:
+            multi_model_manager._global_max_instances = int(_mi)
+            config.models.max_model_instances = int(_mi)
+        except ValueError:
+            pass

    print(f"\nLoad mode: {load_mode}")
    if load_mode == "ondemand":
@@ -558,6 +665,38 @@ def main():
    print(f"\n=== Loading Models from Config ===")

    models_config = config_mgr.models_data
+    # In an engine the front assigns a SUBSET of models to this engine; register and
+    # pre-load only those (so e.g. whisper-server doesn't start on every engine).
+    # config_mgr.models_data stays full, so the admin model list — served from the
+    # primary engine — remains complete.
+    _assigned_env = os.environ.get("CODERAI_ENGINE_MODELS")
+    if _assigned_env is not None:
+        try:
+            import json as _json
+            _keep = set(_json.loads(_assigned_env))
+
+            def _route_key(m):
+                if isinstance(m, str):
+                    return m
+                if isinstance(m, dict):
+                    return m.get("alias") or m.get("path") or m.get("id")
+                return None
+            _model_cats = ("text_models", "gguf_models", "vision_models", "image_models",
+                           "audio_models", "tts_models", "video_models", "audio_gen_models",
+                           "embedding_models", "spatial_models")
+            models_config = {
+                k: ([m for m in v if _route_key(m) in _keep]
+                    if k in _model_cats and isinstance(v, list) else v)
+                for k, v in config_mgr.models_data.items()
+            }
+            _n = sum(len(models_config.get(c, [])) for c in _model_cats)
+            print(f"[engine] registering {_n} model(s) assigned by the front")
+            # Also restrict /v1/models (list_models) to the assigned subset, so the
+            # per-engine model list matches what it actually serves — config_mgr's
+            # full models_data is untouched (the admin model list stays complete).
+            multi_model_manager.set_assigned_models(keep)
+        except Exception as _e:
+            print(f"[engine] assignment filter failed ({_e}); registering all models")

    # Helper to find model config
    def get_model_cfg(model_type, model_id):
@@ -815,6 +954,9 @@ def main():
    global_args.max_ram_gb = config.offload.max_ram_gb
    global_args.evict_idle_on_ram = config.offload.evict_idle_on_ram
    global_args.ram_leak_watch = config.offload.ram_leak_watch
+    global_args.ram_watch_poll_seconds = config.offload.ram_watch_poll_seconds
+    global_args.ram_watch_soft_fraction = config.offload.ram_watch_soft_fraction
+    global_args.ram_watch_cuda = config.offload.ram_watch_cuda
    # Thermal protection settings (read live by codai.models.thermal).
    global_args.thermal_cpu_enabled = config.thermal.cpu_enabled
    global_args.thermal_gpu_enabled = config.thermal.gpu_enabled
@@ -822,6 +964,7 @@ def main():
    global_args.thermal_cpu_resume = config.thermal.cpu_resume
    global_args.thermal_gpu_high = config.thermal.gpu_high
    global_args.thermal_gpu_resume = config.thermal.gpu_resume
+    global_args.thermal_gpu_overrides = config.thermal.gpu_overrides
    global_args.thermal_poll_seconds = config.thermal.poll_seconds
    global_args.thermal_soft_throttle_enabled = config.thermal.soft_throttle_enabled
    global_args.thermal_soft_throttle_temp = config.thermal.soft_throttle_temp
@@ -850,6 +993,7 @@ def main():
    global_args.debug_web = getattr(args, 'debug_web', False)
    global_args.debug_thermal = getattr(args, 'debug_thermal', False)
    global_args.debug_lora = getattr(args, 'debug_lora', False)
+    global_args.debug_requests = getattr(args, 'debug_requests', False)
    global_args.dump = global_dump
    global_args.file_path = config.file_path
    global_args.parser = config.parser
@@ -998,6 +1142,15 @@ def main():
    from codai.queue.manager import queue_manager
    queue_manager.max_size = config.server.queue_max_size
    queue_manager.max_parallel_requests = config.server.max_parallel_requests
+    # In an engine the front may override this engine's concurrency (per-engine
+    # limit) via env, so a bigger card runs more in parallel than a smaller one.
+    _mp = os.environ.get("CODERAI_MAX_PARALLEL")
+    if _mp:
+        try:
+            queue_manager.max_parallel_requests = int(_mp)
+            config.server.max_parallel_requests = int(_mp)
+        except ValueError:
+            pass

    # Configure Python logging so broker/API log calls reach the terminal.
    # uvicorn is started with log_config=None to keep our config in place.
@@ -1068,9 +1221,26 @@ def main():

    # Start the server
    import uvicorn
-    print(f"\nStarting server on http://{config.server.host}:{config.server.port}")
-    print(f"API docs: http://{config.server.host}:{config.server.port}/docs")
-    print(f"Admin UI: http://{config.server.host}:{config.server.port}/admin")
+    # The bind target: an engine binds 127.0.0.1:<internal-port> with plain HTTP
+    # (the front owns the public host + TLS); single-process uses the configured
+    # public host/port/https. config.server keeps the CONFIGURED values either way
+    # so the settings API reports them correctly.
+    if getattr(args, 'engine_only', False):
+        bind_host = "127.0.0.1"
+        bind_port = int(getattr(args, "internal_port", None)
+                        or config.server.internal_port_base)
+        bind_https = False
+        # Engines are internal workers behind the front — the public API docs / Admin
+        # UI live on the front, so don't advertise them here (it's just confusing).
+        print(f"[engine] serving on http://{bind_host}:{bind_port} "
+              f"(internal — reach it via the front)")
+    else:
+        bind_host = config.server.host
+        bind_port = config.server.port
+        bind_https = config.server.https
+        print(f"\nStarting server on http://{bind_host}:{bind_port}")
+        print(f"API docs: http://{bind_host}:{bind_port}/docs")
+        print(f"Admin UI: http://{bind_host}:{bind_port}/admin")

    if model_manager.backend is not None:
        actual_backend = model_manager.backend_type
@@ -1080,7 +1250,20 @@ def main():

    _uvi_log_level = "debug" if global_debug else "info"

-    if config.server.https:
+    # An engine only ever receives internal front→engine traffic (localhost-only +
+    # token-gated), so its whole access log is internal chatter. Silence it unless
+    # --debug-engine by handing uvicorn a log config with uvicorn.access at WARNING
+    # — done via the config (not a post-hoc setLevel) because uvicorn re-applies its
+    # logging config on run and would otherwise reset the level back to INFO. When
+    # the config is used we pass log_level=None so uvicorn doesn't re-override it.
+    _uvi_log_config = None
+    if getattr(args, 'engine_only', False) and not getattr(args, 'debug_engine_web', False):
+        import copy as _copy
+        _uvi_log_config = _copy.deepcopy(uvicorn.config.LOGGING_CONFIG)
+        _uvi_log_config["loggers"]["uvicorn.access"]["level"] = "WARNING"
+    _uvi_ll = None if _uvi_log_config is not None else _uvi_log_level
+
+    if bind_https:
        import ssl
        ssl_keyfile = config.server.https_key_path
        ssl_certfile = config.server.https_cert_path
@@ -1102,17 +1285,17 @@ def main():
            except Exception as e:
                print(f"Warning: Could not generate certificate: {e}")
                print("Falling back to HTTP...")
-                uvicorn.run(fastapi_app, host=config.server.host, port=config.server.port,
-                            log_level=_uvi_log_level, log_config=None)
+                uvicorn.run(fastapi_app, host=bind_host, port=bind_port,
+                            log_level=_uvi_ll, log_config=_uvi_log_config)
                return

        ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
        ssl_context.load_cert_chain(ssl_certfile, ssl_keyfile)
-        uvicorn.run(fastapi_app, host=config.server.host, port=config.server.port,
-                    ssl_context=ssl_context, log_level=_uvi_log_level, log_config=None)
+        uvicorn.run(fastapi_app, host=bind_host, port=bind_port,
+                    ssl_context=ssl_context, log_level=_uvi_ll, log_config=_uvi_log_config)
    else:
-        uvicorn.run(fastapi_app, host=config.server.host, port=config.server.port,
-                    log_level=_uvi_log_level, log_config=None)
+        uvicorn.run(fastapi_app, host=bind_host, port=bind_port,
+                    log_level=_uvi_ll, log_config=_uvi_log_config)


 if __name__ == "__main__":

--- a/codai/models/capabilities.py
+++ b/codai/models/capabilities.py
@@ -21,6 +21,7 @@ from threading import Lock
 from typing import List, Optional
 import json
 import os
+import re
 import time


@@ -179,11 +180,15 @@ def detect_model_capabilities(model_name: str) -> ModelCapabilities:
        return caps

    # ── Image: upscaling (checked before general SD rule to catch SD-family upscalers) ──
-    if any(x in n for x in ['real-esrgan', 'esrgan', 'swinir', 'edsr',
-                              'bsrgan', 'hat-', 'dat-',
+    # 'hat-'/'dat-' are short, ambiguous tokens (e.g. they appear inside
+    # "chat-", "update-"); require a word boundary before them so a text "chat"
+    # model isn't mistaken for the HAT/DAT super-resolution checkpoints.
+    if (any(x in n for x in ['real-esrgan', 'esrgan', 'swinir', 'edsr',
+                              'bsrgan',
                              'x2-upscaler', 'x4-upscaler', 'x2_upscaler', 'x4_upscaler',
                              'latent-upscaler', 'latent_upscaler',
-                              'ldm-super-resolution', 'rcan-', 'sr3-']):
+                              'ldm-super-resolution', 'rcan-', 'sr3-'])
+            or re.search(r'\b[hd]at-', n)):
        caps.image_upscaling = True
        caps.image_to_image = True
        return caps

--- a/codai/models/manager.py
+++ b/codai/models/manager.py
@@ -16,7 +16,7 @@

 """Model manager module - contains ModelManager, WhisperServerManager, and MultiModelManager classes."""

-from typing import Optional, Dict, Any, List
+from typing import Optional, Dict, Any, List, Set
 import os
 import random
 import subprocess
@@ -36,6 +36,36 @@ from codai.models.utils import FuzzyToolBreaker
 from codai.pydantic.textrequest import ModelInfo


+def get_active_ds4_config():
+    """Return the active Ds4Config from the server config, or None if unavailable."""
+    try:
+        from codai.admin.routes import config_manager
+        if config_manager is not None and config_manager.config is not None:
+            return config_manager.config.ds4
+    except Exception:
+        pass
+    return None
+
+
+def ds4_should_handle(model_name: str) -> bool:
+    """True when ds4 is enabled and ``model_name`` should be served by ds4-server.
+
+    Matches the configured ``model_id`` (case-insensitive, short-name aware) or any
+    name containing ``deepseek-v4``, so the stock alias works without extra config.
+    """
+    if not model_name:
+        return False
+    cfg = get_active_ds4_config()
+    if cfg is None or not getattr(cfg, "enabled", False):
+        return False
+    name = model_name.lower()
+    short = name.split("/")[-1]
+    mid = (getattr(cfg, "model_id", "") or "").lower()
+    if mid and (name == mid or short == mid):
+        return True
+    return "deepseek-v4" in name
+
+
 def _trim_cpu_ram() -> None:
    """Return freed CPU heap memory to the OS (and let the kernel reclaim swap).

@@ -128,6 +158,17 @@ class ModelManager:
        
    def load_model(self, model_name: str, backend_type: str = "auto", **kwargs):
        """Load the model with the specified backend."""
+        # DeepSeek V4 via ds4: when enabled, route matching models to the managed
+        # ds4-server proxy instead of the in-process nvidia/vulkan backends.
+        if ds4_should_handle(model_name):
+            from codai.backends.ds4 import Ds4Backend
+            print(f"Routing '{model_name}' to ds4 (DeepSeek V4) backend")
+            self.backend_type = "ds4"
+            self.backend = Ds4Backend(get_active_ds4_config())
+            self.backend.load_model(model_name, **kwargs)
+            self.tool_parser = ModelParserAdapter(model_name=model_name)
+            return
+
        available = detect_available_backends()

        # Check if model is a GGUF file
@@ -543,6 +584,11 @@ class MultiModelManager:
        self.embedding_models: List[str] = []   # text / multimodal embeddings
        self.spatial_models: List[str] = []     # depth estimation, segmentation, object detection
        self.config: Dict[str, Dict] = {}  # Store model configurations
+        # In the front/engine split, the front assigns a subset of models.json to
+        # this engine. When set, list_models() reports only these (so /v1/models per
+        # engine reflects what it actually serves); None = report all (single-process).
+        self._assigned_model_keys: Optional[set] = None
+        self.model_registered_types: Dict[str, Set[str]] = {}
        self.tool_parser = ModelParserAdapter()
        self.current_model_key: Optional[str] = None
        self.load_mode: str = "ondemand"
@@ -571,6 +617,7 @@ class MultiModelManager:
        self._pending_new_instance: set = set()  # keys awaiting a second+ instance load
        self._global_max_instances: int = 1  # set from config at startup
        self._measured_vram_gb: Dict[str, float] = {}  # actual measured VRAM delta per model key
+        self._last_load_errors: Dict[str, str] = {}  # model_key -> last failed load message
        # Callbacks that free VRAM held *outside* the model manager (e.g. the
        # LoRA trainer caches its SD/SDXL base model between jobs). Each returns
        # the GB it freed (or None). Invoked as a last resort during eviction.
@@ -736,6 +783,7 @@ class MultiModelManager:
        self.default_model = model_name
        self.config[model_name] = config or {}
        self.model_backend_types[model_name] = backend_type
+        self._remember_registered_type(model_name, "text")

        # Download/cache the model at startup if it's a URL or HF ID
        resolved_model = self.load_model(model_name)
@@ -834,6 +882,7 @@ class MultiModelManager:
                from codai.tasks import loading_task
                with loading_task(self.default_model, model_type="text"):
                    model_manager.load_model(self.default_model, backend_type=backend_type, **kwargs)
+                self._last_load_errors.pop(self.default_model, None)
                self.add_model(self.default_model, model_manager)
                self.record_vram_delta(self.default_model, _snap)
                self.current_model_key = self.default_model
@@ -842,6 +891,7 @@ class MultiModelManager:
                return model_manager
            except Exception as e:
                print(f"Error loading model {self.default_model}: {e}")
+                self._last_load_errors[self.default_model] = str(e)
                self._mark_cuda_poisoned_if_fatal(e)
                self._model_ready_event.set()
                return None
@@ -935,6 +985,7 @@ class MultiModelManager:
                from codai.tasks import loading_task
                with loading_task(model_name, model_type="text"):
                    model_manager.load_model(model_name, backend_type=backend_type, **kwargs)
+                self._last_load_errors.pop(model_name, None)
                self.add_model(model_name, model_manager)
                self.record_vram_delta(model_name, _snap)
                self.current_model_key = model_name
@@ -944,6 +995,7 @@ class MultiModelManager:
                return model_manager
            except Exception as e:
                print(f"Error loading model {model_name}: {e}")
+                self._last_load_errors[model_name] = str(e)
                self._mark_cuda_poisoned_if_fatal(e)
                self._model_ready_event.set()   # signal: ready (even on failure)
                return None
@@ -953,6 +1005,7 @@ class MultiModelManager:
        if model_name not in self.audio_models:
            self.audio_models.append(model_name)
        self.config[f"audio:{model_name}"] = config or {}
+        self._remember_registered_type(model_name, "audio")

        if isinstance(config, dict) and config.get("backend") == "whisper-server":
            print(f"Registered whisper-server audio model: {model_name}")
@@ -984,6 +1037,7 @@ class MultiModelManager:
        if model_id not in self.audio_models:
            self.audio_models.append(model_id)
        self.config[f"audio:{model_id}"] = cfg
+        self._remember_registered_type(model_id, "audio")
        # Register alias for round-robin routing
        if alias:
            wsm._alias = alias
@@ -1019,6 +1073,7 @@ class MultiModelManager:
        """Set the text-to-speech model and download/cache it if needed."""
        self.tts_model = model_name
        self.config[f"tts:{model_name}"] = config or {}
+        self._remember_registered_type(model_name, "tts")

        # Download/cache the model at startup if it's a URL or HF ID
        resolved_model = self.load_model(model_name)
@@ -1033,6 +1088,7 @@ class MultiModelManager:
        if model_name not in self.image_models:
            self.image_models.append(model_name)
        self.config[f"image:{model_name}"] = config or {}
+        self._remember_registered_type(model_name, "image")

        # For image models, we don't download at startup since they may be large
        # and handled by different backends (diffusers vs sd.cpp)
@@ -1044,6 +1100,7 @@ class MultiModelManager:
        if model_name not in self.vision_models:
            self.vision_models.append(model_name)
        self.config[f"vision:{model_name}"] = config or {}
+        self._remember_registered_type(model_name, "vision")

        resolved_model = self.load_model(model_name)
        if resolved_model != model_name:
@@ -1057,6 +1114,7 @@ class MultiModelManager:
        if model_name not in self.video_models:
            self.video_models.append(model_name)
        self.config[f"video:{model_name}"] = config or {}
+        self._remember_registered_type(model_name, "video")
        print(f"Registered video model: {model_name}")

    def set_audio_gen_model(self, model_name: str, config: Dict = None):
@@ -1064,6 +1122,7 @@ class MultiModelManager:
        if model_name not in self.audio_gen_models:
            self.audio_gen_models.append(model_name)
        self.config[f"audio_gen:{model_name}"] = config or {}
+        self._remember_registered_type(model_name, "audio_gen")
        print(f"Registered audio-gen model: {model_name}")

    def set_embedding_model(self, model_name: str, config: Dict = None):
@@ -1071,6 +1130,7 @@ class MultiModelManager:
        if model_name not in self.embedding_models:
            self.embedding_models.append(model_name)
        self.config[f"embedding:{model_name}"] = config or {}
+        self._remember_registered_type(model_name, "embedding")
        print(f"Registered embedding model: {model_name}")

    def set_spatial_model(self, model_name: str, config: Dict = None):
@@ -1078,11 +1138,31 @@ class MultiModelManager:
        if model_name not in self.spatial_models:
            self.spatial_models.append(model_name)
        self.config[f"spatial:{model_name}"] = config or {}
+        self._remember_registered_type(model_name, "spatial")
        print(f"Registered spatial model: {model_name}")

    def set_model_alias(self, alias: str, model_name: str):
        """Register an alias for a model."""
        self.model_aliases[alias] = model_name
+        for model_type in self._registered_types_for(model_name):
+            self._remember_registered_type(alias, model_type)
+    
+    def set_assigned_models(self, keys) -> None:
+        """Restrict list_models() to the front-assigned subset (route-keys: alias /
+        path / id). None = no restriction."""
+        self._assigned_model_keys = set(keys) if keys is not None else None
+
+    def _entry_assigned(self, m) -> bool:
+        """True if a models.json entry is assigned to this engine (or no restriction)."""
+        if self._assigned_model_keys is None:
+            return True
+        if isinstance(m, str):
+            rk = m
+        elif isinstance(m, dict):
+            rk = m.get("alias") or m.get("path") or m.get("id")
+        else:
+            return True
+        return rk in self._assigned_model_keys

    def get_all_allowed_identifiers(self) -> set:
        """
@@ -1204,6 +1284,12 @@ class MultiModelManager:
            r_short = registered.split("/")[-1] if "/" in registered else registered
            return n_short == r_short

+        requested_type = self._requested_type_from_registered_types(name)
+        if requested_type:
+            return requested_type
+        if self._registered_types_for(name):
+            return None
+
        if self.default_model and _matches(self.default_model):
            return "text"
        for m in self.image_models:
@@ -1231,6 +1317,115 @@ class MultiModelManager:
                return "spatial"
        return None

+    def _remember_registered_type(self, name: str, model_type: str) -> None:
+        """Remember every configured type for a model identifier and short name."""
+        if not name or not model_type:
+            return
+        for key in {name, name.split("/")[-1] if "/" in name else name}:
+            self.model_registered_types.setdefault(key, set()).add(model_type)
+
+    def _registered_types_for(self, name: str) -> Set[str]:
+        """Return all configured types for a model identifier or its short name."""
+        if not name:
+            return set()
+        short = name.split("/")[-1] if "/" in name else name
+        types = set(self.model_registered_types.get(name, set()))
+        types.update(self.model_registered_types.get(short, set()))
+        for key, vals in self.model_registered_types.items():
+            key_short = key.split("/")[-1] if "/" in key else key
+            if key == name or key_short == short:
+                types.update(vals)
+        # Live admin saves update models.json/config_manager immediately, but an
+        # already-running manager may not have had every category re-registered.
+        # Treat the saved config as authoritative so entries with model_types like
+        # text+image don't get rejected just because the image registration won.
+        types.update(self._registered_types_from_config(name))
+        return types
+
+    def _registered_types_from_config(self, name: str) -> Set[str]:
+        """Infer all configured types for a model from config_manager.models_data."""
+        cat_type = {
+            "text_models": "text",
+            "gguf_models": "text",
+            "vision_models": "vision",
+            "image_models": "image",
+            "audio_models": "audio",
+            "tts_models": "tts",
+            "video_models": "video",
+            "audio_gen_models": "audio_gen",
+            "embedding_models": "embedding",
+            "spatial_models": "spatial",
+        }
+        cfg_cat_type = {
+            "text_models": "text",
+            "gguf_models": "text",
+            "vision_models": "vision",
+            "image_models": "image",
+            "audio_models": "audio",
+            "tts_models": "tts",
+            "video_models": "video",
+            "audio_gen_models": "audio_gen",
+            "embedding_models": "embedding",
+            "spatial_models": "spatial",
+        }
+        found: Set[str] = set()
+        short = name.split("/")[-1] if "/" in name else name
+        try:
+            from codai.admin.routes import config_manager
+            md = config_manager.models_data if config_manager is not None else {}
+        except Exception:
+            return found
+        for cat, entries in md.items():
+            default_type = cat_type.get(cat)
+            if not default_type:
+                continue
+            for entry in entries or []:
+                if isinstance(entry, str):
+                    vals = [entry]
+                    entry_types = [default_type]
+                else:
+                    raw = entry.get("path") or entry.get("id") or ""
+                    alias = entry.get("alias") or ""
+                    vals = [raw, alias]
+                    raw_types = entry.get("model_types") or [entry.get("model_type") or cat]
+                    entry_types = [cfg_cat_type.get(t, default_type) for t in raw_types if cfg_cat_type.get(t, default_type)]
+                for val in vals:
+                    if not val:
+                        continue
+                    val_short = val.split("/")[-1] if "/" in val else val
+                    if val == name or val_short == short:
+                        found.update(entry_types)
+        return found
+
+    def _requested_type_from_registered_types(self, name: str) -> Optional[str]:
+        """Return a single registered type only when the model is not multi-type."""
+        types = self._registered_types_for(name)
+        return next(iter(types)) if len(types) == 1 else None
+
+    def model_supports_type(self, name: str, model_type: Optional[str]) -> bool:
+        """True when a configured multi-type model supports the requested type."""
+        if not model_type:
+            return True
+        types = self._registered_types_for(name)
+        if model_type in types:
+            return True
+        return model_type == "text" and "vision" in types
+
+    def _config_for_model_key(self, model_key: str) -> Dict[str, Any]:
+        """Return config for a key, falling back to compatible multi-type keys."""
+        cfg = self.config.get(model_key, {})
+        if cfg:
+            return cfg
+        if ":" in model_key:
+            _, bare = model_key.split(":", 1)
+            return self.config.get(bare, {})
+        bare = model_key
+        for prefix in ("vision", "image", "audio", "tts", "video", "audio_gen", "embedding", "spatial"):
+            cfg = self.config.get(f"{prefix}:{bare}", {})
+            if cfg:
+                return cfg
+        return {}
+
    def is_allowed_model(self, requested_or_resolved: str, model_type: str = None) -> bool:
        """
        Check if a model name (raw request value *or* resolved name) is one of
@@ -1249,9 +1444,17 @@ class MultiModelManager:
        if not requested_or_resolved:
            return False

+        # ds4-served DeepSeek V4 has no models.json entry; accept it for text when
+        # the ds4 worker is enabled and the name matches.
+        if model_type in (None, "text") and ds4_should_handle(requested_or_resolved):
+            return True
+
        # If a model_type is specified, reject models registered under a
        # different type (e.g. an image GGUF requested via /v1/chat/completions).
        if model_type:
+            registered_types = self._registered_types_for(requested_or_resolved)
+            if registered_types and not self.model_supports_type(requested_or_resolved, model_type):
+                return False
            registered_type = self.get_registered_model_type(requested_or_resolved)
            if registered_type is not None and registered_type != model_type:
                # "vision" models are acceptable for "text" endpoints (multimodal)
@@ -1687,7 +1890,7 @@ class MultiModelManager:
        # the runtime reserve (KV cache / activations / VAE-decode spike) so the
        # value we cache and persist reflects the model's PEAK runtime need — not
        # just its loaded weights — and future eviction frees enough headroom.
-        cfg = self.config.get(model_key, {})
+        cfg = self._config_for_model_key(model_key)
        reserve_gb = self._runtime_reserve_gb(
            cfg if isinstance(cfg, dict) else {}, model_key, delta_gb)
        measured = round(delta_gb + reserve_gb, 3)
@@ -2335,6 +2538,31 @@ class MultiModelManager:
        _load_bpe = self._load_bytes_per_elem(cfg)
        prec_factor = (_load_bpe / _storage_bpe) if _storage_bpe > 0 else 1.0

+        # GGUF files are ALREADY quantized on disk and llama.cpp loads the baked-in
+        # quantization — it ignores load_in_4bit/load_in_8bit entirely. The stored
+        # used_vram_gb and file-size baselines already reflect that quantized
+        # footprint, so applying the 4/8-bit quant multiplier (or a storage→load
+        # precision normalization) on top would 2–3× UNDER-estimate the real
+        # resident size and let the loader try to fit a model that doesn't fit.
+        _gguf_path = str(cfg.get("path") or resolved_name or model_key or "")
+        _is_gguf = (_gguf_path.endswith(".gguf") or "gguf" in _gguf_path.lower()
+                    or cfg.get("model_type") == "gguf_models")
+        if _is_gguf:
+            quant_mult = 1.0
+            prec_factor = 1.0
+            # n_gpu_layers controls how much of a GGUF actually lands in VRAM.
+            # With 0 layers on the GPU the weights live in CPU RAM / are mmap'd
+            # from disk, so the GPU only needs compute/KV buffers — don't reserve
+            # the whole model (which would force needless eviction of other
+            # models on every load attempt). A partial positive count is left
+            # conservative since the total layer count isn't known here.
+            try:
+                _ngl = int(cfg.get("n_gpu_layers")) if cfg.get("n_gpu_layers") is not None else -1
+            except (TypeError, ValueError):
+                _ngl = -1
+            if _ngl == 0:
+                quant_mult = 0.0
+
        def _dbg_est(source: str, value: float) -> float:
            try:
                from codai.api.state import get_global_debug
@@ -2592,7 +2820,41 @@ class MultiModelManager:
        """Resident-set size of the server process TREE, in GB (0.0 on failure).

        Offloaded weights and worker subprocesses count against the global cap, so
-        sum the parent plus all children (mirrors thermal.read_process_tree_cpu)."""
+        sum the root process plus all children (mirrors thermal.read_process_tree_cpu).
+
+        Under the front/engine split the host-RAM cap is SHARED, not split: when the
+        front spawned this engine it set CODERAI_FRONT_PID, so the root is the
+        *front* — every engine then measures the same fleet-wide total (front + all
+        engines + their workers) and enforces the single cap against it. In
+        single-process mode the root is just this process, as before."""
+        try:
+            import os
+            import psutil
+            root_pid = os.environ.get("CODERAI_FRONT_PID")
+            proc = None
+            if root_pid:
+                try:
+                    proc = psutil.Process(int(root_pid))
+                except Exception:
+                    proc = None
+            if proc is None:
+                proc = psutil.Process()
+            total = proc.memory_info().rss
+            for child in proc.children(recursive=True):
+                try:
+                    total += child.memory_info().rss
+                except Exception:
+                    pass
+            return total / 1e9
+        except Exception:
+            return 0.0
+
+    @staticmethod
+    def _get_own_ram_gb() -> float:
+        """RSS of THIS engine's own process tree only (ignores the shared-fleet
+        root), in GB. Used for per-engine *leak* detection so unbounded growth is
+        attributed to the engine that actually has it — unlike the shared cap, which
+        uses the whole fleet (:meth:`_get_process_ram_gb`)."""
        try:
            import psutil
            proc = psutil.Process()
@@ -2978,7 +3240,7 @@ class MultiModelManager:
        # Per-model "load" = pre-loaded (treat as loadall for this model).
        # Per-model "on-request" = load when needed with VRAM management.
        # =====================================================================
-        per_model_cfg = self.config.get(model_key, {})
+        per_model_cfg = self._config_for_model_key(model_key)
        per_model_load_mode = per_model_cfg.get("load_mode")  # "load" | "on-request" | None

        if per_model_load_mode == "on-request":
@@ -3467,6 +3729,10 @@ class MultiModelManager:
                            "spatial_models"):
                    mtype = CAT_TYPE.get(cat, "text")
                    for m in md.get(cat, []):
+                        # Only list models the front assigned to THIS engine (so a
+                        # per-engine /v1/models reflects what it actually serves).
+                        if not self._entry_assigned(m):
+                            continue
                        if isinstance(m, str):
                            mid = m
                        else:
@@ -3550,6 +3816,12 @@ class MultiModelManager:
        for alias in self.model_aliases:
            _add(alias)

+        # --- DeepSeek V4 via ds4 (no models.json entry; surfaced when enabled) ---
+        ds4_cfg = get_active_ds4_config()
+        if ds4_cfg is not None and getattr(ds4_cfg, "enabled", False):
+            mid = getattr(ds4_cfg, "model_id", "deepseek-v4") or "deepseek-v4"
+            _add(mid, "text", {"backend": "ds4"})
+
        return models



--- a/codai/models/parser.py
+++ b/codai/models/parser.py
@@ -937,12 +937,143 @@ class CommandRParser(BaseParser):
        return results


+def _parse_gemma_loose_value(s: str, i: int):
+    """Parse one value from gemma's loose object notation starting at index i.
+    Returns (python_value, next_index). Handles "strings", numbers, true/false/
+    null, nested {objects} and [arrays], and bareword fallbacks."""
+    n = len(s)
+    while i < n and s[i] in ' \t\r\n':
+        i += 1
+    if i >= n:
+        return None, i
+    c = s[i]
+    if c == '"':
+        # JSON-style string with escapes.
+        j = i + 1
+        buf = []
+        while j < n:
+            if s[j] == '\\' and j + 1 < n:
+                esc = s[j + 1]
+                buf.append({'n': '\n', 't': '\t', 'r': '\r'}.get(esc, esc))
+                j += 2
+                continue
+            if s[j] == '"':
+                j += 1
+                break
+            buf.append(s[j])
+            j += 1
+        return ''.join(buf), j
+    if c == '{':
+        return _parse_gemma_loose_object(s, i)
+    if c == '[':
+        arr = []
+        j = i + 1
+        while j < n:
+            while j < n and s[j] in ' \t\r\n,':
+                j += 1
+            if j < n and s[j] == ']':
+                j += 1
+                break
+            val, j = _parse_gemma_loose_value(s, j)
+            arr.append(val)
+        return arr, j
+    # Bareword / number / bool / null: read until a delimiter.
+    j = i
+    while j < n and s[j] not in ',}]':
+        j += 1
+    tok = s[i:j].strip()
+    low = tok.lower()
+    if low == 'true':
+        return True, j
+    if low == 'false':
+        return False, j
+    if low in ('null', 'none'):
+        return None, j
+    try:
+        return int(tok), j
+    except ValueError:
+        pass
+    try:
+        return float(tok), j
+    except ValueError:
+        pass
+    return tok, j
+
+
+def _parse_gemma_loose_object(s: str, i: int):
+    """Parse a {key:value,…} object (unquoted keys) starting at the '{' at i.
+    Returns (dict, next_index)."""
+    n = len(s)
+    obj = {}
+    assert s[i] == '{'
+    j = i + 1
+    while j < n:
+        while j < n and s[j] in ' \t\r\n,':
+            j += 1
+        if j < n and s[j] == '}':
+            j += 1
+            break
+        # Read key (bareword or "quoted").
+        if s[j] == '"':
+            key, j = _parse_gemma_loose_value(s, j)
+        else:
+            k = j
+            while j < n and s[j] not in ':}':
+                j += 1
+            key = s[k:j].strip()
+        while j < n and s[j] in ' \t\r\n':
+            j += 1
+        if j < n and s[j] == ':':
+            j += 1
+        val, j = _parse_gemma_loose_value(s, j)
+        if key:
+            obj[key] = val
+    return obj, j
+
+
+def parse_gemma_native_tool_calls(text: str, tool_names=None):
+    """Parse gemma-4's native tool-call format — ``call:NAME{args}`` (optionally
+    wrapped in the ``<|tool_call>…<tool_call|>`` special tokens) — into a list of
+    ``(name, args_dict)``. ``tool_names`` (when given) restricts matches to real
+    tool names so prose containing ``call:`` isn't misread. Exact-duplicate calls
+    are collapsed (a degenerate model loop emits the same call repeatedly)."""
+    if not text or 'call:' not in text:
+        return []
+    out = []
+    seen = set()
+    for m in re.finditer(r'call:\s*([A-Za-z_]\w*)\s*\{', text):
+        name = m.group(1)
+        if tool_names and name not in tool_names:
+            continue
+        brace = m.end() - 1   # index of '{'
+        try:
+            args, _ = _parse_gemma_loose_object(text, brace)
+        except Exception:
+            continue
+        key = (name, json.dumps(args, sort_keys=True, default=str))
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append((name, args))
+    return out
+
+
 # 7. GEMMA PARSER
 class GemmaParser(BaseParser):
    @validate_tool_output
    def parse(self, text: str) -> List[Dict]:
        results = []

+        # gemma-4 native format: call:NAME{args} (the <|tool_call>…<tool_call|>
+        # markers are stripped by skip_special_tokens during decode). Restrict to
+        # declared tool names when we know them, to avoid matching prose.
+        native = parse_gemma_native_tool_calls(
+            text, set(self.tools.keys()) if self.tools else None)
+        for name, args in native:
+            results.append(self._to_oa(name, args))
+        if results:
+            return results
+
        match = re.search(r'{\s*"name":\s*".*?"\s*,\s*"parameters":\s*\{.*?\}\s*\}', text, re.DOTALL)
        if match:
            try:
@@ -2103,6 +2234,21 @@ class ModelParserAdapter:
        if not text:
            return text
        
+        # gemma-4 native: drop every `call:NAME{…}` span (balanced braces) and the
+        # `thought` channel residue left after skip_special_tokens strips the
+        # <|tool_call>/<|channel> markers.
+        if 'call:' in text:
+            while True:
+                m = re.search(r'call:\s*[A-Za-z_]\w*\s*\{', text)
+                if not m:
+                    break
+                try:
+                    _, end = _parse_gemma_loose_object(text, m.end() - 1)
+                except Exception:
+                    end = m.end()
+                text = text[:m.start()] + text[end:]
+        text = re.sub(r'(?m)^\s*thought\s*$\n?', '', text)
+
        # Custom XML format: <tool><action>...</action><object>...</object><properties>...</properties></tool>
        text = re.sub(r'<tool>\s*<action>.*?</action>\s*<object>.*?</object>\s*<properties>.*?</properties>\s*</tool>', '', text, flags=re.DOTALL | re.IGNORECASE)
        text = re.sub(r'<tool=[^>]+>.*?</tool_call>', '', text, flags=re.DOTALL)

--- a/codai/models/ram_monitor.py
+++ b/codai/models/ram_monitor.py
@@ -67,6 +67,29 @@ def _watch_enabled() -> bool:
        return True


+def _cfg(name: str, default):
+    """Read a live-tunable knob off global_args, falling back to the module default."""
+    try:
+        from codai.api.state import get_global_args
+        ga = get_global_args()
+        val = getattr(ga, name, None) if ga else None
+        return val if val is not None else default
+    except Exception:
+        return default
+
+
+def _poll_seconds() -> float:
+    return float(_cfg("ram_watch_poll_seconds", _POLL_SECONDS))
+
+
+def _soft_fraction() -> float:
+    return float(_cfg("ram_watch_soft_fraction", _SOFT_FRACTION))
+
+
+def _cuda_mitigation_enabled() -> bool:
+    return bool(_cfg("ram_watch_cuda", True))
+
+
 def _scheduler_idle() -> bool:
    """True when no request is being served (so RSS growth isn't a live job)."""
    try:
@@ -76,7 +99,22 @@ def _scheduler_idle() -> bool:
        return True


+def _load_in_progress() -> bool:
+    """True while a model is being loaded/switched.
+
+    A load streams multi-GB of weights into host RAM over many seconds, so RSS
+    climbs monotonically — which the leak heuristic would otherwise mistake for a
+    leak. The scheduler shows no active *request* lease during a load, so the
+    idle check alone doesn't catch this; we consult the manager's load event."""
+    try:
+        from codai.models.manager import multi_model_manager
+        return not multi_model_manager._model_ready_event.is_set()
+    except Exception:
+        return False
+
+
 def _process_ram_gb() -> float:
+    """Whole-fleet RSS (front + all engines) — what the SHARED cap is enforced on."""
    try:
        from codai.models.manager import multi_model_manager
        return multi_model_manager._get_process_ram_gb()
@@ -88,13 +126,29 @@ def _process_ram_gb() -> float:
            return 0.0


-def _mitigate(rss_gb: float, cap_gb: float, leak: bool) -> str:
+def _own_ram_gb() -> float:
+    """This engine's OWN tree RSS — what *leak* detection trends on, so a leak is
+    attributed to (and mitigated by) the engine that actually has it, not every
+    engine that merely observes the shared total rising."""
+    try:
+        from codai.models.manager import multi_model_manager
+        return multi_model_manager._get_own_ram_gb()
+    except Exception:
+        return _process_ram_gb()
+
+
+def _mitigate(rss_gb: float, cap_gb: float, leak: bool, loading: bool = False) -> str:
    """Run the mitigation ladder; return a short description of what was done."""
    import gc
    actions = []
    for _ in range(3):
        gc.collect()
    actions.append("gc")
+    # Skip CUDA empty_cache while a load is in flight (accelerate is actively
+    # allocating on the GPU from the main thread, and calling into the CUDA
+    # allocator from this background thread mid-load is needless interference),
+    # or when the operator has disabled CUDA mitigation via ram_watch_cuda.
+    if not loading and _cuda_mitigation_enabled():
        try:
            import torch
            if torch.cuda.is_available():
@@ -120,7 +174,7 @@ def _mitigate(rss_gb: float, cap_gb: float, leak: bool) -> str:
    # Still over and eviction is enabled → unload idle LRU models.
    try:
        from codai.models.manager import multi_model_manager as _mm
-        if (_mm._get_process_ram_gb() > cap_gb * _SOFT_FRACTION
+        if (_mm._get_process_ram_gb() > cap_gb * _soft_fraction()
                and _mm._evict_idle_on_ram_enabled()):
            _mm._evict_models_for_ram(cap_gb * _EVICT_TARGET_FRACTION)
            actions.append("evict_idle")
@@ -134,18 +188,22 @@ def _loop():
    global _recent
    while True:
        try:
-            time.sleep(_POLL_SECONDS)
+            time.sleep(_poll_seconds())
            if not _watch_enabled():
                continue
            cap = _cap_gb()
-            rss = _process_ram_gb()
-            idle = _scheduler_idle()
-
-            # Leak heuristic: only trust growth measured while idle (a live job
-            # legitimately inflates RSS). Keep a short rolling window of idle samples.
+            rss = _process_ram_gb()    # whole fleet — the shared cap is enforced on this
+            own = _own_ram_gb()        # this engine only — leak is trended on this
+            loading = _load_in_progress()
+            idle = _scheduler_idle() and not loading
+
+            # Leak heuristic: only trust growth measured while THIS engine is idle (a
+            # live job — or a model load streaming weights into RAM — legitimately
+            # inflates RSS). Trend OWN RSS so a sibling engine's job/load can't look
+            # like a leak here. Keep a short rolling window of idle samples.
            leak = False
            if idle:
-                _recent.append(rss)
+                _recent.append(own)
                _recent = _recent[-(_LEAK_SAMPLES + 1):]
                if len(_recent) > _LEAK_SAMPLES:
                    rising = all(
@@ -154,18 +212,20 @@ def _loop():
                    )
                    leak = rising
            else:
-                _recent = []  # reset trend while a job runs
+                _recent = []  # reset trend while a job runs or a model loads

            with _state_lock:
                _state["rss_gb"] = round(rss, 2)
+                _state["own_rss_gb"] = round(own, 2)
                _state["cap_gb"] = cap
                _state["percent"] = round(100.0 * rss / cap, 1) if cap else None
                _state["leak_suspected"] = leak
                _state["samples"] += 1

-            # Engage the ladder when over the soft threshold or a leak is suspected.
-            if cap and (rss >= cap * _SOFT_FRACTION or leak):
-                desc = _mitigate(rss, cap, leak)
+            # Engage the ladder when the FLEET is over the soft threshold, or THIS
+            # engine is leaking (mitigation acts locally + evicts idle as needed).
+            if cap and (rss >= cap * _soft_fraction() or leak):
+                desc = _mitigate(rss, cap, leak, loading)
                new_rss = _process_ram_gb()
                _log.warning(
                    "RAM watch: RSS %.1f/%.1f GB (%.0f%%)%s — mitigation [%s] → %.1f GB",

--- a/codai/models/thermal.py
+++ b/codai/models/thermal.py
@@ -144,7 +144,20 @@ def _run(cmd, timeout=4.0) -> Optional[str]:


 def _read_gpu_temp_uncached() -> Optional[float]:
-    """Hottest GPU temperature in °C, or None if unreadable."""
+    """Hottest GPU temperature in °C across ALL installed cards, or None.
+
+    Spans every vendor (NVIDIA via nvidia-smi, AMD via sysfs) and is scoped to the
+    cards THIS engine owns (``CODERAI_ENGINE_GPUS``) — so a hot GPU pauses only the
+    engine using it, while a hot CPU (read globally) pauses everything. In
+    single-process mode it covers all cards. Falls back to the per-vendor probes
+    below if the unified reader fails."""
+    try:
+        from codai.frontproxy.gpu_detect import engine_gpu_stats
+        temps = [c["temp"] for c in engine_gpu_stats() if c.get("temp") is not None]
+        if temps:
+            return max(temps)
+    except Exception:
+        pass
    # NVIDIA — the inference GPU on CUDA backends.
    if _NVIDIA_SMI:
        out = _run([
@@ -282,7 +295,14 @@ _gpu_util_cache: Tuple[float, Optional[float]] = (0.0, None)


 def _read_gpu_util_uncached() -> Optional[float]:
-    """Hottest GPU utilization in %, or None if unreadable."""
+    """Busiest GPU utilization in % across ALL installed cards, or None."""
+    try:
+        from codai.frontproxy.gpu_detect import engine_gpu_stats
+        utils = [c["util"] for c in engine_gpu_stats() if c.get("util") is not None]
+        if utils:
+            return max(utils)
+    except Exception:
+        pass
    if _NVIDIA_SMI:
        out = _run([
            _NVIDIA_SMI,
@@ -441,6 +461,7 @@ class ThermalSettings:
    __slots__ = (
        "cpu_enabled", "gpu_enabled",
        "cpu_high", "cpu_resume", "gpu_high", "gpu_resume",
+        "gpu_overrides",
        "poll_seconds",
        "soft_enabled", "soft_temp", "soft_max_sleep",
    )
@@ -449,18 +470,28 @@ class ThermalSettings:
                 cpu_high=90.0, cpu_resume=87.0,
                 gpu_high=90.0, gpu_resume=87.0,
                 poll_seconds=5.0,
-                 soft_enabled=False, soft_temp=80.0, soft_max_sleep=3.0):
+                 soft_enabled=False, soft_temp=80.0, soft_max_sleep=3.0,
+                 gpu_overrides=None):
        self.cpu_enabled = bool(cpu_enabled)
        self.gpu_enabled = bool(gpu_enabled)
        self.cpu_high = float(cpu_high)
        self.cpu_resume = float(cpu_resume)
        self.gpu_high = float(gpu_high)
        self.gpu_resume = float(gpu_resume)
+        self.gpu_overrides = dict(gpu_overrides or {})
        self.poll_seconds = max(1.0, float(poll_seconds))
        self.soft_enabled = bool(soft_enabled)
        self.soft_temp = float(soft_temp)
        self.soft_max_sleep = max(0.0, float(soft_max_sleep))

+    def gpu_thresholds(self, vendor):
+        """(high, resume) for a card of ``vendor``, honouring per-vendor overrides."""
+        ov = (self.gpu_overrides or {}).get((vendor or "").lower())
+        if isinstance(ov, dict):
+            return (float(ov.get("high", self.gpu_high)),
+                    float(ov.get("resume", self.gpu_resume)))
+        return self.gpu_high, self.gpu_resume
+

 def _settings_from_global_args() -> ThermalSettings:
    """Build settings from the live global_args, falling back to defaults."""
@@ -479,6 +510,7 @@ def _settings_from_global_args() -> ThermalSettings:
        cpu_resume=g("thermal_cpu_resume", 87.0),
        gpu_high=g("thermal_gpu_high", 90.0),
        gpu_resume=g("thermal_gpu_resume", 87.0),
+        gpu_overrides=g("thermal_gpu_overrides", None),
        poll_seconds=g("thermal_poll_seconds", 5.0),
        soft_enabled=g("thermal_soft_throttle_enabled", False),
        soft_temp=g("thermal_soft_throttle_temp", 80.0),
@@ -524,6 +556,39 @@ def checkpoint(context: str = "", throttle_seconds: float = 0.0) -> None:
    wait_until_safe(context=context)


+def gpu_eval(settings: ThermalSettings):
+    """Per-card GPU thermal check, scoped to THIS engine's cards.
+
+    Returns ``(over_high, over_resume, worst)`` where ``worst`` is
+    ``{name,temp,high,resume,vendor}`` for the card most over its OWN high threshold
+    (or hottest vs its resume when none are over high), or ``None`` if no card temp
+    is readable. Honours per-vendor overrides, so e.g. a Radeon limit can differ
+    from an NVIDIA one and each card is judged against its own threshold."""
+    try:
+        from codai.frontproxy.gpu_detect import engine_gpu_stats
+        cards = engine_gpu_stats()
+    except Exception:
+        cards = []
+    over_high = over_resume = False
+    worst = None
+    worst_margin = None
+    for c in cards:
+        t = c.get("temp")
+        if t is None:
+            continue
+        high, resume = settings.gpu_thresholds(c.get("vendor"))
+        if t >= high:
+            over_high = True
+        if t > resume:
+            over_resume = True
+        margin = t - high
+        if worst is None or margin > worst_margin:
+            worst_margin = margin
+            worst = {"name": c.get("name"), "temp": t, "high": high,
+                     "resume": resume, "vendor": c.get("vendor")}
+    return over_high, over_resume, worst
+
+
 def wait_until_safe(settings: Optional[ThermalSettings] = None,
                    debug: bool = False,
                    context: str = "") -> None:
@@ -543,19 +608,23 @@ def wait_until_safe(settings: Optional[ThermalSettings] = None,
    desc0 = f" [{context}]" if context else ""

    # Read current temps once (cached) and log the full picture in debug mode.
-    gpu_t = read_gpu_temp() if settings.gpu_enabled else None
+    # GPU is evaluated per-card (each card vs its own vendor threshold); gpu_t is
+    # the worst offender's temperature, used for messaging/soft-throttle/debug.
+    gpu_over, gpu_over_resume, gpu_worst = (
+        gpu_eval(settings) if settings.gpu_enabled else (False, False, None))
+    gpu_t = gpu_worst["temp"] if gpu_worst else None
    cpu_t = read_cpu_temp() if settings.cpu_enabled else None
    _dbg(
        f"check{desc0}: "
        f"GPU {_fmt(gpu_t)} (enabled={settings.gpu_enabled}, "
-        f"pause>={settings.gpu_high:.0f} resume<={settings.gpu_resume:.0f}) | "
+        f"over_high={gpu_over} over_resume={gpu_over_resume}) | "
        f"CPU {_fmt(cpu_t)} (enabled={settings.cpu_enabled}, "
        f"pause>={settings.cpu_high:.0f} resume<={settings.cpu_resume:.0f})"
    )

    hot = []
-    if settings.gpu_enabled and gpu_t is not None and gpu_t >= settings.gpu_high:
-        hot.append(("GPU", gpu_t, settings.gpu_resume))
+    if settings.gpu_enabled and gpu_over:
+        hot.append(("GPU", gpu_worst["temp"], gpu_worst["resume"]))
    if settings.cpu_enabled and cpu_t is not None and cpu_t >= settings.cpu_high:
        hot.append(("CPU", cpu_t, settings.cpu_resume))

@@ -567,7 +636,7 @@ def wait_until_safe(settings: Optional[ThermalSettings] = None,
    # the resume line and a cooldown is already in progress.
    joined = False
    if not hot and _cooldown_active():
-        if (settings.gpu_enabled and gpu_t is not None and gpu_t > settings.gpu_resume) or \
+        if (settings.gpu_enabled and gpu_over_resume) or \
           (settings.cpu_enabled and cpu_t is not None and cpu_t > settings.cpu_resume):
            joined = True
    if not hot and not joined:
@@ -588,11 +657,12 @@ def wait_until_safe(settings: Optional[ThermalSettings] = None,
    # Enter cooldown: wait until *every* triggered sensor is at/below resume.
    desc = f" ({context})" if context else ""
    if hot:
-        trig = ", ".join(f"{lbl} {t:.0f}°C>={settings.gpu_high if lbl=='GPU' else settings.cpu_high:.0f}°C"
-                         for lbl, t, _ in hot)
-        print(f"[thermal] Hardware too hot{desc}: {trig} — pausing requests "
-              f"until cooldown (GPU<={settings.gpu_resume:.0f}°C / "
-              f"CPU<={settings.cpu_resume:.0f}°C)")
+        # Each triggered sensor carries its own resume threshold (per-card for GPU).
+        trig = ", ".join(f"{lbl} {t:.0f}°C (resume<={r:.0f}°C)" for lbl, t, r in hot)
+        gpu_note = (f" [{gpu_worst['name']}]" if gpu_worst and any(h[0] == 'GPU' for h in hot)
+                    else "")
+        print(f"[thermal] Hardware too hot{desc}: {trig}{gpu_note} — pausing requests "
+              f"until cooldown")
    else:
        # Joined an already-active cooldown started by another parallel worker.
        print(f"[thermal] Joining active cooldown{desc} — another generation is "
@@ -605,13 +675,15 @@ def wait_until_safe(settings: Optional[ThermalSettings] = None,
            # Re-evaluate against resume thresholds (lower than trigger → hysteresis).
            # CPU temps are noisy, so average a few samples for the resume decision
            # (the pause check above stays single-read to react fast to spikes).
-            gt = read_gpu_temp() if settings.gpu_enabled else None
+            _, gpu_still, gpu_w2 = (gpu_eval(settings) if settings.gpu_enabled
+                                    else (False, False, None))
            ct = read_cpu_temp_avg() if settings.cpu_enabled else None
            still = []
-            if gt is not None and gt > settings.gpu_resume:
-                still.append(("GPU", gt, settings.gpu_resume))
+            if settings.gpu_enabled and gpu_still:
+                still.append(("GPU", gpu_w2["temp"], gpu_w2["resume"]))
            if ct is not None and ct > settings.cpu_resume:
                still.append(("CPU", ct, settings.cpu_resume))
+            gt = gpu_w2["temp"] if gpu_w2 else None
            _dbg(f"cooldown{desc} {int(waited)}s: GPU {_fmt(gt)} CPU {_fmt(ct)} (avg-3) "
                 f"(still hot: {[s[0] for s in still] or 'none'})")
            if not still:

--- a/commands
+++ b/commands
+python tools/video_editor.py --no-browser --host 0.0.0.0 --media-dir tools/coderai_media --session
+tools/gen_township_fighters.py -c township_output/township_config.json
+
--- a/docs/deepseek-ds4.md
+++ b/docs/deepseek-ds4.md
+# DeepSeek V4 via ds4
+
+CoderAI can serve **DeepSeek V4** (Flash / PRO) through antirez's
+[ds4 / DwarfStar](https://github.com/antirez/ds4) — a native (C/CUDA/Metal)
+inference engine built specifically for DeepSeek V4 that ships its own
+OpenAI-compatible HTTP server (`ds4-server`).
+
+Because ds4 is a standalone binary (not a Python package), coderai owns its whole
+lifecycle as an *external worker* — the same pattern used for Parler-TTS
+(`codai/api/parler_worker.py`). When enabled, coderai builds ds4, downloads the
+model weights, launches `ds4-server` as a managed subprocess, and proxies text
+requests to it. Everything else in coderai (tool parsing, streaming, the chat UI)
+keeps working unchanged.
+
+> **Hardware:** DeepSeek V4 is large. Per upstream you want **96 GB+ RAM**
+> (256 GB+ for the Q4 variant, 512 GB for PRO). First use also clones the repo,
+> compiles a native binary, and downloads several GB of weights — it is slow.
+
+## Enabling
+
+Admin → **Settings → DeepSeek V4 (ds4)**:
+
+- **Enable ds4** — turn the integration on.
+- **Model id / alias** (default `deepseek-v4`) — any chat request whose model name
+  equals this id, or contains `deepseek-v4` (case-insensitive), is routed to ds4
+  instead of the normal NVIDIA/Vulkan backends. All other models are unaffected.
+- **Weight variant** — passed to ds4's `download_model.sh`
+  (`q2-imatrix`, `q2-q4-imatrix`, `q4-imatrix`, `pro-q2-imatrix`).
+- **Build target** — `auto` detects CUDA (`cuda-generic`) / macOS (`metal`) /
+  `cpu`; override for DGX Spark (`cuda-spark`).
+- **Install dir** — where ds4 is cloned/built (default `~/.coderai/ds4`, or
+  `$CODERAI_DS4_DIR`).
+- **Auto build** — clone + `make` the `ds4-server` binary if it's missing.
+- **Bind host / Port / Context** — `ds4-server --host/--port/--ctx`
+  (port `0` auto-picks a free port).
+- **Extra args** — passed verbatim to `ds4-server`, e.g.
+  `--kv-disk-dir /tmp/ds4-kv --kv-disk-space-mb 8192`.
+
+Then send a normal request:
+
+```sh
+curl localhost:8776/v1/chat/completions -H 'Content-Type: application/json' -d '{
+  "model": "deepseek-v4",
+  "messages": [{"role":"user","content":"Hello"}]
+}'
+```
+
+The first such request triggers build → download → serve (with generous timeouts);
+build and download logs are streamed with a `[ds4]` prefix. The subprocess is torn
+down by the model manager's normal eviction and on server shutdown.
+
+## Building ahead of time / packaging
+
+Runtime auto-build works, but for reproducible installs (and Docker) you can build
+ds4 during setup:
+
+```sh
+./build.sh all --ds4        # clones + builds ds4-server into ~/.coderai/ds4
+```
+
+The OCI image builder (`packaging/linux/build_oci_image.sh`) auto-discovers and
+bundles the prebuilt `ds4-server` binary (and its shared libraries) the same way it
+bundles `whisper-server`. Model **weights are not bundled** — they are downloaded
+on first use inside the container. If only the binary is shipped (no repo scripts),
+coderai shallow-clones the repo at first use to obtain `download_model.sh`.
+
+## Implementation
+
+- `codai/config.py` — `Ds4Config`.
+- `codai/api/ds4_worker.py` — clone/build, weight download, `ds4-server` lifecycle.
+- `codai/backends/ds4.py` — `Ds4Backend`, an OpenAI-API proxy implementing the
+  `ModelBackend` interface.
+- `codai/models/manager.py` — `ds4_should_handle()` routes matching models to
+  `Ds4Backend`; `is_allowed_model()` accepts the ds4 model id.
--- a/docs/expressive-tts.md
+++ b/docs/expressive-tts.md
+# Expressive TTS (emotion / delivery)
+
+The video editor shows **Emotion** and **Delivery** dropdowns whenever the
+configured TTS model advertises them (`codai/api/tts_backends.py`:
+`family_emotions` / `family_styles`). Two engines support expressive control.
+
+## Bark — in-stack, no extra deps
+
+Works with the server's current `transformers`. Configure a Bark model as the
+TTS model, e.g. `--tts-model suno/bark` (or `suno/bark-small`).
+
+- **Delivery**: `normal`, `whispering` (`[whispers] …`), `singing` (`♪ … ♪`),
+  `emphasis` (UPPERCASE).
+- **Emotion**: inserts a matching non-verbal cue — `laughter`→`[laughs]`,
+  `sigh`→`[sighs]`, `gasp`→`[gasps]`.
+- **Voice**: a Bark preset like `v2/en_speaker_6`. The editor's Kokoro voice ids
+  don't apply and fall back to the default preset (set `voice_preset` in the
+  model config to change it). Speed isn't controllable in Bark.
+
+## Parler — fully managed by coderai (no setup)
+
+`parler-tts` pins an old `transformers`/`tokenizers`/`huggingface-hub` that
+**conflict with this server** — never `pip install` it into the coderai venv.
+coderai handles this for you: just use a Parler model as the TTS model
+(e.g. `parler-tts/parler-tts-mini-multilingual`). The worker is launched lazily —
+only when a request for that model actually arrives — and shut down when the
+model is evicted, exactly like loading/unloading any other model. On first use it
+
+1. creates a dedicated venv at `~/.coderai/parler_venv`
+   (override with `CODERAI_PARLER_VENV`), built `--system-site-packages` so the
+   base torch/numpy are reused and only the conflicting packages land in it;
+2. `pip install`s parler-tts there;
+3. launches `tools/parler_tts_service.py` in that venv on a local port, pointing
+   `HF_HUB_CACHE` at coderai's own cache and forcing **offline mode**
+   (`HF_HUB_OFFLINE=1`) so it loads strictly the model you **already downloaded
+   via the model interface** — the worker never downloads anything itself;
+4. health-checks it and routes synthesis to it.
+
+The worker is owned by `codai/api/parler_worker.py`; the backend's `cleanup()`
+calls `stop_service()`, so the model manager's normal eviction tears the process
+down. The first request blocks while the venv builds, then it's cached.
+
+If the model isn't in coderai's cache, the worker fails fast with a clear error
+("download '<model>' from the model interface first") instead of fetching it.
+Download the Parler model through the normal HF download UI first.
+
+The editor's **Emotion**/**Delivery** dropdowns drive it: coderai POSTs
+`{text, voice, speed, emotion, style}` to the worker, which maps them into a
+natural-language delivery description (whisper / shout / monotone / expressive +
+emotion + pace). A fixed `description` in the model config overrides the
+auto-built one. An explicit `service_url` in the config bypasses management and
+talks to an externally-run service instead.
+
+> The model must still be in the server's allowed-models registry to be
+> selectable — that's the only configuration; the worker itself needs none.
--- a/docs/frontend-engine-split.md
+++ b/docs/frontend-engine-split.md
+# Frontend/engine split (responsive UI + multi-engine)
+
+CoderAI boots as two layers so heavy model work never freezes the web interface:
+
+- **front** — a thin reverse proxy on the public host/port. It imports no
+  torch/transformers/diffusers, so its event loop is always free. It streams
+  requests/responses (including SSE) to the engines and serves an aggregated,
+  cached status/tasks view.
+- **engine(s)** — the real CoderAI app (the current server), bound to internal
+  localhost ports, doing all GPU/model work. One engine per GPU by default; each is
+  pinned with `CUDA_VISIBLE_DEVICES` so inside it the GPU is always `cuda:0` and the
+  existing per-process VRAM/eviction logic is unchanged.
+
+```
+client ─HTTP/SSE─▶ front (public) ─┬─ engine#0  (CUDA_VISIBLE_DEVICES=0, :8780)
+                   • no torch       ├─ engine#1  (CUDA_VISIBLE_DEVICES=1, :8781)
+                   • always live    └─ …
+```
+
+See `docs/process-isolation-plans.md` for the design rationale (this is Plan B +
+multi-engine).
+
+## Modes
+
+| Launch | Result |
+|---|---|
+| `coderai` (default) | Front on the public port; auto-spawns one engine per GPU |
+| `coderai --single-process` | Legacy: one process, full app on the public port |
+| `coderai --engine-only --internal-port N` | One engine on `127.0.0.1:N` (the front launches these for you) |
+
+`--engine-only` is not meant to be run by hand; the front's supervisor manages it.
+
+## Config (`config.json` → `server`)
+
+| Key | Default | Meaning |
+|---|---|---|
+| `single_process` | `false` | Force legacy one-process mode |
+| `internal_port_base` | `8780` | First engine's internal port (+1 per extra engine) |
+| `engines` | `0` | Number of engines; `0` = auto (one per GPU, min 1) |
+| `engine_gpus` | `null` | Explicit GPU indices, e.g. `[0, 1]`; `null` = auto-detect (NVIDIA) |
+| `engine_specs` | `null` | Explicit heterogeneous engines (see below). Overrides `engines`/`engine_gpus` |
+| `proxy_status_timeout` | `2.0` | Short timeout (s) for status/UI proxying |
+| `proxy_max_inflight` | `64` | Max concurrent proxied requests through the front |
+
+### Heterogeneous engines (e.g. NVIDIA + Radeon)
+
+Auto-detection only finds NVIDIA cards and assumes one backend, and CUDA vs Vulkan
+device **enumeration is inconsistent** — so for a mixed setup, declare each engine
+with its own backend and env block via `engine_specs`. Each engine is its own
+process: the front applies the env at spawn, forces the backend
+(`CODERAI_ENGINE_BACKEND`), and routes models only to capability-compatible engines.
+
+- **Capabilities** (default from backend): `nvidia` → `["transformers","gguf"]`
+  (CUDA for transformers, GGUF via llama.cpp — which itself may use CUDA or Vulkan);
+  `vulkan` → `["gguf"]`. Override per engine with `"capabilities": [...]`.
+- **Routing:** a transformers/safetensors model goes only to a `transformers`-capable
+  (NVIDIA) engine; a GGUF goes to whichever compatible engine already holds it, else
+  the least-loaded GGUF-capable engine (NVIDIA *or* Radeon).
+
+Example `config.json` → `server.engine_specs` for an NVIDIA (`cuda:0`) + Radeon
+(Vulkan device 1) box, where the NVIDIA engine also serves GGUF via the NVIDIA
+Vulkan ICD:
+
+```json
+"engine_specs": [
+  {
+    "name": "nvidia",
+    "backend": "nvidia",
+    "env": {
+      "CUDA_VISIBLE_DEVICES": "0",
+      "RADEON_VISIBLE_DEVICES": "",
+      "VK_ICD_FILENAMES": "/usr/share/vulkan/icd.d/nvidia_icd.json",
+      "GGML_VK_VISIBLE_DEVICES": "0"
+    }
+  },
+  {
+    "name": "radeon",
+    "backend": "vulkan",
+    "env": {
+      "CUDA_VISIBLE_DEVICES": "",
+      "GGML_VK_VISIBLE_DEVICES": "1"
+    }
+  }
+]
+```
+
+The first spec is the **primary** engine (owns admin/auth/config). Empty-string env
+values are honoured (`CUDA_VISIBLE_DEVICES=""` hides all CUDA cards from the Radeon
+engine). `internal_port_base` assigns ports in order (8780, 8781, …).
+
+#### An engine can own several GPUs
+
+"One engine per GPU" is only the auto-detect default. An engine owns whatever its
+`env` exposes, so to run a single large model **across two NVIDIA cards**, give one
+engine both — list both CUDA UUIDs — and the NVIDIA backend shards the model over
+them automatically (`device_map`/accelerate `max_memory` across every visible CUDA
+device; tune per-model with `max_gpu_percent` / `balanced_gpu_percent` / `max_vram`).
+
+Example: 2× NVIDIA (one sharding engine) + 1× Radeon:
+
+```json
+"engine_specs": [
+  {
+    "name": "nvidia-dual",
+    "backend": "nvidia",
+    "env": {
+      "CUDA_VISIBLE_DEVICES": "GPU-<uuidA>,GPU-<uuidB>",
+      "CUDA_DEVICE_ORDER": "PCI_BUS_ID",
+      "VK_ICD_FILENAMES": "/usr/share/vulkan/icd.d/nvidia_icd.json",
+      "GGML_VK_VISIBLE_DEVICES": "0"
+    }
+  },
+  {
+    "name": "radeon",
+    "backend": "vulkan",
+    "env": {
+      "CUDA_VISIBLE_DEVICES": "",
+      "VK_ICD_FILENAMES": "/usr/share/vulkan/icd.d/radeon_icd.json",
+      "GGML_VK_VISIBLE_DEVICES": "0"
+    }
+  }
+]
+```
+
+Use **GPU UUIDs** (from `nvidia-smi --query-gpu=uuid --format=csv`) rather than
+indices so the assignment survives reboots/reordering. The front reports such an
+engine's VRAM as the **sum across its GPUs** (with a per-device breakdown in
+`/internal/engine-state` and `x_engines`).
+
+## Choosing which card runs a model
+
+When a model is compatible with more than one engine (e.g. a GGUF that runs on both
+the NVIDIA and Radeon engines), the card is chosen by this precedence:
+
+1. **Per-model pin** — set `engine` on the model (Models page → *Engine / card*, or
+   the `"engine"` field in `models.json`) to a declared engine name. Honoured only
+   if that engine can serve the model's format.
+2. **Already resident** — the engine that already has the model loaded (avoids a
+   reload).
+3. **Default engine** — `server.default_engine` (Settings → *Default engine*), used
+   when the model is compatible with several engines.
+4. **Least-loaded** compatible engine.
+
+`default_engine` and the per-model *Engine / card* control only appear in the UI
+when 2+ engines are declared.
+
+**Bad pins are reported, not silently ignored.** Saving a per-model engine (or the
+default engine) that is unknown, or that can't run the model's format (e.g. a
+transformers model pinned to a Vulkan/Radeon engine), returns a warning in the admin
+UI. At request time the front also logs a one-line warning (deduped per
+model+engine) before falling back to a compatible engine.
+
+## Routing
+
+- **Inference** (`POST /v1/...` carrying a `model`) → chosen per the precedence
+  above, restricted to capability-compatible engines. This is what lets one model
+  load on engine A while engine B keeps generating.
+- **Admin / auth / config / UI / status / tasks** → the **primary** engine
+  (engine#0). Sessions and `models.json` writes are per-process today, so pinning
+  these keeps sessions consistent without a shared store.
+- **Status / tasks pollers** use a short timeout with a cached/empty fallback, so a
+  momentarily-blocked engine loop can never hang the dashboard. The front overlays
+  cross-engine VRAM totals (`vram`) and running tasks (tagged with their `engine`).
+
+## Thermal protection
+
+Thermal cooldowns are scoped to match how work is distributed:
+
+- **CPU too hot → everything pauses.** CPU temperature is read globally, and every
+  engine gates on it, so all tasks back off until the CPU cools.
+- **A GPU too hot → only that GPU's engine pauses.** Each engine reads only the
+  cards it owns (the front sets `CODERAI_ENGINE_GPUS` — NVIDIA UUIDs and/or a vendor
+  keyword), so a hot NVIDIA card pauses the NVIDIA engine while the Radeon engine
+  keeps generating, and vice-versa. Each engine is its own process with its own
+  cooldown state, so they're naturally independent.
+
+Granularity is per-engine: if one engine owns several GPUs, a single hot card pauses
+that engine's work on all of its cards (they share one process). In single-process
+mode the GPU check covers all cards.
+
+**Per-card thresholds.** Each card is judged against its own vendor's limit:
+`thermal.gpu_high`/`gpu_resume` are the defaults, and `thermal.gpu_overrides`
+(`{"amd": {"high": 95, "resume": 92}}`) raises/lowers them per vendor — so a Radeon
+can run hotter than an NVIDIA card. Settings → Thermal renders one override row per
+GPU vendor **detected on the machine** (never a hardcoded list).
+
+**Which engine is cooling** is shown on the Tasks page banner (each engine reports
+its cooldown via `/internal/engine-state`; the front names the cooling engine and
+whether it's a GPU or CPU pause).
+
+## Concurrency (per-engine)
+
+Each engine is its own process with its own request queue, so concurrency limits
+apply **per-engine** and total throughput is the sum across engines:
+
+- **Max parallel requests** (`server.max_parallel_requests`) — how many requests an
+  engine runs at once.
+- **Max instances per model** (`models.max_model_instances`) — concurrent copies of
+  one model (needed to run several requests against the *same* model at once).
+
+Both take **per-engine overrides** (`*_overrides`, keyed by engine name, e.g.
+`{"nvidia": 4, "radeon": 1}`) so a bigger card runs more in parallel than a smaller
+one. Settings → Concurrency shows the defaults plus one override row per running
+engine. The front resolves each engine's value and passes it down at spawn
+(`CODERAI_MAX_PARALLEL` / `CODERAI_MAX_MODEL_INSTANCES`).
+
+## Managing engines
+
+The Tasks page shows an **Engines** panel (front mode only) with each engine's
+health, VRAM and loaded-model count, and a **Restart** button — use it to kill an
+engine that's wedged/looping; the supervisor respawns it immediately while the front
+and other engines keep serving. Backed by `GET /admin/api/engines` and
+`POST /admin/api/engines/{id}/restart` on the front (authorized against the primary
+engine's session).
+
+## Shared host-RAM cap
+
+`offload.max_ram_gb` is a single **server-wide** ceiling shared by all engines, not
+split into per-engine slices. The front sets `CODERAI_FRONT_PID` on each engine, so
+every engine measures the same fleet-wide RSS (front + all engines + their workers)
+and enforces the one cap against that total. When the combined usage crosses the
+cap, each engine runs its normal mitigation/eviction (dropping its idle LRU models),
+so whichever engine holds idle models frees them for the shared budget; busy models
+aren't evicted. An idle engine uses ~0 of the budget; a busy one can use most of it.
+
+VRAM is naturally per-card (each engine sees only its own GPUs via
+`CUDA_VISIBLE_DEVICES`), and model eviction on swap is unchanged *within* an engine.
+
+## Broker (runs in the front)
+
+The AISBF broker client runs **in the front**, not in a model engine — it's
+coordination/protocol work, so binding it to a GPU process would stall it whenever
+that engine loads a model. Benefits:
+
+- Never stalls during a model load (the front's loop is always free).
+- One registration for the whole node, regardless of engine count.
+- Advertises **aggregate** hardware: `build_hardware_summary` is torch-free in the
+  front (via `gpu_stats()`), so it reports the total VRAM across *every* card.
+- Brokered requests dispatch through the **same router/proxy** as HTTP — a brokered
+  GGUF request can land on the Radeon engine, a transformers one on NVIDIA.
+
+Engines run no broker client (`main.py` disables it under `--engine-only`); only
+single-process mode keeps the broker in-process. Implementation:
+`FrontProxy.start_broker` / `broker_execute` (`codai/frontproxy/app.py`) +
+`execute_broker_request(..., executor=...)` (`codai/broker/dispatcher.py`).
+
+## Model assignment (one owner per model)
+
+With multiple engines, the front assigns each configured model to exactly **one**
+owner engine and routes accordingly, so a model is never served from two engines:
+
+- **Owner precedence:** per-model `engine` pin → default engine → balanced
+  round-robin across capability-compatible engines.
+- Routing honours the assignment first (`registry.engine_for_assigned`); unassigned
+  / ad-hoc models fall back to capability routing.
+- `/v1/models` (and the broker's model list) is the **union** across engines, deduped
+  — the full catalogue with no duplicates.
+- Engines aren't pruned, so the admin Models page (served from the primary) still
+  shows the complete configuration.
+- **Two configs of one model** can run on different engines if they have distinct
+  aliases (the assignment keys on the routable id: alias → path); configs sharing a
+  path with no distinct alias collapse to one owner.
+
+## Security: engines are localhost-only + token-gated
+
+Engines bind **127.0.0.1 only** (forced regardless of the configured host, which is
+the front's public bind), and the front reaches them via `http://127.0.0.1:<port>`.
+On top of that, the front generates a per-run secret, passes it to each engine via
+`CODERAI_INTERNAL_TOKEN`, and stamps every engine request with an
+`X-Coderai-Internal` header; an engine rejects (403) any request lacking it (and the
+front strips client-supplied copies so the token can't be spoofed). So nothing else
+on localhost can talk to an engine and bypass the front's auth/routing. Single-process
+mode sets no token and is unaffected.
+
+## Fault isolation
+
+The supervisor polls each engine's auth-free, localhost-only
+`/internal/engine-state`. If an engine exits (including a CUDA device-side assert),
+it is **respawned**; the front and sibling engines keep serving. The front's own
+`/healthz` reports per-engine readiness.
+
+## Known limitations (follow-ups)
+
+- Admin/config/session state is pinned to the primary engine (not yet replicated —
+  that's "Plan C" in the design doc). Cross-engine **task visibility** works
+  (merged read-only); cross-engine **session sharing** does not — all admin traffic
+  intentionally lands on the primary.
+- Placement is first-fit (model→least-loaded compatible engine); there is no live
+  cross-engine rebalancing/migration yet.
+- Capability routing keys off the model **name** (a `.gguf`/`gguf` name → GGUF, else
+  transformers), matching the engine's own `is_gguf` heuristic. A transformers model
+  whose name happens to contain "gguf" would be mis-routed — rename or declare an
+  alias if that ever bites.
--- a/docs/process-isolation-plans.md
+++ b/docs/process-isolation-plans.md
+# Process-isolation plans: keeping the web UI responsive during model load/inference
+
+> **Status (implemented):** Plan B + Multi-engine shipped. The front proxy lives in
+> `codai/frontproxy/` (`app.py`, `engine_supervisor.py`, `registry.py`, `router.py`),
+> engines run via `coderai --engine-only --internal-port N`, and default boot starts
+> the front + one engine per GPU. Operator guide: `docs/frontend-engine-split.md`.
+> Plan C (replicating session/config/queue ownership into the front) remains a
+> follow-up — **except the broker, which has already moved into the front** (it
+> registers once for the whole node, advertises aggregate VRAM torch-free, and
+> dispatches brokered requests to engines through the router). Sessions/config/queue
+> are the remaining Plan-C pieces.
+
+## Problem statement
+
+While a model loads (and, for some backends, while it generates), the web
+interface and API become unresponsive.
+
+Root cause: the server is a single process. GIL-heavy Python work blocks the
+asyncio event loop that serves the UI/API. Specifically:
+
+- **Transformers text** (`codai/backends/cuda.py`, `NvidiaBackend`) — both the
+  `from_pretrained` **load** and token-by-token `model.generate` hold the GIL.
+  Dispatching them via `asyncio.to_thread` does **not** free the loop, because
+  `to_thread` only helps when the worker releases the GIL.
+- **Diffusers** (image/video/audio, `codai/api/images.py`, `video.py`,
+  `audio_gen.py`) — the `from_pretrained` **load** is GIL-heavy and freezes the
+  UI. The denoise loop itself is mostly torch CUDA ops that *do* release the
+  GIL, so the freeze is almost entirely the load.
+- **Vulkan / GGUF** (`codai/backends/vulkan.py`, llama.cpp) — the native load
+  **releases the GIL**, so this path does *not* freeze the UI. (This is why the
+  existing defensive comments assume "the load releases the GIL during its C
+  call" — true for llama.cpp, false for the transformers/diffusers paths.)
+
+The fix is to ensure the process serving the UI/API is not the process whose GIL
+is held by model work. Three architectures achieve this with very different
+cost/benefit. This document captures all three so we can choose deliberately.
+
+> Note: an unrelated, already-shipped fix lives in `cuda.py` — Gemma-class models
+> whose attention head dimension exceeds FlashAttention-2's limit of 256 now fall
+> back to SDPA (`_model_head_dim`), which fixed "requests silently stop" for those
+> models. That is orthogonal to the process-isolation work below.
+
+---
+
+## Summary comparison
+
+| | A: model worker (out-of-process models) | B: thin resilient proxy | C: full frontend/engine split |
+|---|---|---|---|
+| Process boundary | Python pipeline-call layer | HTTP layer | HTTP layer + state ownership |
+| Serialization burden | **High** (torch generators, callbacks, tensors, PIL) | **Low** (already HTTP) | **Low** (already HTTP) |
+| Engine/model code changes | Large (text clean, diffusers invasive) | **None** (engine ≈ current app) | Moderate (engine becomes pure executor) |
+| Fixes which model types | One modality at a time | **All at once** | **All at once** |
+| New moving parts | Worker harness + per-modality IPC | Reverse proxy + status cache + supervisor | Proxy + relocated coordination state + supervisor |
+| Crash/CUDA-poison isolation | Per-model worker | **Engine restart, front survives** | **Engine restart, front survives** |
+| Effort | Text: medium. Diffusers: very large. | **Small–medium** | Large |
+| Recommended role | Fallback / not preferred | **First cut (do this)** | Eventual evolution of B |
+
+**Recommendation:** ship **B**, evolve toward **C** if/when coordination state
+needs to be authoritative in the front; keep **A** only as a documented
+alternative (it is the worst fit for diffusers).
+
+---
+
+## Shared context (applies to B and C)
+
+- Public surface is **plain HTTP + SSE**. No inbound websockets, no mounted
+  sub-apps (verified). This makes a reverse-proxy split clean.
+- `codai/broker/asgi_bridge.py` already drives the ASGI app from an external
+  transport, so the app is already transport-decoupled in spirit.
+- The front process must import **no** `torch` / `transformers` / `diffusers`,
+  so its GIL is never held by model code and its event loop is always free.
+- VRAM/GPU stats can be read by the front **without torch** via `nvidia-smi`
+  and sysfs/`lspci` (the existing `api_status` already reads sysfs/`lspci` for
+  the non-CUDA path).
+
+---
+
+## Plan A — Out-of-process model worker (models leave the API process)
+
+The original approach: keep the API/UI in the main process, push the GIL-heavy
+model into a child process behind a proxy backend.
+
+### A.1 Generic worker harness
+- `codai/backends/worker_client.py` — parent-side proxy implementing the
+  `ModelBackend` interface; spawns the child, waits on `/health`, forwards calls.
+- `codai/backends/text_worker.py` — child entrypoint
+  (`python -m codai.backends.text_worker --port 0`) running a tiny local uvicorn
+  that instantiates the **real** `NvidiaBackend` and exposes `/load`,
+  `/generate`, `/generate_chat`, `/generate_stream` (SSE), `/generate_chat_stream`
+  (SSE), `/context_size`, `/usage`, `/tokenize`, `/health`, `/shutdown`.
+- Wire into `ModelManager.load_model` (`codai/models/manager.py:158`): when
+  `backend_type == "nvidia"`, instantiate `WorkerTextBackend()` instead of
+  `NvidiaBackend()`, behind a default-on config flag. Instance pools, eviction,
+  VRAM delta accounting (`torch.cuda.mem_get_info` in the parent still sees the
+  child's allocations) are untouched — each instance owns a subprocess.
+
+### A.2 Text worker (clean)
+- I/O is tiny (text / SSE tokens). Streaming maps directly to SSE.
+- `cleanup()` terminates the subprocess → frees VRAM.
+- Bonus: a device-side CUDA assert kills only the child; parent maps the error to
+  the existing `cuda_context_poisoned` logic and respawns.
+
+### A.3 Diffusers worker (very large — the blocker)
+Diffusers cannot be a thin wrapper. Evidence in `codai/api/images.py`/`video.py`:
+- Pipelines are stored **as live objects** in the shared registry
+  (`multi_model_manager.models[model_key] = pipe`) and called inline at ~dozens
+  of sites (txt2img, img2img, inpaint, upscale, depth, segmentation, video
+  modes, audio_gen).
+- Pipelines are **mutated in-process**: `apply_accel_to_pipeline(pipeline, accel)`
+  (`images.py:345`), LoRA application, IP-Adapter wiring, scheduler swaps.
+- `pipe(...)` call args are **not serializable**: `generator` (a
+  `torch.Generator` bound to a device), `callback_on_step_end=_step_cb` (a live
+  closure updating the in-process `_gen_progress`), `embed_kwargs` (prompt
+  embedding tensors), IP-Adapter/character/environment **PIL reference images**
+  (`images.py:877-899`).
+
+Consequence: putting diffusers in a worker means **moving the entire generation
+lifecycle into the worker** (load + accel + LoRA + IP-adapter + the call + output
+extraction) and converting every call site to a **high-level** IPC request
+(prompt, seed, steps, image bytes), serializing every input (PIL/tensors/masks/
+control images), every output (images/frames/audio), and **relaying step
+progress** back over IPC. Large, regression-prone rewrite of the media API.
+
+### A.4 Assessment
+- Text: medium effort, clean win.
+- Diffusers: very large, fragile; payoff limited (denoise releases the GIL).
+- **Not recommended** as the diffusers solution. Superseded by B.
+
+---
+
+## Plan B — Thin resilient reverse proxy (RECOMMENDED FIRST CUT)
+
+Split at the HTTP boundary. The **engine** is the current app, essentially
+unchanged, on an internal port. The **front** is a small async reverse proxy on
+the public port whose event loop never freezes (no torch in its address space).
+
+### B.1 Architecture
+```
+client ──HTTP/SSE──▶  front (public port)  ──HTTP/SSE──▶  engine (internal port, all models)
+                      • no torch                          • current app, unchanged
+                      • always-responsive                 • may freeze on GIL-heavy load
+                      • status cache + timeouts           • does all GPU work
+                      • supervises engine subprocess
+```
+
+### B.2 The one rule that makes it work
+The front must answer **UI / status / admin** without synchronously
+hard-depending on a possibly-frozen engine:
+- UI / status / admin → short timeout on the engine call; on timeout serve a
+  **last-known status cache** plus an "engine busy loading model X" flag.
+- Generation (chat / image / video / SSE) → proxied with **long timeout**. That
+  single request legitimately waits for the load; the rest of the UI stays live.
+
+### B.3 New files
+- `codai/frontproxy/__init__.py`
+- `codai/frontproxy/app.py` — FastAPI app for the front:
+  - Catch-all reverse-proxy route: streams request body (chunked uploads),
+    forwards method/path/query/headers (incl. auth, rewriting `Host`), streams
+    the response back (SSE and large binary), preserves status codes.
+  - Status handler: proxies `/admin/api/status` with a short timeout; caches the
+    last success; on timeout/refusal returns cache + `{ "engine": "loading"|"down" }`.
+  - `/healthz` for the front itself.
+- `codai/frontproxy/engine_supervisor.py` — spawn the engine subprocess
+  (`python -m codai.main --internal-port …`), poll `/healthz` on the engine,
+  restart on crash/exit (this is where CUDA-poison recovery becomes "respawn").
+- HTTP client: `httpx.AsyncClient` with streaming, or `aiohttp`. Separate short-
+  and long-timeout clients.
+
+### B.4 Engine-side changes (minimal)
+- `codai/main.py` / `codai/cli.py`: add `--internal-port` / `--engine-only` so
+  the engine binds to localhost and the front owns the public port. Default boot
+  launches front + engine; a flag preserves the legacy single-process mode.
+- Add a cheap `/healthz` on the engine (no torch, returns immediately) so the
+  supervisor can distinguish "loading" (slow) from "dead".
+
+### B.5 Proxy correctness checklist (the real work)
+- **SSE / streaming**: forward `text/event-stream` without buffering; flush per
+  chunk; propagate client disconnect to cancel the upstream request.
+- **Large uploads**: stream `model-upload` / image inputs (don't buffer whole
+  body in memory).
+- **Large downloads**: stream image/video/audio byte responses.
+- **Auth / headers**: pass `Authorization`, cookies; rewrite `Host`; preserve
+  `Content-Type`, `Content-Length`/chunked, `Content-Disposition`.
+- **Timeouts**: short for status/UI; long (or none) for generation; map engine
+  timeout to cached status, never to a hung front request.
+- **Backpressure / limits**: bound concurrent in-flight proxied requests.
+- **Redirects / error passthrough**: preserve 3xx/4xx/5xx and bodies.
+
+### B.6 Limitations
+- Does not speed up the one in-flight request waiting on a load; keeps the rest
+  of the UI responsive.
+- True concurrency across models needs multiple engines (see "Multi-engine").
+
+### B.7 Effort: small–medium. Engine code essentially untouched; risk concentrated
+in the proxy, which is testable in isolation.
+
+---
+
+## Plan C — Full frontend/engine split (eventual evolution of B)
+
+Make the front authoritative for all pure-Python coordination state so it never
+needs the engine even for status; the engine becomes a pure executor.
+
+### C.1 What moves to the front
+Relocate non-GPU, pure-Python concerns out of the engine into the front (each is
+serialization-trap-free):
+- **Sessions / auth / API tokens** (`codai/admin` session manager).
+- **Config / models.json management** (the admin "models" CRUD, `config_manager`).
+- **Request queue + metrics** (`codai/queue/manager.py`).
+- **Progress + model-registry view**: the engine pushes events (loaded/unloaded,
+  `_gen_progress` step updates, VRAM deltas) to the front over a control channel;
+  the front holds the authoritative cache and serves status with zero engine
+  dependency.
+
+### C.2 Engine becomes pure executor
+- Exposes only: load/unload, generate (all modalities), health, event stream.
+- No session/config/queue logic; receives resolved requests from the front.
+
+### C.3 Control channel
+- A persistent engine→front event stream (SSE or a small socket) for progress,
+  load state, VRAM, and crash notifications. Front reconciles its cache; on
+  engine restart, front re-syncs.
+
+### C.4 Benefits
+- Status/admin are instant and always correct, even mid-load.
+- Clean seam for **multi-engine** orchestration.
+- Strong fault isolation: engine crash never loses UI/session/queue state.
+
+### C.5 Effort: large, but every moved piece is plain Python (no pipeline
+serialization). Best approached incrementally on top of a shipped B.
+
+---
+
+## Multi-engine (future, enabled by B/C)
+
+One engine per GPU (or per hot model). The front routes a request to the engine
+that holds the target model (or asks an idle engine to load it). One engine
+loading no longer blocks generation on another engine. Requires:
+- Engine registry in the front (which engine holds which model, health, VRAM).
+- A placement/eviction policy across engines (extends the current per-process
+  VRAM logic to a fleet view).
+
+---
+
+## Decision log / open questions
+
+- Confirm: default boot launches **front + engine** with a flag to retain the
+  legacy single-process mode? (Recommended yes.)
+- Confirm: HTTP client — `httpx` (already a likely dependency) vs `aiohttp`.
+- Confirm: status staleness budget when the engine is mid-load (e.g. serve cache
+  up to N seconds old, then show "engine loading").
+- B → C migration order: sessions/tokens first (low risk), then config, then
+  queue, then progress/registry (needs the control channel).
+
+## Recommended sequencing
+
+1. **B** — front proxy + engine supervisor + status cache. Fixes the freeze for
+   all model types with no engine changes beyond `--internal-port`/`/healthz`.
+2. (Optional, separate) **A.1/A.2** text worker — only if we want per-model fault
+   isolation *within* an engine; otherwise B already solves the UI freeze.
+3. **C** — incrementally move coordination state to the front.
+4. **Multi-engine** — once C's registry exists.
--- a/docs/reverse-proxy-nginx.md
+++ b/docs/reverse-proxy-nginx.md
+# Running CoderAI and the `tools/` web UIs behind nginx
+
+Everything here works behind an nginx (or any) reverse proxy. There are two
+ways to mount each service; pick per service:
+
+* **Subdomain / root location** — the service owns `/` of a `server_name`
+  (e.g. `coderai.example.com`). Works for *every* service with no app changes.
+* **Sub-path** — the service lives under a path (e.g. `example.com/coderai/`).
+  Supported by **CoderAI** and **`tools/video_editor.py`**. The other
+  `tools/` UIs currently need the subdomain/root form (see the table).
+
+| Service                         | Root / subdomain | Sub-path (`/foo/`) |
+|---------------------------------|:----------------:|:------------------:|
+| CoderAI server (`codai`)        | ✅               | ✅                 |
+| `tools/video_editor.py`         | ✅               | ✅                 |
+| `tools/videogen.py`             | ✅               | ⚠️ needs work      |
+| `tools/review_outputs.py`       | ✅               | ⚠️ needs work      |
+| `tools/gen_township_fighters.py`| ✅               | ⚠️ needs work      |
+
+## Headers every proxy block needs
+
+CoderAI builds public URLs (image/video/audio output links, redirects, admin
+links) from these headers via `codai/api/urlutils.py`, and `video_editor.py`
+honours `X-Forwarded-Prefix` for sub-path mounting:
+
+```nginx
+proxy_set_header Host              $host;
+proxy_set_header X-Real-IP         $remote_addr;
+proxy_set_header X-Forwarded-For   $proxy_add_x_forwarded_for;
+proxy_set_header X-Forwarded-Proto $scheme;
+proxy_set_header X-Forwarded-Host  $host;
+# Sub-path mounts only — tells the app its public prefix:
+# proxy_set_header X-Forwarded-Prefix /coderai;
+```
+
+Also important for AI workloads:
+
+```nginx
+client_max_body_size 1024m;   # large image/audio/video uploads
+proxy_read_timeout   3600s;   # long generations / renders
+proxy_send_timeout   3600s;
+proxy_buffering      off;     # required for SSE streaming (chat, progress)
+```
+
+## CoderAI — subdomain (root)
+
+```nginx
+server {
+    listen 443 ssl;
+    server_name coderai.example.com;
+    # ssl_certificate ... ; ssl_certificate_key ... ;
+
+    client_max_body_size 1024m;
+
+    location / {
+        proxy_pass http://127.0.0.1:8000;
+        proxy_http_version 1.1;
+        proxy_set_header Host              $host;
+        proxy_set_header X-Real-IP         $remote_addr;
+        proxy_set_header X-Forwarded-For   $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        proxy_set_header X-Forwarded-Host  $host;
+        proxy_read_timeout 3600s;
+        proxy_send_timeout 3600s;
+        proxy_buffering off;            # SSE: streamed chat + task progress
+    }
+}
+```
+
+Optionally pin the public URL instead of trusting headers: start CoderAI with
+`--url https://coderai.example.com`.
+
+## CoderAI — sub-path (`https://example.com/coderai/`)
+
+```nginx
+location /coderai/ {
+    proxy_pass http://127.0.0.1:8000/;   # trailing slash strips the prefix
+    proxy_http_version 1.1;
+    proxy_set_header Host              $host;
+    proxy_set_header X-Forwarded-Proto $scheme;
+    proxy_set_header X-Forwarded-Host  $host;
+    proxy_set_header X-Forwarded-Prefix /coderai;   # <-- the key line
+    proxy_read_timeout 3600s;
+    proxy_buffering off;
+}
+```
+
+CoderAI reads `X-Forwarded-Prefix` into the ASGI `root_path`, so `request.url`,
+redirects, `{{ root_path }}` template links, the `ROOT_PATH` JS global, and all
+generated file URLs become `/coderai/...` automatically.
+
+## `tools/video_editor.py`
+
+Start it bound to localhost (default) and proxy to it. It works at root and at
+a sub-path. For a sub-path, set `X-Forwarded-Prefix`; the page injects a
+matching `<base href>` and all its API/media/render URLs are relative, so they
+resolve correctly under any mount. It also strips the prefix server-side, so it
+works whether or not nginx strips it.
+
+```nginx
+# Sub-path: https://example.com/editor/
+location /editor/ {
+    proxy_pass http://127.0.0.1:8420/;
+    proxy_http_version 1.1;
+    proxy_set_header Host              $host;
+    proxy_set_header X-Forwarded-Proto $scheme;
+    proxy_set_header X-Forwarded-Prefix /editor;
+    proxy_read_timeout 3600s;     # long ffmpeg renders
+    proxy_send_timeout 3600s;
+    proxy_request_buffering off;  # stream large uploads straight through
+    client_max_body_size 4096m;   # video/music uploads from the browser machine
+}
+```
+
+Run with `--no-browser` on a server. The video editor talks to CoderAI over
+`--base-url` server-side (not from the browser), so the browser only ever needs
+to reach the editor's own origin. Source files can be picked from the server's
+media directory or uploaded from the browser machine (hence the larger
+`client_max_body_size` / `proxy_request_buffering off` above).
+
+## `tools/videogen.py`, `review_outputs.py`, `gen_township_fighters.py`
+
+Mount each at the root of its own `server_name` (or a dedicated port). These
+UIs use absolute (`/...`) asset and API paths plus SSE, so they expect to own
+`/`:
+
+```nginx
+server {
+    listen 443 ssl;
+    server_name videogen.example.com;
+    location / {
+        proxy_pass http://127.0.0.1:7860;   # the tool's --port
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        proxy_read_timeout 3600s;
+        proxy_buffering off;                # these stream progress over SSE
+    }
+}
+```
+
+Sub-path mounting for these three needs their client URLs made relative (the
+same change already applied to `video_editor.py`).
--- a/packaging/linux/build_oci_image.sh
+++ b/packaging/linux/build_oci_image.sh
@@ -182,6 +182,8 @@ discover_local_binaries() {
    "$HOME/whisper.cpp/build/bin/whisper-cli"
    "$HOME/whisper.cpp/build/bin/main"
    "$HOME/whisper.cpp/build/bin/server"
+    "/usr/local/bin/ds4-server"
+    "${CODERAI_DS4_DIR:-$HOME/.coderai/ds4}/ds4-server"
  )
  local path
  for path in "${candidates[@]}"; do

--- a/tools/parler_tts_service.py
+++ b/tools/parler_tts_service.py
+#!/usr/bin/env python3
+"""Standalone Parler-TTS HTTP microservice — run in its OWN venv.
+
+parler-tts hard-pins an old transformers/tokenizers/huggingface-hub that conflict
+with the coderai server's stack (transformers 5.x). So instead of polluting that
+environment, Parler runs here behind a tiny stdlib HTTP shim, and coderai talks to
+it as a remote TTS backend (``_RemoteParlerBackend``, selected when a model's
+config carries a ``service_url``).
+
+Setup (separate venv!):
+
+    python3 -m venv ~/.venvs/parler
+    source ~/.venvs/parler/bin/activate
+    pip install "git+https://github.com/huggingface/parler-tts.git" soundfile
+    python tools/parler_tts_service.py \
+        --model parler-tts/parler-tts-mini-multilingual --port 8123
+
+Then point a coderai TTS model's config at it, e.g. in models.json:
+
+    "tts:parler-tts/parler-tts-mini-multilingual": {"service_url": "http://127.0.0.1:8123"}
+
+Endpoints:
+    GET  /health  -> {"ok": true, "model": ..., "sampling_rate": N}
+    POST /speak   -> audio/wav   (body: {text, voice, speed, emotion, style, description?})
+"""
+
+import argparse
+import io
+import json
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+
+import numpy as np
+import soundfile as sf
+
+
+# These mirror the editor's gated controls. coderai surfaces the same lists via
+# codai.api.tts_backends._FAMILY_{EMOTIONS,STYLES}["parler"].
+EMOTIONS = ["neutral", "happy", "sad", "angry", "excited", "calm", "fearful"]
+STYLES = ["normal", "whispering", "shouting", "monotone", "expressive"]
+
+
+def build_description(voice: str, speed, emotion: str, style: str, speaker: str = "") -> str:
+    """Map the UI controls into a Parler natural-language delivery description."""
+    spk = (voice or "").strip()
+    if spk and ("/" in spk or spk.lower().startswith(("af_", "am_", "bf_", "bm_"))):
+        spk = ""  # a path or a Kokoro id is not a Parler speaker name
+    who = spk or speaker or "A speaker"
+    bits = [f"{who} speaks"]
+    if emotion and emotion != "neutral":
+        bits.append(f"in a {emotion} tone")
+    smap = {"whispering": "whispering softly", "shouting": "shouting loudly",
+            "monotone": "in a flat monotone", "expressive": "in a very expressive, animated way"}
+    if style and style not in ("", "normal"):
+        bits.append(smap.get(style, style))
+    try:
+        sp = float(speed or 1.0)
+    except (TypeError, ValueError):
+        sp = 1.0
+    bits.append(f"at a {'slow' if sp < 0.9 else 'fast' if sp > 1.15 else 'moderate'} pace")
+    return (" ".join(bits) +
+            ". The recording is very high quality, the voice clear and close up "
+            "with no background noise.")
+
+
+class _Engine:
+    """Loads the Parler model once and synthesizes to a float waveform."""
+
+    def __init__(self, model_name: str):
+        from parler_tts import ParlerTTSForConditionalGeneration
+        from transformers import AutoTokenizer
+        import torch
+        self.model_name = model_name
+        self._device = "cuda" if torch.cuda.is_available() else "cpu"
+        self._model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(self._device)
+        self._tok = AutoTokenizer.from_pretrained(model_name)
+        self.sr = int(self._model.config.sampling_rate)
+
+    def speak(self, text: str, description: str) -> np.ndarray:
+        ids = self._tok(description, return_tensors="pt").input_ids.to(self._device)
+        prompt = self._tok(text, return_tensors="pt").input_ids.to(self._device)
+        gen = self._model.generate(input_ids=ids, prompt_input_ids=prompt)
+        return np.asarray(gen.cpu().numpy().squeeze(), dtype=np.float32)
+
+
+ENGINE: _Engine = None  # set in main()
+
+
+class Handler(BaseHTTPRequestHandler):
+    def _send(self, code, body=b"", ctype="application/json"):
+        self.send_response(code)
+        self.send_header("Content-Type", ctype)
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        if body:
+            self.wfile.write(body)
+
+    def log_message(self, fmt, *args):  # quieter logs
+        pass
+
+    def do_GET(self):
+        if self.path.split("?")[0] == "/health":
+            self._send(200, json.dumps(
+                {"ok": True, "model": ENGINE.model_name, "sampling_rate": ENGINE.sr}).encode())
+        else:
+            self._send(404, b'{"error":"not found"}')
+
+    def do_POST(self):
+        if self.path.split("?")[0] != "/speak":
+            self._send(404, b'{"error":"not found"}')
+            return
+        try:
+            n = int(self.headers.get("Content-Length", 0))
+            req = json.loads(self.rfile.read(n) or b"{}")
+            text = (req.get("text") or "").strip()
+            if not text:
+                self._send(400, b'{"error":"empty text"}')
+                return
+            desc = req.get("description") or build_description(
+                req.get("voice", ""), req.get("speed", 1.0),
+                req.get("emotion", ""), req.get("style", ""))
+            audio = ENGINE.speak(text, desc)
+            buf = io.BytesIO()
+            sf.write(buf, audio, ENGINE.sr, format="WAV")
+            self._send(200, buf.getvalue(), ctype="audio/wav")
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            self._send(500, json.dumps({"error": str(e)}).encode())
+
+
+def main(argv=None):
+    ap = argparse.ArgumentParser(description="Standalone Parler-TTS HTTP service")
+    ap.add_argument("--model", default="parler-tts/parler-tts-mini-multilingual")
+    ap.add_argument("--host", default="127.0.0.1")
+    ap.add_argument("--port", type=int, default=8123)
+    args = ap.parse_args(argv)
+
+    global ENGINE
+    print(f"Loading {args.model} …")
+    ENGINE = _Engine(args.model)
+    print(f"Ready: {args.model} @ {ENGINE.sr} Hz — serving on http://{args.host}:{args.port}")
+    ThreadingHTTPServer((args.host, args.port), Handler).serve_forever()
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/video_editor.py
+++ b/tools/video_editor.py
--- a/video_editor.config.json
+++ b/video_editor.config.json
+{
+  "media_dir": "/storage/coderai/tools/coderai_media",
+  "output_dir": "/storage/coderai/video_editor_output",
+  "base_url": "http://127.0.0.1:8000",
+  "api_key": "sk-coderai-1b8b559808f9fb9927cabef33e0c1bf7ca7943f3281cbf7b7b661fc37aa9fbe0",
+  "voice": "feminine",
+  "voice_name": "af_sarah",
+  "tts_model": "suno/bark",
+  "stt_model": null,
+  "audio_model": null,
+  "video": null
+}