front/engine split, ds4 + media tooling, gemma-4 native tools; ignore runtime artifacts

- frontproxy: torch-free front proxy + per-vendor engine supervisor with auth,
  localhost binding, model routing; Ctrl-C now force-kills engines (own session +
  PDEATHSIG, SIGKILL of engine process groups, watchdog on hung drain)
- gemma-4 tool calling: prompt via native tools= template, parse call:NAME{...}
  into tool_calls, honour generation_config EOS so it stops instead of looping
- ds4 external worker, parler/expressive TTS backends, video editor tooling
- --debug-requests: full client<->API request/response logging + live snapshots
- stop tracking runtime artifacts (video_editor/sessions/, tools/coderai_media/)
Co-Authored-By: 's avatarClaude Opus 4.8 <noreply@anthropic.com>
parent 2fb085f4
......@@ -33,3 +33,7 @@ township_output/
# Packaging build cache + runtime temp (large artifacts)
.packaging-cache/
tmp/
# Video editor sessions + generated media (runtime artifacts)
video_editor/sessions/
tools/coderai_media/
......@@ -35,6 +35,7 @@ BACKEND="${1:-all}"
FLASH=false
CUSTOM_VENV=""
PACKAGE=false
DS4=false
# Parse arguments
i=1
......@@ -50,6 +51,9 @@ for arg in "$@"; do
--package)
PACKAGE=true
;;
--ds4)
DS4=true
;;
esac
i=$((i + 1))
done
......@@ -68,6 +72,7 @@ if [[ "$BACKEND" != "nvidia" && "$BACKEND" != "vulkan" && "$BACKEND" != "vulkan-
echo ""
echo "Options:"
echo " --flash - Install Flash Attention 2 for faster inference (NVIDIA only)"
echo " --ds4 - Clone + build the ds4 (DeepSeek V4) native engine"
exit 1
fi
......@@ -755,6 +760,35 @@ package_app() {
echo -e "${YELLOW}Note: The target machine must still provide compatible system GPU/runtime libraries.${NC}"
}
# Optionally clone + build ds4 (DeepSeek V4 native engine). Opt-in via --ds4.
# coderai can also auto-build this at runtime on first use, but doing it here lets
# the OCI/Docker packaging bundle the prebuilt ds4-server binary.
build_ds4() {
local DS4_DIR="${CODERAI_DS4_DIR:-$HOME/.coderai/ds4}"
echo -e "${YELLOW}Building ds4 (DeepSeek V4 engine) → $DS4_DIR ...${NC}"
if [ ! -e "$DS4_DIR/Makefile" ]; then
mkdir -p "$(dirname "$DS4_DIR")"
git clone --depth 1 https://github.com/antirez/ds4 "$DS4_DIR" || {
echo -e "${YELLOW}Warning: could not clone ds4; skipping.${NC}"; return 0; }
fi
local TARGET="cpu"
if command -v nvcc &> /dev/null || [ -d "/usr/local/cuda" ]; then
TARGET="cuda-generic"
elif [ "$(uname -s)" = "Darwin" ]; then
TARGET="" # bare `make` builds the macOS Metal backend
fi
( cd "$DS4_DIR" && make $TARGET ) || {
echo -e "${YELLOW}Warning: ds4 build failed; it can still be built at runtime.${NC}"; return 0; }
if [ -x "$DS4_DIR/ds4-server" ]; then
echo -e "${GREEN}✓ ds4-server built at $DS4_DIR/ds4-server${NC}"
echo -e "${YELLOW}Note: DeepSeek V4 weights are downloaded on first use (multi-GB).${NC}"
fi
}
if [ "$DS4" = true ]; then
build_ds4
fi
# Create .backend file to track which backend was used
echo "$BACKEND" > .backend
......
......@@ -16,6 +16,7 @@
"""Admin dashboard routes."""
from pathlib import Path
import asyncio
import re
import shutil
from typing import Optional
......@@ -830,14 +831,35 @@ def _run_download_thread(session_id: str, model_id: str, file_pattern: str, pq):
# JSON lines on stdout, which we relay onto this session's SSE queue.
import subprocess as _sp
import sys as _sys
import collections as _collections
import pathlib as _pathlib
# The worker runs as `python -m codai.admin.download_worker`; when coderai is
# run from source (not pip-installed) the child won't find the `codai`
# package unless the repo root is on its path. routes.py lives at
# <repo>/codai/admin/routes.py, so parents[2] is the repo root.
_repo_root = str(_pathlib.Path(__file__).resolve().parents[2])
def _attempt(disable_xet: bool):
"""Spawn the worker once; relay its events. Returns (terminal, rc, tail)."""
env = dict(os.environ)
env["PYTHONPATH"] = _repo_root + (os.pathsep + env["PYTHONPATH"]
if env.get("PYTHONPATH") else "")
# hf_xet (the accelerated transfer) bypasses our tqdm progress hook — so
# the bar freezes near 100% while a big file silently downloads — and can
# hard-crash the worker (segfault / signal kill) with no traceback. The
# plain HTTPS path reports byte-accurate progress and is reliable, so we
# default to it unless the operator explicitly opted in (set
# HF_HUB_DISABLE_XET=0). A crash retry always disables it.
if disable_xet or os.environ.get("HF_HUB_DISABLE_XET") is None:
env["HF_HUB_DISABLE_XET"] = "1"
proc = _sp.Popen(
[_sys.executable, "-m", "codai.admin.download_worker", model_id, file_pattern or ""],
stdout=_sp.PIPE, stderr=_sp.STDOUT, text=True, bufsize=1,
stdout=_sp.PIPE, stderr=_sp.STDOUT, text=True, bufsize=1, env=env, cwd=_repo_root,
)
_download_procs[session_id] = proc
terminal = None # set to "done"/"error" once the child reports a final event
terminal = None
recent = _collections.deque(maxlen=12)
try:
for line in proc.stdout:
line = line.strip()
......@@ -846,7 +868,9 @@ def _run_download_thread(session_id: str, model_id: str, file_pattern: str, pq):
try:
evt = _j.loads(line)
except Exception:
# Non-JSON output (warnings / tracebacks) → surface as info.
# Non-JSON output (warnings / tracebacks) → surface as info
# and keep a tail so a hard crash can report what it printed.
recent.append(line)
push({"type": "info", "message": line})
continue
etype = evt.get("type")
......@@ -855,12 +879,11 @@ def _run_download_thread(session_id: str, model_id: str, file_pattern: str, pq):
terminal = etype
except Exception as exc:
push({"type": "error", "message": str(exc)})
terminal = "error"
finally:
# Ensure the child is gone (cancel, crash, or normal exit).
if proc.poll() is None:
try:
proc.terminate()
proc.wait(timeout=10)
proc.terminate(); proc.wait(timeout=10)
except Exception:
pass
if proc.poll() is None:
......@@ -869,15 +892,32 @@ def _run_download_thread(session_id: str, model_id: str, file_pattern: str, pq):
except Exception:
pass
_download_procs.pop(session_id, None)
return terminal, proc.poll(), " | ".join(list(recent)[-4:]).strip()
try:
terminal, rc, tail = _attempt(disable_xet=False)
# A hard crash (no done/error event, not a user cancel) is the classic
# hf_xet failure — retry once with Xet disabled before giving up.
crashed = (terminal is None and session_id not in _download_cancelled)
if crashed and "HF_HUB_DISABLE_XET" not in os.environ:
push({"type": "info",
"message": "Transfer crashed; retrying without the Xet accelerator…"})
_download_status.get(session_id, {}).update({"status": "downloading", "percent": 0})
terminal, rc, tail = _attempt(disable_xet=True)
if terminal is None:
# Child ended without a done/error event → cancelled or died.
# Still no final event → cancelled or died for good.
if session_id in _download_cancelled:
pq.put({"type": "cancelled", "message": "Download cancelled by user"})
_download_status.get(session_id, {}).update({"status": "cancelled"})
else:
push({"type": "error", "message": "Download process exited unexpectedly"})
detail = f"Download process exited unexpectedly (exit code {rc})"
if rc is not None and rc < 0:
detail += f" — killed by signal {-rc} (often out-of-memory)"
if tail:
detail += f". Last output: {tail}"
push({"type": "error", "message": detail})
finally:
_download_cancelled.discard(session_id)
def _gc():
......@@ -1115,7 +1155,28 @@ def _scan_caches() -> dict:
continue
if p not in configured_settings:
configured_settings[p] = (s, cat)
all_configs.setdefault(p, []).append({"settings": s, "cat": cat})
# A single logical config can be registered under multiple
# categories via model_types (for example text+vision). It is
# stored once per category in models.json with the same
# config_id, but the UI should show it as one editable config,
# not duplicate pills that appear to delete each other.
_cfg_list = all_configs.setdefault(p, [])
_cid = s.get("config_id") if isinstance(s, dict) else None
_existing = None
if _cid:
for _cfg in _cfg_list:
_settings = _cfg.get("settings") or {}
if isinstance(_settings, dict) and _settings.get("config_id") == _cid:
_existing = _cfg
break
if _existing is not None:
_cats = _existing.setdefault("cats", [])
if cat not in _cats:
_cats.append(cat)
if not _existing.get("cat"):
_existing["cat"] = cat
else:
_cfg_list.append({"settings": s, "cat": cat, "cats": [cat]})
# Secondary index: basename → (settings_tuple, original_path)
# Used to reconnect a config to a re-downloaded file that landed at a different path.
......@@ -1678,7 +1739,6 @@ async def api_model_load(request: Request, username: str = Depends(require_admin
multi_model_manager.add_model(model_key, pipeline)
multi_model_manager.record_vram_delta(model_key, _snap)
elif model_type == "video":
import asyncio
from codai.api.video import _load_video_pipeline, _derive_device
model_key = f"video:{path}"
device = _derive_device()
......@@ -1693,7 +1753,6 @@ async def api_model_load(request: Request, username: str = Depends(require_admin
multi_model_manager.models_in_vram.add(model_key)
multi_model_manager.record_vram_delta(model_key, _snap)
elif model_type == "audio_gen":
import asyncio
from codai.api.audio_gen import _load_musicgen, _load_audioldm, _detect_audio_gen_type, _derive_device
model_key = f"audio_gen:{path}"
device = _derive_device()
......@@ -1711,32 +1770,26 @@ async def api_model_load(request: Request, username: str = Depends(require_admin
multi_model_manager.models_in_vram.add(model_key)
multi_model_manager.record_vram_delta(model_key, _snap)
elif model_type == "tts":
import asyncio
model_key = f"tts:{path}"
_snap = multi_model_manager.vram_before_load()
# Use the same backend factory as a real request so every engine is
# handled identically — in particular a Parler model boots its managed
# worker here, so "loading" it from the interface starts the service.
cfg = (multi_model_manager.config.get(model_key)
or multi_model_manager.config.get(f"tts:{path}")
or model_cfg or {})
def _load_tts():
try:
from kokoro import Kokoro
return Kokoro(path)
except ImportError:
pass
try:
from bark import preload_models
preload_models()
return {"bark": True}
except ImportError:
pass
return None
from codai.api import tts_backends
return tts_backends.load_backend(path, path, cfg)
tts_obj = await asyncio.to_thread(_load_tts)
if tts_obj is None:
raise RuntimeError("No supported TTS backend found (kokoro / bark)")
raise RuntimeError("TTS model failed to load")
multi_model_manager.models[model_key] = tts_obj
multi_model_manager.current_model_key = model_key
multi_model_manager.active_in_vram = model_key
multi_model_manager.models_in_vram.add(model_key)
multi_model_manager.record_vram_delta(model_key, _snap)
elif model_type in ("embedding", "spatial", "vision"):
import asyncio
from codai.api.images import _load_diffusers_pipeline
from codai.api.state import get_global_args
model_key = f"{model_type}:{path}"
......@@ -1797,6 +1850,78 @@ async def api_model_unload(request: Request, username: str = Depends(require_adm
return {"success": True, "was_loaded": True}
def _sanitize_engine_int_overrides(raw) -> dict:
"""Clean a {engine_name: int} override map: keep positive ints, drop the rest."""
out = {}
if isinstance(raw, dict):
for name, val in raw.items():
if val in (None, ""):
continue
try:
iv = int(val)
except (TypeError, ValueError):
continue
if iv >= 1:
out[str(name)] = iv
return out
def _resolve_engine_spec(engine_name: str, engine_specs):
"""Find the declared engine matching ``engine_name`` (by name or backend)."""
for s in (engine_specs or []):
if not isinstance(s, dict):
continue
if (s.get("name") or "").lower() == engine_name.lower() \
or (s.get("backend") or "").lower() == engine_name.lower():
return s
return None
def validate_engine_pin(engine_name: str, model_path: str, engine_specs,
model_backend: str = None, ds4_cfg=None) -> list:
"""Return human-readable warnings if pinning ``model_path`` to ``engine_name``
is wrong (unknown engine, or an engine that can't run this model's format).
Empty list = the pin is fine. Used to *notify* the admin instead of silently
ignoring a bad pin (the router would otherwise just fall back)."""
engine_name = (engine_name or "").strip()
if not engine_name:
return []
from codai.frontproxy.registry import _DEFAULT_CAPS
from codai.frontproxy.router import required_capability
specs = engine_specs or []
if specs:
spec = _resolve_engine_spec(engine_name, specs)
if spec is None:
names = [s.get("name") for s in specs if isinstance(s, dict) and s.get("name")]
return [f"Engine '{engine_name}' is not declared. Known engines: "
f"{', '.join(names) or '(none)'}."]
backend = (spec.get("backend") or "auto").lower()
caps = set(spec.get("capabilities")
or _DEFAULT_CAPS.get(backend, {"transformers", "gguf"}))
else:
# Auto-detection: no engine_specs to resolve against — infer the engine's
# capabilities from its vendor/backend name so we can still catch an
# impossible pin (e.g. a transformers model pinned to the Radeon engine).
key = engine_name.lower()
backend = {"radeon": "vulkan", "amd": "vulkan", "intel": "vulkan",
"cuda": "nvidia"}.get(key, key)
caps = _DEFAULT_CAPS.get(backend)
if caps is None:
return [] # unknown name, nothing to validate against — accept silently
caps = set(caps)
req = required_capability(
model_path, backend=model_backend,
ds4_model_id=getattr(ds4_cfg, "model_id", None) if ds4_cfg else None,
ds4_enabled=bool(getattr(ds4_cfg, "enabled", False)) if ds4_cfg else False)
if req and req not in caps:
return [f"Engine '{engine_name}' (backend '{backend}') can't run this model: "
f"it needs '{req}' capability but the engine only provides "
f"{sorted(caps)}. The request would fall back to a compatible engine — "
f"pick a different engine or adjust the engine's capabilities."]
return []
@router.post("/admin/api/model-configure", summary="Update a model's configuration")
async def api_model_configure(request: Request, username: str = Depends(require_admin)):
"""Save per-model configuration and register/update in models.json."""
......@@ -1897,6 +2022,11 @@ async def api_model_configure(request: Request, username: str = Depends(require_
if existing_cid and not is_new_config:
# Targeted removal: only the entry that shares this config_id
return existing_cid == config_id
if existing_cid and is_new_config:
# Adding a new configuration for the same model must preserve modern
# sibling configs. Only legacy entries without config_id fall through
# to path-based replacement because they cannot be targeted safely.
return False
# Path-based removal (no config_id on either side, or new entry replacing old)
key = m_entry.get("path", m_entry.get("id", ""))
return key in paths_to_remove or (fnames_to_remove and _os.path.basename(key) in fnames_to_remove)
......@@ -1938,7 +2068,7 @@ async def api_model_configure(request: Request, username: str = Depends(require_
"max_vram", "sdcpp_flash_attn", "sdcpp_diffusion_flash_attn", "vae_tiling",
"component_quantization", "output_crf", "force_vram_update",
"balanced_gpu_percent", "acceleration",
"cache_type_k", "cache_type_v", "turboquant"):
"cache_type_k", "cache_type_v", "turboquant", "engine"):
if key in data:
entry[key] = data[key]
......@@ -1966,7 +2096,15 @@ async def api_model_configure(request: Request, username: str = Depends(require_
applied = apply_model_entry_live(entry, model_types)
except Exception as e:
print(f" [admin] live config apply failed (restart to apply): {e}")
return {"success": True, "applied_live": applied}
warnings = []
if entry.get("engine"):
warnings = validate_engine_pin(
entry["engine"], path, config_manager.config.server.engine_specs,
model_backend=entry.get("backend"),
ds4_cfg=getattr(config_manager.config, "ds4", None))
for w in warnings:
print(f" [admin] engine-pin warning: {w}")
return {"success": True, "applied_live": applied, "warnings": warnings}
@router.get("/admin/api/accel-presets", summary="List acceleration / distillation presets")
......@@ -2244,6 +2382,21 @@ def _read_vram_info() -> Optional[dict]:
return None
@router.get("/admin/api/gpu-stats", summary="Per-card GPU utilization, VRAM and temperature")
def api_gpu_stats(username: str = Depends(require_auth)):
"""Live stats for EVERY physical GPU installed (NVIDIA via nvidia-smi, AMD via
sysfs), independent of which engine owns it. Used by the Tasks page to show
per-card VRAM + utilization across all cards. Best-effort; empty if unreadable.
SYNC handler: it shells out to nvidia-smi / reads sysfs, so it runs in the
threadpool rather than on the event loop."""
try:
from codai.frontproxy.gpu_detect import gpu_stats
return {"cards": gpu_stats()}
except Exception as e:
return {"cards": [], "error": str(e)}
@router.get("/admin/api/system-stats", summary="Live CPU / GPU / RAM / VRAM usage and temperatures")
def api_system_stats(username: str = Depends(require_admin)):
"""Lightweight hardware telemetry for the Tasks page header: CPU & GPU
......@@ -2406,6 +2559,12 @@ async def api_get_settings(username: str = Depends(require_admin)):
"https_cert_path": c.server.https_cert_path,
"queue_max_size": c.server.queue_max_size,
"max_parallel_requests": c.server.max_parallel_requests,
"max_parallel_requests_overrides": c.server.max_parallel_requests_overrides,
"internal_port_base": c.server.internal_port_base,
"default_engine": c.server.default_engine,
# Engine names available to pick as the default (for the settings UI).
"engine_names": [s.get("name") for s in (c.server.engine_specs or [])
if isinstance(s, dict) and s.get("name")],
},
"backend": {
"type": c.backend.type,
......@@ -2417,6 +2576,8 @@ async def api_get_settings(username: str = Depends(require_admin)):
"default_load_mode": c.models.default_load_mode,
"hf_cache_dir": c.models.hf_cache_dir,
"gguf_cache_dir": c.models.gguf_cache_dir,
"max_model_instances": c.models.max_model_instances,
"max_model_instances_overrides": c.models.max_model_instances_overrides,
},
"offload": {
"directory": c.offload.directory,
......@@ -2430,6 +2591,9 @@ async def api_get_settings(username: str = Depends(require_admin)):
"max_ram_gb": c.offload.max_ram_gb,
"evict_idle_on_ram": c.offload.evict_idle_on_ram,
"ram_leak_watch": c.offload.ram_leak_watch,
"ram_watch_poll_seconds": c.offload.ram_watch_poll_seconds,
"ram_watch_soft_fraction": c.offload.ram_watch_soft_fraction,
"ram_watch_cuda": c.offload.ram_watch_cuda,
},
"vulkan": {
"n_gpu_layers": c.vulkan.n_gpu_layers,
......@@ -2449,6 +2613,7 @@ async def api_get_settings(username: str = Depends(require_admin)):
"cpu_resume": c.thermal.cpu_resume,
"gpu_high": c.thermal.gpu_high,
"gpu_resume": c.thermal.gpu_resume,
"gpu_overrides": c.thermal.gpu_overrides,
"poll_seconds": c.thermal.poll_seconds,
"soft_throttle_enabled": c.thermal.soft_throttle_enabled,
"soft_throttle_temp": c.thermal.soft_throttle_temp,
......@@ -2461,6 +2626,19 @@ async def api_get_settings(username: str = Depends(require_admin)):
"allow_ffmpeg": c.enhance.allow_ffmpeg,
"allow_rife_ncnn": c.enhance.allow_rife_ncnn,
},
"ds4": {
"enabled": c.ds4.enabled,
"repo_url": c.ds4.repo_url,
"install_dir": c.ds4.install_dir,
"build_target": c.ds4.build_target,
"model_variant": c.ds4.model_variant,
"model_id": c.ds4.model_id,
"host": c.ds4.host,
"port": c.ds4.port,
"ctx": c.ds4.ctx,
"extra_args": c.ds4.extra_args,
"auto_build": c.ds4.auto_build,
},
"broker": {
"enabled": c.broker.enabled,
"base_url": c.broker.base_url,
......@@ -2495,6 +2673,7 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
data = await request.json()
c = config_manager.config
_settings_warnings: list = []
if "server" in data:
srv = data["server"]
......@@ -2511,6 +2690,28 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
c.server.max_parallel_requests = int(srv["max_parallel_requests"])
from codai.queue.manager import queue_manager
queue_manager.max_parallel_requests = c.server.max_parallel_requests
if "max_parallel_requests_overrides" in srv:
c.server.max_parallel_requests_overrides = _sanitize_engine_int_overrides(
srv["max_parallel_requests_overrides"])
if "internal_port_base" in srv:
try:
c.server.internal_port_base = max(1, min(65535, int(srv["internal_port_base"])))
except (TypeError, ValueError):
pass
if "default_engine" in srv:
c.server.default_engine = (srv.get("default_engine") or "").strip() or None
# Only validate against engine_specs when they're explicitly declared.
# With auto-detection engine_specs is empty and the engines (nvidia/
# radeon/…) are only known to the front, so don't false-warn there — the
# front validates the name at routing time and logs if it can't honour it.
if (c.server.default_engine and c.server.engine_specs
and _resolve_engine_spec(c.server.default_engine,
c.server.engine_specs) is None):
names = [s.get("name") for s in (c.server.engine_specs or [])
if isinstance(s, dict) and s.get("name")]
_settings_warnings.append(
f"Default engine '{c.server.default_engine}' is not declared. "
f"Known engines: {', '.join(names) or '(none)'}.")
if "backend" in data:
bk = data["backend"]
......@@ -2526,6 +2727,14 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
c.models.hf_cache_dir = mdl["hf_cache_dir"] or None
if "gguf_cache_dir" in mdl:
c.models.gguf_cache_dir = mdl["gguf_cache_dir"] or None
if "max_model_instances" in mdl:
try:
c.models.max_model_instances = max(1, int(mdl["max_model_instances"]))
except (TypeError, ValueError):
pass
if "max_model_instances_overrides" in mdl:
c.models.max_model_instances_overrides = _sanitize_engine_int_overrides(
mdl["max_model_instances_overrides"])
if "offload" in data:
off = data["offload"]
......@@ -2543,6 +2752,11 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
c.offload.max_ram_gb = off["max_ram_gb"] or None
c.offload.evict_idle_on_ram = bool(off.get("evict_idle_on_ram", c.offload.evict_idle_on_ram))
c.offload.ram_leak_watch = bool(off.get("ram_leak_watch", c.offload.ram_leak_watch))
if "ram_watch_poll_seconds" in off:
c.offload.ram_watch_poll_seconds = float(off["ram_watch_poll_seconds"] or c.offload.ram_watch_poll_seconds)
if "ram_watch_soft_fraction" in off:
c.offload.ram_watch_soft_fraction = float(off["ram_watch_soft_fraction"] or c.offload.ram_watch_soft_fraction)
c.offload.ram_watch_cuda = bool(off.get("ram_watch_cuda", c.offload.ram_watch_cuda))
# Push the RAM-cap settings to live global_args so the watcher, per-load
# budget clamp and eviction honour them without a restart.
try:
......@@ -2552,6 +2766,9 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
ga.max_ram_gb = c.offload.max_ram_gb
ga.evict_idle_on_ram = c.offload.evict_idle_on_ram
ga.ram_leak_watch = c.offload.ram_leak_watch
ga.ram_watch_poll_seconds = c.offload.ram_watch_poll_seconds
ga.ram_watch_soft_fraction = c.offload.ram_watch_soft_fraction
ga.ram_watch_cuda = c.offload.ram_watch_cuda
except Exception:
pass
......@@ -2607,6 +2824,22 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
c.thermal.cpu_resume = float(th.get("cpu_resume", c.thermal.cpu_resume))
c.thermal.gpu_high = float(th.get("gpu_high", c.thermal.gpu_high))
c.thermal.gpu_resume = float(th.get("gpu_resume", c.thermal.gpu_resume))
if "gpu_overrides" in th and isinstance(th["gpu_overrides"], dict):
# Sanitize: {vendor: {high, resume}} with numeric values only.
clean = {}
for vendor, ov in th["gpu_overrides"].items():
if not isinstance(ov, dict):
continue
entry = {}
for k in ("high", "resume"):
if ov.get(k) not in (None, ""):
try:
entry[k] = float(ov[k])
except (TypeError, ValueError):
pass
if entry:
clean[str(vendor).lower()] = entry
c.thermal.gpu_overrides = clean
c.thermal.poll_seconds = max(1.0, float(th.get("poll_seconds", c.thermal.poll_seconds)))
c.thermal.soft_throttle_enabled = bool(th.get("soft_throttle_enabled", c.thermal.soft_throttle_enabled))
c.thermal.soft_throttle_temp = float(th.get("soft_throttle_temp", c.thermal.soft_throttle_temp))
......@@ -2622,6 +2855,7 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
ga.thermal_cpu_resume = c.thermal.cpu_resume
ga.thermal_gpu_high = c.thermal.gpu_high
ga.thermal_gpu_resume = c.thermal.gpu_resume
ga.thermal_gpu_overrides = c.thermal.gpu_overrides
ga.thermal_poll_seconds = c.thermal.poll_seconds
ga.thermal_soft_throttle_enabled = c.thermal.soft_throttle_enabled
ga.thermal_soft_throttle_temp = c.thermal.soft_throttle_temp
......@@ -2656,6 +2890,30 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
except Exception:
pass
if "ds4" in data:
d = data["ds4"]
c.ds4.enabled = bool(d.get("enabled", c.ds4.enabled))
if "repo_url" in d:
c.ds4.repo_url = (d.get("repo_url") or c.ds4.repo_url or "").strip()
if "install_dir" in d:
c.ds4.install_dir = (d.get("install_dir") or "").strip() or None
if "build_target" in d:
c.ds4.build_target = (d.get("build_target") or "auto").strip()
if "model_variant" in d:
c.ds4.model_variant = (d.get("model_variant") or c.ds4.model_variant).strip()
if "model_id" in d:
c.ds4.model_id = (d.get("model_id") or c.ds4.model_id or "deepseek-v4").strip()
if "host" in d:
c.ds4.host = (d.get("host") or "127.0.0.1").strip()
if "port" in d:
c.ds4.port = int(d.get("port") or 0)
if "ctx" in d:
c.ds4.ctx = max(1024, int(d.get("ctx") or c.ds4.ctx))
if "extra_args" in d:
c.ds4.extra_args = (d.get("extra_args") or "").strip()
if "auto_build" in d:
c.ds4.auto_build = bool(d["auto_build"])
if "broker" in data:
bro = data["broker"]
c.broker.enabled = bool(bro.get("enabled", c.broker.enabled))
......@@ -2688,7 +2946,7 @@ async def api_save_settings(request: Request, username: str = Depends(require_ad
raise HTTPException(status_code=400, detail=str(error)) from error
config_manager.save_config()
return {"success": True}
return {"success": True, "warnings": _settings_warnings}
# =============================================================================
......
......@@ -2372,7 +2372,7 @@ const STUDIO_CAPABILITIES = {
optional:[],
notes:[
'Requires <code>insightface</code> and <code>onnxruntime</code>: <code>pip install insightface onnxruntime</code>.',
'The <b>inswapper_128.onnx</b> model is <b>auto-downloaded</b> from HuggingFace on first use (<a href="/admin/models?tab=search&q=inswapper&pipeline=&gguf=no-gguf" class="cap-find-link">deepinsight/inswapper<span class="cap-find-icon">↗</span></a>).',
'The <b>inswapper_128.onnx</b> model is <b>auto-downloaded</b> from HuggingFace on first use (<a href="' + (window.ROOT_PATH||'') + '/admin/models?tab=search&q=inswapper&pipeline=&gguf=no-gguf" class="cap-find-link">deepinsight/inswapper<span class="cap-find-icon">↗</span></a>).',
'No AI model selection needed — this feature uses its own dedicated backend.',
],
backendPath: ROOT_PATH + '/v1/images/faceswap',
......@@ -2386,7 +2386,7 @@ const STUDIO_CAPABILITIES = {
optional:[],
notes:[
'Requires <code>insightface</code> and <code>onnxruntime</code>: <code>pip install insightface onnxruntime</code>.',
'The <b>inswapper_128.onnx</b> model is <b>auto-downloaded</b> from HuggingFace on first use (<a href="/admin/models?tab=search&q=inswapper&pipeline=&gguf=no-gguf" class="cap-find-link">deepinsight/inswapper<span class="cap-find-icon">↗</span></a>).',
'The <b>inswapper_128.onnx</b> model is <b>auto-downloaded</b> from HuggingFace on first use (<a href="' + (window.ROOT_PATH||'') + '/admin/models?tab=search&q=inswapper&pipeline=&gguf=no-gguf" class="cap-find-link">deepinsight/inswapper<span class="cap-find-icon">↗</span></a>).',
'No AI model selection needed — this feature uses its own dedicated backend.',
],
backendPath: ROOT_PATH + '/v1/images/faceswap',
......@@ -2461,14 +2461,14 @@ function capSearchUrl(cap) {
const s = CAP_TO_HF_SEARCH[cap];
if (!s) return null;
const p = new URLSearchParams({ tab:'search', q: s.q, pipeline: s.pipeline, gguf: s.gguf });
return '/admin/models?' + p.toString();
return (window.ROOT_PATH || '') + '/admin/models?' + p.toString();
}
function capMissingHtml(caps, label) {
if (!caps.length) return '';
const links = caps.map(cap => {
const chip = `<span class="cap-chip dim">${cap.replace(/_/g,' ')}</span>`;
if (_localCapSet.has(cap)) {
const url = `/admin/models?local_cap=${encodeURIComponent(cap)}`;
const url = `${window.ROOT_PATH || ''}/admin/models?local_cap=${encodeURIComponent(cap)}`;
return `<a href="${url}" class="cap-find-link" title="You have a local model with ${cap.replace(/_/g,' ')} — click to configure it">${chip}<span class="cap-find-icon" style="color:#6ecf7e">↑ configure</span></a>`;
}
const url = capSearchUrl(cap);
......
......@@ -577,6 +577,13 @@ window.__DEFAULT_WHISPER_SERVER_PATH__ = {{ default_whisper_server_path|tojson }
</select>
</div>
</div>
<div class="form-row" id="cfg-engine-row" style="margin-top:.75rem;display:none">
<label class="form-label">Engine / card</label>
<select id="cfg-engine" class="form-input">
<option value="">Default (auto — by capability)</option>
</select>
<span class="form-hint" style="font-size:11px">Pin this model to a specific engine/card. Overrides the default engine. Only shown when multiple engines are running.</span>
</div>
<div style="display:grid;grid-template-columns:1fr 1fr;gap:.75rem;margin-top:.75rem">
<div class="form-row" style="margin:0">
<label class="form-label">Used VRAM <span class="muted">(GB)</span></label>
......@@ -1441,8 +1448,7 @@ function handleProgressEvent(evt){
showDownloadError(evt.message);
}else if(evt.type==='cancelled'){
_dlDone=true;
if(_dlEs){_dlEs.close();_dlEs=null;}
showDownloadError('Download cancelled');
showDownloadCancelled();
}
// keepalive: ignore
}
......@@ -1483,7 +1489,7 @@ async function reopenDownload(session_id){
if(s.rate) document.getElementById('dl-speed').textContent=fmtRate(s.rate);
if(s.eta!=null) document.getElementById('dl-eta').textContent=fmtEta(s.eta);
if(s.status==='done'){handleProgressEvent({type:'done'});return;}
if(s.status==='cancelled'){showDownloadError('Download cancelled');return;}
if(s.status==='cancelled'){_dlDone=true;showDownloadCancelled();return;}
if(s.status==='error'){showDownloadError(s.error||'Download failed');return;}
}
}
......@@ -1501,15 +1507,27 @@ async function reopenDownload(session_id){
};
}
function showDownloadCancelled(){
if(_dlEs){_dlEs.close();_dlEs=null}
document.getElementById('dl-form').style.display='block';
document.getElementById('dl-progress').style.display='none';
}
async function stopDownload(session_id){
if(!confirm('Cancel this download?')) return;
try{
await fetch(ROOT_PATH + '/admin/api/download-cancel/'+session_id, {method:'POST'});
const r = await fetch(ROOT_PATH + '/admin/api/download-cancel/'+session_id, {method:'POST'});
if(!r.ok){
let detail = r.status+' '+r.statusText;
try{ const j = await r.json(); if(j&&j.detail) detail = j.detail; }catch{}
alert('Could not cancel download: '+detail);
return;
}
if(_dlSessionId===session_id){
if(_dlEs){_dlEs.close();_dlEs=null;}
_dlDone=true;
showDownloadError('Download cancelled');
showDownloadCancelled();
}
pollDownloads(); // refresh the active-downloads strip immediately
}catch(e){
alert('Could not cancel download: '+e.message);
}
......@@ -1798,12 +1816,53 @@ let _localModels = [];
let _ggufFiles = [];
let _hfModels = [];
// Engine/card hardware info (fetched once); used to tag models with the card they
// run on when more than one engine is configured.
let _engineNames = [];
let _defaultEngine = '';
async function _loadEngineInfo(){
// Live engine names from the front (covers auto-detected engines, not just those
// declared in engine_specs); default_engine still comes from settings.
try {
const er = await fetch(ROOT_PATH + '/admin/api/engines');
if (er.ok) _engineNames = ((await er.json()).engines || []).map(e => e.name);
} catch(e) {}
try {
const d = await (await fetch(ROOT_PATH + '/admin/api/settings')).json();
if (!_engineNames.length) _engineNames = (d.server && d.server.engine_names) || [];
_defaultEngine = (d.server && d.server.default_engine) || '';
} catch(e) {}
}
// Compact card tag for a model config. Pinned engines show as-is (with 📌);
// otherwise the engine is inferred from the model's format (transformers/ds4 →
// nvidia; gguf/whisper → the default engine, or "any"). Hidden when ≤1 engine, so
// it never widens single-card setups.
function _engineTagHtml(m, s){
if(!_engineNames || _engineNames.length < 2) return '';
let eng = ((s && s.engine) || '').trim();
let pinned = !!eng;
if(!eng){
const path = (((m && (m.path || m.id || m.filename)) || '') + '').toLowerCase();
const isGguf = path.endsWith('.gguf') || path.includes('gguf');
const isWhisper = ((s && s.backend) || '') === 'whisper-server';
const isDs4 = path.includes('deepseek-v4');
if(isDs4 || (!isGguf && !isWhisper)) eng = 'nvidia'; // ds4/transformers → nvidia
else eng = _defaultEngine || 'any'; // gguf/whisper → default
}
const lc = eng.toLowerCase();
const color = (lc.includes('nv')) ? '#76b900'
: (lc.includes('rad') || lc.includes('amd')) ? '#ed1c24'
: 'var(--text-3)';
const title = pinned ? ('Pinned to engine: ' + eng) : ('Runs on: ' + eng + ' (auto)');
return `<span class="badge" title="${esc(title)}" style="font-size:9px;padding:.05rem .3rem;margin:.1rem .1rem 0 0;vertical-align:middle;border:1px solid ${color};color:${color};background:transparent">${esc(eng)}${pinned?' 📌':''}</span>`;
}
function _renderConfigPills(idx, m) {
const configs = m.configs || [];
if (!configs.length) return '';
const pills = configs.map((c, cfgIdx) => {
const label = (c.settings && (c.settings.config_name || c.settings.alias)) || `Config ${cfgIdx + 1}`;
return `<span class="badge badge-user" style="font-size:10px;cursor:pointer;vertical-align:middle;margin:.1rem .1rem 0 0" onclick="openCfgModal(${idx},${cfgIdx})" title="Edit this configuration">${esc(label)}</span>`;
return `<span class="badge badge-user" style="font-size:10px;cursor:pointer;vertical-align:middle;margin:.1rem .1rem 0 0" onclick="openCfgModal(${idx},${cfgIdx})" title="Edit this configuration">${esc(label)}</span>${_engineTagHtml(m, c.settings)}`;
}).join('');
const addPill = `<span class="badge" style="font-size:10px;cursor:pointer;vertical-align:middle;margin:.1rem 0 0 0;background:var(--raised);border:1px dashed var(--border);color:var(--text-2)" onclick="openCfgModalNew(${idx})" title="Add another configuration for this model">+ Config</span>`;
return `<br style="line-height:.5rem">${pills}${addPill}`;
......@@ -2338,6 +2397,9 @@ async function refreshLocal(){
}
loadGlobalSettings();
// Load engine/card info first so the per-model card tags render on the first paint,
// then re-render once it's available (covers the fetch resolving after the list).
_loadEngineInfo().then(() => loadCachedModels());
refreshLocal();
// Toggle the acceleration / TurboQuant sections as model types are checked/unchecked.
......@@ -2731,6 +2793,7 @@ function openCfgModal(idx, cfgIdx){
document.getElementById('cfg-noram').checked = !!s.no_ram;
document.getElementById('cfg-offload-strategy').value = s.offload_strategy || 'auto';
document.getElementById('cfg-offload-dir').value = s.offload_dir || _defaultOffloadDir;
_populateEnginePin(s.engine || '');
document.getElementById('cfg-sysprompt').value = s.system_prompt || '';
document.getElementById('cfg-parser').value = s.parser || (!m.in_config ? _autoDetectParser(m.path) : 'auto');
document.getElementById('cfg-tools').checked = !!s.tools_closer_prompt;
......@@ -3027,6 +3090,21 @@ async function removeThisConfig(){
} catch(e) { alert('Error: ' + e.message); }
}
// Engine-pin field: populate the datalist from declared engines and only show the
// row when more than one engine is configured (single-engine setups don't need it).
async function _populateEnginePin(desired){
const row = document.getElementById('cfg-engine-row');
const sel = document.getElementById('cfg-engine');
try {
if (!_engineNames || !_engineNames.length) await _loadEngineInfo();
const want = (desired !== undefined) ? desired : sel.value;
sel.querySelectorAll('option:not([value=""])').forEach(o => o.remove());
_engineNames.forEach(n => { const o=document.createElement('option'); o.value=n; o.textContent=n; sel.appendChild(o); });
sel.value = want || ''; // set AFTER options exist so the selection sticks
row.style.display = _engineNames.length > 1 ? '' : 'none';
} catch(e) { row.style.display = 'none'; }
}
async function saveModelConfig(){
const path = document.getElementById('cfg-path').value;
const maxGpu = parseFloat(document.getElementById('cfg-max-gpu').value);
......@@ -3063,6 +3141,7 @@ async function saveModelConfig(){
no_ram: document.getElementById('cfg-noram').checked,
offload_strategy: document.getElementById('cfg-offload-strategy').value,
offload_dir: document.getElementById('cfg-offload-dir').value.trim() || './offload',
engine: document.getElementById('cfg-engine').value.trim() || null,
system_prompt: document.getElementById('cfg-sysprompt').value.trim() || null,
parser: document.getElementById('cfg-parser').value,
tools_closer_prompt: document.getElementById('cfg-tools').checked,
......@@ -3094,7 +3173,12 @@ async function saveModelConfig(){
body: JSON.stringify(data)
});
const d = await r.json();
if(d.success){ closeModal('cfg-modal'); loadCachedModels(); }
if(d.success){
if (d.warnings && d.warnings.length) {
alert('Saved, but check this:\n\n• ' + d.warnings.join('\n• '));
}
closeModal('cfg-modal'); loadCachedModels();
}
else alert('Error: '+(d.detail||'Unknown'));
}catch(e){ alert('Error: '+e.message); }
}
......
......@@ -50,6 +50,40 @@
<input type="number" id="s-queue-max" class="form-input" placeholder="6" min="1" max="1000" style="max-width:160px">
<span class="form-hint">Maximum number of concurrent queued requests. Authenticated requests arriving when the queue is full receive a 429 response.</span>
</div>
<div class="form-row" style="margin-top:1rem;margin-bottom:0">
<label class="form-label">Engine base port</label>
<input type="number" id="s-internal-port-base" class="form-input" placeholder="8780" min="1" max="65535" style="max-width:160px">
<span class="form-hint">First internal port for engine subprocesses (the front assigns this and the next free ports). Keep it different from the listen port above. Restart to apply.</span>
</div>
<div class="form-row" id="default-engine-row" style="margin-top:1rem;margin-bottom:0;display:none">
<label class="form-label">Default engine</label>
<select id="s-default-engine" class="form-input" style="max-width:260px">
<option value="">Auto (least-loaded compatible)</option>
</select>
<span class="form-hint">When a model can run on more than one card (e.g. a GGUF on either the NVIDIA or Radeon engine), prefer this one. A per-model <em>engine</em> set on the Models page overrides this. Only shown when multiple engines are declared.</span>
</div>
</div>
<!-- Concurrency -->
<div class="card mb-0" style="margin-top:1rem">
<div class="card-title">Concurrency</div>
<div style="display:grid;grid-template-columns:1fr 1fr;gap:1rem">
<div class="form-row" style="margin:0">
<label class="form-label">Max parallel requests <span class="muted">(per engine)</span></label>
<input type="number" id="s-max-parallel" class="form-input" min="1" max="64" placeholder="2" style="max-width:160px">
<span class="form-hint">How many requests each engine runs at once. Each engine is its own process, so this applies per-engine — total concurrency is the sum across engines.</span>
</div>
<div class="form-row" style="margin:0">
<label class="form-label">Max instances per model <span class="muted">(per engine)</span></label>
<input type="number" id="s-max-instances" class="form-input" min="1" max="16" placeholder="1" style="max-width:160px">
<span class="form-hint">Default number of concurrent copies of one model. Override per model on the Models page.</span>
</div>
</div>
<div id="concurrency-overrides" style="margin-top:.7rem;display:none">
<label class="form-label" style="margin-bottom:.3rem">Per-engine overrides <span class="muted">(blank = use the defaults above)</span></label>
<span class="form-hint" style="margin-bottom:.4rem">Give a bigger card more concurrency than a smaller one — e.g. more parallel requests on a 24 GB NVIDIA than an 8 GB Radeon.</span>
<div id="concurrency-override-rows" style="display:grid;grid-template-columns:110px 1fr 1fr;gap:.5rem;align-items:center"></div>
</div>
</div>
<!-- Storage -->
......@@ -87,6 +121,23 @@
</label>
<span class="form-hint">Background watcher samples RSS; when it keeps climbing while idle or nears the cap it runs gc / CUDA cache release / heap trim and (if enabled) evicts idle models.</span>
</div>
<div class="form-row" style="margin:0">
<label style="display:flex;align-items:center;gap:.5rem;cursor:pointer">
<input type="checkbox" id="s-ram-watch-cuda">
<span>Allow CUDA cache release in mitigation</span>
</label>
<span class="form-hint">When the watcher mitigates, let it call <code>torch.cuda.empty_cache()</code>. Turn this <b>off</b> on a GPU that drops off the bus (Xid 79) to stop the background thread from touching CUDA — host-RAM mitigation (gc / heap trim / idle eviction) still runs. Loads are always skipped regardless.</span>
</div>
<div class="form-row" style="margin-top:.75rem">
<label class="form-label">RAM watch poll interval (s) <span class="muted">(default: 15)</span></label>
<input type="number" id="s-ram-watch-poll" class="form-input" min="1" step="1" placeholder="15">
<span class="form-hint">How often the watcher samples process-tree RSS.</span>
</div>
<div class="form-row" style="margin-top:.75rem">
<label class="form-label">RAM watch soft threshold <span class="muted">(0–1, default: 0.90)</span></label>
<input type="number" id="s-ram-watch-soft" class="form-input" min="0" max="1" step="0.05" placeholder="0.90">
<span class="form-hint">Fraction of the RAM cap at which the mitigation ladder engages.</span>
</div>
<div class="form-row" style="margin-top:.75rem">
<label class="form-label">Temporary working directory <span class="muted">(default: system /tmp)</span></label>
<input type="text" id="s-tmp-dir" class="form-input" placeholder="e.g. /data/tmp">
......@@ -172,6 +223,12 @@
<input type="number" id="s-therm-gpu-resume" class="form-input" min="30" max="120" step="1" placeholder="87">
</div>
</div>
<div id="therm-gpu-overrides" class="form-row" style="margin-top:.5rem;display:none">
<label class="form-label" style="margin-bottom:.3rem">Per-card overrides <span class="muted">(optional — blank uses the limits above)</span></label>
<span class="form-hint" style="margin-bottom:.4rem">Set a different pause/resume limit per GPU vendor detected on this machine — e.g. let a Radeon run hotter than an NVIDIA card. Each card is judged against its own vendor's limit.</span>
<!-- One row per GPU vendor actually present, built from detected hardware. -->
<div id="therm-gpu-overrides-rows" style="display:grid;grid-template-columns:90px 1fr 1fr;gap:.5rem;align-items:center"></div>
</div>
<div class="form-row" style="margin-top:.5rem">
<label style="display:flex;align-items:center;gap:.5rem;cursor:pointer">
......@@ -321,10 +378,148 @@
</div>
</div>
</div>
<!-- DeepSeek V4 (ds4) -->
<div class="card mb-0" style="margin-top:1rem">
<div class="card-title">DeepSeek V4 (ds4)</div>
<p class="form-hint" style="margin-bottom:.6rem">Run DeepSeek V4 through antirez's native <a href="https://github.com/antirez/ds4" target="_blank" rel="noopener">ds4 / DwarfStar</a> engine as a managed subprocess. When enabled, requests for a matching model name are proxied to ds4-server. First use clones + builds ds4 and downloads several GB of weights — this is slow and needs lots of RAM (96 GB+).</p>
<div class="form-row">
<label style="display:flex;align-items:center;gap:.5rem;cursor:pointer">
<input type="checkbox" id="s-ds4-enabled" onchange="toggleDs4Fields()">
<span style="font-size:13px;font-weight:500">Enable ds4 (DeepSeek V4)</span>
</label>
</div>
<div id="ds4-fields" style="display:none">
<div style="display:grid;grid-template-columns:1fr 1fr;gap:1rem;align-items:start">
<div class="form-row" style="margin:0">
<label class="form-label">Model id / alias</label>
<input type="text" id="s-ds4-model-id" class="form-input" placeholder="deepseek-v4">
<span class="form-hint">Requests for this id (or any name containing "deepseek-v4") route to ds4.</span>
</div>
<div class="form-row" style="margin:0">
<label class="form-label">Weight variant</label>
<select id="s-ds4-variant" class="form-input">
<option value="q2-imatrix">q2-imatrix (96/128 GB)</option>
<option value="q2-q4-imatrix">q2-q4-imatrix (96/128 GB)</option>
<option value="q4-imatrix">q4-imatrix (256 GB+)</option>
<option value="pro-q2-imatrix">pro-q2-imatrix (512 GB)</option>
</select>
</div>
</div>
<div style="display:grid;grid-template-columns:1fr 1fr;gap:1rem;align-items:start">
<div class="form-row" style="margin:0">
<label class="form-label">Build target</label>
<select id="s-ds4-build-target" class="form-input">
<option value="auto">auto-detect</option>
<option value="cuda-generic">cuda-generic</option>
<option value="cuda-spark">cuda-spark (DGX Spark/GB10)</option>
<option value="metal">metal (macOS)</option>
<option value="cpu">cpu</option>
</select>
</div>
<div class="form-row" style="margin:0">
<label class="form-label">Install dir</label>
<input type="text" id="s-ds4-install-dir" class="form-input" placeholder="~/.coderai/ds4">
</div>
</div>
<div class="form-row">
<label style="display:flex;align-items:center;gap:.5rem;cursor:pointer">
<input type="checkbox" id="s-ds4-auto-build">
<span>Auto clone + build the ds4-server binary if missing</span>
</label>
</div>
<div style="display:grid;grid-template-columns:1fr 120px 140px;gap:1rem;align-items:start">
<div class="form-row" style="margin:0">
<label class="form-label">Bind host</label>
<input type="text" id="s-ds4-host" class="form-input" placeholder="127.0.0.1">
</div>
<div class="form-row" style="margin:0">
<label class="form-label">Port</label>
<input type="number" id="s-ds4-port" class="form-input" min="0" placeholder="0 = auto">
</div>
<div class="form-row" style="margin:0">
<label class="form-label">Context (--ctx)</label>
<input type="number" id="s-ds4-ctx" class="form-input" min="1024" placeholder="100000">
</div>
</div>
<div class="form-row">
<label class="form-label">Repo URL</label>
<input type="text" id="s-ds4-repo-url" class="form-input" placeholder="https://github.com/antirez/ds4">
</div>
<div class="form-row">
<label class="form-label">Extra ds4-server args</label>
<input type="text" id="s-ds4-extra-args" class="form-input" placeholder="--kv-disk-dir /tmp/ds4-kv --kv-disk-space-mb 8192">
</div>
</div>
</div>
{% endblock %}
{% block scripts %}
<script>
// Build one per-vendor threshold-override row for each GPU vendor actually present
// on this machine (detected via /admin/api/gpu-stats) — never hardcode the mix.
const _VENDOR_LABEL = {nvidia:'NVIDIA', amd:'Radeon', intel:'Intel'};
const _VENDOR_COLOR = {nvidia:'#76b900', amd:'#ed1c24', intel:'#0071c5'};
async function _buildGpuOverrideRows(overrides){
const wrap = document.getElementById('therm-gpu-overrides');
const rows = document.getElementById('therm-gpu-overrides-rows');
let vendors = [];
try {
const cards = ((await fetch(ROOT_PATH + '/admin/api/gpu-stats').then(r=>r.json())).cards) || [];
vendors = [...new Set(cards.map(c => c.vendor).filter(Boolean))];
} catch(e) {}
// Also include any vendor that already has a saved override, even if not detected now.
Object.keys(overrides || {}).forEach(v => { if(!vendors.includes(v)) vendors.push(v); });
if(!vendors.length){ wrap.style.display = 'none'; rows.innerHTML = ''; return; }
wrap.style.display = '';
rows.innerHTML = vendors.map(v => {
const ov = (overrides && overrides[v]) || {};
const label = _VENDOR_LABEL[v] || v;
const color = _VENDOR_COLOR[v] || 'var(--text-2)';
return `<span style="font-size:12px;color:${color}">${label}</span>
<input type="number" class="form-input therm-ov" data-vendor="${v}" data-k="high"
min="40" max="120" step="1" placeholder="pause °C" value="${ov.high ?? ''}">
<input type="number" class="form-input therm-ov" data-vendor="${v}" data-k="resume"
min="30" max="120" step="1" placeholder="resume °C" value="${ov.resume ?? ''}">`;
}).join('');
}
function _collectGpuOverrides(){
const ov = {};
document.querySelectorAll('#therm-gpu-overrides-rows .therm-ov').forEach(inp => {
const v = parseFloat(inp.value);
if(isNaN(v)) return;
const vendor = inp.dataset.vendor;
(ov[vendor] = ov[vendor] || {})[inp.dataset.k] = v;
});
return ov;
}
// Per-engine concurrency overrides — one row per running engine (by name).
function _buildConcurrencyOverrides(engNames, parOv, instOv){
const wrap = document.getElementById('concurrency-overrides');
const rows = document.getElementById('concurrency-override-rows');
if(!engNames || engNames.length < 2){ wrap.style.display='none'; rows.innerHTML=''; return; }
wrap.style.display = '';
rows.innerHTML =
`<span class="muted" style="font-size:11px">Engine</span>
<span class="muted" style="font-size:11px">Max parallel</span>
<span class="muted" style="font-size:11px">Max instances</span>` +
engNames.map(n => `
<span style="font-size:12px">${n}</span>
<input type="number" class="form-input conc-par" data-engine="${n}" min="1" max="64" placeholder="default" value="${parOv[n] ?? ''}">
<input type="number" class="form-input conc-inst" data-engine="${n}" min="1" max="16" placeholder="default" value="${instOv[n] ?? ''}">`).join('');
}
function _collectConcOverrides(cls){
const o = {};
document.querySelectorAll('#concurrency-override-rows .' + cls).forEach(inp => {
const v = parseInt(inp.value);
if(!isNaN(v) && v >= 1) o[inp.dataset.engine] = v;
});
return o;
}
function toggleDs4Fields(){
document.getElementById('ds4-fields').style.display =
document.getElementById('s-ds4-enabled').checked ? 'block' : 'none';
}
function toggleHttps(){
document.getElementById('https-fields').style.display =
document.getElementById('s-https').checked ? 'block' : 'none';
......@@ -371,12 +566,41 @@ async function loadSettings(){
document.getElementById('s-key').value = d.server?.https_key_path ?? '';
document.getElementById('s-cert').value = d.server?.https_cert_path ?? '';
document.getElementById('s-queue-max').value = d.server?.queue_max_size ?? 6;
document.getElementById('s-internal-port-base').value = d.server?.internal_port_base ?? 8780;
// Default engine — surfaced when 2+ engines are running. Sourced from the
// front's live engine list so it also covers AUTO-DETECTED engines (no
// engine_specs needed); falls back to declared engine_specs names.
let engNames = [];
try {
const er = await fetch(ROOT_PATH + '/admin/api/engines');
if (er.ok) engNames = ((await er.json()).engines || []).map(e => e.name);
} catch(e) {}
if (!engNames.length) engNames = d.server?.engine_names || [];
const engRow = document.getElementById('default-engine-row');
const engSel = document.getElementById('s-default-engine');
if (engNames.length > 1) {
engSel.querySelectorAll('option:not([value=""])').forEach(o => o.remove());
engNames.forEach(n => { const o=document.createElement('option'); o.value=n; o.textContent=n; engSel.appendChild(o); });
engSel.value = d.server?.default_engine || '';
engRow.style.display = '';
} else {
engRow.style.display = 'none';
}
// Concurrency: defaults + per-engine override rows.
document.getElementById('s-max-parallel').value = d.server?.max_parallel_requests ?? 2;
document.getElementById('s-max-instances').value = d.models?.max_model_instances ?? 1;
_buildConcurrencyOverrides(engNames,
d.server?.max_parallel_requests_overrides || {},
d.models?.max_model_instances_overrides || {});
document.getElementById('s-hf-cache').value = d.models?.hf_cache_dir ?? '';
document.getElementById('s-gguf-cache').value = d.models?.gguf_cache_dir ?? '';
document.getElementById('s-offload-dir').value = d.offload?.directory ?? './offload';
document.getElementById('s-max-ram').value = d.offload?.max_ram_gb ?? '';
document.getElementById('s-evict-idle-ram').checked = d.offload?.evict_idle_on_ram !== false;
document.getElementById('s-ram-leak-watch').checked = d.offload?.ram_leak_watch !== false;
document.getElementById('s-ram-watch-cuda').checked = d.offload?.ram_watch_cuda !== false;
document.getElementById('s-ram-watch-poll').value = d.offload?.ram_watch_poll_seconds ?? '';
document.getElementById('s-ram-watch-soft').value = d.offload?.ram_watch_soft_fraction ?? '';
document.getElementById('s-tmp-dir').value = d.tmp_dir ?? '';
document.getElementById('s-allow-ffmpeg').checked = !!(d.enhance && d.enhance.allow_ffmpeg);
document.getElementById('s-allow-rife-ncnn').checked = !!(d.enhance && d.enhance.allow_rife_ncnn);
......@@ -416,6 +640,7 @@ async function loadSettings(){
document.getElementById('s-therm-cpu-enabled').checked = therm.cpu_enabled !== false;
document.getElementById('s-therm-gpu-high').value = therm.gpu_high ?? 90;
document.getElementById('s-therm-gpu-resume').value = therm.gpu_resume ?? 87;
await _buildGpuOverrideRows(therm.gpu_overrides || {});
document.getElementById('s-therm-cpu-high').value = therm.cpu_high ?? 90;
document.getElementById('s-therm-cpu-resume').value = therm.cpu_resume ?? 87;
document.getElementById('s-therm-poll').value = therm.poll_seconds ?? 5;
......@@ -426,6 +651,20 @@ async function loadSettings(){
// Background jobs
const jobs = d.jobs || {};
document.getElementById('s-jobs-resume').checked = jobs.resume_on_restart !== false;
// DeepSeek V4 (ds4)
const ds4 = d.ds4 || {};
document.getElementById('s-ds4-enabled').checked = !!ds4.enabled;
document.getElementById('s-ds4-model-id').value = ds4.model_id ?? 'deepseek-v4';
document.getElementById('s-ds4-variant').value = ds4.model_variant ?? 'q4-imatrix';
document.getElementById('s-ds4-build-target').value = ds4.build_target ?? 'auto';
document.getElementById('s-ds4-install-dir').value = ds4.install_dir ?? '';
document.getElementById('s-ds4-auto-build').checked = ds4.auto_build !== false;
document.getElementById('s-ds4-host').value = ds4.host ?? '127.0.0.1';
document.getElementById('s-ds4-port').value = ds4.port ?? 0;
document.getElementById('s-ds4-ctx').value = ds4.ctx ?? 100000;
document.getElementById('s-ds4-repo-url').value = ds4.repo_url ?? 'https://github.com/antirez/ds4';
document.getElementById('s-ds4-extra-args').value = ds4.extra_args ?? '';
toggleDs4Fields();
}catch(e){ showAlert('error','Failed to load settings: '+e.message); }
}
......@@ -439,16 +678,25 @@ async function saveSettings(){
https_key_path: strOrNull('s-key'),
https_cert_path: strOrNull('s-cert'),
queue_max_size: parseInt(document.getElementById('s-queue-max').value) || 6,
internal_port_base: parseInt(document.getElementById('s-internal-port-base').value) || 8780,
max_parallel_requests: parseInt(document.getElementById('s-max-parallel').value) || 2,
max_parallel_requests_overrides: _collectConcOverrides('conc-par'),
default_engine: document.getElementById('s-default-engine').value || null,
},
models:{
hf_cache_dir: strOrNull('s-hf-cache'),
gguf_cache_dir: strOrNull('s-gguf-cache'),
max_model_instances: parseInt(document.getElementById('s-max-instances').value) || 1,
max_model_instances_overrides: _collectConcOverrides('conc-inst'),
},
offload:{
directory: document.getElementById('s-offload-dir').value.trim() || './offload',
max_ram_gb: (parseFloat(document.getElementById('s-max-ram').value) || null),
evict_idle_on_ram: document.getElementById('s-evict-idle-ram').checked,
ram_leak_watch: document.getElementById('s-ram-leak-watch').checked,
ram_watch_cuda: document.getElementById('s-ram-watch-cuda').checked,
ram_watch_poll_seconds: (parseFloat(document.getElementById('s-ram-watch-poll').value) || null),
ram_watch_soft_fraction: (parseFloat(document.getElementById('s-ram-watch-soft').value) || null),
},
tmp_dir: strOrNull('s-tmp-dir'),
enhance:{
......@@ -465,6 +713,7 @@ async function saveSettings(){
cpu_enabled: document.getElementById('s-therm-cpu-enabled').checked,
gpu_high: parseFloat(document.getElementById('s-therm-gpu-high').value) || 90,
gpu_resume: parseFloat(document.getElementById('s-therm-gpu-resume').value) || 87,
gpu_overrides: _collectGpuOverrides(),
cpu_high: parseFloat(document.getElementById('s-therm-cpu-high').value) || 90,
cpu_resume: parseFloat(document.getElementById('s-therm-cpu-resume').value) || 87,
poll_seconds: parseFloat(document.getElementById('s-therm-poll').value) || 5,
......@@ -475,6 +724,19 @@ async function saveSettings(){
jobs:{
resume_on_restart: document.getElementById('s-jobs-resume').checked,
},
ds4:{
enabled: document.getElementById('s-ds4-enabled').checked,
model_id: document.getElementById('s-ds4-model-id').value.trim() || 'deepseek-v4',
model_variant: document.getElementById('s-ds4-variant').value,
build_target: document.getElementById('s-ds4-build-target').value,
install_dir: document.getElementById('s-ds4-install-dir').value.trim(),
auto_build: document.getElementById('s-ds4-auto-build').checked,
host: document.getElementById('s-ds4-host').value.trim() || '127.0.0.1',
port: parseInt(document.getElementById('s-ds4-port').value) || 0,
ctx: parseInt(document.getElementById('s-ds4-ctx').value) || 100000,
repo_url: document.getElementById('s-ds4-repo-url').value.trim() || 'https://github.com/antirez/ds4',
extra_args: document.getElementById('s-ds4-extra-args').value.trim(),
},
broker:{
enabled: document.getElementById('s-broker-enabled').checked,
base_url: document.getElementById('s-broker-base-url').value.trim(),
......@@ -499,7 +761,14 @@ async function saveSettings(){
method:'POST', headers:{'Content-Type':'application/json'},
body: JSON.stringify(data)
});
if(r.ok) showAlert('info','Settings saved. Archive and thermal-protection changes take effect immediately; restart CoderAI for other changes.');
if(r.ok){
const d = await r.json().catch(()=>({}));
if(d.warnings && d.warnings.length){
showAlert('error', 'Saved, but: ' + d.warnings.join(' '));
} else {
showAlert('info','Settings saved. Archive and thermal-protection changes take effect immediately; restart CoderAI for other changes.');
}
}
else{ const e=await r.json(); showAlert('error', e.detail||'Save failed'); }
}catch(e){ showAlert('error','Error: '+e.message); }
}
......
......@@ -30,11 +30,23 @@
<div id="sys-stats" style="display:grid;grid-template-columns:repeat(auto-fit,minmax(220px,1fr));
gap:.75rem;margin:0 0 1.25rem">
<div class="sys-tile" id="tile-cpu"></div>
<div class="sys-tile" id="tile-gpu"></div>
<div class="sys-tile" id="tile-ram"></div>
<!-- Per-card GPU tiles (util + VRAM) injected here when cards are detected. -->
<div id="tile-cards" style="display:contents"></div>
<!-- Fallback single tiles when per-card stats are unavailable. -->
<div class="sys-tile" id="tile-gpu"></div>
<div class="sys-tile" id="tile-vram"></div>
</div>
<!-- Engines (only shown in front/multi-engine mode) -->
<div id="engines-card" style="display:none;margin:0 0 1.25rem">
<div style="display:flex;align-items:baseline;gap:.5rem;margin-bottom:.5rem">
<h2 style="font-size:14px;margin:0">Engines</h2>
<span class="dim small">restart a stuck engine — the supervisor respawns it</span>
</div>
<div id="engines-body" style="display:grid;grid-template-columns:repeat(auto-fit,minmax(240px,1fr));gap:.6rem"></div>
</div>
<style>
.sys-tile{border:1px solid var(--border,#2a2a2a);border-radius:10px;padding:.7rem .85rem;
background:var(--card-bg,rgba(255,255,255,.02))}
......@@ -76,7 +88,7 @@ function fmtTime(s) {
} catch { return ''; }
}
const KIND_LABEL = {training:'Training', image:'Image', video:'Video', upscale:'Upscale', interpolate:'Interpolate', audio:'Audio', text:'Text', pipeline:'Pipeline', request:'Request', loading:'Loading'};
const KIND_LABEL = {training:'Training', image:'Image', video:'Video', upscale:'Upscale', interpolate:'Interpolate', audio:'Audio', text:'Text', tts:'Speech (TTS)', transcription:'Transcription', embedding:'Embedding', spatial:'3D / Spatial', pipeline:'Pipeline', request:'Request', loading:'Loading'};
const STATUS_BADGE = {
running:'badge-admin', queued:'badge-user', done:'badge-ok', error:'badge-err',
cancelled:'badge-user', interrupted:'badge-warn'
......@@ -140,18 +152,89 @@ function _memTile(name, used, total, pct){
return `<div class="sys-head"><span class="sys-name">${name}</span><span class="sys-val">${valTxt}</span></div>`
+ _bar(p) + `<div class="sys-sub"><span>${p == null ? '' : Math.round(p)+'% used'}</span><span></span></div>`;
}
// One tile per physical card showing both GPU utilization and VRAM (+ temp).
function _cardTile(c){
const vColor = c.vendor==='nvidia' ? '#76b900'
: c.vendor==='amd' ? '#ed1c24' : 'var(--text-3)';
const memP = (c.mem_total ? (c.mem_used / c.mem_total * 100) : null);
const temp = (c.temp!=null) ? ' · '+Math.round(c.temp)+'°C' : '';
const util = (c.util!=null) ? Math.round(c.util)+'%' : '—';
return `<div class="sys-tile">
<div class="sys-head"><span class="sys-name" style="color:${vColor}">${esc(c.name)}</span>
<span class="sys-val">${util}${temp}</span></div>
${_bar(c.util)}
<div class="sys-sub"><span>VRAM ${c.mem_used!=null?c.mem_used.toFixed(1):'—'}/${c.mem_total!=null?c.mem_total.toFixed(0):'—'} GB</span>
<span>${memP!=null?Math.round(memP)+'% used':''}</span></div>
${_bar(memP)}
</div>`;
}
async function loadSystemStats(){
try {
const s = await fetch(ROOT_PATH + '/admin/api/system-stats').then(r => r.json());
const cpu = s.cpu || {}, gpu = s.gpu || {}, ram = s.ram || {}, vram = s.vram || {};
document.getElementById('tile-cpu').innerHTML = _utilTile('CPU', cpu.util, cpu.temp, (cpu.cores || 1) * 100);
document.getElementById('tile-gpu').innerHTML = _utilTile('GPU', gpu.util, gpu.temp);
document.getElementById('tile-ram').innerHTML = _memTile('RAM', ram.used, ram.total, ram.percent);
document.getElementById('tile-vram').innerHTML =
_memTile('VRAM', vram.used, vram.total, vram.percent);
// Per-card GPU+VRAM tiles for every physical card; fall back to single tiles.
let cards = [];
try { cards = ((await fetch(ROOT_PATH + '/admin/api/gpu-stats').then(r => r.json())).cards) || []; } catch(e){}
const cardsEl = document.getElementById('tile-cards');
const gpuEl = document.getElementById('tile-gpu');
const vramEl = document.getElementById('tile-vram');
if (cards.length) {
cardsEl.innerHTML = cards.map(_cardTile).join('');
gpuEl.style.display = 'none'; vramEl.style.display = 'none';
} else {
cardsEl.innerHTML = '';
gpuEl.style.display = ''; vramEl.style.display = '';
gpuEl.innerHTML = _utilTile('GPU', gpu.util, gpu.temp);
vramEl.innerHTML = _memTile('VRAM', vram.used, vram.total, vram.percent);
}
} catch(e){ /* keep last render on transient errors */ }
}
// Engines panel — only present in front/multi-engine mode (404 in single-process).
async function loadEngines(){
let engines = null;
try {
const r = await fetch(ROOT_PATH + '/admin/api/engines');
if (!r.ok) { document.getElementById('engines-card').style.display = 'none'; return; }
engines = (await r.json()).engines || [];
} catch(e) { document.getElementById('engines-card').style.display = 'none'; return; }
const card = document.getElementById('engines-card');
if (!engines.length) { card.style.display = 'none'; return; }
card.style.display = '';
document.getElementById('engines-body').innerHTML = engines.map(e => {
const dot = e.healthy ? '#3fb950' : '#e5534b';
const state = e.healthy ? 'healthy' : 'down / starting';
const vram = e.vram ? `${(e.vram.used ?? 0).toFixed ? e.vram.used.toFixed(1) : e.vram.used}/${e.vram.total} GB` : '';
const cool = e.cooling ? ` <span class="badge badge-warn" style="font-size:9px">❄ cooling</span>` : '';
const prim = e.primary ? ` <span class="badge badge-user" style="font-size:9px">primary</span>` : '';
const models = (e.loaded_models||[]).length;
return `<div class="sys-tile">
<div class="sys-head">
<span class="sys-name">${esc(e.name)} <span class="dim" style="text-transform:none">(${esc(e.backend)})</span>${prim}${cool}</span>
<span style="width:9px;height:9px;border-radius:50%;background:${dot};display:inline-block" title="${state}"></span>
</div>
<div class="sys-sub"><span>${esc(state)}${vram?' · '+esc(vram):''}</span><span>${models} model${models!==1?'s':''}</span></div>
<div style="margin-top:.5rem;text-align:right">
<button class="btn btn-ghost" style="font-size:11px;padding:.15rem .5rem;color:var(--error,#e55)"
onclick="restartEngine(${e.id}, '${esc(e.name)}')" title="Kill and respawn this engine">↻ Restart</button>
</div>
</div>`;
}).join('');
}
async function restartEngine(id, name){
if (!confirm(`Restart engine "${name}"? In-flight requests on it will fail; the supervisor respawns it immediately.`)) return;
try {
const r = await fetch(ROOT_PATH + '/admin/api/engines/' + id + '/restart', {method:'POST'});
if (!r.ok) { const e = await r.json().catch(()=>({})); alert(e.detail || 'Restart failed'); }
setTimeout(loadEngines, 800);
} catch(e) { alert(e.message); }
}
let _refreshing = false;
async function loadTasks() {
if (_refreshing) return;
......@@ -165,7 +248,19 @@ async function loadTasks() {
const therm = data.thermal || {};
const banner = document.getElementById('thermal-banner');
if (therm.active) {
// Multi-engine: name which engine(s) are cooling and on what (GPU vs CPU).
const cooling = data.cooling_engines || [];
if (cooling.length) {
const parts = cooling.map(c => {
const what = (c.gpu != null && c.cpu == null) ? `GPU ${Math.round(c.gpu)}°C`
: (c.cpu != null && c.gpu == null) ? `CPU ${Math.round(c.cpu)}°C`
: (c.message || 'cooling');
return `${esc(c.engine)} (${esc(what)})`;
});
document.getElementById('thermal-banner-msg').textContent =
' Cooling down: ' + parts.join(', ');
banner.style.display = '';
} else if (therm.active) {
document.getElementById('thermal-banner-msg').textContent = ' ' + (therm.message || '');
banner.style.display = '';
} else {
......@@ -207,7 +302,7 @@ async function loadTasks() {
}
return `<tr>
<td><span class="badge badge-user">${esc(KIND_LABEL[t.kind] || t.kind)}</span></td>
<td><div class="td-name">${esc(title)}</div><div class="dim small mono">${esc(t.model || '')}</div></td>
<td><div class="td-name">${esc(title)}${t.engine?` <span class="badge badge-user" style="font-size:9px;padding:.05rem .3rem;vertical-align:middle" title="Running on engine">${esc(t.engine)}</span>`:''}</div><div class="dim small mono">${esc(t.model || '')}</div></td>
<td>${statusCell}</td>
<td>${progressBar(t)}</td>
<td class="dim small">${fmtTime(t.started_at)}</td>
......@@ -248,7 +343,9 @@ async function removeTask(id) {
loadTasks();
loadSystemStats();
loadEngines();
setInterval(loadTasks, 2000);
setInterval(loadSystemStats, 2000);
setInterval(loadEngines, 5000);
</script>
{% endblock %}
......@@ -160,6 +160,32 @@ except ImportError:
pass
class _InternalAuthMiddleware:
"""Reject any HTTP request that doesn't carry the front's internal token.
Active only when CODERAI_INTERNAL_TOKEN is set (i.e. this process is an engine
spawned by the front). It binds 127.0.0.1, but this also blocks anything else on
localhost from talking to the engine directly and bypassing the front. In
single-process mode the token is unset and this is a no-op."""
def __init__(self, app):
self._app = app
self._token = os.environ.get("CODERAI_INTERNAL_TOKEN")
async def __call__(self, scope, receive, send):
if self._token and scope.get("type") == "http":
headers = dict(scope.get("headers", []))
got = headers.get(b"x-coderai-internal", b"").decode("latin-1")
if got != self._token:
await send({"type": "http.response.start", "status": 403,
"headers": [(b"content-type", b"application/json")]})
await send({"type": "http.response.body",
"body": b'{"error":"forbidden: engines are reachable only '
b'through the front proxy"}'})
return
await self._app(scope, receive, send)
class _ForwardedPrefixMiddleware:
"""Populate ASGI root_path from X-Forwarded-Prefix / X-Script-Name headers."""
......@@ -180,6 +206,9 @@ class _ForwardedPrefixMiddleware:
app.add_middleware(_ForwardedPrefixMiddleware)
# Added last → outermost: the internal-token gate runs before anything else, so a
# request without the front's token never reaches a route.
app.add_middleware(_InternalAuthMiddleware)
# Mount static files for admin dashboard
from fastapi.staticfiles import StaticFiles
......@@ -193,6 +222,77 @@ from fastapi.responses import FileResponse, Response as _FaviconResponse
_favicon_path = admin_static_dir / "favicon.ico"
@app.get("/healthz", include_in_schema=False)
async def healthz():
"""Cheap liveness probe that touches no torch/model state.
The front proxy's engine supervisor polls this to distinguish a *slow* engine
(busy loading a model — the event loop may be blocked, so this can be late but
will eventually answer) from a *dead* one (connection refused). It must stay
trivial and dependency-free so it returns the instant the loop is free."""
import os as _os
return {"ok": True, "pid": _os.getpid()}
@app.get("/internal/engine-state", include_in_schema=False)
async def internal_engine_state():
"""Auth-free engine introspection for the front proxy's router/aggregator.
Engines bind 127.0.0.1 only, so this is not publicly reachable. Returns which
models are resident (for model→engine routing) and this engine's GPU/VRAM (for
cross-engine status aggregation). Kept cheap so it answers even mid-generation.
"""
import os as _os
try:
loaded = list(multi_model_manager.models.keys())
except Exception:
loaded = []
vram = None
try:
import torch
if torch.cuda.is_available():
# Sum across every CUDA device this engine can see — an engine may own
# more than one GPU (e.g. two NVIDIA cards sharding one large model), so
# reporting only device 0 would under-count its VRAM.
n = torch.cuda.device_count()
used = free = total = 0
devs = []
for i in range(n):
f, t = torch.cuda.mem_get_info(i)
used += (t - f); free += f; total += t
devs.append({"index": i, "name": torch.cuda.get_device_name(i),
"free": round(f / 1e9, 2), "total": round(t / 1e9, 2)})
label = (torch.cuda.get_device_name(0) if n == 1
else f"{n}× CUDA")
vram = {"used": round(used / 1e9, 2), "free": round(free / 1e9, 2),
"total": round(total / 1e9, 2), "gpu": label,
"devices": devs, "device_count": n}
except Exception:
vram = None
# Running tasks so the front can show cross-engine activity without needing a
# session on this engine (sessions live only on the primary).
tasks = []
try:
from codai.tasks import task_registry
tasks = [t for t in task_registry.list()
if t.get("status") in ("running", "queued", "paused")]
except Exception:
tasks = []
# This engine's thermal cooldown state, so the front can show WHICH engine is
# cooling (each engine pauses on its own GPUs; CPU pauses everything).
cooling = None
try:
from codai.models import thermal
cs = thermal.get_cooldown_state()
if cs.get("active"):
cooling = {"gpu": cs.get("gpu"), "cpu": cs.get("cpu"),
"message": cs.get("message")}
except Exception:
cooling = None
return {"ok": True, "pid": _os.getpid(), "loaded_models": loaded,
"vram": vram, "tasks": tasks, "cooling": cooling}
@app.get("/favicon.ico", include_in_schema=False)
async def favicon():
if _favicon_path.exists():
......
# CoderAI - OpenAI-compatible API server
# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
"""Fully-managed ds4 (DeepSeek V4) worker.
ds4 / DwarfStar (https://github.com/antirez/ds4) is a native inference engine for
DeepSeek V4 that ships its own OpenAI-compatible HTTP server (``ds4-server``). It is
not a Python package — it's a C/CUDA/Metal binary built with ``make`` — so coderai
owns the whole lifecycle here, mirroring :mod:`codai.api.parler_worker`:
* :func:`ensure_built` clones the repo and runs the right ``make`` target so the
``ds4-server`` binary exists (idempotent).
* :func:`ensure_model` runs the project's ``download_model.sh`` for the configured
weight variant if the model isn't already present.
* :func:`ensure_service` launches ``ds4-server`` on a free port, health-checks its
``/v1/models`` endpoint, and returns the base URL.
The matching ``Ds4Backend.cleanup()`` calls :func:`stop_service`, so the model
manager's normal eviction tears the process down.
"""
import os
import platform
import shutil
import socket
import subprocess
import sys
import threading
import time
import collections
from pathlib import Path
from typing import Optional
_lock = threading.RLock()
# Single managed server (ds4 serves one DeepSeek V4 model). Keyed by model_id so a
# config change to a different id restarts cleanly.
_services: dict[str, dict] = {} # model_id -> {"proc","port","url"}
_built = False
def default_install_dir() -> Path:
return Path(os.environ.get("CODERAI_DS4_DIR")
or os.path.expanduser("~/.coderai/ds4"))
def _install_dir(cfg) -> Path:
return Path(cfg.install_dir).expanduser() if getattr(cfg, "install_dir", None) \
else default_install_dir()
def _server_bin(install_dir: Path) -> Path:
return install_dir / "ds4-server"
def _detect_build_target() -> str:
"""Pick a ``make`` target from the host: CUDA → cuda-generic, macOS → metal."""
if platform.system() == "Darwin":
return "metal"
if shutil.which("nvcc") or os.path.isdir("/usr/local/cuda"):
return "cuda-generic"
return "cpu"
def _resolve_target(cfg) -> str:
target = (getattr(cfg, "build_target", "auto") or "auto").strip()
if target in ("", "auto"):
return _detect_build_target()
# ds4's macOS Metal target is the bare ``make`` (no suffix).
return "" if target == "metal" else target
def _run_logged(cmd, cwd, label, tail, **kw):
"""Run a subprocess, streaming its output with a ``[ds4]`` prefix into ``tail``."""
print(f"[ds4] $ {' '.join(str(c) for c in cmd)}", flush=True)
proc = subprocess.Popen(cmd, cwd=str(cwd), stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, text=True, bufsize=1, **kw)
for line in proc.stdout:
line = line.rstrip()
if line:
tail.append(line)
print(f"[ds4] {line}", flush=True)
proc.wait()
if proc.returncode != 0:
joined = " | ".join(list(tail)[-5:])
raise RuntimeError(f"{label} failed (exit {proc.returncode}). {joined}")
def ensure_built(cfg) -> Path:
"""Clone + build ds4 if the ``ds4-server`` binary is missing. Returns its path."""
global _built
install_dir = _install_dir(cfg)
binary = _server_bin(install_dir)
if binary.exists():
_built = True
return binary
if not getattr(cfg, "auto_build", True):
raise RuntimeError(
f"ds4-server not found at {binary} and auto_build is disabled. Build ds4 "
f"manually (git clone {cfg.repo_url}; make <target>) or enable auto_build.")
tail = collections.deque(maxlen=40)
install_dir.parent.mkdir(parents=True, exist_ok=True)
if not (install_dir / ".git").exists() and not (install_dir / "Makefile").exists():
print(f"[ds4] cloning {cfg.repo_url} → {install_dir} …", flush=True)
_run_logged(["git", "clone", "--depth", "1", cfg.repo_url, str(install_dir)],
cwd=install_dir.parent, label="git clone", tail=tail)
target = _resolve_target(cfg)
make_cmd = ["make"] + ([target] if target else [])
print(f"[ds4] building ds4 (make {target or 'metal'}) — this can take a while …",
flush=True)
_run_logged(make_cmd, cwd=install_dir, label="make", tail=tail)
if not binary.exists():
raise RuntimeError(
f"ds4 build completed but {binary} is missing. Last output: "
+ " | ".join(list(tail)[-5:]))
_built = True
print(f"[ds4] built {binary}", flush=True)
return binary
def ensure_model(cfg) -> None:
"""Download the configured GGUF weight variant if it isn't present.
ds4's ``download_model.sh`` writes into ``<install_dir>/gguf/`` and updates the
``ds4flash.gguf`` symlink that ``ds4-server`` loads by default.
"""
install_dir = _install_dir(cfg)
# If the default-loaded model file already resolves, nothing to do.
default_model = install_dir / "ds4flash.gguf"
if default_model.exists():
return
script = install_dir / "download_model.sh"
if not script.exists():
# The binary may have been bundled (e.g. into a Docker image) without the
# repo scripts. Shallow-clone the repo to get download_model.sh — this does
# not rebuild the already-present binary.
tail0 = collections.deque(maxlen=20)
if (install_dir / ".git").exists() or (install_dir / "Makefile").exists():
_run_logged(["git", "-C", str(install_dir), "pull", "--ff-only"],
cwd=install_dir, label="git pull", tail=tail0)
else:
tmp = install_dir.parent / (install_dir.name + ".repo")
shutil.rmtree(tmp, ignore_errors=True)
_run_logged(["git", "clone", "--depth", "1", cfg.repo_url, str(tmp)],
cwd=install_dir.parent, label="git clone (scripts)", tail=tail0)
# Copy repo files into install_dir without clobbering the built binary.
for item in tmp.iterdir():
dest = install_dir / item.name
if dest.exists():
continue
if item.is_dir():
shutil.copytree(item, dest)
else:
shutil.copy2(item, dest)
shutil.rmtree(tmp, ignore_errors=True)
if not script.exists():
raise RuntimeError(f"ds4 download script not found at {script}")
variant = (getattr(cfg, "model_variant", "") or "q4-imatrix").strip()
tail = collections.deque(maxlen=40)
print(f"[ds4] downloading DeepSeek V4 weights (variant '{variant}', multi-GB, "
f"resumable) …", flush=True)
_run_logged(["bash", str(script), variant], cwd=install_dir,
label="download_model.sh", tail=tail)
def _free_port() -> int:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(("127.0.0.1", 0))
port = s.getsockname()[1]
s.close()
return port
def _pump_logs(proc, tail):
for line in proc.stdout:
line = line.rstrip()
if line:
tail.append(line)
print(f"[ds4] {line}", flush=True)
def _health_ok(url: str) -> bool:
import requests
try:
r = requests.get(url + "/v1/models", timeout=3)
return r.ok
except Exception:
return False
def ensure_service(cfg, ready_timeout: float = 3600.0) -> str:
"""Build + download (as needed), then start (or reuse) ds4-server.
Returns the base URL. First call clones, builds, and downloads several GB, so the
timeout is generous. Raises RuntimeError if the service never becomes ready.
"""
model_id = getattr(cfg, "model_id", "deepseek-v4") or "deepseek-v4"
with _lock:
svc = _services.get(model_id)
if svc and svc["proc"].poll() is None and _health_ok(svc["url"]):
return svc["url"]
if svc and svc["proc"].poll() is not None:
_services.pop(model_id, None) # died — restart below
binary = ensure_built(cfg)
ensure_model(cfg)
install_dir = _install_dir(cfg)
host = getattr(cfg, "host", "127.0.0.1") or "127.0.0.1"
port = int(getattr(cfg, "port", 0) or 0) or _free_port()
# ds4-server binds the requested host; build the URL from a loopback address
# for our own health checks / proxying when it's bound to 0.0.0.0.
connect_host = "127.0.0.1" if host in ("0.0.0.0", "::") else host
url = f"http://{connect_host}:{port}"
cmd = [str(binary), "--host", host, "--port", str(port),
"--ctx", str(int(getattr(cfg, "ctx", 100000) or 100000)),
"--chdir", str(install_dir)]
extra = (getattr(cfg, "extra_args", "") or "").strip()
if extra:
import shlex
cmd += shlex.split(extra)
proc = subprocess.Popen(
cmd, cwd=str(install_dir), stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, text=True, bufsize=1,
)
tail = collections.deque(maxlen=15)
threading.Thread(target=_pump_logs, args=(proc, tail), daemon=True).start()
_services[model_id] = {"proc": proc, "port": port, "url": url}
def _tail_msg():
joined = " | ".join(list(tail)[-5:]).strip()
return f". Last output: {joined}" if joined else ""
deadline = time.time() + ready_timeout
while time.time() < deadline:
if proc.poll() is not None:
raise RuntimeError(
f"ds4-server exited (code {proc.returncode}) before becoming ready"
+ _tail_msg())
if _health_ok(url):
print(f"[ds4] service ready for {model_id} at {url}", flush=True)
return url
time.sleep(2)
stop_service(model_id)
raise RuntimeError(f"ds4-server for {model_id} did not become ready in time"
+ _tail_msg())
def stop_service(model_id: str) -> None:
with _lock:
svc = _services.pop(model_id, None)
if not svc:
return
proc = svc["proc"]
if proc.poll() is None:
try:
proc.terminate()
proc.wait(timeout=10)
except Exception:
pass
if proc.poll() is None:
try:
proc.kill()
except Exception:
pass
print(f"[ds4] service for {model_id} stopped", flush=True)
def stop_all() -> None:
for mid in list(_services.keys()):
stop_service(mid)
import atexit as _atexit
_atexit.register(stop_all)
......@@ -106,6 +106,27 @@ async def create_embeddings(request: EmbeddingsRequest, http_request: Request =
"""
OpenAI-compatible embeddings endpoint.
"""
# Register a task so embeddings appear in the unified task list, like every
# other model type. Finished on success or error below.
from codai.tasks import task_registry
_title = request.input if isinstance(request.input, str) else "embeddings"
_tid = task_registry.register(
"embedding", title=str(_title)[:80], model=(request.model or "embedding"))
task_registry.start(_tid)
try:
_resp = await _run_embeddings(request, http_request)
task_registry.finish(_tid, "done")
return _resp
except HTTPException:
task_registry.finish(_tid, "error")
raise
except Exception as e:
task_registry.finish(_tid, "error", str(e)[:200])
raise
async def _run_embeddings(request: EmbeddingsRequest, http_request: Request = None):
"""Core embeddings logic; registered as a task by create_embeddings()."""
model_info = await asyncio.to_thread(
multi_model_manager.request_model, request.model, model_type="embedding")
model_name = model_info.get('model_name')
......
# CoderAI - OpenAI-compatible API server
# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
"""Fully-managed Parler-TTS worker.
parler-tts pins an old transformers/tokenizers/huggingface-hub that conflict with
the coderai server's stack, so it can't share this venv. Instead coderai owns the
whole lifecycle here: on first use it bootstraps a dedicated venv (installing
parler-tts), launches ``tools/parler_tts_service.py`` in it as a local HTTP
service, health-checks it, and hands back the URL. The matching
``_RemoteParlerBackend.cleanup()`` calls :func:`stop_service`, so the model
manager's normal eviction tears the process down — no manual setup or config.
"""
import os
import socket
import subprocess
import sys
import threading
import time
from pathlib import Path
_REPO_ROOT = Path(__file__).resolve().parents[2]
_SERVICE_SCRIPT = _REPO_ROOT / "tools" / "parler_tts_service.py"
# Dedicated venv for the (incompatible) parler-tts stack. Created with access to
# the base interpreter's packages so torch/numpy aren't re-downloaded; parler's
# pinned transformers installs into the venv and shadows the system one.
_VENV_DIR = Path(os.environ.get("CODERAI_PARLER_VENV")
or os.path.expanduser("~/.coderai/parler_venv"))
_lock = threading.RLock()
_services: dict[str, dict] = {} # model_name -> {"proc","port","url"}
_bootstrapped = False
def _venv_python() -> Path:
return _VENV_DIR / ("Scripts" if os.name == "nt" else "bin") / (
"python.exe" if os.name == "nt" else "python")
def _pip_ok(py: Path) -> bool:
try:
return subprocess.run([str(py), "-c", "import parler_tts, soundfile"],
capture_output=True).returncode == 0
except Exception:
return False
def _venv_is_system_site() -> bool:
"""True if the venv was built with --system-site-packages (can't isolate)."""
try:
return "include-system-site-packages = true" in \
(_VENV_DIR / "pyvenv.cfg").read_text().lower()
except Exception:
return False
def _bootstrap_venv() -> Path:
"""Create a fully-isolated venv and install parler-tts (idempotent).
Isolation is the whole point: parler-tts pins an old transformers/tokenizers
that must NOT be shared with — or shadowed by — the server's stack, so the
venv gets its own copy of everything (torch included). Returns its python."""
global _bootstrapped
py = _venv_python()
if _bootstrapped and py.exists():
return py
# A previously-created shared-site venv leaks the server's transformers in;
# rebuild it isolated.
if py.exists() and _venv_is_system_site():
import shutil
print("[parler] rebuilding venv as fully isolated …", flush=True)
shutil.rmtree(_VENV_DIR, ignore_errors=True)
if not _venv_python().exists():
print(f"[parler] creating isolated venv at {_VENV_DIR} …", flush=True)
_VENV_DIR.parent.mkdir(parents=True, exist_ok=True)
subprocess.run([sys.executable, "-m", "venv", str(_VENV_DIR)], check=True)
py = _venv_python()
if not _pip_ok(py):
print("[parler] installing parler-tts + torch into the isolated venv "
"(first run, downloads several GB, this can take a while) …", flush=True)
subprocess.run([str(py), "-m", "pip", "install",
"git+https://github.com/huggingface/parler-tts.git",
"soundfile"], check=True)
if not _pip_ok(py):
raise RuntimeError("parler-tts install did not yield an importable package")
_bootstrapped = True
return py
def _free_port() -> int:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(("127.0.0.1", 0))
port = s.getsockname()[1]
s.close()
return port
def _pump_logs(proc: subprocess.Popen, tail):
for line in proc.stdout:
line = line.rstrip()
if line:
tail.append(line)
print(f"[parler] {line}", flush=True)
def _health_ok(url: str) -> bool:
import requests
try:
r = requests.get(url + "/health", timeout=3)
return r.ok and bool(r.json().get("ok"))
except Exception:
return False
def ensure_service(model_name: str, ready_timeout: float = 1800.0) -> str:
"""Start (or reuse) the worker for ``model_name`` and return its base URL.
First call bootstraps the venv and downloads the model, so the timeout is
generous. Raises RuntimeError if the service never comes up."""
with _lock:
svc = _services.get(model_name)
if svc and svc["proc"].poll() is None and _health_ok(svc["url"]):
return svc["url"]
if svc and svc["proc"].poll() is not None:
_services.pop(model_name, None) # died — restart below
py = _bootstrap_venv()
port = _free_port()
url = f"http://127.0.0.1:{port}"
env = dict(os.environ)
# The worker must use the model already pulled via coderai's HF download
# interface — it never downloads anything itself. Point it at coderai's
# cache and force offline mode, so a missing model fails fast instead of
# silently fetching.
try:
from codai.models.cache import get_hf_hub_cache_dir
hub = get_hf_hub_cache_dir()
env["HF_HUB_CACHE"] = hub
env["HUGGINGFACE_HUB_CACHE"] = hub
except Exception:
pass
env["HF_HUB_OFFLINE"] = "1"
env["TRANSFORMERS_OFFLINE"] = "1"
proc = subprocess.Popen(
[str(py), str(_SERVICE_SCRIPT), "--model", model_name,
"--host", "127.0.0.1", "--port", str(port)],
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True,
bufsize=1, env=env, cwd=str(_REPO_ROOT),
)
import collections
tail = collections.deque(maxlen=15)
threading.Thread(target=_pump_logs, args=(proc, tail), daemon=True).start()
_services[model_name] = {"proc": proc, "port": port, "url": url}
def _tail_msg():
joined = " | ".join(list(tail)[-5:]).strip()
if "offline" in joined.lower() or "not" in joined.lower() and "found" in joined.lower():
return (f". The model isn't in coderai's cache — download "
f"'{model_name}' from the model interface first. ({joined})")
return f". Last output: {joined}" if joined else ""
# Wait (outside the lock) for the service to load the model and answer.
deadline = time.time() + ready_timeout
while time.time() < deadline:
if proc.poll() is not None:
raise RuntimeError(
f"Parler worker exited (code {proc.returncode}) before becoming ready"
+ _tail_msg())
if _health_ok(url):
print(f"[parler] service ready for {model_name} at {url}", flush=True)
return url
time.sleep(2)
stop_service(model_name)
raise RuntimeError(f"Parler worker for {model_name} did not become ready in time"
+ _tail_msg())
def stop_service(model_name: str) -> None:
with _lock:
svc = _services.pop(model_name, None)
if not svc:
return
proc = svc["proc"]
if proc.poll() is None:
try:
proc.terminate()
proc.wait(timeout=10)
except Exception:
pass
if proc.poll() is None:
try:
proc.kill()
except Exception:
pass
print(f"[parler] service for {model_name} stopped", flush=True)
def stop_all() -> None:
for name in list(_services.keys()):
stop_service(name)
import atexit as _atexit
_atexit.register(stop_all)
......@@ -45,6 +45,31 @@ global_args = None
global_file_path = None
def _spatial_task(title: str):
"""Decorator: register a spatial/3D endpoint in the unified task list so
every model type is visible there. Finishes done/error around the call."""
import functools
def deco(fn):
@functools.wraps(fn)
async def wrap(*args, **kwargs):
from codai.tasks import task_registry
tid = task_registry.register("spatial", title=title, model="spatial")
task_registry.start(tid)
try:
result = await fn(*args, **kwargs)
task_registry.finish(tid, "done")
return result
except HTTPException:
task_registry.finish(tid, "error")
raise
except Exception as e:
task_registry.finish(tid, "error", str(e)[:200])
raise
return wrap
return deco
def set_global_args(args):
global global_args
global_args = args
......@@ -500,6 +525,7 @@ class ImageTo3DRequest(BaseModel):
@router.post("/v1/images/to3d", summary="Image to 3D model")
@_spatial_task("Image → 3D")
async def image_to_3d(request: ImageTo3DRequest, http_request: Request = None):
"""Convert a 2D image to a 3D representation.
......@@ -568,6 +594,7 @@ class ImageFrom3DRequest(BaseModel):
@router.post("/v1/images/from3d", summary="Render a 3D model to an image")
@_spatial_task("3D → image")
async def image_from_3d(request: ImageFrom3DRequest, http_request: Request = None):
"""Render a 3D model (GLB/OBJ) to a 2D PNG image from a specified camera angle."""
raw = _decode_b64(request.model_data)
......@@ -601,6 +628,7 @@ class VideoTo3DRequest(BaseModel):
@router.post("/v1/video/to3d", summary="Video to 3D model")
@_spatial_task("Video → 3D")
async def video_to_3d(request: VideoTo3DRequest, http_request: Request = None):
"""Convert a 2D video to a 3D video frame-by-frame.
......@@ -642,6 +670,7 @@ class VideoFrom3DRequest(BaseModel):
@router.post("/v1/video/from3d", summary="Render a 3D model to a video")
@_spatial_task("3D → video")
async def video_from_3d(request: VideoFrom3DRequest, http_request: Request = None):
"""Render a 3D model as a 360° turntable video."""
raw = _decode_b64(request.model_data)
......@@ -675,6 +704,7 @@ class Generate3DRequest(BaseModel):
@router.post("/v1/3d/generate", summary="Generate a 3D model from a prompt")
@_spatial_task("Generate 3D")
async def generate_3d(request: Generate3DRequest, http_request: Request = None):
"""Generate a 3D model (GLB) from a text prompt and/or an image.
......
......@@ -86,6 +86,105 @@ def set_grammar_guided_gen(enabled: bool):
_set_grammar_guided_gen(enabled)
def _debug_requests_enabled() -> bool:
"""True when --debug-requests is set (full client<->API payload logging)."""
return bool(getattr(global_args, 'debug_requests', False)) if global_args else False
def _summarize_tool_calls(tool_calls):
"""Compact one-line-per-call view of OpenAI tool_calls (dict or pydantic)."""
out = []
for tc in (tool_calls or []):
fn = (tc.get('function') if isinstance(tc, dict) else getattr(tc, 'function', None)) or {}
name = fn.get('name', '') if isinstance(fn, dict) else getattr(fn, 'name', '')
args = fn.get('arguments', '') if isinstance(fn, dict) else getattr(fn, 'arguments', '')
if not isinstance(args, str):
try:
args = json.dumps(args)
except Exception:
args = str(args)
out.append(f"{name}({args})")
return out
def log_request_exchange(request):
"""Dump the incoming chat request (messages + tools) when --debug-requests.
Shows exactly what an agentic client (opencode, etc.) sends each turn —
including whether it replays prior assistant tool_calls and `role:tool`
results — so tool-call loops can be diagnosed from the wire, not guesswork."""
if not _debug_requests_enabled():
return
try:
print(f"\n{'#'*80}\n# >>> REQUEST model={getattr(request, 'model', '?')} "
f"stream={getattr(request, 'stream', False)} "
f"tools={len(getattr(request, 'tools', None) or [])}\n{'#'*80}")
for i, m in enumerate(getattr(request, 'messages', []) or []):
role = getattr(m, 'role', '?')
content = getattr(m, 'content', '') or ''
if isinstance(content, list):
content = json.dumps(content)
line = f"[{i}] {role}: {str(content)[:2000]}"
tcs = getattr(m, 'tool_calls', None)
if tcs:
line += " tool_calls=" + json.dumps(_summarize_tool_calls(tcs))
tcid = getattr(m, 'tool_call_id', None)
if tcid:
line += f" tool_call_id={tcid}"
name = getattr(m, 'name', None)
if name:
line += f" name={name}"
print(line)
tools = getattr(request, 'tools', None) or []
if tools:
names = []
for t in tools:
fn = t.get('function', {}) if isinstance(t, dict) else getattr(t, 'function', None)
names.append((fn.get('name') if isinstance(fn, dict) else getattr(fn, 'name', '?')))
print(f"# tools offered: {names}")
print(f"{'#'*80}\n", flush=True)
except Exception as e:
print(f"[debug-requests] failed to log request: {e}", flush=True)
def log_response_exchange(content, tool_calls=None, finish_reason=None,
streamed=False, stage="pre-format"):
"""Dump the assistant message coderai *extracted* (content + tool_calls) when
--debug-requests. This is the model's decision **before** the OpenAI formatter
runs — pair it with :func:`log_response_payload` to see what the client gets."""
if not _debug_requests_enabled():
return
try:
tag = "STREAM" if streamed else "RESPONSE"
print(f"\n{'#'*80}\n# <<< {tag} [{stage}] finish_reason={finish_reason}\n{'#'*80}")
if content:
print(f"content: {str(content)[:2000]}")
if tool_calls:
for c in _summarize_tool_calls(tool_calls):
print(f"tool_call: {c}")
if not content and not tool_calls:
print("(empty)")
print(f"{'#'*80}\n", flush=True)
except Exception as e:
print(f"[debug-requests] failed to log response: {e}", flush=True)
def log_response_payload(payload, streamed=False):
"""Dump the exact payload the client receives (post OpenAI-formatter) when
--debug-requests — the SSE chunk dict for streaming or the full JSON body for
non-streaming. This is the ground truth of what opencode actually parses, so a
formatter that rewrites/drops tool_calls or content is caught here."""
if not _debug_requests_enabled():
return
try:
tag = "STREAM CHUNK" if streamed else "RESPONSE BODY"
print(f"\n{'#'*80}\n# <<< {tag} [post-format, sent to client]\n{'#'*80}")
print(json.dumps(payload, indent=2, default=str)[:4000])
print(f"{'#'*80}\n", flush=True)
except Exception as e:
print(f"[debug-requests] failed to log payload: {e}", flush=True)
# =============================================================================
# Router and Endpoints
# =============================================================================
......@@ -96,6 +195,7 @@ router = APIRouter()
@router.post("/v1/chat/completions", summary="Chat completions")
async def chat_completions(request: ChatCompletionRequest, http_request: Request = None):
"""Chat completions endpoint with streaming and tool support."""
log_request_exchange(request)
# Check if we should use litellm backend
parser_type = getattr(global_args, 'parser', 'auto') if global_args else 'auto'
......@@ -344,6 +444,13 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
mm = _candidate
break
_load_err = None
if _model_key:
_load_err = getattr(multi_model_manager, '_last_load_errors', {}).get(_model_key)
if _load_err:
raise HTTPException(status_code=503, detail=(
f"Model '{requested_model}' failed to load: {_load_err}"))
print(f"Text model '{requested_model}' not ready, retrying in 5s "
f"(attempt {_attempt + 1}/{_MAX_WAIT_TRIES})…")
await asyncio.sleep(5)
......@@ -1188,6 +1295,7 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
}
from fastapi.responses import JSONResponse
log_response_payload(formatted_response, streamed=False)
return JSONResponse(content=formatted_response, headers=headers)
# Compute prefix key for prompt-aggregation scheduling
......@@ -1235,6 +1343,65 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
finally:
_release_instance()
import re as _re
_TOOL_SPAN_RE = _re.compile(r'<(tool|tool_call)\b[\s\S]*?</\1\s*>', _re.IGNORECASE)
_TOOL_OPEN_RE = _re.compile(r'<(?:tool|tool_call)\b', _re.IGNORECASE)
_TOOL_OPEN_TAGS = ('<tool>', '<tool_call>')
# gemma-4 native tool call: `call:NAME{…}` (the <|tool_call> markers are stripped
# by skip_special_tokens). Once it starts we withhold everything to the end of the
# stream — the call is surfaced as structured tool_calls after generation.
_GEMMA_CALL_OPEN_RE = _re.compile(r'call:\s*[A-Za-z_]\w*\s*\{')
def _gate_tool_content(buffer: str, final: bool = False):
"""Split accumulated stream text into (content_to_emit, held_buffer).
During tool-enabled streaming the model emits ``<tool>{json}</tool>`` spans
inline. Those must NOT reach the client as visible ``content`` (they're
surfaced separately as structured ``tool_calls``); otherwise the raw tags leak
into the chat. This withholds any complete or in-progress tool span, plus a
trailing partial ``<`` that could still grow into a tool tag, and streams only
the safe text around them. With ``final=True`` any leftover (possibly unclosed)
tool span is dropped and the rest emitted.
"""
emit = []
# Strip complete tool spans, emitting the text around each.
while True:
m = _TOOL_SPAN_RE.search(buffer)
if not m:
break
emit.append(buffer[:m.start()])
buffer = buffer[m.end():]
# An open tag with no close yet → hold from there (a call is in progress).
m = _TOOL_OPEN_RE.search(buffer)
if m:
emit.append(buffer[:m.start()])
held = '' if final else buffer[m.start():]
return ''.join(emit), held
# gemma-4 `call:NAME{…}` — withhold from the call onward (extracted at the end).
gm = _GEMMA_CALL_OPEN_RE.search(buffer)
if gm:
emit.append(buffer[:gm.start()])
held = '' if final else buffer[gm.start():]
return ''.join(emit), held
# Hold back a trailing '<…' that could still become a tool open tag.
if not final:
lt = buffer.rfind('<')
if lt != -1:
tail = buffer[lt:].lower()
if any(t.startswith(tail) for t in _TOOL_OPEN_TAGS):
emit.append(buffer[:lt])
return ''.join(emit), buffer[lt:]
# Hold a trailing 'call:NAME' (no '{' yet) that may grow into a gemma call.
cm = _re.search(r'call:\s*[A-Za-z_]?\w*$', buffer)
if cm:
emit.append(buffer[:cm.start()])
return ''.join(emit), buffer[cm.start():]
emit.append(buffer)
return ''.join(emit), ''
async def stream_chat_response(
messages: List[Dict],
model_name: str,
......@@ -1350,6 +1517,12 @@ async def stream_chat_response(
try:
chunk_count = 0
# Buffer for withholding in-progress tool tags from the content stream.
content_buffer = ""
# Exact content deltas actually streamed to the client (post-format,
# post tool-gating) — logged once at the end under --debug-requests so we
# see the real reply, not just what we extracted internally.
client_sent_content = ""
# Debug: Print what is being passed to the model
if get_global_debug():
......@@ -1399,6 +1572,30 @@ async def stream_chat_response(
# Pass through all content including whitespace - it's essential for message composition
generated_text += filtered_chunk
# Live progress under --debug-requests so a non-terminating / looping
# generation is visible AS IT HAPPENS — the end-of-stream response logs
# below never fire if the model never stops. The front pumps engine
# stdout line-by-line, so emit newline-terminated snapshots (every N
# chunks) of the accumulated tail; a loop shows up as the same text
# repeating across snapshots.
if _debug_requests_enabled():
if chunk_count == 1:
print(f"# <<< STREAMING [live] model={model_name} "
f"(snapshots every 64 tokens until stop)", flush=True)
if chunk_count % 64 == 0:
_tail = generated_text[-220:].replace("\n", "\\n")
print(f"# <<< [live @{chunk_count} tok] …{_tail}", flush=True)
# When tools are enabled, gate the content so in-progress <tool>…</tool>
# spans are never streamed as visible text (they're surfaced as
# structured tool_calls after the stream). Without tools, stream as-is.
if tools:
content_buffer += filtered_chunk
filtered_chunk, content_buffer = _gate_tool_content(content_buffer)
if not filtered_chunk:
await asyncio.sleep(0)
continue
data = {
"id": completion_id,
"object": "chat.completion.chunk",
......@@ -1410,10 +1607,30 @@ async def stream_chat_response(
"finish_reason": None,
}],
}
client_sent_content += filtered_chunk
yield f"data: {json.dumps(data)}\n\n"
# Explicitly flush to ensure data is sent immediately
await asyncio.sleep(0)
# Flush any safe trailing text held back by the tool-content gate
# (dropping leftover/unclosed tool tags — they become tool_calls below).
if tools and content_buffer:
tail_content, content_buffer = _gate_tool_content(content_buffer, final=True)
if tail_content:
data = {
"id": completion_id,
"object": "chat.completion.chunk",
"created": created,
"model": model_name,
"choices": [{
"index": 0,
"delta": {"content": tail_content},
"finish_reason": None,
}],
}
client_sent_content += tail_content
yield f"data: {json.dumps(data)}\n\n"
# In debug mode, dump the full generated text
if get_global_debug():
......@@ -1484,6 +1701,9 @@ async def stream_chat_response(
print(f"{'='*80}\n")
# Tool calls were extracted and stripped from content during streaming
# Just send the tool_calls chunk
log_response_exchange(generated_text, tool_calls=tool_calls,
finish_reason="tool_calls", streamed=True,
stage="pre-format extracted")
data = {
"id": completion_id,
"object": "chat.completion.chunk",
......@@ -1497,6 +1717,7 @@ async def stream_chat_response(
"native_finish_reason": "tool_calls",
}],
}
log_response_payload(data, streamed=True)
yield f"data: {json.dumps(data)}\n\n"
else:
# Calculate token counts for usage in final chunk
......@@ -1514,7 +1735,12 @@ async def stream_chat_response(
"completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens,
}
log_response_exchange(generated_text, finish_reason="stop",
streamed=True, stage="pre-format extracted")
log_response_exchange(client_sent_content, finish_reason="stop",
streamed=True, stage="post-format sent to client")
final_chunk = formatter.format_litellm_chunk("", is_final=True, usage=usage_details, context_size=context_size)
log_response_payload(final_chunk, streamed=True)
yield f"data: {json.dumps(final_chunk)}\n\n"
else:
# Calculate token counts for usage in final chunk
......@@ -1571,6 +1797,11 @@ async def stream_chat_response(
},
"system_fingerprint": None,
}
log_response_exchange(generated_text, finish_reason="stop",
streamed=True, stage="pre-format extracted")
log_response_exchange(client_sent_content, finish_reason="stop",
streamed=True, stage="post-format sent to client")
log_response_payload(final_chunk, streamed=True)
yield f"data: {json.dumps(final_chunk)}\n\n"
yield "data: [DONE]\n\n"
......@@ -1740,6 +1971,10 @@ async def generate_chat_response(
context_size = current_manager.get_context_size()
# Use OpenAIFormatter for final sanitization
log_response_exchange(response_message.get("content", ""),
tool_calls=response_message.get("tool_calls"),
finish_reason=finish_reason, streamed=False,
stage="pre-format extracted")
formatter = OpenAIFormatter(model_name)
formatted_response = formatter.format_litellm_full(
text=response_message.get("content", ""),
......@@ -1789,6 +2024,7 @@ async def generate_chat_response(
print(json.dumps(formatted_response, indent=2))
print(f"{'='*80}\n")
log_response_payload(formatted_response, streamed=False)
return formatted_response
except Exception as e:
print(f"Error during generation: {e}")
......
......@@ -135,6 +135,32 @@ async def create_transcription(
if len(file_content) > _MAX_AUDIO_BYTES:
raise HTTPException(status_code=413, detail="Audio file too large (max 100 MB)")
# Register a task so transcription appears in the unified task list, like
# every other model type. Finished on success or error below.
from codai.tasks import task_registry
_tid = task_registry.register(
"transcription",
title=(file.filename or "audio")[:80],
model=model or "",
)
task_registry.start(_tid)
try:
_resp = await _run_transcription(
file_content, model, language, prompt, response_format, temperature, file)
task_registry.finish(_tid, "done")
return _resp
except HTTPException:
task_registry.finish(_tid, "error")
raise
except Exception as e:
task_registry.finish(_tid, "error", str(e)[:200])
raise
async def _run_transcription(
file_content: bytes, model: str, language, prompt, response_format, temperature, file
):
"""Core transcription logic; registered as a task by create_transcription()."""
# Check if the requested model maps to a configured whisper-server instance first.
# Try alias round-robin resolution before direct ID lookup.
whisper_model_id = multi_model_manager.resolve_whisper_alias_model_id(model)
......
......@@ -28,6 +28,7 @@ from pydantic import BaseModel, ConfigDict
# Import from codai modules
from codai.models.manager import multi_model_manager
from codai.api import tts_backends
# Global reference to be set by coderai
......@@ -40,6 +41,20 @@ def set_global_args(args):
global_args = args
# Substrings that mark a model as a text/classifier/embedding model wrongly routed
# to TTS (e.g. an emotion classifier exposed under a stray ``tts:`` alias).
_NON_TTS_HINTS = (
"go_emotions", "roberta", "bert", "embedding", "e5-", "minilm",
"classifier", "toxic", "reranker", "sentence-transformers",
)
def _family_is_text_model(model_name: str) -> bool:
"""Heuristic guard: True when the model is clearly not a speech synthesizer."""
n = (model_name or "").lower()
return any(h in n for h in _NON_TTS_HINTS)
# =============================================================================
# Router and Endpoints
# =============================================================================
......@@ -72,6 +87,16 @@ async def create_speech(request: TTSRequest, http_request: Request = None):
Supports:
- Kokoro TTS models (when --tts-model is specified)
"""
# Register a task so TTS shows up in the unified task list / dashboard,
# like every other model type. Finished on success or error below.
from codai.tasks import task_registry, loading_task
_tid = task_registry.register(
"tts",
title=(request.input or "")[:80],
model=(request.model or request.voice_profile or "tts"),
)
task_registry.start(_tid)
try:
# If a voice profile is requested, delegate to voice cloning (F5-TTS)
if request.voice_profile:
from codai.api.voice_clone import _load_voice, _f5tts_clone
......@@ -96,6 +121,7 @@ async def create_speech(request: TTSRequest, http_request: Request = None):
except Exception as e:
raise HTTPException(status_code=500, detail=f"Voice cloning failed: {e}")
audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
task_registry.finish(_tid, "done")
return {"audio": audio_base64}
# Use the manager to resolve the model and manage VRAM
......@@ -111,7 +137,7 @@ async def create_speech(request: TTSRequest, http_request: Request = None):
model_name = model_info['model_name']
model_key = model_info['model_key']
kokoro_model = model_info['model_object']
tts_backend = model_info['model_object']
# If no TTS model configured, return an error
if not model_name:
......@@ -120,35 +146,42 @@ async def create_speech(request: TTSRequest, http_request: Request = None):
detail="TTS not configured. Use --tts-model to specify a model."
)
# Try to use kokoro if available
try:
from kokoro import Kokoro
# Reject text/classifier models that aren't actually speech synthesizers.
if _family_is_text_model(model_name):
raise HTTPException(
status_code=404,
detail=(f"Model '{model_name}' is a text model and cannot be used for "
"tts generation. Use a TTS model (e.g. a kokoro/XTTS/Bark model).")
)
if kokoro_model is None:
print(f"Loading Kokoro TTS model: {model_name}")
try:
from codai.api import tts_backends
# Check if model_name is a URL - download it (with caching)
model_path = None
if model_name.startswith('http://') or model_name.startswith('https://'):
print(f"Loading model from URL: {model_name}")
from codai.models.cache import load_model
model_path = load_model(model_name)
if not model_path:
raise Exception(f"Failed to load model from {model_name}")
else:
# Use local path or model name
if tts_backend is None:
print(f"Loading TTS model: {model_name}")
model_path = model_name
# Load the Kokoro model
kokoro_model = Kokoro(model_path if model_path else model_name)
multi_model_manager.add_model(model_key, kokoro_model)
if model_name.startswith(('http://', 'https://')):
from codai.models.cache import load_model
model_path = load_model(model_name) or model_name
cfg = multi_model_manager.config.get(model_key) or \
multi_model_manager.config.get(f"tts:{model_name}") or {}
with loading_task(model_name, model_type="tts"):
tts_backend = await asyncio.to_thread(
tts_backends.load_backend, model_name, model_path, cfg)
multi_model_manager.add_model(model_key, tts_backend)
multi_model_manager.current_model_key = model_key
# Generate speech
voice = request.voice or "af_sarah"
voice = request.voice or getattr(tts_backend, "default_voice", "")
speed = request.speed or 1.0
lang = getattr(request, "language", None) or "en-us"
emotion = getattr(request, "emotion", None) or ""
style = getattr(request, "style", None) or ""
fmt = request.response_format or "wav"
audio_bytes = kokoro_model.generate(request.input, voice=voice, speed=speed)
samples, sample_rate = await asyncio.to_thread(
tts_backend.synthesize, request.input, voice, speed, lang, emotion, style)
audio_bytes, out_fmt = await asyncio.to_thread(
tts_backends.encode_audio, samples, sample_rate, fmt)
try:
from codai.api.archive import archive_manager
......@@ -157,27 +190,29 @@ async def create_speech(request: TTSRequest, http_request: Request = None):
"tts", "/v1/audio/speech",
model_name,
request.input,
{"voice": voice, "speed": speed, "response_format": request.response_format},
[(audio_bytes, request.response_format or "mp3")],
{"voice": voice, "speed": speed, "response_format": out_fmt},
[(audio_bytes, out_fmt)],
))
except Exception:
pass
# Convert to base64
audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
task_registry.finish(_tid, "done")
return {"audio": audio_base64}
return {
"audio": audio_base64
}
except ImportError as e:
# kokoro not installed
raise HTTPException(
status_code=501,
detail=f"TTS not available. Install kokoro: pip install kokoro. Error: {str(e)}"
)
except HTTPException:
raise
except tts_backends.MissingEngineError as e:
# Missing optional engine (e.g. coqui-tts) → actionable 501.
raise HTTPException(status_code=501, detail=str(e))
except Exception as e:
print(f"TTS error: {e}")
import traceback
traceback.print_exc()
raise HTTPException(status_code=500, detail=f"TTS error: {str(e)}")
except HTTPException:
task_registry.finish(_tid, "error")
raise
except Exception as e:
task_registry.finish(_tid, "error", str(e)[:200])
raise
\ No newline at end of file
"""Pluggable text-to-speech backends for the /v1/audio/speech endpoint.
Dispatches a TTS request to the right engine based on the model family:
* **kokoro** → ``kokoro-onnx`` (ONNX runtime, no torch/spaCy needed). Requires a
``kokoro-*.onnx`` model file and a ``voices-*.bin`` file; both are auto-resolved
from a local dir / HF repo, or downloaded from the kokoro-onnx release.
* **coqui / XTTS** → ``coqui-TTS`` (``pip install coqui-tts``) when installed.
* **parler** → ``parler-tts`` (expressive; voice/emotion/delivery/speed are steered
through a natural-language description prompt) when installed.
* **anything else** → transformers ``pipeline("text-to-speech")`` (SpeechT5, Bark,
VITS / MMS-TTS, …).
Every backend returns ``(samples: np.float32 [-1, 1], sample_rate: int)`` which is
then encoded to the requested container by :func:`encode_audio`.
"""
from __future__ import annotations
import io
import os
from pathlib import Path
from typing import Optional, Tuple
import numpy as np
# Official kokoro-onnx model + voices release (used when files aren't local).
_KOKORO_MODEL_URL = (
"https://github.com/thewh1teagle/kokoro-onnx/releases/download/"
"model-files-v1.0/kokoro-v1.0.onnx"
)
_KOKORO_VOICES_URL = (
"https://github.com/thewh1teagle/kokoro-onnx/releases/download/"
"model-files-v1.0/voices-v1.0.bin"
)
class MissingEngineError(RuntimeError):
"""Raised when the optional engine for a TTS family isn't installed."""
def _family(model_name: str) -> str:
"""Classify a TTS model name into a backend family."""
n = (model_name or "").lower()
if "kokoro" in n:
return "kokoro"
if "xtts" in n or "coqui" in n:
return "coqui"
if "parler" in n:
return "parler"
if "bark" in n:
return "bark"
return "transformers"
# Discrete emotion presets a family can steer at synthesis time. Empty unless an
# engine actually supports it — clients surface an emotion picker only when this
# is non-empty, so the control stays hidden for engines that can't honour it.
_FAMILY_EMOTIONS: dict[str, list[str]] = {
# Parler steers these through its natural-language description prompt.
"parler": ["neutral", "happy", "sad", "angry", "excited", "calm", "fearful"],
# Bark has no true emotion knob; it inserts matching non-verbal cues in text.
"bark": ["neutral", "laughter", "sigh", "gasp"],
}
# Delivery / vocal styles a family can steer (whisper, shout/scream, tone, …).
# Empty unless an engine actually honours it — kept separate from emotions so a
# client can offer "how it's said" independently of "what's felt".
_FAMILY_STYLES: dict[str, list[str]] = {
"parler": ["normal", "whispering", "shouting", "monotone", "expressive"],
"bark": ["normal", "whispering", "singing", "emphasis"],
}
def family_emotions(model_name: str) -> list[str]:
"""Emotions the given model can steer, or [] when none are available."""
return list(_FAMILY_EMOTIONS.get(_family(model_name), []))
def family_styles(model_name: str) -> list[str]:
"""Delivery styles (whisper/shout/tone/…) the model can steer, or []."""
return list(_FAMILY_STYLES.get(_family(model_name), []))
# --------------------------------------------------------------------------- #
# kokoro-onnx
# --------------------------------------------------------------------------- #
def _cache_dir() -> Path:
base = os.environ.get("CODERAI_TTS_CACHE") or os.path.expanduser("~/.coderai/tts_cache")
p = Path(base)
p.mkdir(parents=True, exist_ok=True)
return p
def _download(url: str, dest: Path) -> Path:
if dest.exists() and dest.stat().st_size > 0:
return dest
import urllib.request
tmp = dest.with_suffix(dest.suffix + ".part")
print(f" Downloading TTS asset: {url}")
urllib.request.urlretrieve(url, tmp)
tmp.replace(dest)
return dest
def _resolve_kokoro_files(model_path: str, config: dict) -> Tuple[str, str]:
"""Return (onnx_model_path, voices_path) for kokoro-onnx.
Order of resolution: explicit config fields → files alongside a local dir /
.onnx path → download the official release into the TTS cache.
"""
voices_path = (config or {}).get("voices_path")
onnx_path = (config or {}).get("model_path") or model_path
cand = Path(onnx_path) if onnx_path else None
if cand and cand.is_dir():
onnx = next(iter(sorted(cand.glob("*.onnx"))), None)
vb = next(iter(sorted(cand.glob("voices*.bin"))), None)
if onnx:
onnx_path = str(onnx)
if vb and not voices_path:
voices_path = str(vb)
elif cand and cand.suffix == ".onnx" and cand.exists():
onnx_path = str(cand)
if not voices_path:
sib = next(iter(sorted(cand.parent.glob("voices*.bin"))), None)
if sib:
voices_path = str(sib)
# Fall back to the official release files in the cache.
if not (onnx_path and Path(onnx_path).exists()):
onnx_path = str(_download(_KOKORO_MODEL_URL, _cache_dir() / "kokoro-v1.0.onnx"))
if not (voices_path and Path(voices_path).exists()):
voices_path = str(_download(_KOKORO_VOICES_URL, _cache_dir() / "voices-v1.0.bin"))
return onnx_path, voices_path
class _KokoroBackend:
family = "kokoro"
default_voice = "af_sarah"
def __init__(self, model_path: str, config: dict):
from kokoro_onnx import Kokoro
onnx_path, voices_path = _resolve_kokoro_files(model_path, config)
print(f" kokoro-onnx model={onnx_path} voices={voices_path}")
self._kokoro = Kokoro(onnx_path, voices_path)
def synthesize(self, text: str, voice: str, speed: float, lang: str,
emotion: str = "", style: str = "") -> Tuple[np.ndarray, int]:
samples, sr = self._kokoro.create(
text, voice=voice or self.default_voice, speed=speed or 1.0,
lang=lang or "en-us",
)
return np.asarray(samples, dtype=np.float32), int(sr)
def voices(self):
try:
return sorted(self._kokoro.get_voices())
except Exception:
return []
# --------------------------------------------------------------------------- #
# transformers pipeline("text-to-speech")
# --------------------------------------------------------------------------- #
class _TransformersBackend:
family = "transformers"
default_voice = ""
def __init__(self, model_name: str, config: dict):
from transformers import pipeline
import torch
device = 0 if torch.cuda.is_available() else -1
print(f" transformers TTS pipeline model={model_name} device={device}")
self._pipe = pipeline("text-to-speech", model=model_name, device=device)
def synthesize(self, text: str, voice: str, speed: float, lang: str,
emotion: str = "", style: str = "") -> Tuple[np.ndarray, int]:
out = self._pipe(text)
audio = np.asarray(out["audio"], dtype=np.float32)
if audio.ndim > 1:
audio = audio.squeeze()
return audio, int(out["sampling_rate"])
def voices(self):
return []
# --------------------------------------------------------------------------- #
# coqui / XTTS (optional)
# --------------------------------------------------------------------------- #
class _CoquiBackend:
family = "coqui"
default_voice = ""
def __init__(self, model_name: str, config: dict):
try:
from TTS.api import TTS # coqui-tts
except ImportError as e:
raise MissingEngineError(
"Coqui/XTTS models need the coqui-tts package: "
"pip install coqui-tts"
) from e
import torch
self._cfg = config or {}
self._tts = TTS(model_name).to("cuda" if torch.cuda.is_available() else "cpu")
def synthesize(self, text: str, voice: str, speed: float, lang: str,
emotion: str = "", style: str = "") -> Tuple[np.ndarray, int]:
# XTTS can clone from a reference wav, use one of its built-in speakers,
# or fall back to a default. `voice` may be a wav path, a built-in
# speaker name, or (e.g. a Kokoro id like "af_sarah") neither — so only
# treat it as a clone source when it's an actual file.
kwargs = {"text": text, "language": (lang or "en")[:2]}
speakers = list(getattr(self._tts, "speakers", None) or [])
cfg_wav = self._cfg.get("speaker_wav")
if voice and os.path.isfile(voice):
kwargs["speaker_wav"] = voice
elif cfg_wav and os.path.isfile(cfg_wav):
kwargs["speaker_wav"] = cfg_wav
elif voice and voice in speakers:
kwargs["speaker"] = voice
elif speakers:
# Multi-speaker model (e.g. XTTS-v2) needs *a* speaker; pick a default.
kwargs["speaker"] = self._cfg.get("speaker") or speakers[0]
try:
kwargs["speed"] = float(speed) if speed else 1.0
wav = np.asarray(self._tts.tts(**kwargs), dtype=np.float32)
except TypeError:
kwargs.pop("speed", None) # some coqui models don't accept speed
wav = np.asarray(self._tts.tts(**kwargs), dtype=np.float32)
sr = int(getattr(self._tts.synthesizer, "output_sample_rate", 24000))
return wav, sr
def voices(self):
return list(getattr(self._tts, "speakers", None) or [])
# --------------------------------------------------------------------------- #
# Parler-TTS (optional) — expressive, description-prompt driven
# --------------------------------------------------------------------------- #
class _ParlerBackend:
family = "parler"
default_voice = ""
def __init__(self, model_name: str, config: dict):
try:
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
except ImportError as e:
raise MissingEngineError(
"Parler-TTS isn't installed. NOTE: parler-tts pins an old "
"transformers/tokenizers/huggingface-hub that conflict with this "
"server — do NOT pip install it into this environment. Run it in a "
"separate venv as its own service, or use an expressive engine that "
"works with this stack (e.g. Bark via the transformers pipeline)."
) from e
import torch
self._cfg = config or {}
self._device = "cuda" if torch.cuda.is_available() else "cpu"
self._model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(self._device)
self._tok = AutoTokenizer.from_pretrained(model_name)
self._sr = int(self._model.config.sampling_rate)
def _describe(self, voice: str, speed: float, emotion: str, style: str) -> str:
# Parler is steered by a free-text description of the delivery; map the
# UI controls (voice name, emotion, delivery style, speed) into one.
speaker = (voice or "").strip()
if speaker and (os.sep in speaker or speaker.lower().startswith(("af_", "am_", "bf_", "bm_"))):
speaker = "" # a file path or a Kokoro id is not a Parler speaker name
who = speaker or self._cfg.get("speaker") or "A speaker"
bits = [f"{who} speaks"]
if emotion and emotion != "neutral":
bits.append(f"in a {emotion} tone")
smap = {"whispering": "whispering softly", "shouting": "shouting loudly",
"monotone": "in a flat monotone", "expressive": "in a very expressive, animated way"}
if style and style not in ("", "normal"):
bits.append(smap.get(style, style))
try:
sp = float(speed or 1.0)
except (TypeError, ValueError):
sp = 1.0
bits.append(f"at a {'slow' if sp < 0.9 else 'fast' if sp > 1.15 else 'moderate'} pace")
return (" ".join(bits) +
". The recording is very high quality, the voice clear and close up "
"with no background noise.")
def synthesize(self, text: str, voice: str, speed: float, lang: str,
emotion: str = "", style: str = "") -> Tuple[np.ndarray, int]:
description = self._cfg.get("description") or self._describe(voice, speed, emotion, style)
input_ids = self._tok(description, return_tensors="pt").input_ids.to(self._device)
prompt_ids = self._tok(text, return_tensors="pt").input_ids.to(self._device)
gen = self._model.generate(input_ids=input_ids, prompt_input_ids=prompt_ids)
audio = np.asarray(gen.cpu().numpy().squeeze(), dtype=np.float32)
return audio, self._sr
def voices(self):
return []
# --------------------------------------------------------------------------- #
# Bark (suno/bark) — expressive via text markup; works with current transformers
# --------------------------------------------------------------------------- #
class _BarkBackend:
family = "bark"
default_voice = "v2/en_speaker_6"
# Curated English Bark presets by gender (speaker_6 is a clear male, speaker_9
# is the commonly-used female). Override via config: "bark_voice_male" /
# "bark_voice_female", or "bark_voices": {"male": ..., "female": ...}.
_BARK_MALE = "v2/en_speaker_6"
_BARK_FEMALE = "v2/en_speaker_9"
def __init__(self, model_name: str, config: dict):
# Uses the stable AutoProcessor + BarkModel API (not the pipeline) so
# voice presets and generation params are passed reliably.
from transformers import AutoProcessor, BarkModel
import torch
self._cfg = config or {}
self._device = "cuda" if torch.cuda.is_available() else "cpu"
self._proc = AutoProcessor.from_pretrained(model_name)
self._model = BarkModel.from_pretrained(model_name).to(self._device)
self._sr = int(getattr(self._model.generation_config, "sample_rate", 24000))
def _markup(self, text: str, emotion: str, style: str) -> str:
# Bark is steered by in-text cues rather than parameters.
if style == "emphasis":
text = text.upper()
elif style == "singing":
text = f"♪ {text} ♪"
elif style == "whispering":
text = f"[whispers] {text}"
cue = {"laughter": "[laughs] ", "sigh": "[sighs] ", "gasp": "[gasps] "}.get(emotion, "")
return cue + text
def _resolve_preset(self, voice: str) -> str:
v = (voice or "").strip()
# An explicit Bark preset passes straight through.
if v and ("speaker" in v or v.startswith("v2/")):
return v
# The editor sends Kokoro-style ids whose 2nd char is the gender
# (af_/bf_ = female, am_/bm_ = male). Map that to a gendered preset.
lv = v.lower()
gender = "male" if (len(lv) >= 2 and lv[1] == "m") else \
("female" if (len(lv) >= 2 and lv[1] == "f") else "")
vmap = self._cfg.get("bark_voices") or {}
if gender == "male":
return self._cfg.get("bark_voice_male") or vmap.get("male") or self._BARK_MALE
if gender == "female":
return self._cfg.get("bark_voice_female") or vmap.get("female") or self._BARK_FEMALE
return self._cfg.get("voice_preset") or self.default_voice
def synthesize(self, text: str, voice: str, speed: float, lang: str,
emotion: str = "", style: str = "") -> Tuple[np.ndarray, int]:
import torch
# Speed isn't controllable in Bark; the voice maps to a gendered preset.
preset = self._resolve_preset(voice)
prompt = self._markup(text, emotion, style)
inputs = self._proc(prompt, voice_preset=preset)
inputs = {k: (v.to(self._device) if hasattr(v, "to") else v) for k, v in inputs.items()}
with torch.no_grad():
audio = self._model.generate(**inputs)
arr = np.asarray(audio.cpu().numpy().squeeze(), dtype=np.float32)
return arr, self._sr
def voices(self):
return [f"v2/en_speaker_{i}" for i in range(10)]
# --------------------------------------------------------------------------- #
# Parler over HTTP — the real engine runs in an isolated venv as a microservice
# (parler-tts pins an old transformers that conflicts with this server's stack).
# --------------------------------------------------------------------------- #
class _RemoteParlerBackend:
family = "parler"
default_voice = ""
def __init__(self, config: dict, managed_model: Optional[str] = None):
self._cfg = config or {}
self._url = str(self._cfg["service_url"]).rstrip("/")
# When coderai launched the worker itself, remember the model so the
# manager's eviction (which calls cleanup()) can shut it down.
self._managed_model = managed_model
def synthesize(self, text: str, voice: str, speed: float, lang: str,
emotion: str = "", style: str = "") -> Tuple[np.ndarray, int]:
import io
import requests
import soundfile as sf
payload = {"text": text, "voice": voice, "speed": speed,
"emotion": emotion, "style": style, "language": lang}
if self._cfg.get("description"):
payload["description"] = self._cfg["description"]
resp = requests.post(self._url + "/speak", json=payload, timeout=600)
resp.raise_for_status()
data, sr = sf.read(io.BytesIO(resp.content), dtype="float32")
if getattr(data, "ndim", 1) > 1:
data = data.mean(axis=1)
return np.asarray(data, dtype=np.float32), int(sr)
def voices(self):
return []
def cleanup(self):
# Called by the model manager on eviction; stop the worker we launched.
if self._managed_model:
try:
from codai.api import parler_worker
parler_worker.stop_service(self._managed_model)
except Exception:
pass
def load_backend(model_name: str, model_path: Optional[str], config: Optional[dict]):
"""Instantiate the TTS backend for ``model_name`` (cached by the caller)."""
fam = _family(model_name)
config = config or {}
if fam == "kokoro":
return _KokoroBackend(model_path or model_name, config)
if fam == "coqui":
return _CoquiBackend(model_name, config)
if fam == "bark":
return _BarkBackend(model_name, config)
if fam == "parler":
# An explicit service_url points at an externally-run service. Otherwise
# coderai fully manages the worker: bootstrap its venv, spawn it, and
# route to it — no manual setup needed.
if config.get("service_url"):
return _RemoteParlerBackend(config)
from codai.api import parler_worker
url = parler_worker.ensure_service(model_name)
return _RemoteParlerBackend({**config, "service_url": url},
managed_model=model_name)
return _TransformersBackend(model_name, config)
# --------------------------------------------------------------------------- #
# encoding
# --------------------------------------------------------------------------- #
def encode_audio(samples: np.ndarray, sample_rate: int, fmt: str) -> Tuple[bytes, str]:
"""Encode float samples to the requested container, returning (bytes, fmt).
WAV/FLAC/OGG go straight through soundfile; mp3 (and anything else) is muxed
via ffmpeg when available, else falls back to WAV.
"""
import soundfile as sf
fmt = (fmt or "wav").lower()
samples = np.clip(np.asarray(samples, dtype=np.float32), -1.0, 1.0)
sf_formats = {"wav": "WAV", "flac": "FLAC", "ogg": "OGG"}
if fmt in sf_formats:
buf = io.BytesIO()
sf.write(buf, samples, sample_rate, format=sf_formats[fmt])
return buf.getvalue(), fmt
# mp3/other: write WAV then transcode with ffmpeg if present.
wav = io.BytesIO()
sf.write(wav, samples, sample_rate, format="WAV")
wav_bytes = wav.getvalue()
import shutil
import subprocess
if shutil.which("ffmpeg"):
try:
proc = subprocess.run(
["ffmpeg", "-hide_banner", "-loglevel", "error",
"-f", "wav", "-i", "pipe:0", "-f", fmt, "pipe:1"],
input=wav_bytes, stdout=subprocess.PIPE, check=True,
)
return proc.stdout, fmt
except Exception as exc:
print(f" ffmpeg transcode to {fmt} failed ({exc}); returning WAV")
return wav_bytes, "wav"
......@@ -161,6 +161,272 @@ class NvidiaBackend(ModelBackend):
print(f"Warning: Could not estimate model size: {e}")
return None
def _model_head_dim(self, model_name: str) -> Optional[int]:
"""Return the model's attention head dimension from its config.
Prefers the explicit ``head_dim`` field (Gemma sets it directly, decoupled
from hidden_size/num_heads); otherwise derives hidden_size // num_heads.
Returns None when the config can't be read.
"""
from transformers import AutoConfig
try:
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
except Exception as e:
print(f"Warning: Could not read head dimension from config: {e}")
return None
# Multimodal models (Gemma, Qwen-VL) nest the real attention dims under
# text_config/vision_config; the top level reports None. Return the max
# head dim across all sub-configs so the FA2 limit check can't be fooled.
dims = []
for cfg in (config,
getattr(config, 'text_config', None),
getattr(config, 'vision_config', None)):
if cfg is None:
continue
head_dim = getattr(cfg, 'head_dim', None)
if head_dim:
dims.append(int(head_dim))
continue
hidden = getattr(cfg, 'hidden_size', None)
heads = getattr(cfg, 'num_attention_heads', None)
if hidden and heads:
dims.append(int(hidden) // int(heads))
return max(dims) if dims else None
def _estimate_kv_cache_bytes(self, model_name: str, n_ctx) -> int:
"""Estimate the KV-cache size (bytes) for an ``n_ctx``-token sequence.
KV = 2 (key+value) × Σ(effective tokens per layer) × kv_heads × head_dim ×
dtype_bytes. Effective tokens per layer depend on the attention type:
full-attention layers hold the whole context; sliding-window layers (gemma)
cap at the window; linear-attention layers (Qwen3.5/Qwen3-Next) keep only a
small fixed recurrent state (~0 KV). The cache stays fp16/bf16 (2 bytes)
even when weights are 4-bit; head_dim/kv_heads come from the *text* config
(multimodal models nest them under ``text_config``). Returns 0 when
``n_ctx`` or the architecture can't be determined.
"""
try:
n_ctx = int(n_ctx)
except (TypeError, ValueError):
return 0
if n_ctx <= 0 or not model_name:
return 0
try:
from transformers import AutoConfig
cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
tc = getattr(cfg, 'text_config', None) or cfg
layers = getattr(tc, 'num_hidden_layers', None)
kv_heads = (getattr(tc, 'num_key_value_heads', None)
or getattr(tc, 'num_attention_heads', None))
head_dim = getattr(tc, 'head_dim', None)
if not head_dim:
hidden = getattr(tc, 'hidden_size', None)
heads = getattr(tc, 'num_attention_heads', None)
if hidden and heads:
head_dim = int(hidden) // int(heads)
try:
sliding = int(getattr(tc, 'sliding_window', None) or 0) or None
except (TypeError, ValueError):
sliding = None
# Sum cached tokens contributed by each layer, honouring its attn type.
layer_types = getattr(tc, 'layer_types', None)
if layer_types:
eff_tokens = 0
for t in layer_types:
tl = str(t).lower()
if 'linear' in tl:
continue # recurrent state — negligible KV
if 'sliding' in tl and sliding:
eff_tokens += min(n_ctx, sliding)
else:
eff_tokens += n_ctx
elif layers:
eff_tokens = int(layers) * n_ctx
else:
return 0
if not (kv_heads and head_dim and eff_tokens > 0):
return 0
dtype_bytes = 2 # KV cache is fp16/bf16 regardless of weight quant
return 2 * int(eff_tokens) * int(kv_heads) * int(head_dim) * dtype_bytes
except Exception as e:
print(f"Warning: could not estimate KV cache size: {e}")
return 0
def _kv_quant_nbits(self):
"""Decide KV-cache quantization width (2 or 4 bits) or None for fp16.
Honours an explicit ``cache_type_k``/``cache_type_v`` request (e.g. "q4_0",
"int4", "q2"); otherwise auto-enables 4-bit quantization when the model's
estimated fp16 KV cache is large enough to threaten VRAM. Quantizing the
KV cache (quanto) is what lets a long context coexist with the weights on
a single GPU instead of forcing a heavy weight offload.
"""
# quanto/HQQ QuantizedCache only works with plain full-attention models.
# Both hybrid linear-attention (Qwen3.5/Qwen3-Next) and sliding-window
# (gemma) models raise during generation, so skip quantization entirely —
# regardless of any explicit cache_type request.
if not self._kv_quant_compatible():
return None
spec = str(
getattr(self, '_pending_cache_type_k', None)
or getattr(self, '_pending_cache_type_v', None)
or ''
).lower()
if spec in ('', 'f16', 'fp16', 'bf16', 'f32', 'none', 'auto'):
kv = self._estimate_kv_cache_bytes(
getattr(self, '_pending_model_name', None),
getattr(self, '_pending_ctx', None),
)
return 4 if kv > 6 * 1024 ** 3 else None
if spec.startswith('q2') or 'int2' in spec or spec == '2':
return 2
return 4
def _kv_quant_compatible(self) -> bool:
"""Whether the model supports transformers' quantized KV cache.
Only plain full-attention models do. Hybrid linear-attention models
(Qwen3.5/Qwen3-Next, identified by 'linear' entries in ``layer_types``)
raise "`has_previous_state` can only be called on LinearAttention layers",
and sliding-window/gemma models also fail — so exclude both.
"""
try:
cfg = getattr(self.model, 'config', None)
if cfg is None:
from transformers import AutoConfig
name = getattr(self, '_pending_model_name', None)
if not name:
return False
cfg = AutoConfig.from_pretrained(name, trust_remote_code=True)
tc = getattr(cfg, 'text_config', None) or cfg
layer_types = getattr(tc, 'layer_types', None) or []
if any('linear' in str(t).lower() for t in layer_types):
return False
if self._is_sliding_window_model():
return False
return True
except Exception:
return False
def _is_sliding_window_model(self) -> bool:
"""True for hybrid / sliding-window-attention models (gemma family).
Prefers the loaded model's config; falls back to AutoConfig at load time
(before the model exists) using the pending model name.
"""
try:
cfg = getattr(self.model, 'config', None)
if cfg is None:
from transformers import AutoConfig
name = getattr(self, '_pending_model_name', None)
if not name:
return False
cfg = AutoConfig.from_pretrained(name, trust_remote_code=True)
tc = getattr(cfg, 'text_config', None) or cfg
model_type = (getattr(tc, 'model_type', '') or '').lower()
cache_impl = (getattr(tc, 'cache_implementation', '') or '').lower()
return (
model_type.startswith('gemma')
or getattr(tc, 'sliding_window', None) is not None
or cache_impl in {'hybrid', 'sliding_window'}
)
except Exception:
return False
def _kv_cache_reserve_bytes(self) -> int:
"""VRAM (bytes) to reserve for the KV cache, accounting for quantization.
Quantized caches keep a small fp16 residual window plus group metadata, so
we scale the fp16 estimate by nbits/16 with ~1.5× overhead rather than a
naive 4×. Returns 0 when the size is unknown.
"""
fp16 = self._estimate_kv_cache_bytes(
getattr(self, '_pending_model_name', None),
getattr(self, '_pending_ctx', None),
)
if fp16 <= 0:
return 0
nbits = self._kv_quant_nbits()
if nbits:
return int(fp16 * (nbits / 16.0) * 1.5)
return fp16
def _kv_offload_threshold_bytes(self) -> int:
"""Free VRAM (after weights) below which a large KV should live on CPU.
Computed once per load from the actual free VRAM headroom; falls back to a
fixed 8 GB if it can't be read.
"""
try:
import torch
free, _ = torch.cuda.mem_get_info()
# Leave ~2 GB for activations/compute; KV above that goes to CPU.
return max(int(2 * 1024 ** 3), int(free - 2 * 1024 ** 3))
except Exception:
return 8 * 1024 ** 3
def _offloaded_cache_impl(self) -> str:
"""Name of the offloaded KV cache for sliding-window / hybrid models.
transformers >=5.12 merges the hybrid offloaded cache into
``offloaded_static`` (the sliding/full layer structure is inferred from the
model config automatically); ``offloaded_hybrid`` is deprecated and removed
in v5.13. Prefer the new name when the installed transformers exposes it, so
we stay correct across versions without emitting the deprecation warning.
"""
try:
from transformers.generation.configuration_utils import ALL_CACHE_IMPLEMENTATIONS
if 'offloaded_static' in ALL_CACHE_IMPLEMENTATIONS:
return 'offloaded_static'
except Exception:
pass
return 'offloaded_hybrid'
def _cache_gen_kwargs(self, using_prefix: bool, plain: bool = False) -> dict:
"""generate() kwargs selecting the KV-cache strategy, or {} for default.
Priority: (1) quantized cache for compatible large-KV models (cuts VRAM
~4×); (2) offloaded cache when the estimated KV won't fit in free VRAM —
keeps weights on GPU and streams KV from CPU RAM, and works on hybrid /
sliding-window models where the quantized cache crashes. ``plain=True`` (the
fallback path) forces the default in-GPU DynamicCache so a request can
always succeed even if a special cache is unsupported. Skipped entirely
when a manually-prefilled prefix cache is in use.
"""
if using_prefix or plain:
return {}
# 1. Quantized cache (full-attention models only; returns None otherwise).
nbits = self._kv_quant_nbits()
if nbits:
if not getattr(self, '_cache_strategy_announced', False):
print(f"KV cache quantization enabled: quanto int{nbits} (residual_length=128)")
self._cache_strategy_announced = True
return {
'cache_implementation': 'quantized',
'cache_config': {
'backend': 'quanto',
'nbits': nbits,
'q_group_size': 64,
'residual_length': 128,
},
}
# 2. Offloaded cache when the KV is too large to fit in free VRAM.
kv = self._estimate_kv_cache_bytes(
getattr(self, '_pending_model_name', None),
getattr(self, '_pending_ctx', None),
)
if kv > 0 and kv > self._kv_offload_threshold_bytes():
impl = self._offloaded_cache_impl() if self._is_sliding_window_model() else 'offloaded'
if not getattr(self, '_cache_strategy_announced', False):
print(f"KV cache offloaded to CPU: cache_implementation={impl} "
f"(est ~{kv/1e9:.1f}GB exceeds free VRAM)")
self._cache_strategy_announced = True
return {'cache_implementation': impl}
return {}
def _get_gpu_memory_map(self) -> Dict:
"""Get max_memory dict for Accelerate."""
import torch
......@@ -193,6 +459,7 @@ class NvidiaBackend(ModelBackend):
from transformers import AutoModelForCausalLM
try:
load_kwargs = self._strip_invalid_native_quant_config(model_name, load_kwargs)
model = AutoModelForCausalLM.from_pretrained(model_name, **load_kwargs)
if device == "cpu" and load_kwargs.get('device_map') is None:
model = model.to(device)
......@@ -226,6 +493,61 @@ class NvidiaBackend(ModelBackend):
raise e
raise
def _prequant_method(self, model_name: str):
"""Return the checkpoint's valid embedded quantization method, or None.
Models shipped already-quantized (FP8 / GPTQ / AWQ / compressed-tensors,
e.g. DeepSeek-V4-Flash's FineGrainedFP8Config) carry a ``quantization_config``
in their config.json and MUST be loaded with that native config —
bitsandbytes cannot be layered on top (transformers raises
"is quantized with ... but you are passing a BitsAndBytesConfig").
Some non-transformers repositories (notably MLX checkpoints) publish a
partial ``quantization_config`` without ``quant_method``. Transformers
treats that as invalid and raises during ``from_pretrained`` even if we
don't pass our own config. Do not treat those checkpoints as native
transformers quantized models.
"""
try:
from transformers import AutoConfig
cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
qc = getattr(cfg, 'quantization_config', None)
if not qc:
return None
if isinstance(qc, dict):
return qc.get('quant_method')
return getattr(qc, 'quant_method', None)
except Exception:
return None
def _strip_invalid_native_quant_config(self, model_name: str, load_kwargs: dict) -> dict:
"""Avoid passing malformed native quantization configs to transformers.
If a checkpoint config has ``quantization_config`` but no
``quant_method``, recent transformers aborts with:
"The model's quantization config ... has no `quant_method` attribute".
Removing it lets normal HF/bitsandbytes loading paths proceed; MLX-only
checkpoints will then fail with a clearer architecture/weight mismatch
instead of entering the text endpoint retry loop for a bogus quant config.
"""
if 'quantization_config' in load_kwargs:
return load_kwargs
try:
from transformers import AutoConfig
cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
qc = getattr(cfg, 'quantization_config', None)
if not isinstance(qc, dict) or qc.get('quant_method'):
return load_kwargs
if hasattr(cfg, 'quantization_config'):
delattr(cfg, 'quantization_config')
patched = dict(load_kwargs)
patched['config'] = cfg
print("Ignoring invalid checkpoint quantization_config without quant_method; "
"using explicit loader quantization/settings instead.")
return patched
except Exception:
return load_kwargs
def _make_bnb_config(self, model_name: str, load_in_4bit: bool, load_in_8bit: bool):
"""Build a transformers BitsAndBytesConfig (the modern quant API).
......@@ -236,6 +558,14 @@ class NvidiaBackend(ModelBackend):
Always go through quantization_config instead.
"""
ml = model_name.lower()
# Already-quantized checkpoints must load with their own config; bnb on top
# is rejected by transformers. Skip bnb and let from_pretrained use the
# embedded quantization_config.
pq = self._prequant_method(model_name)
if pq:
print(f"Model is pre-quantized ({pq}); skipping bitsandbytes and loading "
f"with its native quantization config.")
return None
if 'qwen3.5' in ml and ('a3b' in ml or 'moe' in ml):
print(f"Warning: {model_name} does not support bitsandbytes quantization")
return None
......@@ -365,6 +695,9 @@ class NvidiaBackend(ModelBackend):
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# Re-evaluate KV-prefix support for the model about to be loaded.
self._kv_prefix_ok = None
offload_dir = kwargs.get('offload_dir')
load_in_4bit = kwargs.get('load_in_4bit', False)
load_in_8bit = kwargs.get('load_in_8bit', False)
......@@ -386,6 +719,10 @@ class NvidiaBackend(ModelBackend):
pass
self._pending_ram_gb = manual_ram_gb
self._pending_model_name = model_name
self._pending_ctx = kwargs.get('ctx')
self._pending_cache_type_k = kwargs.get('cache_type_k')
self._pending_cache_type_v = kwargs.get('cache_type_v')
print(f"Loading HuggingFace model: {model_name}")
......@@ -417,6 +754,36 @@ class NvidiaBackend(ModelBackend):
self.use_flash_attn = flash_attn and self._fa2_safe
self.check_flash_attn_support()
# FlashAttention-2's forward kernel supports a head dimension of at most
# 256. Gemma (and some other large-head-dim models) exceed this, so FA2
# raises "FlashAttention forward only supports head dimension at most
# 256" on EVERY forward pass — both the KV-prefix build and the actual
# model.generate (whose error is swallowed by the streamer thread, so the
# request silently produces no output and appears to hang). Fall back to
# SDPA, which handles any head dimension and still uses flash kernels.
if self.use_flash_attn:
head_dim = self._model_head_dim(model_name)
fa2_bad = bool(head_dim and head_dim > 256)
reason = f"head dimension {head_dim} exceeds FA2's limit of 256" if fa2_bad else None
if not fa2_bad:
# Gemma reports head_dim==256 but still raises "FlashAttention
# forward only supports head dimension at most 256" on every
# forward (its sliding-window attention path), producing empty
# replies. Treat the whole gemma family as FA2-incompatible.
try:
from transformers import AutoConfig
_cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
_tc = getattr(_cfg, 'text_config', None) or _cfg
_mt = (getattr(_tc, 'model_type', '') or getattr(_cfg, 'model_type', '') or '').lower()
if _mt.startswith('gemma'):
fa2_bad = True
reason = f"gemma family (model_type={_mt}) is incompatible with FA2"
except Exception:
pass
if fa2_bad:
self.use_flash_attn = False
print(f" Flash Attention 2 disabled: {reason} → using SDPA instead.")
self.device = self._detect_device()
self.tokenizer = AutoTokenizer.from_pretrained(
......@@ -454,6 +821,7 @@ class NvidiaBackend(ModelBackend):
load_kwargs['quantization_config'] = _qc
try:
load_kwargs = self._strip_invalid_native_quant_config(model_name, load_kwargs)
model = AutoModelForCausalLM.from_pretrained(model_name, **load_kwargs)
self.model = model
self.model.eval()
......@@ -514,6 +882,7 @@ class NvidiaBackend(ModelBackend):
load_kwargs.pop('dtype', None)
try:
load_kwargs = self._strip_invalid_native_quant_config(model_name, load_kwargs)
model = AutoModelForCausalLM.from_pretrained(model_name, **load_kwargs)
except Exception as e:
raise RuntimeError(
......@@ -521,9 +890,56 @@ class NvidiaBackend(ModelBackend):
f"The model may be too large for available VRAM. Error: {e}"
)
else:
# 'auto'/'auto-borderline': honour the dropdown's documented contract —
# "over-VRAM → straight to model offload" means that when the model's
# peak (quantized weights + KV reserve + activations) FITS in free VRAM
# we load it full-GPU on a single device (fast, no device_map split, no
# CPU staging), and only fall through to the device_map=auto offload
# ladder below when it genuinely doesn't fit. The full-GPU attempt is
# non-fatal: on OOM it falls back to the ladder (unlike strategy 'none',
# which hard-errors). Honours the large-context KV reserve so a model
# that fits *with* its 64k KV stays resident.
if (model is None and self.device == "cuda"
and offload_strategy in ('auto', 'auto-borderline')):
_fits = False
try:
if torch.cuda.is_available() and expected_vram_gb > 0:
_free, _ = torch.cuda.mem_get_info(0)
_free_gb = _free / 1e9
_kv_gb = self._kv_cache_reserve_bytes() / 1e9
_act_gb = 1.5 if _kv_gb > 0 else 0.0
_need_gb = expected_vram_gb + _kv_gb + _act_gb
_borderline = 3.0 if offload_strategy == 'auto-borderline' else 0.0
_fits = _need_gb <= (_free_gb - 0.5 + _borderline)
if _fits:
print(f"\n Auto: peak VRAM need {_need_gb:.1f} GB "
f"(weights {expected_vram_gb:.1f} + KV {_kv_gb:.1f} "
f"+ act {_act_gb:.1f}) fits in {_free_gb:.1f} GB free "
f"— loading full-GPU (no offload)")
else:
print(f"\n Auto: peak VRAM need {_need_gb:.1f} GB > "
f"{_free_gb:.1f} GB free — going straight to "
f"device_map offload")
except Exception:
_fits = False
if _fits:
cuda_device = self._derive_cuda_device()
_fg_kwargs = dict(load_kwargs)
_fg_kwargs['device_map'] = cuda_device
_fg_kwargs['low_cpu_mem_usage'] = True
_fg_kwargs = self._strip_invalid_native_quant_config(model_name, _fg_kwargs)
model = self._try_load_model(model_name, _fg_kwargs, self.device)
if model is not None:
print(f" ✓ Model loaded full-GPU on {cuda_device}")
else:
print(" ✗ Full-GPU load OOMed — falling back to "
"device_map offload ladder")
first_vram_pct = vram_percentages[0] if vram_percentages else 0.93
for vram_pct in vram_percentages:
if model is not None:
break
if self.device != "cuda":
# No CUDA device — go straight to CPU+disk loading below.
break
......@@ -627,6 +1043,14 @@ class NvidiaBackend(ModelBackend):
import torch
max_memory = {}
# Reserve VRAM for the KV cache (grows with context) plus a fixed
# activation/compute buffer, so device_map offloads enough weight layers
# to CPU instead of packing VRAM with weights and OOMing at generation.
# Uses the quantization-aware reserve so an int4 KV cache doesn't force a
# needless heavy offload.
kv_reserve = self._kv_cache_reserve_bytes()
activation_reserve = int(1.5 * 1024 ** 3) if kv_reserve > 0 else 0
if torch.cuda.is_available():
for i in range(torch.cuda.device_count()):
props = torch.cuda.get_device_properties(i)
......@@ -638,7 +1062,24 @@ class NvidiaBackend(ModelBackend):
headroom = 512 * 1024 * 1024 # 512 MB for CUDA driver overhead
limit_by_fraction = int(total_vram * vram_fraction)
limit_by_free = max(0, free_vram - headroom)
max_memory[i] = min(limit_by_fraction, limit_by_free)
weight_budget = min(limit_by_fraction, limit_by_free)
# Cap the reservation so a large/mis-estimated KV cache can never
# crush the weight budget: never reserve more than 60% of the GPU
# budget for context. If the KV genuinely doesn't fit in the
# remaining 40%, KV quantization (see _kv_quant_nbits) is the lever,
# not starving the weights onto CPU.
reserved = min(kv_reserve + activation_reserve, int(weight_budget * 0.6))
if reserved > 0:
new_budget = max(weight_budget - reserved, int(weight_budget * 0.4))
print(
f" GPU {i}: reserving {reserved/1e9:.1f}GB for KV+activations "
f"(KV~{kv_reserve/1e9:.1f}GB, ctx={getattr(self, '_pending_ctx', None)}, "
f"quant={self._kv_quant_nbits()}); "
f"weight budget {weight_budget/1e9:.1f}→{new_budget/1e9:.1f}GB "
f"(rest spills to CPU)"
)
weight_budget = new_budget
max_memory[i] = weight_budget
manual_ram_gb = getattr(self, '_pending_ram_gb', None)
if manual_ram_gb:
......@@ -965,6 +1406,9 @@ class NvidiaBackend(ModelBackend):
if repeat_penalty != 1.0:
generation_kwargs["repetition_penalty"] = repeat_penalty
# Quantize the KV cache when enabled (completions never use a prefix cache).
generation_kwargs.update(self._cache_gen_kwargs(using_prefix=False))
# Mid-generation thermal checkpoint (runs on the generate thread).
_criteria = []
_therm = _make_thermal_criteria()
......@@ -998,12 +1442,36 @@ class NvidiaBackend(ModelBackend):
torch.cuda.empty_cache()
else:
generation_error = str(e)
except Exception as e:
# Any other failure (shape/cache mismatch, transformers API change, …)
# must still be recorded — otherwise it is silently swallowed.
generation_error = str(e)
print(f"Error during streaming generation: {e}")
finally:
# generate() only calls streamer.end() on its success path. If it
# raised before finishing, end the streamer here so the consumer is
# never left blocked forever on an empty queue (which freezes the
# whole event loop).
streamer.end()
thread = Thread(target=generate_with_error_handling)
thread.start()
# Pull each token from a worker thread so a blocking streamer.__next__
# never runs on (and freezes) the asyncio event loop between tokens.
import asyncio
_SENT = object()
_it = iter(streamer)
def _next_token():
try:
return next(_it)
except StopIteration:
return _SENT
try:
for text in streamer:
while True:
text = await asyncio.to_thread(_next_token)
if text is _SENT:
break
yield text
except Exception as e:
print(f"Error during stream iteration: {e}")
......@@ -1059,6 +1527,79 @@ class NvidiaBackend(ModelBackend):
del self._kv_past_key_values
self._kv_past_key_values = None
self._kv_prefix_len = 0
def _kv_prefix_supported(self) -> bool:
"""Whether this model can safely reuse a manually-prefilled KV cache.
The prefix fast-path builds a cache with a plain forward pass and then
continues it via generate(input_ids=suffix, past_key_values=cache). That
only works for models that use a simple growing (Dynamic) cache. Models
with a sliding-window / hybrid cache (e.g. the gemma family) build a
different cache object during generate() and raise before the first
token when handed our prefix — so disable the fast-path for them and let
the full forward pass handle the request.
"""
cached = getattr(self, "_kv_prefix_ok", None)
if cached is not None:
return cached
ok = True
try:
cfg = getattr(self.model, "config", None)
# Multimodal wrappers nest the LM config under text_config.
text_cfg = getattr(cfg, "text_config", None) or cfg
model_type = (getattr(text_cfg, "model_type", "") or "").lower()
cache_impl = (getattr(text_cfg, "cache_implementation", "") or "").lower()
sliding = getattr(text_cfg, "sliding_window", None)
reason = None
if (
model_type.startswith("gemma")
or cache_impl in {"hybrid", "static", "sliding_window"}
or sliding is not None
):
reason = "hybrid/sliding-window cache"
else:
# A large configured context means the stored prefix KV is several
# GB and lives *alongside* the generation cache — doubling KV
# memory and risking OOM on a single GPU. Not worth it: disable the
# fast-path so only one KV cache is ever resident.
kv_bytes = self._estimate_kv_cache_bytes(
getattr(self, '_pending_model_name', None),
getattr(self, '_pending_ctx', None),
)
if kv_bytes > 2 * 1024 ** 3:
reason = f"large KV cache (~{kv_bytes/1e9:.1f}GB at configured ctx)"
if reason is not None:
ok = False
self._kv_prefix_off_reason = reason
except Exception:
# If we can't introspect the config, stay safe and skip the fast-path.
ok = False
self._kv_prefix_off_reason = "config introspection failed"
if not ok:
print(
"KV-prefix fast-path disabled for this model "
f"({getattr(self, '_kv_prefix_off_reason', 'unsupported')}); "
"using full forward pass"
)
self._kv_prefix_ok = ok
return ok
def _kv_prefix_headroom_ok(self, min_free_gb: float = 1.5) -> bool:
"""Whether there is enough free VRAM to safely build/store a KV prefix.
The prefix path runs an extra forward pass and keeps a second copy of the
prefix KV alongside the live model. On a nearly-full card that extra
allocation OOMs (the build is caught and we fall back, but it wastes a
forward pass and risks fragmentation). Skip it when headroom is low and
let normal generation — which doesn't keep a separate stored prefix —
handle the request.
"""
import torch
try:
free, _total = torch.cuda.mem_get_info()
return free / 1e9 >= min_free_gb
except Exception:
return False
self._kv_timestamp = 0.0
# ------------------------------------------------------------------
......@@ -1092,8 +1633,21 @@ class NvidiaBackend(ModelBackend):
ids.add(int(self.tokenizer.eos_token_id))
except Exception:
pass
# The model's own generation_config is authoritative for the turn-end
# token(s) — e.g. gemma-4's turn terminator is <turn|> (id 106), which has
# no recognisable name in the loop below, so without this the model never
# stops after a tool call and loops to max_tokens.
try:
gc_eos = getattr(getattr(self.model, 'generation_config', None),
'eos_token_id', None)
if isinstance(gc_eos, int):
ids.add(gc_eos)
elif isinstance(gc_eos, (list, tuple)):
ids.update(int(t) for t in gc_eos if isinstance(t, int))
except Exception:
pass
for tok in ('<|im_end|>', '<|eot_id|>', '<|end|>', '<|endoftext|>',
'<|end_of_text|>', '<end_of_turn>'):
'<|end_of_text|>', '<end_of_turn>', '<turn|>'):
try:
tid = self.tokenizer.convert_tokens_to_ids(tok)
if isinstance(tid, int) and tid >= 0 and tid != getattr(
......@@ -1103,25 +1657,189 @@ class NvidiaBackend(ModelBackend):
pass
return list(ids) if ids else self.tokenizer.eos_token_id
def supports_native_tools(self) -> bool:
"""True when the loaded model's chat template understands `tools=` natively
(gemma-4, Qwen, Llama-3.1, …). For those we pass the structured tools to the
template instead of injecting coderai's custom <tool>{…} text prompt, so the
model is prompted in — and replies in — its own trained tool-call format."""
tmpl = getattr(self.tokenizer, 'chat_template', None)
return bool(tmpl) and ('tools' in tmpl or 'tool_calls' in tmpl)
def _native_tools_payload(self, tools):
"""Normalise tools to the OpenAI [{'type','function':{…}}] dicts that chat
templates expect. Accepts dicts or pydantic Tool objects; returns None if
there's nothing usable."""
if not tools:
return None
out = []
for t in tools:
if isinstance(t, dict):
fn = t.get('function') or {}
name = fn.get('name') if isinstance(fn, dict) else None
if not name:
continue
out.append({'type': t.get('type', 'function'),
'function': {'name': name,
'description': fn.get('description') or '',
'parameters': fn.get('parameters') or {}}})
else:
fn = getattr(t, 'function', None)
name = getattr(fn, 'name', None) if fn else None
if not name:
continue
out.append({'type': getattr(t, 'type', 'function'),
'function': {'name': name,
'description': getattr(fn, 'description', '') or '',
'parameters': getattr(fn, 'parameters', {}) or {}}})
return out or None
def _build_native_tool_prompt(self, messages, native_tools, enable_thinking,
add_generation_prompt):
"""Render the prompt via the model's template with native `tools=`, keeping
structured `tool_calls` and `role:tool` turns intact so the template emits
the model's own tool-call/tool-response format. Returns the string, or None
if the template can't handle it (caller falls back)."""
import re as _re
def _get(m, k, default=None):
return m.get(k, default) if isinstance(m, dict) else getattr(m, k, default)
# Going native: strip coderai's custom <tool>{…} text instruction that
# format_tools_for_prompt() prepends to the system prompt, so the model
# isn't told to use two different tool formats at once (native tool
# declarations are supplied via tools= below).
def _strip_injected(text):
if not text or 'You have access to the following tools:' not in text:
return text
return _re.sub(
r"You have access to the following tools:.*?example\.txt.*?</tool>\s*",
"", text, count=1, flags=_re.DOTALL).lstrip()
norm = []
for m in messages:
role = _get(m, 'role')
content = _get(m, 'content') or ''
if isinstance(content, list):
content = '\n'.join(
str(p.get('text', '')) if isinstance(p, dict) else str(p)
for p in content)
if role in ('system', 'developer'):
content = _strip_injected(content)
entry = {'role': role, 'content': content}
tcs = _get(m, 'tool_calls')
if tcs:
# Pass tool_calls through in the OpenAI shape the templates expect
# (function.name + function.arguments as a JSON string).
norm_tcs = []
for tc in tcs:
fn = (tc.get('function') if isinstance(tc, dict)
else getattr(tc, 'function', None)) or {}
name = fn.get('name') if isinstance(fn, dict) else getattr(fn, 'name', '')
args = fn.get('arguments') if isinstance(fn, dict) else getattr(fn, 'arguments', '{}')
tcid = (tc.get('id') if isinstance(tc, dict) else getattr(tc, 'id', None)) or f"call_{len(norm_tcs)}"
norm_tcs.append({'id': tcid, 'type': 'function',
'function': {'name': name, 'arguments': args}})
entry['tool_calls'] = norm_tcs
tcid = _get(m, 'tool_call_id')
if tcid:
entry['tool_call_id'] = tcid
name = _get(m, 'name')
if name:
entry['name'] = name
norm.append(entry)
for kwargs in ({'tools': native_tools, 'add_generation_prompt': add_generation_prompt,
'enable_thinking': enable_thinking},
{'tools': native_tools, 'add_generation_prompt': add_generation_prompt}):
try:
return self.tokenizer.apply_chat_template(norm, tokenize=False, **kwargs)
except TypeError:
continue
except Exception:
return None
return None
def _build_chat_prompt(self, messages, enable_thinking: bool = False,
add_generation_prompt: bool = True) -> str:
add_generation_prompt: bool = True, tools=None) -> str:
"""Build the prompt string using the MODEL's own chat template when it has
one (correct special tokens + proper `enable_thinking` handling for Qwen3).
Falls back to the legacy custom formatter when no template is available.
`enable_thinking=True` keeps reasoning <think> blocks available for callers
that ask for them; `False` (default) suppresses them via the template.
When `tools` is given and the template natively supports tools, the tools
and the structured tool_calls/tool-role turns are passed straight to the
template (native format) — see :meth:`supports_native_tools`.
"""
import json
tmpl = getattr(self.tokenizer, 'chat_template', None)
# Native-tools fast path: hand structured tools + tool turns to the model's
# own template so it renders (and the model emits) its trained tool-call
# format, instead of folding everything into custom <tool>{…} text.
native_tools = self._native_tools_payload(tools) if (
tmpl and tools and self.supports_native_tools()) else None
if native_tools:
prompt = self._build_native_tool_prompt(
messages, native_tools, enable_thinking, add_generation_prompt)
if prompt is not None:
return prompt
# else: native render failed — fall through to the generic path.
if tmpl:
# Normalise to plain {role, content} dicts for apply_chat_template.
#
# Most chat templates (gemma, mistral, …) only understand the
# system/user/assistant roles and a plain `content` string — they
# ignore `tool_calls`/`tool_call_id` and reject (or silently drop) the
# `tool` role. If we simply stripped those, an agentic client
# (opencode, etc.) would lose the record of the tool call it already
# made *and* the result it got back, so the model re-issues the same
# call every turn — an infinite tool-call loop. So we fold tool turns
# back into `content` using the same `<tool>{…}</tool>` convention the
# tool-injection prompt teaches, and render tool results as readable
# text under a role the template accepts.
def _get(m, k, default=None):
return m.get(k, default) if isinstance(m, dict) else getattr(m, k, default)
norm = []
for m in messages:
if isinstance(m, dict):
norm.append({'role': m.get('role'), 'content': m.get('content') or ''})
role = _get(m, 'role')
content = _get(m, 'content') or ''
if isinstance(content, list):
content = '\n'.join(
str(p.get('text', '')) if isinstance(p, dict) else str(p)
for p in content)
if role == 'assistant':
tcs = _get(m, 'tool_calls') or []
for tc in tcs:
fn = (tc.get('function') if isinstance(tc, dict)
else getattr(tc, 'function', None)) or {}
name = fn.get('name', '') if isinstance(fn, dict) else getattr(fn, 'name', '')
args = fn.get('arguments', '{}') if isinstance(fn, dict) else getattr(fn, 'arguments', '{}')
if not isinstance(args, str):
try:
args = json.dumps(args)
except Exception:
args = '{}'
block = f'<tool>{{"name": "{name}", "arguments": {args}}}</tool>'
content = (content + '\n' + block) if content else block
norm.append({'role': 'assistant', 'content': content})
elif role == 'tool':
# Templates that lack a `tool` role would error/drop this.
# Render the result as a user turn so the model sees it.
name = _get(m, 'name') or ''
label = f'Tool result ({name})' if name else 'Tool result'
text = f'{label}: {content}'
# Merge into the previous turn if it's also a synthesised
# user/tool message, to avoid consecutive same-role turns that
# strict templates (gemma) reject.
if norm and norm[-1]['role'] == 'user':
norm[-1]['content'] = norm[-1]['content'] + '\n' + text
else:
norm.append({'role': 'user', 'content': text})
else:
norm.append({'role': getattr(m, 'role', None),
'content': getattr(m, 'content', '') or ''})
norm.append({'role': role, 'content': content})
try:
return self.tokenizer.apply_chat_template(
norm, tokenize=False,
......@@ -1154,7 +1872,7 @@ class NvidiaBackend(ModelBackend):
max_tokens = 512
full_prompt = self._build_chat_prompt(messages, enable_thinking=enable_thinking,
add_generation_prompt=True)
add_generation_prompt=True, tools=tools)
total_input_ids = self.tokenizer(full_prompt, return_tensors="pt")['input_ids']
total_prompt_len = int(total_input_ids.shape[1])
......@@ -1168,9 +1886,9 @@ class NvidiaBackend(ModelBackend):
past_kv = None
cached_len = 0
if prefix_msgs and self._model_on_cuda():
if prefix_msgs and self._model_on_cuda() and self._kv_prefix_supported() and self._kv_prefix_headroom_ok():
prefix_text = self._build_chat_prompt(
prefix_msgs, enable_thinking=enable_thinking, add_generation_prompt=False)
prefix_msgs, enable_thinking=enable_thinking, add_generation_prompt=False, tools=tools)
if self._kv_cache_valid() and self._kv_prefix_text == prefix_text:
past_kv = self._kv_past_key_values
cached_len = self._kv_prefix_len
......@@ -1223,6 +1941,7 @@ class NvidiaBackend(ModelBackend):
input_ids=total_input_ids,
attention_mask=attn_mask,
**gen_kwargs,
**self._cache_gen_kwargs(using_prefix=False),
)
new_tokens = outputs[0][total_prompt_len:]
generated_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
......@@ -1300,7 +2019,7 @@ class NvidiaBackend(ModelBackend):
max_tokens = 512
full_prompt = self._build_chat_prompt(messages, enable_thinking=enable_thinking,
add_generation_prompt=True)
add_generation_prompt=True, tools=tools)
total_input_ids = self.tokenizer(full_prompt, return_tensors="pt")['input_ids']
total_prompt_len = int(total_input_ids.shape[1])
......@@ -1312,9 +2031,9 @@ class NvidiaBackend(ModelBackend):
past_kv = None
cached_len = 0
if prefix_msgs and self._model_on_cuda():
if prefix_msgs and self._model_on_cuda() and self._kv_prefix_supported() and self._kv_prefix_headroom_ok():
prefix_text = self._build_chat_prompt(
prefix_msgs, enable_thinking=enable_thinking, add_generation_prompt=False)
prefix_msgs, enable_thinking=enable_thinking, add_generation_prompt=False, tools=tools)
if self._kv_cache_valid() and self._kv_prefix_text == prefix_text:
past_kv = self._kv_past_key_values
cached_len = self._kv_prefix_len
......@@ -1329,21 +2048,46 @@ class NvidiaBackend(ModelBackend):
temperature, top_p, do_sample = self._validate_params(temperature, top_p)
total_input_ids = total_input_ids.to(self.model.device)
if past_kv is not None and 0 < cached_len < total_prompt_len:
# Stopping criteria (thermal checkpoint + optional stop sequences) are
# independent of the KV-prefix path, so build them once and reuse across
# both the cached attempt and any full-forward fallback.
_criteria = []
_therm = _make_thermal_criteria()
if _therm is not None:
_criteria.append(_therm)
if stop:
class _StopOnSeq(StoppingCriteria):
def __init__(self, seqs, tok):
self.seqs = seqs
self.tok = tok
def __call__(self, input_ids, scores, **kw):
decoded = self.tok.decode(input_ids[0][-20:], skip_special_tokens=True)
return any(s in decoded for s in self.seqs)
_criteria.append(_StopOnSeq(stop, self.tokenizer))
stopping = StoppingCriteriaList(_criteria) if _criteria else None
import asyncio
_SENT = object()
def _build_attempt(use_prefix, plain_cache=False):
"""Build (streamer, gen_kwargs, used_cached_len) for one attempt.
use_prefix=False forces a clean full forward pass (no past_key_values);
plain_cache=True forces the default in-GPU cache (fallback path)."""
if use_prefix and past_kv is not None and 0 < cached_len < total_prompt_len:
gen_input_ids = total_input_ids[:, cached_len:]
full_attn = torch.ones(
1, total_prompt_len, dtype=torch.long, device=self.model.device
)
extra_gen = {'past_key_values': past_kv, 'attention_mask': full_attn}
used_cached = cached_len
else:
cached_len = 0
gen_input_ids = total_input_ids
extra_gen = {'attention_mask': torch.ones_like(total_input_ids)}
used_cached = 0
streamer = TextIteratorStreamer(
self.tokenizer, skip_prompt=True, skip_special_tokens=True
)
gen_kwargs = dict(
input_ids=gen_input_ids,
max_new_tokens=max_tokens,
......@@ -1356,26 +2100,25 @@ class NvidiaBackend(ModelBackend):
use_cache=True,
**extra_gen,
)
# Mid-generation thermal checkpoint (runs on the generate thread).
_criteria = []
_therm = _make_thermal_criteria()
if _therm is not None:
_criteria.append(_therm)
if stop:
class _StopOnSeq(StoppingCriteria):
def __init__(self, seqs, tok):
self.seqs = seqs
self.tok = tok
def __call__(self, input_ids, scores, **kw):
decoded = self.tok.decode(input_ids[0][-20:], skip_special_tokens=True)
return any(s in decoded for s in self.seqs)
_criteria.append(_StopOnSeq(stop, self.tokenizer))
if _criteria:
gen_kwargs['stopping_criteria'] = StoppingCriteriaList(_criteria)
if stopping is not None:
gen_kwargs['stopping_criteria'] = stopping
# Select the KV-cache strategy (quantized / offloaded) on the
# full-forward path. Not combinable with a prefix cache; plain_cache
# forces the default cache for the guaranteed-working fallback.
gen_kwargs.update(self._cache_gen_kwargs(
using_prefix=used_cached > 0, plain=plain_cache))
return streamer, gen_kwargs, used_cached
gen_error = [None]
comp_tokens = [0]
final_cached_len = [0]
async def _attempt(use_prefix, plain_cache=False):
"""Run one generation attempt, yielding decoded text. Records any
failure in gen_error[0] and the prefix length actually used."""
gen_error[0] = None
streamer, gen_kwargs, used_cached = _build_attempt(use_prefix, plain_cache)
final_cached_len[0] = used_cached
def _run():
try:
......@@ -1383,27 +2126,71 @@ class NvidiaBackend(ModelBackend):
self.model.generate(**gen_kwargs)
except Exception as e:
gen_error[0] = str(e)
print(f"Error during {'KV-cached' if use_prefix else 'full'} stream generation: {e}")
# Release whatever the failed pass reserved so the next request
# starts from a clean allocator state (esp. on OOM).
if "out of memory" in str(e).lower():
try:
torch.cuda.empty_cache()
except Exception:
pass
finally:
# generate() only ends the streamer on success; if it raised
# before finishing, end it here so the consumer never deadlocks
# on an empty queue (which would freeze the whole event loop).
streamer.end()
thread = Thread(target=_run)
thread.start()
# Pull each token from a worker thread so a blocking streamer.__next__
# never runs on (and freezes) the asyncio event loop between tokens.
_it = iter(streamer)
def _next_token():
try:
for text in streamer:
return next(_it)
except StopIteration:
return _SENT
try:
while True:
text = await asyncio.to_thread(_next_token)
if text is _SENT:
break
comp_tokens[0] += 1
yield text
except Exception as e:
print(f"Error during KV-cached stream iteration: {e}")
finally:
thread.join()
try:
# First attempt: use the cached KV prefix when one is available.
async for text in _attempt(use_prefix=True):
yield text
# If the first attempt failed before emitting any token — whether from
# a stale prefix or an unsupported quantized/offloaded cache — retry
# once with a clean full forward pass and the default in-GPU cache,
# which is guaranteed to work if the model works at all.
if gen_error[0] and comp_tokens[0] == 0:
print("generation failed before first token; retrying with plain full forward pass")
self.invalidate_kv_cache()
async for text in _attempt(use_prefix=False, plain_cache=True):
yield text
finally:
self._last_usage = {
'prompt_tokens': total_prompt_len,
'completion_tokens': comp_tokens[0],
'cached_tokens': cached_len,
'cached_tokens': final_cached_len[0],
}
if gen_error[0]:
print(f"Warning: KV-cached stream generation error: {gen_error[0]}")
print(f"Warning: stream generation error (after fallback): {gen_error[0]}")
self.invalidate_kv_cache()
# If we produced nothing, the client would otherwise receive an empty
# but successful completion. Stream a visible error notice instead.
if comp_tokens[0] == 0:
if "out of memory" in gen_error[0].lower():
yield ("[error: GPU ran out of memory during generation — "
"try a shorter prompt/context or a smaller model]")
else:
yield "[error: text generation failed — see server logs]"
def get_model_name(self) -> str:
return self.model_name or "unknown"
......
# CoderAI - OpenAI-compatible API server
# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
"""ds4 (DeepSeek V4) proxy backend.
ds4-server already speaks the OpenAI HTTP API, so this backend is a thin proxy: it
forwards chat/completion requests to the managed ``ds4-server`` subprocess (whose
lifecycle is owned by :mod:`codai.api.ds4_worker`) and adapts the responses to the
:class:`~codai.backends.base.ModelBackend` contract the model manager expects.
Tool/think parsing is handled the same way as the other backends — by
``ModelParserAdapter`` over the returned text — so tools are not forwarded to
ds4-server; the text-level ``DeepSeekParser`` extracts ``<think>`` and tool calls.
"""
import asyncio
import threading
from typing import AsyncGenerator, Dict, List, Optional
from codai.backends.base import ModelBackend
class Ds4Backend(ModelBackend):
"""Proxy backend that routes generation to a managed ds4-server."""
def __init__(self, cfg=None):
# cfg is a codai.config.Ds4Config. When omitted, resolve the active one.
if cfg is None:
from codai.config import Ds4Config
cfg = Ds4Config()
self._cfg = cfg
self._model_id = getattr(cfg, "model_id", "deepseek-v4") or "deepseek-v4"
self._url: Optional[str] = None
self._ctx = int(getattr(cfg, "ctx", 100000) or 100000)
self._last_usage: Dict = {}
# ------------------------------------------------------------------ #
# lifecycle
# ------------------------------------------------------------------ #
def load_model(self, model_name: str, **kwargs) -> None:
from codai.api import ds4_worker
if model_name:
self._model_id = model_name
self._url = ds4_worker.ensure_service(self._cfg)
def get_model_name(self) -> str:
return self._model_id
def get_context_size(self) -> int:
return self._ctx
def get_last_usage(self) -> dict:
return dict(self._last_usage)
def cleanup(self) -> None:
from codai.api import ds4_worker
ds4_worker.stop_service(getattr(self._cfg, "model_id", self._model_id))
self._url = None
# ------------------------------------------------------------------ #
# helpers
# ------------------------------------------------------------------ #
def _base(self) -> str:
if not self._url:
raise RuntimeError("ds4 service not started")
return self._url
def _store_usage(self, usage: dict) -> None:
if usage:
self._last_usage = {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}
def format_messages(self, messages) -> str:
# ds4-server applies DeepSeek V4's own chat template server-side; this is only
# used by callers that need a flat prompt string.
parts = []
for m in messages:
role = m.get("role") if isinstance(m, dict) else getattr(m, "role", "")
content = m.get("content") if isinstance(m, dict) else getattr(m, "content", "")
parts.append(f"{role}: {content}")
return "\n".join(parts)
def _chat_payload(self, messages, max_tokens, temperature, top_p, stop, stream):
payload = {
"model": self._model_id,
"messages": messages,
"temperature": temperature,
"top_p": top_p,
"stream": stream,
}
if max_tokens:
payload["max_tokens"] = max_tokens
if stop:
payload["stop"] = stop
return payload
# ------------------------------------------------------------------ #
# chat-level generation (preferred by the manager)
# ------------------------------------------------------------------ #
def generate_chat(self, messages: List[Dict], max_tokens=None, temperature=0.7,
top_p=1.0, stop=None, tools=None, response_format=None):
import requests
payload = self._chat_payload(messages, max_tokens, temperature, top_p, stop, False)
if response_format and response_format.get("type") == "json_object":
payload["response_format"] = {"type": "json_object"}
r = requests.post(self._base() + "/v1/chat/completions", json=payload, timeout=3600)
r.raise_for_status()
data = r.json()
self._store_usage(data.get("usage", {}))
return data["choices"][0]["message"].get("content") or ""
async def generate_chat_stream(self, messages: List[Dict], max_tokens=None,
temperature=0.7, top_p=1.0, stop=None, tools=None,
response_format=None) -> AsyncGenerator[str, None]:
payload = self._chat_payload(messages, max_tokens, temperature, top_p, stop, True)
async for chunk in self._stream(self._base() + "/v1/chat/completions", payload,
delta_key="delta"):
yield chunk
# ------------------------------------------------------------------ #
# plain completion (fallback path)
# ------------------------------------------------------------------ #
def generate(self, prompt: str, max_tokens=None, temperature: float = 0.7,
top_p: float = 1.0, stop=None, repeat_penalty: float = 1.0,
presence_penalty: float = 0.0, frequency_penalty: float = 0.0) -> str:
return self.generate_chat([{"role": "user", "content": prompt}],
max_tokens, temperature, top_p, stop)
async def generate_stream(self, prompt: str, max_tokens=None, temperature: float = 0.7,
top_p: float = 1.0, stop=None, repeat_penalty: float = 1.0,
presence_penalty: float = 0.0,
frequency_penalty: float = 0.0) -> AsyncGenerator[str, None]:
async for chunk in self.generate_chat_stream(
[{"role": "user", "content": prompt}], max_tokens, temperature, top_p, stop):
yield chunk
# ------------------------------------------------------------------ #
# SSE streaming: iterate the blocking requests stream on a worker thread
# and hand chunks to the event loop through an asyncio.Queue.
# ------------------------------------------------------------------ #
async def _stream(self, url: str, payload: dict, delta_key: str
) -> AsyncGenerator[str, None]:
import json
loop = asyncio.get_event_loop()
queue: asyncio.Queue = asyncio.Queue()
_SENTINEL = object()
def _worker():
import requests
try:
with requests.post(url, json=payload, stream=True, timeout=3600) as r:
r.raise_for_status()
for raw in r.iter_lines(decode_unicode=True):
if not raw or not raw.startswith("data:"):
continue
data = raw[len("data:"):].strip()
if data == "[DONE]":
break
try:
obj = json.loads(data)
except ValueError:
continue
choice = (obj.get("choices") or [{}])[0]
text = (choice.get(delta_key) or {}).get("content") or ""
if text:
loop.call_soon_threadsafe(queue.put_nowait, text)
if obj.get("usage"):
self._store_usage(obj["usage"])
if choice.get("finish_reason"):
break
except Exception as exc: # surface to the consumer
loop.call_soon_threadsafe(queue.put_nowait, exc)
finally:
loop.call_soon_threadsafe(queue.put_nowait, _SENTINEL)
threading.Thread(target=_worker, daemon=True).start()
while True:
item = await queue.get()
if item is _SENTINEL:
break
if isinstance(item, Exception):
raise item
yield item
......@@ -621,6 +621,27 @@ class VulkanBackend(ModelBackend):
else:
raise ValueError(f"Could not cache model from URL: {model_path}")
# Fallback: a configured .gguf path that no longer exists (e.g. the file was
# downloaded into the GGUF cache rather than the HF-hub snapshot the entry
# points at, or a stale snapshot hash). Look for the same filename in the
# GGUF cache dir before giving up — the model loads without re-editing the
# config entry.
if model_path.endswith('.gguf') and not os.path.exists(model_path):
try:
from codai.models.cache import get_model_cache_dir
_base = os.path.basename(model_path)
_cache = get_model_cache_dir()
_cand = os.path.join(_cache, _base)
if not os.path.exists(_cand):
import glob as _glob
_hits = _glob.glob(os.path.join(_cache, "**", _base), recursive=True)
_cand = _hits[0] if _hits else _cand
if os.path.exists(_cand):
print(f" Model path missing; resolved from GGUF cache: {_cand}")
model_path = _cand
except Exception:
pass
if not os.path.exists(model_path):
raise FileNotFoundError(f"Model file not found: {model_path}")
......
......@@ -49,7 +49,13 @@ def build_hardware_summary() -> Dict[str, Any]:
total_vram_mb = 0
available_vram_mb = 0
# Only use torch if it's ALREADY loaded (i.e. we're in an engine). Never import
# it here — the front is torch-free and must stay that way (importing torch in
# the front is heavy and would initialise CUDA in the wrong process).
import sys as _sys
try:
if "torch" not in _sys.modules:
raise ImportError("torch not loaded (front) — using torch-free path")
import torch
if torch.cuda.is_available():
......@@ -76,6 +82,23 @@ def build_hardware_summary() -> Dict[str, Any]:
except Exception:
pass
# Torch-free path (e.g. the front, which imports no torch): enumerate every
# physical card via nvidia-smi + sysfs so VRAM is reported for the whole node.
if not gpus:
try:
from codai.frontproxy.gpu_detect import gpu_stats
for c in gpu_stats():
total_mb = int(round((c.get("mem_total") or 0) * 1024))
used_mb = int(round((c.get("mem_used") or 0) * 1024))
if total_mb <= 0:
continue
gpus.append({"name": c.get("name") or c.get("vendor"),
"total_vram_mb": total_mb})
total_vram_mb += total_mb
available_vram_mb += max(0, total_mb - used_mb)
except Exception:
pass
if not gpus:
for total_path in sorted(glob.glob("/sys/class/drm/card*/device/mem_info_vram_total")):
used_path = total_path.replace("vram_total", "vram_used")
......
......@@ -60,8 +60,13 @@ def _is_text_response(content_type: str | None) -> bool:
)
async def execute_broker_request(app, envelope):
"""Validate and execute a broker request envelope."""
async def execute_broker_request(app, envelope, executor=None):
"""Validate and execute a broker request envelope.
``executor`` is an ``async (method, path, headers, query, body) -> {status_code,
headers, body}`` callable. When omitted the request is run in-process against
``app`` via the ASGI bridge (engine / single-process mode). The front passes its
own executor that proxies to the right engine over HTTP."""
logger.debug(
"broker dispatch → op=%s request_id=%s path=%r method=%r stream=%s",
......@@ -136,6 +141,12 @@ async def execute_broker_request(app, envelope):
headers["content-type"] = envelope.content_type
started_at = perf_counter()
if executor is not None:
response = await executor(
method=envelope.method, path=envelope.path, headers=headers,
query=envelope.query, body=body,
)
else:
response = await execute_internal_request(
app,
method=envelope.method,
......
......@@ -224,6 +224,13 @@ configuration directory (--config DIR, default: OS-specific CoderAI directory).
action="store_true",
help="Dump model output: raw output, parsed output, and litellm debug info",
)
parser.add_argument(
"--debug-requests",
action="store_true",
help="Log the full request/response payloads exchanged with API clients "
"(opencode, etc.): incoming messages + tools and the outgoing "
"content/tool_calls. Use to diagnose agentic tool-call loops.",
)
parser.add_argument(
"--list-cached-models",
action="store_true",
......@@ -278,4 +285,39 @@ configuration directory (--config DIR, default: OS-specific CoderAI directory).
help="Ignore any existing pipeline cache and rebuild it from scratch this "
"run (use after changing a model's quantization/precision config).",
)
# ─── Frontend/engine split ───────────────────────────────────────────────
parser.add_argument(
"--single-process",
action="store_true",
help="Run the legacy single-process server (UI/API and all model work in "
"one process). Default boots a front proxy + supervised engine "
"subprocess(es) so the web UI stays responsive during model work.",
)
parser.add_argument(
"--engine-only",
action="store_true",
help="Run this process as an engine (binds an internal localhost port, no "
"front proxy). Normally launched automatically by the front; not "
"intended to be run by hand.",
)
parser.add_argument(
"--internal-port",
type=int,
default=None,
help="Internal port for --engine-only mode (the front assigns one per engine).",
)
parser.add_argument(
"--debug-engine",
action="store_true",
help="General engine debugging in the front/engine split (engine lifecycle, "
"spawn details, health transitions). Does NOT include the internal "
"HTTP access log — use --debug-engine-web for that.",
)
parser.add_argument(
"--debug-engine-web",
action="store_true",
help="Show the internal front↔engine HTTP requests in an engine's access log "
"(proxied calls, /internal/engine-state, /healthz, …). Suppressed by "
"default since every engine only ever serves internal front traffic.",
)
return parser.parse_args()
......@@ -34,6 +34,43 @@ class ServerConfig:
https_cert_path: Optional[str] = None
queue_max_size: int = 6
max_parallel_requests: int = 2
# Per-engine overrides for max_parallel_requests, keyed by engine name
# (e.g. {"nvidia": 4, "radeon": 1}). Each engine is a separate process and
# enforces this on itself, so the default already applies per-engine; the
# override lets a bigger card run more concurrently than a smaller one. Blank =
# use the default above.
max_parallel_requests_overrides: dict = field(default_factory=dict)
# ─── Frontend/engine split ───────────────────────────────────────────────
# By default coderai boots a thin, always-responsive *front* reverse proxy on
# the public host/port and supervises one or more *engine* subprocesses (which
# do all GPU/model work) on internal localhost ports. This keeps the web UI
# responsive while a model loads or generates. Set single_process=True (or pass
# --single-process) to keep the legacy one-process behavior.
single_process: bool = False
internal_port_base: int = 8780 # first engine binds here; +1 per extra engine
engines: int = 0 # 0 = auto (one per detected GPU, min 1)
engine_gpus: Optional[list] = None # explicit GPU indices, e.g. [0, 1]; None = auto
proxy_status_timeout: float = 2.0 # short timeout for UI/status proxying (seconds)
proxy_max_inflight: int = 64 # max concurrent proxied requests through the front
# Explicit, heterogeneous engine declarations. Auto GPU detection only finds
# NVIDIA cards and assumes one backend, and CUDA vs Vulkan device enumeration is
# inconsistent — so for mixed setups (e.g. an NVIDIA + a Radeon card, where the
# NVIDIA engine also serves GGUF via Vulkan) declare each engine with its own
# env block. When non-empty this overrides `engines`/`engine_gpus`. Each item:
# {
# "name": "nvidia", # label for logs
# "backend": "nvidia", # nvidia | vulkan (forces this engine's backend)
# "capabilities": [...], # optional; defaults from backend (see below)
# "env": { "CUDA_VISIBLE_DEVICES": "0", "GGML_VK_VISIBLE_DEVICES": "0",
# "VK_ICD_FILENAMES": "/usr/share/vulkan/icd.d/nvidia_icd.json" }
# }
# Default capabilities: nvidia → ["transformers","gguf"]; vulkan → ["gguf"].
engine_specs: Optional[list] = None
# Preferred engine (by name or backend) when a model is compatible with more
# than one — e.g. a GGUF that could run on either an NVIDIA or a Radeon engine.
# None = spread to the least-loaded compatible engine. A per-model "engine" set
# in models.json overrides this for that model.
default_engine: Optional[str] = None
@dataclass
......@@ -52,6 +89,9 @@ class ModelsConfig:
hf_cache_dir: Optional[str] = None
gguf_cache_dir: Optional[str] = None
max_model_instances: int = 1 # max concurrent instances per model (global default; overridable per-model via "max_instances")
# Per-engine overrides for max_model_instances, keyed by engine name
# (e.g. {"nvidia": 2, "radeon": 1}). Applied per-engine process; blank = default.
max_model_instances_overrides: dict = field(default_factory=dict)
@dataclass
......@@ -72,6 +112,13 @@ class OffloadConfig:
max_ram_gb: Optional[float] = None
evict_idle_on_ram: bool = True # unload idle LRU models when over the RAM cap
ram_leak_watch: bool = True # background watcher samples RSS + auto-mitigates
# Leak-watch mitigation tuning. The watcher runs a mitigation ladder when RSS
# crosses ram_watch_soft_fraction of the cap (or a leak is suspected). On a
# marginal GPU the cross-thread CUDA call in that ladder can be undesirable, so
# ram_watch_cuda gates whether mitigation is allowed to call torch.cuda.empty_cache().
ram_watch_poll_seconds: float = 15.0 # how often the watcher samples RSS
ram_watch_soft_fraction: float = 0.90 # mitigate at/above this fraction of the cap
ram_watch_cuda: bool = True # allow mitigation to call CUDA empty_cache()
@dataclass
......@@ -130,6 +177,11 @@ class ThermalConfig:
cpu_resume: float = 87.0 # resume once CPU drops back to/below this
gpu_high: float = 90.0 # pause when GPU reaches this temperature
gpu_resume: float = 87.0 # resume once GPU drops back to/below this
# Per-vendor GPU threshold overrides, e.g. {"amd": {"high": 95, "resume": 92}}.
# A card uses its vendor's override when present, else the gpu_high/gpu_resume
# defaults above — so e.g. a Radeon that runs hotter can have a higher limit
# than an NVIDIA card. Keyed by vendor: "nvidia" | "amd" | "intel".
gpu_overrides: dict = field(default_factory=dict)
poll_seconds: float = 5.0 # how often to re-check while cooling down
# Proactive soft-throttle: before a hard pause, when a sensor enters the warm
# band [soft_throttle_temp, *_high) insert a short per-step sleep (scaled by
......@@ -162,6 +214,30 @@ class EnhanceConfig:
allow_rife_ncnn: bool = False # allow the external rife-ncnn-vulkan binary instead of a torch model
@dataclass
class Ds4Config:
"""DeepSeek V4 via ds4 (antirez/DwarfStar) external-worker configuration.
ds4 is a native inference engine built specifically for DeepSeek V4 that exposes
an OpenAI-compatible HTTP server (``ds4-server``). When ``enabled``, coderai owns
the whole lifecycle: on first use it clones + builds ds4, downloads the chosen
GGUF weight variant, launches ``ds4-server`` as a managed subprocess, and proxies
text requests to it. Any requested model whose name matches ``model_id`` (or
contains ``deepseek-v4``) is routed to ds4 instead of the normal backends.
"""
enabled: bool = False
repo_url: str = "https://github.com/antirez/ds4"
install_dir: Optional[str] = None # None = ~/.coderai/ds4
build_target: str = "auto" # auto|cuda-generic|cuda-spark|metal|cpu
model_variant: str = "q4-imatrix" # download_model.sh variant
model_id: str = "deepseek-v4" # model id/alias that routes to ds4
host: str = "127.0.0.1"
port: int = 0 # 0 = auto-pick a free port
ctx: int = 100000 # ds4-server --ctx context window
extra_args: str = "" # extra flags passed to ds4-server
auto_build: bool = True # clone+build the binary if it's missing
@dataclass
class Config:
"""Main configuration class."""
......@@ -177,6 +253,7 @@ class Config:
thermal: ThermalConfig = field(default_factory=ThermalConfig)
jobs: JobsConfig = field(default_factory=JobsConfig)
enhance: EnhanceConfig = field(default_factory=EnhanceConfig)
ds4: Ds4Config = field(default_factory=Ds4Config)
broker: BrokerConfig = field(default_factory=BrokerConfig)
system_prompt: Optional[str] = None
tools_closer_prompt: bool = False
......@@ -338,6 +415,7 @@ class ConfigManager:
thermal=ThermalConfig(**config_data.get("thermal", {})),
jobs=JobsConfig(**config_data.get("jobs", {})),
enhance=EnhanceConfig(**config_data.get("enhance", {})),
ds4=Ds4Config(**config_data.get("ds4", {})),
broker=BrokerConfig(**config_data.get("broker", {})),
system_prompt=config_data.get("system_prompt"),
tools_closer_prompt=config_data.get("tools_closer_prompt", False),
......@@ -401,6 +479,15 @@ class ConfigManager:
"https_cert_path": self.config.server.https_cert_path,
"queue_max_size": self.config.server.queue_max_size,
"max_parallel_requests": self.config.server.max_parallel_requests,
"max_parallel_requests_overrides": self.config.server.max_parallel_requests_overrides,
"single_process": self.config.server.single_process,
"internal_port_base": self.config.server.internal_port_base,
"engines": self.config.server.engines,
"engine_gpus": self.config.server.engine_gpus,
"proxy_status_timeout": self.config.server.proxy_status_timeout,
"proxy_max_inflight": self.config.server.proxy_max_inflight,
"engine_specs": self.config.server.engine_specs,
"default_engine": self.config.server.default_engine,
},
"backend": {
"type": self.config.backend.type,
......@@ -412,6 +499,8 @@ class ConfigManager:
"default_load_mode": self.config.models.default_load_mode,
"hf_cache_dir": self.config.models.hf_cache_dir,
"gguf_cache_dir": self.config.models.gguf_cache_dir,
"max_model_instances": self.config.models.max_model_instances,
"max_model_instances_overrides": self.config.models.max_model_instances_overrides,
},
"offload": {
"directory": self.config.offload.directory,
......@@ -424,7 +513,10 @@ class ConfigManager:
"flash_attention": self.config.offload.flash_attention,
"max_ram_gb": self.config.offload.max_ram_gb,
"evict_idle_on_ram": self.config.offload.evict_idle_on_ram,
"ram_leak_watch": self.config.offload.ram_leak_watch
"ram_leak_watch": self.config.offload.ram_leak_watch,
"ram_watch_poll_seconds": self.config.offload.ram_watch_poll_seconds,
"ram_watch_soft_fraction": self.config.offload.ram_watch_soft_fraction,
"ram_watch_cuda": self.config.offload.ram_watch_cuda
},
"vulkan": {
"n_gpu_layers": self.config.vulkan.n_gpu_layers,
......@@ -458,6 +550,7 @@ class ConfigManager:
"cpu_resume": self.config.thermal.cpu_resume,
"gpu_high": self.config.thermal.gpu_high,
"gpu_resume": self.config.thermal.gpu_resume,
"gpu_overrides": self.config.thermal.gpu_overrides,
"poll_seconds": self.config.thermal.poll_seconds,
"soft_throttle_enabled": self.config.thermal.soft_throttle_enabled,
"soft_throttle_temp": self.config.thermal.soft_throttle_temp,
......@@ -470,6 +563,19 @@ class ConfigManager:
"allow_ffmpeg": self.config.enhance.allow_ffmpeg,
"allow_rife_ncnn": self.config.enhance.allow_rife_ncnn,
},
"ds4": {
"enabled": self.config.ds4.enabled,
"repo_url": self.config.ds4.repo_url,
"install_dir": self.config.ds4.install_dir,
"build_target": self.config.ds4.build_target,
"model_variant": self.config.ds4.model_variant,
"model_id": self.config.ds4.model_id,
"host": self.config.ds4.host,
"port": self.config.ds4.port,
"ctx": self.config.ds4.ctx,
"extra_args": self.config.ds4.extra_args,
"auto_build": self.config.ds4.auto_build,
},
"broker": {
"enabled": self.config.broker.enabled,
"base_url": self.config.broker.base_url,
......
# CoderAI - OpenAI-compatible API server
# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
"""Front proxy package: always-responsive web/API front + supervised engines.
See ``docs/frontend-engine-split.md`` and ``docs/process-isolation-plans.md``.
"""
from codai.frontproxy.app import run_front, build_app
__all__ = ["run_front", "build_app"]
# CoderAI - OpenAI-compatible API server
# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
"""The front: a thin, always-responsive reverse proxy in front of the engines.
It imports no torch/transformers/diffusers, so its event loop is never blocked by
model work. It streams requests/responses (incl. SSE) to the engine chosen by
:mod:`codai.frontproxy.router`, and serves an aggregated, cached status so the web
UI stays live even while an engine is busy loading a model.
"""
import json
import time
from typing import Optional
import httpx
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, Response, StreamingResponse
from starlette.background import BackgroundTask
from codai.frontproxy.registry import EngineRegistry
from codai.frontproxy.engine_supervisor import EngineSupervisor
from codai.frontproxy import router as _router
# Hop-by-hop headers that must not be forwarded verbatim (RFC 7230 §6.1) plus
# length/host headers that the client/StreamingResponse recompute.
_HOP_BY_HOP = {
"connection", "keep-alive", "proxy-authenticate", "proxy-authorization",
"te", "trailers", "transfer-encoding", "upgrade",
}
# Also strip any client-supplied internal token so a caller can't spoof/override the
# real one the front injects — only the front's httpx default header reaches engines.
_DROP_REQ = _HOP_BY_HOP | {"host", "content-length", "x-coderai-internal"}
_DROP_RESP = _HOP_BY_HOP | {"content-length"}
class FrontProxy:
def __init__(self, config, config_dir=None):
self.config = config
self.default_engine = getattr(config.server, "default_engine", None)
# Per-model engine pins are read from models.json (torch-free) and refreshed
# when the file changes, so admin edits take effect without a front restart.
import os
self._models_path = os.path.join(config_dir, "models.json") if config_dir else None
self._pins: dict = {}
self._pins_mtime: float = -1.0
self.registry = EngineRegistry()
self.supervisor: Optional[EngineSupervisor] = None
# Per-run secret shared only with the engines (passed via env at spawn). The
# front stamps every engine request with it and engines reject requests that
# lack it, so nothing on localhost can talk to an engine bypassing the front.
import secrets
self.internal_token = secrets.token_urlsafe(32)
_auth = {"x-coderai-internal": self.internal_token}
# Short client for status/UI; long client (no read timeout) for generation
# that may legitimately wait for a model load.
self._short = httpx.AsyncClient(timeout=config.server.proxy_status_timeout,
headers=_auth)
self._long = httpx.AsyncClient(
timeout=httpx.Timeout(connect=10.0, read=None, write=None, pool=None),
headers=_auth)
self._status_cache: Optional[dict] = None
self._status_cache_at: float = 0.0
self._broker = None
self.debug_engine = False # --debug-engine: verbose engine lifecycle
async def aclose(self):
await self._short.aclose()
await self._long.aclose()
# ------------------------------------------------------------------ broker
def start_broker(self):
"""Run the AISBF broker here in the front (always-responsive, one
registration for the whole node) instead of inside a model engine. Brokered
requests are dispatched to the right engine through the same router/proxy."""
cfg = getattr(self.config, "broker", None)
if cfg is None or not getattr(cfg, "enabled", False):
return
try:
from codai.broker import build_broker_runtime_config, BrokerConfigError
from codai.broker.client import BrokerClient
from codai.broker.service import BrokerService
from codai.broker.dispatcher import execute_broker_request
except Exception as exc:
print(f"[front] broker not available: {exc}", flush=True)
return
try:
runtime = build_broker_runtime_config(cfg)
except BrokerConfigError as exc:
print(f"[front] broker disabled (invalid config): {exc}", flush=True)
return
if not runtime.enabled:
return
client = BrokerClient(runtime)
async def _dispatch(message):
envelope = client.message_to_envelope(message)
return await execute_broker_request(None, envelope,
executor=self.broker_execute)
client.dispatcher = _dispatch
self._broker = BrokerService(client) # app=None → keep our dispatcher
self._broker.start()
print("[front] AISBF broker started (front-managed, routes to engines)",
flush=True)
async def stop_broker(self):
if self._broker is not None:
await self._broker.stop()
self._broker = None
async def collect_models(self, headers):
"""Union of every healthy engine's /v1/models. Each engine registers only
the models the front assigned to it, so the union is the full set with no
duplicates. Returns ("ok", {...}) or ("passthrough", httpx.Response) when an
auth/error response should be relayed instead."""
seen, order, relay = {}, [], None
for e in self.registry.healthy():
try:
r = await self._short.get(e.url + "/v1/models", headers=headers)
except Exception:
continue
if r.status_code != 200:
relay = relay or r
continue
try:
data = r.json()
except Exception:
continue
for m in (data.get("data") or []):
mid = m.get("id")
if mid and mid not in seen:
seen[mid] = m
order.append(mid)
if not order and relay is not None:
return ("passthrough", relay)
return ("ok", {"object": "list", "data": [seen[i] for i in order]})
async def broker_execute(self, *, method, path, headers, query, body):
# Brokered models.list must reflect the WHOLE node (union across engines),
# not a single engine's assigned subset.
if method.upper() == "GET" and path.split("?", 1)[0].rstrip("/") == "/v1/models":
hdrs = {k: v for k, v in (headers or {}).items() if k.lower() not in _DROP_REQ}
kind, val = await self.collect_models(hdrs)
if kind == "ok":
import json as _json
return {"status_code": 200,
"headers": {"content-type": "application/json"},
"body": _json.dumps(val).encode()}
return {"status_code": val.status_code, "headers": dict(val.headers),
"body": val.content}
return await self._broker_execute_route(method=method, path=path,
headers=headers, query=query, body=body)
async def _broker_execute_route(self, *, method, path, headers, query, body):
"""Executor for brokered requests: route to an engine over HTTP and return
the buffered response (the broker dispatcher base64s/relays it)."""
import json as _json
model = None
if method.upper() == "POST" and _router.is_inference_path(path):
try:
model = (_json.loads(body or b"{}") or {}).get("model")
except Exception:
model = None
engine = _router.pick_engine(
self.registry, path, method, model,
required_cap=self._required_cap(path, model),
default_engine=self.default_engine, pinned=self._pin_for(model))
if engine is None:
return {"status_code": 503, "headers": {"content-type": "application/json"},
"body": b'{"error":"No engine is ready yet."}'}
send_headers = {k: v for k, v in (headers or {}).items()
if k.lower() not in _DROP_REQ}
try:
r = await self._long.request(method, engine.url + path,
headers=send_headers, params=query or {},
content=body or b"")
except Exception as exc:
return {"status_code": 502,
"headers": {"content-type": "application/json"},
"body": ('{"error":"engine#%s unreachable: %s"}'
% (engine.id, exc)).encode()}
return {"status_code": r.status_code, "headers": dict(r.headers),
"body": r.content}
# ------------------------------------------------------------------ helpers
@staticmethod
def _filter_headers(headers, drop) -> list:
return [(k, v) for k, v in headers.items() if k.lower() not in drop]
def _model_info(self, model: Optional[str]) -> dict:
"""Return {"engine": pin, "backend": backend} for a model from models.json.
Builds a {model id / alias / short-name → info} map, refreshed on file mtime
change. Used for per-model engine pins and capability detection (e.g. a
``whisper-server`` backend → the ``whisper`` capability)."""
if not model or not self._models_path:
return {}
import os
try:
mtime = os.path.getmtime(self._models_path)
except OSError:
return {}
if mtime != self._pins_mtime:
self._pins = self._load_pins()
self._pins_mtime = mtime
m = model.lower()
return self._pins.get(m) or self._pins.get(m.split("/")[-1]) or {}
def _pin_for(self, model: Optional[str]) -> Optional[str]:
return self._model_info(model).get("engine")
def _load_pins(self) -> dict:
import json as _json
info: dict = {}
try:
data = _json.load(open(self._models_path))
except Exception:
return info
for key, lst in data.items():
if not isinstance(lst, list):
continue
for m in lst:
if not isinstance(m, dict):
continue
rec = {"engine": (m.get("engine") or "").strip() or None,
"backend": (m.get("backend") or "").strip() or None}
for field_ in (m.get("path"), m.get("id"), m.get("alias")):
if field_:
info[str(field_).lower()] = rec
info[str(field_).split("/")[-1].lower()] = rec
return info
def _required_cap(self, path: str, model: Optional[str]) -> Optional[str]:
ds4 = getattr(self.config, "ds4", None)
return _router.required_capability(
model, path=path,
backend=self._model_info(model).get("backend"),
ds4_model_id=getattr(ds4, "model_id", None) if ds4 else None,
ds4_enabled=bool(getattr(ds4, "enabled", False)) if ds4 else False)
@staticmethod
def _peek_model(body: bytes, content_type: str) -> Optional[str]:
if not body or "application/json" not in (content_type or "").lower():
return None
try:
return (json.loads(body) or {}).get("model")
except Exception:
return None
# High-frequency dashboard pollers: serve with a short timeout and a graceful
# fallback so a momentarily-blocked engine loop can never hang the web UI.
_POLL_PATHS = {"/admin/api/tasks", "/admin/api/system-stats"}
async def poll(self, request: Request) -> Response:
prim = self.registry.primary()
if prim is None:
return JSONResponse({"engine": "down", "tasks": [], "queue": []})
is_tasks = request.url.path.rstrip("/").endswith("/tasks")
try:
headers = self._filter_headers(request.headers, _DROP_REQ)
r = await self._short.get(prim.url + request.url.path, headers=headers,
params=request.query_params)
if is_tasks and r.status_code == 200:
try:
data = r.json()
data["tasks"] = self._merge_engine_tasks(prim, data.get("tasks") or [])
data["cooling_engines"] = self._cooling_engines()
return JSONResponse(data)
except Exception:
pass
return Response(content=r.content, status_code=r.status_code,
headers=dict(self._filter_headers(r.headers, _DROP_RESP)),
media_type=r.headers.get("content-type"))
except Exception:
# Engine busy (event loop blocked by GIL-heavy work) — don't hang the UI.
# Still surface known running tasks from other engines.
tasks = self._merge_engine_tasks(prim, []) if is_tasks else []
return JSONResponse({"engine": "loading", "stale": True,
"tasks": tasks, "queue": []})
async def is_admin(self, request: Request) -> bool:
"""Authorize a front-handled admin action by validating the caller's session
against the primary engine (which owns sessions). 200 → authorized."""
prim = self.registry.primary()
if prim is None:
return False
try:
headers = self._filter_headers(request.headers, _DROP_REQ)
r = await self._short.get(prim.url + "/admin/api/status", headers=headers)
return r.status_code == 200
except Exception:
return False
def engines_list(self) -> list:
out = []
for e in self.registry.all():
try:
pid = e.proc.pid if e.proc else None
except Exception:
pid = None
out.append({"id": e.id, "name": e.name, "backend": e.backend,
"gpu": e.gpu, "healthy": e.healthy, "primary": e.primary,
"vram": e.vram, "cooling": bool(e.cooling),
"loaded_models": sorted(e.loaded_models), "pid": pid})
return out
def _cooling_engines(self) -> list:
"""Which engines are in thermal cooldown right now (for the Tasks banner)."""
out = []
for e in self.registry.all():
if e.cooling:
out.append({"engine": e.name, "gpu": e.cooling.get("gpu"),
"cpu": e.cooling.get("cpu"),
"message": e.cooling.get("message")})
return out
def _merge_engine_tasks(self, primary, primary_tasks: list) -> list:
"""Tasks from all engines, each tagged with the engine *name* it runs on."""
merged = []
seen = set()
# Primary's tasks (from its authed response) — tag with the primary name.
for t in primary_tasks:
if isinstance(t, dict):
t = dict(t)
t.setdefault("engine", primary.name if primary else None)
seen.add(t.get("id"))
merged.append(t)
# Tasks the supervisor saw on the other engines.
for e in self.registry.all():
if primary is not None and e.id == primary.id:
continue
for t in (e.tasks or []):
if not isinstance(t, dict) or t.get("id") in seen:
continue
t = dict(t)
t["engine"] = e.name
merged.append(t)
seen.add(t.get("id"))
return merged
# -------------------------------------------------------------------- proxy
async def proxy(self, request: Request) -> Response:
path = request.url.path
method = request.method
# Inference JSON bodies are small: buffer so we can route by `model`, then
# forward the buffered bytes. Everything else streams through unbuffered.
body_bytes: Optional[bytes] = None
model = None
if method == "POST" and _router.is_inference_path(path):
body_bytes = await request.body()
model = self._peek_model(body_bytes, request.headers.get("content-type", ""))
engine = _router.pick_engine(
self.registry, path, method, model,
required_cap=self._required_cap(path, model),
default_engine=self.default_engine, pinned=self._pin_for(model))
if engine is None:
return JSONResponse(
{"error": "No engine is ready yet (still starting/loading)."},
status_code=503)
url = engine.url + path
headers = self._filter_headers(request.headers, _DROP_REQ)
content = body_bytes if body_bytes is not None else request.stream()
rp_req = self._long.build_request(
method, url, headers=headers, params=request.query_params,
content=content)
try:
rp_resp = await self._long.send(rp_req, stream=True)
except Exception as exc:
return JSONResponse(
{"error": f"Engine#{engine.id} unreachable: {exc}"}, status_code=502)
resp_headers = self._filter_headers(rp_resp.headers, _DROP_RESP)
return StreamingResponse(
rp_resp.aiter_raw(),
status_code=rp_resp.status_code,
headers=dict(resp_headers),
media_type=rp_resp.headers.get("content-type"),
background=BackgroundTask(rp_resp.aclose),
)
# ----------------------------------------------------------------- status
async def status(self, request: Request) -> Response:
"""Aggregate /admin/api/status across engines, with a last-good cache.
Proxies the user's authed request to the primary engine (sessions live
there), then overlays cross-engine VRAM/loaded-model totals from the
registry so the dashboard reflects every GPU. On engine timeout, serve the
cache plus an ``engine: loading|down`` marker — the UI never hangs.
"""
prim = self.registry.primary()
if prim is None:
return self._cached_status("down")
try:
headers = self._filter_headers(request.headers, _DROP_REQ)
r = await self._short.get(prim.url + request.url.path, headers=headers,
params=request.query_params)
if r.status_code != 200:
# Pass through auth redirects/errors unchanged (e.g. login needed).
return Response(content=r.content, status_code=r.status_code,
headers=dict(self._filter_headers(r.headers, _DROP_RESP)),
media_type=r.headers.get("content-type"))
data = r.json()
data = self._overlay_engine_totals(data)
self._status_cache = data
self._status_cache_at = time.monotonic()
return JSONResponse(data)
except Exception:
return self._cached_status("loading")
def _overlay_engine_totals(self, data: dict) -> dict:
engines = self.registry.all()
per = []
used = free = total = 0.0
loaded = set()
have_vram = False
for e in engines:
per.append({"id": e.id, "name": e.name, "backend": e.backend,
"gpu": e.gpu, "healthy": e.healthy, "vram": e.vram,
"capabilities": sorted(e.capabilities),
"loaded_models": sorted(e.loaded_models)})
loaded |= e.loaded_models
if e.vram:
have_vram = True
used += e.vram.get("used", 0.0)
free += e.vram.get("free", 0.0)
total += e.vram.get("total", 0.0)
data["x_engines"] = per
if len([e for e in engines if e.healthy]) > 1:
if have_vram:
data["vram"] = {"used": round(used, 2), "free": round(free, 2),
"total": round(total, 2),
"gpu": f"{len(engines)} engines"}
if loaded:
data["loaded_models"] = sorted(loaded)
data["models_loaded"] = len(loaded)
return data
def _cached_status(self, engine_state: str) -> Response:
body = dict(self._status_cache or {"models_loaded": 0, "loaded_models": [],
"vram": None})
body["engine"] = engine_state
body["stale"] = True
if self._status_cache_at:
body["stale_age_seconds"] = round(time.monotonic() - self._status_cache_at, 1)
return JSONResponse(body)
class _PollNoiseFilter:
"""Hide web-UI traffic from the front's access log unless --debug-web.
The admin dashboard constantly polls/reads (status, gpu-stats, tasks, settings,
downloads, model-loaded-status, models, …) plus loads static assets — all noise
for normal operation. So drop **read** requests (GET/HEAD/OPTIONS) to /admin,
/static, /, and /favicon. Real API calls (/v1/...) and admin **mutations**
(POST/PUT/PATCH/DELETE — model-configure, deletes, etc.) still log.
"""
_READ = ("GET", "HEAD", "OPTIONS")
_WEB_PREFIXES = ("/admin", "/static", "/login", "/logout")
_WEB_EXACT = ("/", "/favicon.ico")
def filter(self, record):
try:
a = record.args
if isinstance(a, (tuple, list)) and len(a) >= 3:
method = str(a[1]).upper()
path = str(a[2]).split("?", 1)[0]
if method in self._READ and (
path in self._WEB_EXACT
or any(path == p or path.startswith(p + "/") or path == p
for p in self._WEB_PREFIXES)):
return False
except Exception:
pass
return True
def _front_log_config(debug_web: bool):
"""uvicorn log config that prefixes every front-process line with ``[front]``
(so it's never confused with an engine's ``[nvidia]``/``[radeon]`` lines) and
routes codai/broker logs through the same handler. Drops poll noise unless
--debug-web."""
import copy
import uvicorn
lc = copy.deepcopy(uvicorn.config.LOGGING_CONFIG)
for fmt in lc.get("formatters", {}).values():
if "fmt" in fmt and not fmt["fmt"].startswith("[front]"):
fmt["fmt"] = "[front] " + fmt["fmt"]
# Surface codai/broker logs (the broker now runs here) via uvicorn's handler.
lc.setdefault("loggers", {})
lc["loggers"]["codai"] = {"handlers": ["default"], "level": "INFO", "propagate": False}
if not debug_web:
lc.setdefault("filters", {})["pollnoise"] = {
"()": "codai.frontproxy.app._PollNoiseFilter"}
lc["handlers"].get("access", {}).setdefault("filters", []).append("pollnoise")
return lc
def build_app(config, config_dir=None) -> FastAPI:
front = FrontProxy(config, config_dir=config_dir)
app = FastAPI(title="CoderAI Front", docs_url=None, redoc_url=None,
openapi_url=None)
app.state.front = front
@app.on_event("startup")
async def _startup():
front.supervisor = EngineSupervisor(config, None, front.registry,
models_path=front._models_path,
internal_token=front.internal_token,
debug=front.debug_engine)
front.supervisor.start()
front.start_broker()
@app.on_event("shutdown")
async def _shutdown():
await front.stop_broker()
if front.supervisor:
front.supervisor.stop_all()
await front.aclose()
@app.get("/healthz", include_in_schema=False)
async def _healthz():
prim = front.registry.primary()
return {"ok": True, "engine_ready": bool(prim and prim.healthy),
"engines": [{"id": e.id, "gpu": e.gpu, "healthy": e.healthy}
for e in front.registry.all()]}
# Status/UI poll endpoints get the cached, cross-engine-aggregated handler so a
# busy engine can never hang the dashboard.
@app.get("/admin/api/status", include_in_schema=False)
async def _status(request: Request):
return await front.status(request)
@app.get("/admin/api/tasks", include_in_schema=False)
async def _tasks(request: Request):
return await front.poll(request)
@app.get("/admin/api/system-stats", include_in_schema=False)
async def _system_stats(request: Request):
return await front.poll(request)
# /v1/models is the union across engines (each engine registers only the models
# the front assigned to it). Registered before the catch-all so it's aggregated.
@app.get("/v1/models", include_in_schema=False)
async def _models(request: Request):
headers = front._filter_headers(request.headers, _DROP_REQ)
kind, val = await front.collect_models(headers)
if kind == "passthrough":
return Response(content=val.content, status_code=val.status_code,
headers=dict(front._filter_headers(val.headers, _DROP_RESP)),
media_type=val.headers.get("content-type"))
return JSONResponse(val)
# Engine management (front-owned: it runs the supervisor). Registered before
# the catch-all so they aren't proxied to an engine.
@app.get("/admin/api/engines", include_in_schema=False)
async def _engines(request: Request):
if not await front.is_admin(request):
return JSONResponse({"detail": "Unauthorized"}, status_code=401)
return JSONResponse({"engines": front.engines_list()})
@app.post("/admin/api/engines/{eid}/restart", include_in_schema=False)
async def _engine_restart(eid: int, request: Request):
if not await front.is_admin(request):
return JSONResponse({"detail": "Unauthorized"}, status_code=401)
ok = bool(front.supervisor and front.supervisor.restart_engine(eid))
return JSONResponse({"success": ok}, status_code=200 if ok else 404)
# Catch-all reverse proxy for everything else (admin UI, /v1 inference, files…).
@app.api_route("/{path:path}", include_in_schema=False,
methods=["GET", "POST", "PUT", "PATCH", "DELETE", "OPTIONS", "HEAD"])
async def _proxy(path: str, request: Request):
return await front.proxy(request)
return app
def _serve_front(app, **uvicorn_kwargs) -> None:
"""Serve the front with uvicorn, but own the SIGINT/SIGTERM handling so a
Ctrl-C ALWAYS tears the engines down — even if uvicorn's graceful shutdown
hangs draining an in-flight proxy stream to a stuck (e.g. mid-CUDA) engine.
On the first signal we ask uvicorn to exit AND arm a watchdog that force-stops
the engines (escalating to SIGKILL of their process groups) after a short
grace, regardless of whether the drain ever completes. A second Ctrl-C stops
them immediately. As a backstop, engines are also stopped after serve returns.
"""
import signal
import threading
import uvicorn
supervisor = getattr(app.state.front, "supervisor", None)
server = uvicorn.Server(uvicorn.Config(app, **uvicorn_kwargs))
server.install_signal_handlers = lambda: None # we manage signals ourselves
state = {"hits": 0}
def _handle(signum, _frame):
state["hits"] += 1
server.should_exit = True
if state["hits"] >= 2:
server.force_exit = True
if supervisor is not None:
supervisor.stop_all(grace=0.0)
return
print("\n[front] shutdown requested — stopping engines "
"(Ctrl-C again to force)…", flush=True)
def _watchdog():
# If the graceful drain hasn't finished promptly, force engines down
# so a stuck upstream stream can't keep them (and us) alive.
time.sleep(6.0)
if supervisor is not None:
supervisor.stop_all(grace=5.0)
server.force_exit = True
threading.Thread(target=_watchdog, daemon=True).start()
for _sig in (signal.SIGINT, signal.SIGTERM):
try:
signal.signal(_sig, _handle)
except Exception:
pass
try:
server.run()
finally:
# Backstop: whatever path we exited by, make sure no engine survives us.
if supervisor is not None:
supervisor.stop_all(grace=5.0)
def run_front(config, args) -> None:
"""Build the front app, start engine supervision, and serve on the public port."""
config_dir = getattr(args, "config", None) if args is not None else None
app = build_app(config, config_dir=config_dir)
app.state.front.debug_engine = getattr(args, "debug_engine", False)
host = config.server.host
port = config.server.port
print(f"\n[front] CoderAI front proxy on http://{host}:{port}")
print(f"[front] Admin UI: http://{host}:{port}/admin")
_log_config = _front_log_config(getattr(args, "debug_web", False))
if config.server.https:
import ssl
keyfile = config.server.https_key_path
certfile = config.server.https_cert_path
if not (keyfile and certfile):
print("[front] HTTPS requested but no cert/key configured; using HTTP.")
_serve_front(app, host=host, port=port, log_config=_log_config)
return
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
ctx.load_cert_chain(certfile, keyfile)
# uvicorn.Server reads ssl via Config(ssl_*), so pass cert/key paths.
_serve_front(app, host=host, port=port, log_config=_log_config,
ssl_keyfile=keyfile, ssl_certfile=certfile)
else:
_serve_front(app, host=host, port=port, log_config=_log_config)
# CoderAI - OpenAI-compatible API server
# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
"""Assign each configured model to exactly one engine.
With multiple engines, every engine would otherwise read the shared models.json and
register *every* model — so a model would appear on several engines at once. The
front instead computes a single **owner** per model and tells each engine which
models it owns; the engine then registers only those.
Owner precedence (per model):
1. The per-model ``engine`` pin (models.json), if that engine can run the model.
2. The configured default engine, if it can run the model.
3. Round-robin across the capability-compatible engines (balanced, deterministic),
so unpinned models spread out instead of all landing on one engine.
A model whose format no engine can serve is left unassigned (it can't run anyway).
"""
import json
# models.json categories that hold servable model entries.
_CATEGORIES = (
"text_models", "gguf_models", "vision_models", "image_models",
"audio_models", "tts_models", "video_models", "audio_gen_models",
"embedding_models", "spatial_models",
)
def _entry_path(entry):
"""The model's path/id — used for capability detection (e.g. is it a .gguf)."""
if isinstance(entry, str):
return entry
if isinstance(entry, dict):
return entry.get("path") or entry.get("id")
return None
def _route_key(entry):
"""The identifier clients address this entry by (alias > path > id).
Keying on the alias lets two *configs* of the same model — with distinct
aliases — be assigned to different engines; configs sharing a path with no
distinct alias collapse to one owner (they're not separately addressable)."""
if isinstance(entry, str):
return entry
if isinstance(entry, dict):
return entry.get("alias") or entry.get("path") or entry.get("id")
return None
def _required_cap(entry, ds4_cfg):
from codai.frontproxy.router import required_capability
path = _entry_path(entry) or ""
backend = entry.get("backend") if isinstance(entry, dict) else None
return required_capability(
path, backend=backend,
ds4_model_id=getattr(ds4_cfg, "model_id", None) if ds4_cfg else None,
ds4_enabled=bool(getattr(ds4_cfg, "enabled", False)) if ds4_cfg else False)
def compute_assignment(engines, models_path, default_engine=None, ds4_cfg=None):
"""Return {engine_name: [model_identifiers]} — each model owned by one engine."""
assignment = {e.name: [] for e in engines}
if not engines or not models_path:
return assignment
try:
with open(models_path) as f:
data = json.load(f)
except Exception:
return assignment
default_engine = (default_engine or "").strip().lower()
rr = {} # round-robin cursor per candidate-set signature
seen = set()
for cat in _CATEGORIES:
for entry in data.get(cat, []):
ident = _route_key(entry)
if not ident or ident in seen:
continue
cap = _required_cap(entry, ds4_cfg)
candidates = [e for e in engines if e.can_serve(cap)]
if not candidates:
continue # nothing can run it — leave unassigned
owner = None
pin = ((entry.get("engine") if isinstance(entry, dict) else "") or "").strip().lower()
if pin:
owner = next((e for e in candidates
if e.name.lower() == pin or (e.backend or "").lower() == pin), None)
if owner is None and default_engine:
owner = next((e for e in candidates
if e.name.lower() == default_engine
or (e.backend or "").lower() == default_engine), None)
if owner is None:
key = tuple(sorted(e.name for e in candidates))
i = rr.get(key, 0)
owner = candidates[i % len(candidates)]
rr[key] = i + 1
assignment[owner.name].append(ident)
seen.add(ident)
return assignment
# CoderAI - OpenAI-compatible API server
# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
"""Spawn and supervise engine subprocesses for the front proxy.
One engine per GPU (or a configured count). Each engine is this same codebase
relaunched with ``--engine-only --internal-port P`` and ``CUDA_VISIBLE_DEVICES``
pinned to its GPU, so inside the engine its GPU is always ``cuda:0`` and the
existing per-process VRAM/eviction logic is untouched.
The supervisor polls each engine's auth-free ``/internal/engine-state`` to keep the
:class:`EngineRegistry` current (health, resident models, VRAM) and respawns an
engine that dies or stops answering — which is also how a CUDA-poisoned engine
recovers (the front and sibling engines survive).
"""
import atexit
import collections
import json
import os
import shutil
import signal
import socket
import subprocess
import sys
import threading
import time
import httpx
from codai.frontproxy.registry import Engine, EngineRegistry
def _engine_preexec():
"""Run in the child just before exec: put the engine in its OWN process group
(so the terminal's Ctrl-C reaches only the front, which then stops engines
deterministically) and ask the kernel to SIGKILL the engine if the front dies
unexpectedly — even by SIGKILL, where our atexit/handlers can't run. Linux-only;
best-effort elsewhere."""
try:
os.setsid()
except Exception:
pass
try:
import ctypes
# prctl(PR_SET_PDEATHSIG, SIGKILL) — parent-death signal.
ctypes.CDLL("libc.so.6", use_errno=True).prctl(1, 9, 0, 0, 0)
except Exception:
pass
def _port_is_free(port: int, host: str = "127.0.0.1") -> bool:
"""True if ``port`` can be bound right now on ``host``."""
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
s.bind((host, port))
return True
except OSError:
return False
finally:
s.close()
def detect_gpus() -> list:
"""Return CUDA GPU indices via nvidia-smi (no torch). Empty when none found."""
smi = shutil.which("nvidia-smi")
if not smi:
return []
try:
out = subprocess.run(
[smi, "--query-gpu=index", "--format=csv,noheader"],
capture_output=True, text=True, timeout=10,
)
if out.returncode != 0:
return []
return [int(line.strip()) for line in out.stdout.splitlines() if line.strip()]
except Exception:
return []
def _gpu_selectors(spec: dict, env: dict) -> list:
"""Which physical cards an engine owns, as selectors thermal can match against:
NVIDIA UUIDs (precise) and/or vendor keywords ("nvidia"/"amd"/"intel").
Derived from the engine's CUDA_VISIBLE_DEVICES (UUIDs), its ``gpus`` vendor
keyword, its Vulkan ICD, and its backend."""
sels = []
for tok in (env.get("CUDA_VISIBLE_DEVICES") or "").split(","):
tok = tok.strip()
if tok.startswith("GPU-"):
sels.append(tok) # precise NVIDIA UUID
elif tok.isdigit():
sels.append("nvidia") # index → vendor fallback
vmap = {"radeon": "amd", "amd": "amd", "intel": "intel", "nvidia": "nvidia"}
gpus_kw = (spec.get("gpus") or "").strip().lower()
if gpus_kw in vmap:
sels.append(vmap[gpus_kw])
icd = (env.get("VK_ICD_FILENAMES") or "").lower()
if "radeon" in icd or "amd" in icd:
sels.append("amd")
elif "intel" in icd:
sels.append("intel")
elif "nvidia" in icd:
sels.append("nvidia")
if (spec.get("backend") or "").lower() == "nvidia" and not sels:
sels.append("nvidia")
sels = list(dict.fromkeys(sels))
# When we have precise NVIDIA UUIDs, drop the broad "nvidia" vendor so two
# separate NVIDIA engines don't each match every NVIDIA card.
if any(s.startswith("GPU-") for s in sels):
sels = [s for s in sels if s != "nvidia"]
return sels
class EngineSupervisor:
def __init__(self, config, args, registry: EngineRegistry, models_path=None,
internal_token=None, debug=False):
self.config = config
self.args = args
self.registry = registry
self.models_path = models_path # for computing per-engine model assignment
self.internal_token = internal_token # shared secret stamped on engine calls
self.debug = debug # --debug-engine: verbose engine lifecycle
self._health = {} # engine_id -> last healthy bool (for debug)
self._stopped = threading.Event()
self._poll_thread = None
self._logs = {} # engine_id -> deque tail
self._restart_lock = threading.RLock()
def _assign_models(self, engines) -> None:
"""Give each engine the set of models it owns (via CODERAI_ENGINE_MODELS), so
a model is registered on exactly one engine. With a single engine there's
nothing to partition — it owns everything."""
if not self.models_path or len(engines) < 2:
return
try:
from codai.frontproxy.assignment import compute_assignment
default_engine = getattr(self.config.server, "default_engine", None)
ds4 = getattr(self.config, "ds4", None)
assignment = compute_assignment(engines, self.models_path,
default_engine, ds4)
for e in engines:
owned = assignment.get(e.name, [])
e.assigned_models = set(owned) # the front's router enforces this
# Also hand the set to the engine so it only registers/pre-loads its
# assigned models (avoids e.g. whisper-server starting on every
# engine). models.json itself stays full for the admin view.
e.env["CODERAI_ENGINE_MODELS"] = json.dumps(owned)
print(f"[front] engine '{e.name}' assigned {len(owned)} model(s): "
f"{', '.join(owned) if owned else '(none)'}", flush=True)
except Exception as exc:
print(f"[front] model assignment skipped: {exc}", flush=True)
def _alloc_port(self) -> int:
"""Next free internal port at/above internal_port_base, skipping the front's
own port and any port already in use, so engines never collide with the
front or each other (or a stale process on the base port)."""
p = self._port_cursor
front_port = int(getattr(self.config.server, "port", 0) or 0)
while p == front_port or not _port_is_free(p):
p += 1
self._port_cursor = p + 1
return p
# ----------------------------------------------------------------- planning
def _build_engines(self) -> list:
"""Return the list of Engine objects to launch.
Explicit ``engine_specs`` (heterogeneous: per-engine backend + env, e.g. an
NVIDIA card and a Radeon card) take precedence. Otherwise auto-detect the
LOCAL hardware and create one engine per GPU vendor actually present —
NVIDIA (CUDA), AMD/Radeon (Vulkan), Intel (Vulkan) — so e.g. a box with an
NVIDIA + a Radeon gets both engines without any config. A machine with no
GPU gets a single CPU engine.
"""
srv = self.config.server
self._port_cursor = srv.internal_port_base
specs = getattr(srv, "engine_specs", None)
engines = []
if specs:
from codai.frontproxy.gpu_detect import vendor_env
for idx, spec in enumerate(specs):
backend = (spec.get("backend") or "auto").strip()
# Vendor keyword → all of that vendor's cards on this machine. A
# plain nvidia backend defaults to "nvidia" (unambiguous); Vulkan
# vendors must be named ("radeon"/"amd"/"intel"). Explicit env wins.
gpus_kw = (spec.get("gpus") or "").strip().lower()
if not gpus_kw and not spec.get("env") and backend == "nvidia":
gpus_kw = "nvidia"
detected = vendor_env(gpus_kw) if gpus_kw else {}
explicit = {str(k): str(v) for k, v in (spec.get("env") or {}).items()}
env = {**detected, **explicit} # explicit overrides detected
# Tell the engine which physical cards it owns, so thermal
# protection scopes GPU cooldowns to this engine (CPU stays global).
sels = _gpu_selectors(spec, env)
if sels and "CODERAI_ENGINE_GPUS" not in env:
env["CODERAI_ENGINE_GPUS"] = ",".join(sels)
caps = set(spec.get("capabilities") or [])
engines.append(Engine(
id=idx, gpu=None, port=self._alloc_port(), primary=(idx == 0),
name=spec.get("name") or f"engine#{idx}",
backend=backend, env=env, capabilities=caps,
))
return engines
# Auto: one engine per GPU vendor actually present on this machine. Vendors
# come from Vulkan enumeration AND the sysfs PCI-vendor fallback, so AMD/Intel
# are detected even without vulkaninfo installed.
from codai.frontproxy.gpu_detect import nvidia_gpus, gpu_vendors, vendor_env
vendors = gpu_vendors()
# (engine name, vendor keyword, backend). NVIDIA first so it's the primary
# (it owns admin/sessions and has the broadest capabilities). NVIDIA needs
# CUDA, so it's gated on nvidia-smi rather than the Vulkan/sysfs presence.
plan = []
if nvidia_gpus():
plan.append(("nvidia", "nvidia", "nvidia"))
if "amd" in vendors:
plan.append(("radeon", "amd", "vulkan"))
if "intel" in vendors:
plan.append(("intel", "intel", "vulkan"))
if not plan:
engines.append(Engine(id=0, gpu=None, port=self._alloc_port(),
primary=True, name="cpu", backend="auto", env={}))
return engines
for idx, (name, vkw, backend) in enumerate(plan):
env = vendor_env(vkw)
sels = _gpu_selectors({"backend": backend, "gpus": vkw}, env)
if sels:
env["CODERAI_ENGINE_GPUS"] = ",".join(sels)
engines.append(Engine(id=idx, gpu=None, port=self._alloc_port(),
primary=(idx == 0), name=name,
backend=backend, env=env))
return engines
# ------------------------------------------------------------------ spawning
def _engine_cmd(self, port: int) -> list:
"""Build the command to relaunch this codebase as an engine."""
# sys.argv[0] is the launcher script (``coderai``); preserve all original
# args (config dir, model selection, …) and append the engine flags. Strip
# any flag that would re-trigger front mode or fix a different port.
passthrough = []
skip_next = False
for a in sys.argv[1:]:
if skip_next:
skip_next = False
continue
if a in ("--single-process", "--engine-only"):
continue
if a == "--internal-port":
skip_next = True
continue
passthrough.append(a)
return [sys.executable, sys.argv[0], *passthrough,
"--engine-only", "--internal-port", str(port)]
def _spawn(self, engine: Engine) -> None:
env = dict(os.environ)
# Engine stdout is a pipe (not a TTY), so CPython block-buffers print()
# output — debug lines (e.g. --debug-requests) would stall in the buffer
# until it fills or the process exits, unlike tqdm which flushes stderr
# itself. Force unbuffered so engine logs reach the front terminal live.
env["PYTHONUNBUFFERED"] = "1"
# Per-engine env block (device pinning, Vulkan ICD, etc.). Empty-string
# values are honoured (e.g. CUDA_VISIBLE_DEVICES="" hides all CUDA cards).
for k, v in (engine.env or {}).items():
env[str(k)] = str(v)
# The global host-RAM cap (offload.max_ram_gb) is SHARED across all engines,
# not split: tell each engine the front's PID so it measures the whole
# fleet's RAM (front + every engine + workers) against the one cap.
env["CODERAI_FRONT_PID"] = str(os.getpid())
# Only the primary engine talks to the AISBF broker, so N engines don't
# register N times under the same provider id.
if engine.primary:
env["CODERAI_ENGINE_PRIMARY"] = "1"
# Shared secret: the engine rejects any HTTP request that doesn't carry it,
# so only the front (which has it) can reach the engine on localhost.
if self.internal_token:
env["CODERAI_INTERNAL_TOKEN"] = self.internal_token
# Resolve this engine's concurrency limits (global default, or a per-engine
# override keyed by engine name) and hand them down so a bigger card can run
# more in parallel than a smaller one.
srv = self.config.server
mdl = self.config.models
par = (srv.max_parallel_requests_overrides or {}).get(engine.name,
srv.max_parallel_requests)
inst = (getattr(mdl, "max_model_instances_overrides", None) or {}).get(
engine.name, getattr(mdl, "max_model_instances", 1))
if par is not None:
env["CODERAI_MAX_PARALLEL"] = str(int(par))
if inst is not None:
env["CODERAI_MAX_MODEL_INSTANCES"] = str(int(inst))
# Force this engine's backend (the engine reads this in --engine-only mode
# and overrides config.backend.type) so a Vulkan/Radeon engine doesn't
# auto-pick CUDA, and vice-versa.
if engine.backend and engine.backend != "auto":
env["CODERAI_ENGINE_BACKEND"] = engine.backend
cmd = self._engine_cmd(engine.port)
tag = engine.name + (f"(gpu{engine.gpu})" if engine.gpu is not None else "")
print(f"[front] launching {tag} on port {engine.port}: {' '.join(cmd)}", flush=True)
proc = subprocess.Popen(
cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
text=True, bufsize=1,
preexec_fn=_engine_preexec if os.name == "posix" else None,
)
engine.proc = proc
tail = self._logs.setdefault(engine.id, collections.deque(maxlen=30))
threading.Thread(target=self._pump_logs, args=(tag, proc, tail),
daemon=True).start()
@staticmethod
def _pump_logs(tag, proc, tail):
for line in proc.stdout:
line = line.rstrip()
if line:
tail.append(line)
print(f"[{tag}] {line}", flush=True)
# -------------------------------------------------------------------- lifecycle
def _set_primary(self, engines) -> None:
"""The primary engine owns admin/sessions/config. Honour the configured
engine (server.default_engine) as the primary when it's present; otherwise
keep the first engine (the build order) as primary."""
de = (getattr(self.config.server, "default_engine", None) or "").strip().lower()
if not de or len(engines) < 2:
return
match = next((e for e in engines
if e.name.lower() == de or (e.backend or "").lower() == de), None)
if match is None:
return # configured engine isn't present — leave the default primary
for e in engines:
e.primary = (e is match)
print(f"[front] primary engine: '{match.name}' (from settings.default_engine)",
flush=True)
def start(self) -> None:
engines = self._build_engines()
self._set_primary(engines) # configured engine owns admin/sessions
self._assign_models(engines) # set CODERAI_ENGINE_MODELS before spawning
for engine in engines:
self.registry.add(engine)
self._spawn(engine)
self._poll_thread = threading.Thread(target=self._poll_loop, daemon=True)
self._poll_thread.start()
atexit.register(self.stop_all)
def _poll_loop(self) -> None:
_auth = ({"x-coderai-internal": self.internal_token}
if self.internal_token else {})
client = httpx.Client(timeout=self.config.server.proxy_status_timeout,
headers=_auth)
while not self._stopped.is_set():
for engine in self.registry.all():
# Respawn engines whose process has exited.
if engine.proc is not None and engine.proc.poll() is not None:
self._maybe_restart(engine)
continue
healthy = False
try:
r = client.get(engine.url + "/internal/engine-state")
if r.status_code == 200:
d = r.json()
healthy = True
self.registry.update_state(
engine.id, healthy=True,
loaded_models=d.get("loaded_models") or [],
vram=d.get("vram"),
tasks=d.get("tasks") or [],
cooling=d.get("cooling"),
)
else:
self.registry.update_state(engine.id, healthy=False)
except Exception:
# Connection refused / timeout: still-loading or dead. Mark
# unhealthy; the process-exit check above handles true death.
self.registry.update_state(engine.id, healthy=False)
# --debug-engine: report health transitions (ready / lost).
if self.debug and self._health.get(engine.id) != healthy:
self._health[engine.id] = healthy
print(f"[front] engine '{engine.name}' "
f"{'ready' if healthy else 'not responding'}", flush=True)
self._stopped.wait(self.config.server.proxy_status_timeout)
client.close()
def _maybe_restart(self, engine: Engine) -> None:
with self._restart_lock:
if self._stopped.is_set():
return
code = engine.proc.poll() if engine.proc else None
tail = " | ".join(list(self._logs.get(engine.id, []))[-3:])
print(f"[front] engine#{engine.id} exited (code {code}); respawning. {tail}",
flush=True)
self.registry.update_state(engine.id, healthy=False)
time.sleep(1.0) # avoid a tight crash loop
self._spawn(engine)
def restart_engine(self, engine_id: int) -> bool:
"""Forcibly kill and respawn one engine (e.g. it's stuck in a loop).
Holds the restart lock so the poll loop's own respawn can't double-spawn."""
engine = self.registry.get(engine_id)
if engine is None:
return False
with self._restart_lock:
proc = engine.proc
if proc is not None and proc.poll() is None:
try:
proc.terminate()
proc.wait(timeout=8)
except Exception:
pass
if proc.poll() is None:
try:
proc.kill()
proc.wait(timeout=3)
except Exception:
pass
self.registry.update_state(engine_id, healthy=False)
print(f"[front] restarting engine#{engine_id} ({engine.name}) on request",
flush=True)
self._spawn(engine)
return True
def wait_ready(self, timeout: float = 1800.0) -> bool:
"""Block until at least the primary engine answers (best effort)."""
deadline = time.time() + timeout
while time.time() < deadline and not self._stopped.is_set():
prim = self.registry.primary()
if prim and prim.healthy:
return True
time.sleep(1.0)
return bool(self.registry.primary())
def stop_all(self, grace: float = 8.0) -> None:
"""Stop every engine, escalating to SIGKILL of the engine's whole process
group if it doesn't exit within ``grace`` seconds — so a stuck (e.g.
mid-CUDA) engine, and any children it spawned (whisper-server, ds4), are
guaranteed dead. Idempotent and safe to call from a signal handler."""
self._stopped.set()
def _signal_group(proc, sig):
# Engines are started in their own session (setsid), so killing the
# process group reaps the engine + its grandchildren in one shot.
try:
os.killpg(os.getpgid(proc.pid), sig)
except Exception:
try:
proc.send_signal(sig)
except Exception:
pass
procs = [(e, e.proc) for e in self.registry.all()
if e.proc is not None and e.proc.poll() is None]
# Phase 1: polite SIGTERM to each group.
for _engine, proc in procs:
_signal_group(proc, signal.SIGTERM)
# Phase 2: wait up to `grace`, then SIGKILL whatever is still alive.
deadline = time.time() + grace
for _engine, proc in procs:
remaining = max(0.0, deadline - time.time())
try:
proc.wait(timeout=remaining)
except Exception:
pass
for _engine, proc in procs:
if proc.poll() is None:
_signal_group(proc, signal.SIGKILL)
try:
proc.wait(timeout=3)
except Exception:
pass
print("[front] all engines stopped", flush=True)
# CoderAI - OpenAI-compatible API server
# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
"""Torch-free GPU detection for the front proxy.
Turns a *vendor keyword* (``"nvidia"`` / ``"radeon"`` / ``"intel"``) into the env
that pins an engine to **all** of that vendor's cards on the local machine — so the
same ``engine_specs`` work on a 1-, 2-, or 10-card box without hand-writing UUIDs or
device indices:
* **CUDA** is pinned by UUID (stable across reboots/reordering), listing every
NVIDIA card for the NVIDIA engine and ``""`` (hidden) for the others.
* **Vulkan** is isolated by pointing ``VK_ICD_FILENAMES`` at only that vendor's ICD,
so the engine sees exactly that vendor's cards (no index fragility, no llvmpipe /
cross-vendor interference).
Everything shells out to ``nvidia-smi`` / ``vulkaninfo`` and reads the Vulkan ICD
directory; nothing imports torch.
"""
import glob
import os
import shutil
import subprocess
# Vulkan ICD search dirs (loader defaults) and per-vendor filename patterns. Names
# vary by distro/driver, so we match several and skip disabled ones.
_ICD_DIRS = ("/usr/share/vulkan/icd.d", "/etc/vulkan/icd.d",
"/usr/local/share/vulkan/icd.d")
_ICD_PATTERNS = {
"nvidia": ("nvidia_icd*.json",),
"amd": ("radeon_icd*.json", "amd_icd*.json"), # RADV (mesa) or AMDVLK
"intel": ("intel_icd*.json", "intel_hasvk_icd*.json"),
}
# Vulkan vendor IDs (PCI) for reporting.
VENDOR_IDS = {0x10de: "nvidia", 0x1002: "amd", 0x8086: "intel"}
_ALIASES = {"radeon": "amd", "amd": "amd", "nvidia": "nvidia", "nv": "nvidia",
"intel": "intel"}
def _norm_vendor(vendor: str) -> str:
return _ALIASES.get((vendor or "").strip().lower(), (vendor or "").strip().lower())
def nvidia_gpus() -> list:
"""Return [{'uuid','name','pci'}] for each NVIDIA GPU (empty if none)."""
smi = shutil.which("nvidia-smi")
if not smi:
return []
try:
out = subprocess.run(
[smi, "--query-gpu=uuid,name,pci.bus_id", "--format=csv,noheader"],
capture_output=True, text=True, timeout=10)
if out.returncode != 0:
return []
gpus = []
for line in out.stdout.splitlines():
parts = [p.strip() for p in line.split(",")]
if parts and parts[0]:
gpus.append({"uuid": parts[0],
"name": parts[1] if len(parts) > 1 else "",
"pci": parts[2] if len(parts) > 2 else ""})
return gpus
except Exception:
return []
def vulkan_devices() -> list:
"""Return [{'vendor','vendor_id','name'}] from ``vulkaninfo --summary``.
Order matches Vulkan device indexing. Best-effort: empty if vulkaninfo is
missing or unparseable."""
vk = shutil.which("vulkaninfo")
if not vk:
return []
try:
out = subprocess.run([vk, "--summary"], capture_output=True, text=True,
timeout=15)
text = out.stdout
except Exception:
return []
devices = []
cur = {}
for raw in text.splitlines():
line = raw.strip()
if line.startswith("GPU") and line.endswith(":"):
if cur:
devices.append(cur)
cur = {}
elif "=" in line:
k, _, v = line.partition("=")
k = k.strip().lower(); v = v.strip()
if k == "vendorid":
try:
vid = int(v, 16) if v.lower().startswith("0x") else int(v)
except ValueError:
vid = None
cur["vendor_id"] = vid
cur["vendor"] = VENDOR_IDS.get(vid, "other")
elif k == "devicename":
cur["name"] = v
if cur:
devices.append(cur)
return devices
_PCI_VENDOR = {"0x10de": "nvidia", "0x1002": "amd", "0x8086": "intel"}
def sysfs_gpu_vendors() -> set:
"""GPU vendors present per sysfs (``/sys/class/drm/card*/device/vendor``).
A driver-independent fallback for AMD/Intel detection when ``vulkaninfo`` isn't
installed. Returns vendor keywords ({"amd","nvidia","intel"})."""
import re
vendors = set()
for card in glob.glob("/sys/class/drm/card*"):
base = os.path.basename(card)
if not re.match(r"^card\d+$", base):
continue
try:
with open(os.path.join(card, "device", "vendor")) as f:
vid = f.read().strip().lower()
except OSError:
continue
v = _PCI_VENDOR.get(vid)
if v:
vendors.add(v)
return vendors
def gpu_vendors() -> set:
"""All GPU vendors present, combining Vulkan enumeration (vulkaninfo) with the
sysfs PCI-vendor fallback, so detection doesn't depend on vulkaninfo alone."""
vendors = {d.get("vendor") for d in vulkan_devices() if d.get("vendor")}
vendors |= sysfs_gpu_vendors()
vendors.discard("other") # llvmpipe / software rasterizers
return vendors
def find_vulkan_icd(vendor: str) -> str:
"""Return the path to a Vulkan ICD JSON for ``vendor``, or '' if not found."""
vendor = _norm_vendor(vendor)
patterns = _ICD_PATTERNS.get(vendor, ())
for d in _ICD_DIRS:
for pat in patterns:
for path in sorted(glob.glob(os.path.join(d, pat))):
if path.endswith(".disabled") or ".disabled" in os.path.basename(path):
continue
if os.path.isfile(path):
return path
return ""
def vendor_env(vendor: str) -> dict:
"""Env that pins an engine to **all** of ``vendor``'s cards on this machine.
NVIDIA: CUDA visible = all NVIDIA UUIDs (+ PCI_BUS_ID order), Vulkan ICD = nvidia.
AMD/Intel: CUDA hidden (""), Vulkan ICD = that vendor's, so it sees only those
cards. Missing tools degrade gracefully (the key is simply omitted)."""
vendor = _norm_vendor(vendor)
env = {}
if vendor == "nvidia":
uuids = [g["uuid"] for g in nvidia_gpus() if g.get("uuid")]
if uuids:
env["CUDA_VISIBLE_DEVICES"] = ",".join(uuids)
env["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
else:
# Non-NVIDIA engine: hide all CUDA cards so torch/llama-CUDA can't grab them.
env["CUDA_VISIBLE_DEVICES"] = ""
# Likewise hide AMD/Radeon cards from any engine that isn't the AMD one, so a
# non-AMD engine can't pick up a Radeon (mirrors CUDA hiding for non-NVIDIA).
if vendor != "amd":
env["RADEON_VISIBLE_DEVICES"] = ""
icd = find_vulkan_icd(vendor)
if icd:
env["VK_ICD_FILENAMES"] = icd
# After ICD isolation only THIS vendor's card(s) are visible to Vulkan, as
# indices 0..n-1. Pin GGML_VK_VISIBLE_DEVICES to those indices so an inherited
# value (e.g. a launcher exporting GGML_VK_VISIBLE_DEVICES=1 from the old
# multi-vendor enumeration) can't select an invalid index and silently fall back
# to CPU. Default to "0" when the count can't be determined (single card).
_n = sum(1 for d in vulkan_devices() if d.get("vendor") == vendor)
env["GGML_VK_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(max(1, _n)))
return env
def _nvidia_stats() -> list:
"""Per-GPU live stats from nvidia-smi (reports ALL cards regardless of
CUDA_VISIBLE_DEVICES). Memory in GB."""
smi = shutil.which("nvidia-smi")
if not smi:
return []
try:
out = subprocess.run(
[smi, "--query-gpu=index,name,utilization.gpu,memory.used,memory.total,"
"temperature.gpu,uuid", "--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=10)
if out.returncode != 0:
return []
except Exception:
return []
cards = []
for line in out.stdout.splitlines():
p = [x.strip() for x in line.split(",")]
if len(p) < 6 or not p[0].isdigit():
continue
def _f(v):
try:
return float(v)
except ValueError:
return None
cards.append({"vendor": "nvidia", "index": int(p[0]), "name": p[1],
"util": _f(p[2]),
"mem_used": round((_f(p[3]) or 0) / 1024, 2),
"mem_total": round((_f(p[4]) or 0) / 1024, 2),
"temp": _f(p[5]),
"uuid": p[6] if len(p) > 6 else None})
return cards
def _amd_stats() -> list:
"""Per-GPU live stats for AMD cards from sysfs (amdgpu). Memory in GB."""
import re
cards = []
for card in sorted(glob.glob("/sys/class/drm/card*")):
base = os.path.basename(card)
if not re.match(r"^card\d+$", base):
continue
dev = os.path.join(card, "device")
def _read(rel):
try:
with open(os.path.join(dev, rel)) as f:
return f.read().strip()
except OSError:
return None
vendor = (_read("vendor") or "").lower()
if vendor != "0x1002": # AMD only (NVIDIA handled via nvidia-smi)
continue
busy = _read("gpu_busy_percent")
used = _read("mem_info_vram_used")
total = _read("mem_info_vram_total")
temp = None
for hw in glob.glob(os.path.join(dev, "hwmon", "hwmon*", "temp1_input")):
t = None
try:
with open(hw) as f:
t = int(f.read().strip())
except OSError:
t = None
if t is not None:
temp = t / 1000.0
break
cards.append({
"vendor": "amd", "index": int(base[4:]),
"name": f"AMD GPU ({base})",
"util": float(busy) if busy and busy.isdigit() else None,
"mem_used": round(int(used) / 1e9, 2) if used and used.isdigit() else None,
"mem_total": round(int(total) / 1e9, 2) if total and total.isdigit() else None,
"temp": temp})
return cards
def gpu_stats() -> list:
"""Live per-card stats for EVERY physical GPU installed (vendor-agnostic):
``[{vendor, index, name, util%, mem_used GB, mem_total GB, temp °C, uuid}]``.
Independent of engine ownership — nvidia-smi and sysfs report all cards
regardless of CUDA_VISIBLE_DEVICES — so this shows the whole machine."""
return _nvidia_stats() + _amd_stats()
def engine_gpu_stats() -> list:
"""Like :func:`gpu_stats` but scoped to the cards THIS engine owns, per the
``CODERAI_ENGINE_GPUS`` env the front sets (comma-separated vendor keywords
and/or NVIDIA UUIDs). Unset → all cards (single-process / legacy mode).
Used by thermal protection so a hot GPU only pauses the engine(s) using it,
while a hot CPU (read globally) still pauses everything."""
cards = gpu_stats()
raw = (os.environ.get("CODERAI_ENGINE_GPUS") or "").strip()
if not raw:
return cards
sels = {s.strip() for s in raw.split(",") if s.strip()}
return [c for c in cards
if c.get("vendor") in sels or (c.get("uuid") and c["uuid"] in sels)]
def summary() -> dict:
"""Detected hardware, for a 'detect engines' UI / debugging."""
return {"nvidia": nvidia_gpus(), "vulkan": vulkan_devices(),
"icd": {v: find_vulkan_icd(v) for v in ("nvidia", "amd", "intel")}}
# CoderAI - OpenAI-compatible API server
# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
"""Front-side registry of engine subprocesses.
The front never imports torch; it knows about engines only through the small,
auth-free ``/internal/engine-state`` endpoint each engine exposes on localhost.
This module holds the shared, thread-safe view the supervisor writes and the
router/aggregator read.
"""
import threading
import time
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Set
# Default model-format capabilities implied by an engine's backend:
# transformers — safetensors/HF models (CUDA only here)
# gguf — llama.cpp models (CUDA or Vulkan)
# whisper — whisper.cpp STT (CUDA or Vulkan)
# ds4 — DeepSeek V4 via the native ds4 engine (CUDA-only build)
# An NVIDIA engine can do all of them; a Vulkan (e.g. Radeon) engine does GGUF and
# whisper, but not transformers and not ds4.
_DEFAULT_CAPS = {
"nvidia": {"transformers", "gguf", "whisper", "ds4"},
"cuda": {"transformers", "gguf", "whisper", "ds4"},
"vulkan": {"gguf", "whisper"},
"opencl": {"gguf", "whisper"},
"auto": {"transformers", "gguf", "whisper", "ds4"},
}
@dataclass
class Engine:
id: int
gpu: Optional[int] # device hint for logs (CUDA/Vulkan index; None = n/a)
port: int
primary: bool = False # the engine that owns admin/auth/config traffic
name: str = "" # human label for logs
backend: str = "auto" # nvidia | vulkan | … (forced for this engine)
env: dict = field(default_factory=dict) # extra env applied at spawn
capabilities: Set[str] = field(default_factory=set) # model formats it can serve
assigned_models: Set[str] = field(default_factory=set) # routable ids it owns
url: str = ""
healthy: bool = False
loaded_models: Set[str] = field(default_factory=set)
vram: Optional[dict] = None
tasks: list = field(default_factory=list) # running/queued tasks on this engine
cooling: Optional[dict] = None # thermal cooldown state, or None when not cooling
last_ok: float = 0.0 # monotonic time of last successful poll
proc: object = None # subprocess.Popen (set by the supervisor)
def __post_init__(self):
if not self.url:
self.url = f"http://127.0.0.1:{self.port}"
if not self.name:
self.name = f"engine#{self.id}"
if not self.capabilities:
self.capabilities = set(_DEFAULT_CAPS.get(self.backend, {"transformers", "gguf"}))
def can_serve(self, required_cap: Optional[str]) -> bool:
return (not required_cap) or (required_cap in self.capabilities)
class EngineRegistry:
def __init__(self):
self._engines: Dict[int, Engine] = {}
self._lock = threading.RLock()
def add(self, engine: Engine) -> None:
with self._lock:
self._engines[engine.id] = engine
def get(self, engine_id: int) -> Optional[Engine]:
with self._lock:
return self._engines.get(engine_id)
def all(self) -> List[Engine]:
with self._lock:
return list(self._engines.values())
def healthy(self) -> List[Engine]:
with self._lock:
return [e for e in self._engines.values() if e.healthy]
def primary(self) -> Optional[Engine]:
"""The engine that owns admin/session/config — falls back to first healthy."""
with self._lock:
prim = next((e for e in self._engines.values() if e.primary), None)
if prim and prim.healthy:
return prim
return next((e for e in self._engines.values() if e.healthy), prim)
def by_name(self, name: Optional[str]) -> Optional[Engine]:
"""Resolve an engine by its declared name (or, failing that, its backend).
Used for the configured default engine and per-model pins. Prefers a healthy
match but returns an unhealthy one too, so callers can decide."""
if not name:
return None
name = name.strip().lower()
with self._lock:
engines = list(self._engines.values())
match = None
for e in engines:
if (e.name or "").lower() == name or (e.backend or "").lower() == name:
if e.healthy:
return e
match = match or e
return match
def update_state(self, engine_id: int, *, healthy: bool,
loaded_models=None, vram=None, tasks=None,
cooling=False) -> None:
with self._lock:
e = self._engines.get(engine_id)
if not e:
return
e.healthy = healthy
if healthy:
e.last_ok = time.monotonic()
if loaded_models is not None:
e.loaded_models = set(loaded_models)
if vram is not None:
e.vram = vram
if tasks is not None:
e.tasks = list(tasks)
elif not healthy:
e.tasks = []
if cooling is not False: # explicit None clears it
e.cooling = cooling
elif not healthy:
e.cooling = None
def engine_for_model(self, model_key: str, required_cap: Optional[str] = None) -> Optional[Engine]:
"""Return a healthy, capability-compatible engine that already has the model
resident, if any.
Matching is forgiving: exact key, short-name, or type-prefixed variants —
the same fuzzy spirit the manager uses, but read-only over loaded keys."""
if not model_key:
return None
short = model_key.split("/")[-1]
with self._lock:
for e in self._engines.values():
if not e.healthy or not e.can_serve(required_cap):
continue
for k in e.loaded_models:
if k == model_key or k.split("/")[-1] == short \
or k.endswith(model_key) or model_key.endswith(k.split(":")[-1]):
return e
return None
def engine_for_assigned(self, model_key: str) -> Optional[Engine]:
"""The engine the front ASSIGNED this model to (single owner), or None.
The assignment is the authoritative routing decision (it already encodes
pins, the default engine, and balanced auto-selection); match leniently so a
short-name / alias resolves to the owner."""
if not model_key:
return None
short = model_key.split("/")[-1]
with self._lock:
for e in self._engines.values():
if not e.healthy:
continue
for k in e.assigned_models:
if (k == model_key or k.split("/")[-1] == short
or k.endswith(model_key) or model_key.endswith(k.split("/")[-1])):
return e
return None
def least_loaded(self, required_cap: Optional[str] = None) -> Optional[Engine]:
"""Pick a healthy, capability-compatible engine to load a new model on:
fewest resident models, then most free VRAM."""
with self._lock:
cands = [e for e in self._engines.values()
if e.healthy and e.can_serve(required_cap)]
if not cands:
return None
def _free(e: Engine) -> float:
return (e.vram or {}).get("free", 0.0) if e.vram else 0.0
cands.sort(key=lambda e: (len(e.loaded_models), -_free(e)))
return cands[0]
# CoderAI - OpenAI-compatible API server
# Copyright (C) 2026 Stefy Lanza <stefy@nexlab.net>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
"""Decide which engine handles a proxied request.
Policy (Plan B + multi-engine):
* **Admin / auth / config / UI / status / tasks** → the **primary** engine. These
own per-process session and config state, so pinning them to one engine keeps
sessions consistent without a shared session store (that's Plan C).
* **Inference** (``/v1/...`` POST carrying a ``model``) → the engine that already
has that model resident; otherwise the least-loaded engine (which loads it on
demand). This is what lets one model load on engine A while engine B keeps
generating.
* **Everything else** (e.g. ``GET /v1/models``, file downloads) → primary.
"""
from typing import Optional
from codai.frontproxy.registry import Engine, EngineRegistry
# POST endpoints that carry a `model` and should be load-balanced across engines.
_INFERENCE_PATHS = {
"/v1/chat/completions",
"/v1/completions",
"/v1/embeddings",
"/v1/images/generations",
"/v1/images/edits",
"/v1/audio/speech",
"/v1/audio/transcriptions",
"/v1/videos/generations",
}
def is_inference_path(path: str) -> bool:
p = path.split("?", 1)[0].rstrip("/")
return p in _INFERENCE_PATHS
def is_admin_path(path: str) -> bool:
p = path.split("?", 1)[0]
return (p.startswith("/admin") or p.startswith("/login") or p.startswith("/logout")
or p == "/" or p.startswith("/static"))
_warned_pins: set = set()
def _warn_bad_pin(model, pinned, cap, engine) -> None:
key = (model, pinned)
if key in _warned_pins:
return
_warned_pins.add(key)
if engine is None:
reason = f"no engine named/backed '{pinned}' is declared"
elif not engine.healthy:
reason = f"engine '{pinned}' is not healthy"
else:
reason = (f"engine '{pinned}' (backend '{engine.backend}') can't serve a "
f"'{cap}' model (capabilities: {sorted(engine.capabilities)})")
print(f"[front] WARNING: model '{model}' is pinned to '{pinned}' but {reason}; "
f"falling back to a compatible engine.", flush=True)
def required_capability(model: Optional[str], path: Optional[str] = None,
backend: Optional[str] = None,
ds4_model_id: Optional[str] = None,
ds4_enabled: bool = False) -> Optional[str]:
"""The capability an engine must have to serve this request.
* ``whisper`` — whisper.cpp STT (transcription endpoint or a
``whisper-server`` model). Runs on CUDA or Vulkan.
* ``ds4`` — DeepSeek V4 via the native ds4 engine. CUDA-only.
* ``gguf`` — llama.cpp model. Runs on CUDA or Vulkan.
* ``transformers`` — safetensors/HF model. CUDA-only.
Signals are combined: the request path and the model's configured ``backend``
(from models.json) take precedence over the name heuristic, so whisper works
even when the model id isn't in the request body (multipart upload)."""
p = (path or "").split("?", 1)[0].rstrip("/")
if p == "/v1/audio/transcriptions" or (backend or "") == "whisper-server":
return "whisper"
m = (model or "").lower()
if ds4_enabled and m:
mid = (ds4_model_id or "").lower()
if (mid and (m == mid or m.split("/")[-1] == mid)) or "deepseek-v4" in m:
return "ds4"
if m.endswith(".gguf") or "gguf" in m:
return "gguf"
if not model:
return None
return "transformers"
def pick_engine(registry: EngineRegistry, path: str, method: str,
model: Optional[str], required_cap: Optional[str] = None,
default_engine: Optional[str] = None,
pinned: Optional[str] = None) -> Optional[Engine]:
"""Return the engine to proxy this request to, or None if none are ready.
Precedence for inference: per-model pin → engine already holding the model →
configured default engine → least-loaded compatible engine. Each candidate must
be capability-compatible (``required_cap``) and healthy. Works even when the
model id isn't known (e.g. a multipart transcription upload), routing purely by
capability.
"""
if method.upper() == "POST" and is_inference_path(path):
cap = required_cap
# 0. The front's precomputed assignment is authoritative — it already folds
# in the pin, the default engine, and balanced auto-selection, and is what
# keeps a model on exactly one engine. Honour it first when it's compatible.
if model:
owner = registry.engine_for_assigned(model)
if owner is not None and owner.can_serve(cap):
return owner
# 1. Per-model pin (models.json "engine") — only honoured if compatible.
if pinned:
e = registry.by_name(pinned)
if e and e.healthy and e.can_serve(cap):
return e
# Pin can't be honoured — say why (once per model+engine) instead of
# silently falling back, so a misconfiguration is visible in the logs.
_warn_bad_pin(model, pinned, cap, e)
# 2. Engine that already has the model resident.
if model:
e = registry.engine_for_model(model, cap)
if e:
return e
# 3. Configured default engine, when it can serve this request.
if default_engine:
e = registry.by_name(default_engine)
if e and e.healthy and e.can_serve(cap):
return e
# 4. Least-loaded compatible engine; then any engine rather than 503.
return (registry.least_loaded(cap)
or registry.least_loaded(None)
or registry.primary())
# Admin/auth/config/UI and everything else → primary (consistent sessions).
return registry.primary() or registry.least_loaded()
......@@ -20,6 +20,12 @@ import os
import logging
import threading as _t
# Reduce CUDA allocator fragmentation: expandable segments let large transient
# allocations (KV cache, attention activations) grow into reserved-but-unallocated
# memory instead of OOMing on a borderline shortfall. Must be set before torch
# initialises CUDA; honour an explicit override if the operator already set it.
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
# Import configuration from codai modules
from codai.cli import parse_args
from codai.config import ConfigManager
......@@ -225,6 +231,10 @@ def apply_model_entry_live(entry, model_types) -> int:
old_cfg = multi_model_manager.config.get(key)
cfg = build_runtime_model_cfg(entry, type_str)
multi_model_manager.config[key] = cfg
try:
multi_model_manager._remember_registered_type(mid, type_str)
except Exception:
pass
updated += 1
# Acceleration (Lightning/Lightx2v/LCM distill LoRA + scheduler) is FUSED
# into the pipeline at load time, so it can't be toggled on an already
......@@ -248,6 +258,45 @@ def apply_model_entry_live(entry, model_types) -> int:
return updated
def _repair_stale_model_paths(config_mgr) -> int:
"""Rewrite models.json entries whose .gguf file path no longer exists but whose
file is present in the GGUF cache (by filename). Returns the number of entries
fixed; saves models.json only when something changed."""
import glob
from codai.models.cache import get_model_cache_dir
cache = get_model_cache_dir()
if not cache or not os.path.isdir(cache):
return 0
cats = ("text_models", "gguf_models", "vision_models", "image_models",
"audio_models", "tts_models", "video_models", "audio_gen_models",
"embedding_models", "spatial_models")
def _resolve(p):
if not p or not str(p).endswith(".gguf") or os.path.exists(p):
return None
base = os.path.basename(p)
cand = os.path.join(cache, base)
if os.path.exists(cand):
return cand
hits = glob.glob(os.path.join(cache, "**", base), recursive=True)
return hits[0] if hits else None
fixed = 0
for cat in cats:
for m in config_mgr.models_data.get(cat, []):
if not isinstance(m, dict):
continue
for key in ("path", "model_path"):
new = _resolve(m.get(key))
if new:
print(f" Repaired model path: {m[key]} -> {new}")
m[key] = new
fixed += 1
if fixed:
config_mgr.save_models()
return fixed
def main():
"""Main entry point for the codai server."""
# Suppress unraisable exceptions from LlamaModel.__del__
......@@ -309,6 +358,19 @@ def main():
if config.models.gguf_cache_dir:
os.environ['CODERAI_CACHE_DIR'] = config.models.gguf_cache_dir
# Repair stale .gguf paths in models.json: an entry may point at an HF-hub
# snapshot path that no longer exists while the file actually lives in the GGUF
# cache (downloads route there). Rewrite the entry to the real file so the config
# is correct (the GGUF loader has the same fallback, but this fixes it on disk).
# The front runs this before spawning engines, so they read the corrected file;
# it is idempotent (only writes when something changed).
try:
_repaired = _repair_stale_model_paths(config_mgr)
if _repaired:
print(f"Repaired {_repaired} stale model path(s) in models.json")
except Exception as _e:
logging.getLogger(__name__).debug("model-path repair skipped: %s", _e)
# Configure generation archive
_arc_dir = config.archive.directory
if not _arc_dir:
......@@ -424,6 +486,43 @@ def main():
print(f"Error listing devices: {e}")
sys.exit(0)
# ─── Frontend/engine split ───────────────────────────────────────────────
# Default boot: run the always-responsive front proxy on the public port and
# let it supervise engine subprocess(es) that do all GPU/model work. This
# process becomes the front and returns here — none of the heavy engine init
# below runs in it (so its event loop is never blocked by model work).
# --engine-only → this process IS an engine: bind an internal localhost
# port and run the full app below (the front spawns these).
# --single-process → legacy: one process, full app on the public port.
_engine_only = getattr(args, "engine_only", False)
_single_process = getattr(args, "single_process", False) or config.server.single_process
if not _engine_only and not _single_process:
from codai.frontproxy import run_front
run_front(config, args)
return
if _engine_only:
# Engines bind plain localhost HTTP; the front owns the public host + TLS.
# NOTE: don't mutate config.server here — the settings API reads it, and it
# must keep reporting the user's CONFIGURED public host/port/https. The
# actual bind target is computed separately at serve time.
# The front pins this engine's backend (so a Radeon engine uses Vulkan and
# an NVIDIA engine uses CUDA) via CODERAI_ENGINE_BACKEND; honour it over the
# shared config.backend.type. Device selection is done by the env block the
# front set (CUDA_VISIBLE_DEVICES / GGML_VK_VISIBLE_DEVICES / VK_ICD_FILENAMES).
_forced_backend = os.environ.get("CODERAI_ENGINE_BACKEND")
if _forced_backend:
config.backend.type = _forced_backend
print(f"[engine] backend forced to '{_forced_backend}' by the front")
# The front owns the AISBF broker (always-responsive, one registration for
# the whole node, routes to engines). So no engine runs its own broker
# client — that would double-register and stall when the engine loads.
if config.broker.enabled:
config.broker.enabled = False
print("[engine] broker disabled (the front manages the broker)")
# Note: model→engine assignment is enforced by the FRONT's router (each model
# is routed to its single owner engine), not by pruning models.json here —
# so the admin model list (served from the primary) stays complete.
# Migrate any GGUF files that ended up in the HF cache to the GGUF cache
_t.Thread(target=_migrate_hf_gguf_to_gguf_cache, daemon=True).start()
......@@ -519,6 +618,14 @@ def main():
set_load_mode(load_mode)
multi_model_manager.set_load_mode(load_mode)
multi_model_manager._global_max_instances = config.models.max_model_instances
# Per-engine override of the default instances-per-model, set by the front.
_mi = os.environ.get("CODERAI_MAX_MODEL_INSTANCES")
if _mi:
try:
multi_model_manager._global_max_instances = int(_mi)
config.models.max_model_instances = int(_mi)
except ValueError:
pass
print(f"\nLoad mode: {load_mode}")
if load_mode == "ondemand":
......@@ -558,6 +665,38 @@ def main():
print(f"\n=== Loading Models from Config ===")
models_config = config_mgr.models_data
# In an engine the front assigns a SUBSET of models to this engine; register and
# pre-load only those (so e.g. whisper-server doesn't start on every engine).
# config_mgr.models_data stays full, so the admin model list — served from the
# primary engine — remains complete.
_assigned_env = os.environ.get("CODERAI_ENGINE_MODELS")
if _assigned_env is not None:
try:
import json as _json
_keep = set(_json.loads(_assigned_env))
def _route_key(m):
if isinstance(m, str):
return m
if isinstance(m, dict):
return m.get("alias") or m.get("path") or m.get("id")
return None
_model_cats = ("text_models", "gguf_models", "vision_models", "image_models",
"audio_models", "tts_models", "video_models", "audio_gen_models",
"embedding_models", "spatial_models")
models_config = {
k: ([m for m in v if _route_key(m) in _keep]
if k in _model_cats and isinstance(v, list) else v)
for k, v in config_mgr.models_data.items()
}
_n = sum(len(models_config.get(c, [])) for c in _model_cats)
print(f"[engine] registering {_n} model(s) assigned by the front")
# Also restrict /v1/models (list_models) to the assigned subset, so the
# per-engine model list matches what it actually serves — config_mgr's
# full models_data is untouched (the admin model list stays complete).
multi_model_manager.set_assigned_models(keep)
except Exception as _e:
print(f"[engine] assignment filter failed ({_e}); registering all models")
# Helper to find model config
def get_model_cfg(model_type, model_id):
......@@ -815,6 +954,9 @@ def main():
global_args.max_ram_gb = config.offload.max_ram_gb
global_args.evict_idle_on_ram = config.offload.evict_idle_on_ram
global_args.ram_leak_watch = config.offload.ram_leak_watch
global_args.ram_watch_poll_seconds = config.offload.ram_watch_poll_seconds
global_args.ram_watch_soft_fraction = config.offload.ram_watch_soft_fraction
global_args.ram_watch_cuda = config.offload.ram_watch_cuda
# Thermal protection settings (read live by codai.models.thermal).
global_args.thermal_cpu_enabled = config.thermal.cpu_enabled
global_args.thermal_gpu_enabled = config.thermal.gpu_enabled
......@@ -822,6 +964,7 @@ def main():
global_args.thermal_cpu_resume = config.thermal.cpu_resume
global_args.thermal_gpu_high = config.thermal.gpu_high
global_args.thermal_gpu_resume = config.thermal.gpu_resume
global_args.thermal_gpu_overrides = config.thermal.gpu_overrides
global_args.thermal_poll_seconds = config.thermal.poll_seconds
global_args.thermal_soft_throttle_enabled = config.thermal.soft_throttle_enabled
global_args.thermal_soft_throttle_temp = config.thermal.soft_throttle_temp
......@@ -850,6 +993,7 @@ def main():
global_args.debug_web = getattr(args, 'debug_web', False)
global_args.debug_thermal = getattr(args, 'debug_thermal', False)
global_args.debug_lora = getattr(args, 'debug_lora', False)
global_args.debug_requests = getattr(args, 'debug_requests', False)
global_args.dump = global_dump
global_args.file_path = config.file_path
global_args.parser = config.parser
......@@ -998,6 +1142,15 @@ def main():
from codai.queue.manager import queue_manager
queue_manager.max_size = config.server.queue_max_size
queue_manager.max_parallel_requests = config.server.max_parallel_requests
# In an engine the front may override this engine's concurrency (per-engine
# limit) via env, so a bigger card runs more in parallel than a smaller one.
_mp = os.environ.get("CODERAI_MAX_PARALLEL")
if _mp:
try:
queue_manager.max_parallel_requests = int(_mp)
config.server.max_parallel_requests = int(_mp)
except ValueError:
pass
# Configure Python logging so broker/API log calls reach the terminal.
# uvicorn is started with log_config=None to keep our config in place.
......@@ -1068,9 +1221,26 @@ def main():
# Start the server
import uvicorn
print(f"\nStarting server on http://{config.server.host}:{config.server.port}")
print(f"API docs: http://{config.server.host}:{config.server.port}/docs")
print(f"Admin UI: http://{config.server.host}:{config.server.port}/admin")
# The bind target: an engine binds 127.0.0.1:<internal-port> with plain HTTP
# (the front owns the public host + TLS); single-process uses the configured
# public host/port/https. config.server keeps the CONFIGURED values either way
# so the settings API reports them correctly.
if getattr(args, 'engine_only', False):
bind_host = "127.0.0.1"
bind_port = int(getattr(args, "internal_port", None)
or config.server.internal_port_base)
bind_https = False
# Engines are internal workers behind the front — the public API docs / Admin
# UI live on the front, so don't advertise them here (it's just confusing).
print(f"[engine] serving on http://{bind_host}:{bind_port} "
f"(internal — reach it via the front)")
else:
bind_host = config.server.host
bind_port = config.server.port
bind_https = config.server.https
print(f"\nStarting server on http://{bind_host}:{bind_port}")
print(f"API docs: http://{bind_host}:{bind_port}/docs")
print(f"Admin UI: http://{bind_host}:{bind_port}/admin")
if model_manager.backend is not None:
actual_backend = model_manager.backend_type
......@@ -1080,7 +1250,20 @@ def main():
_uvi_log_level = "debug" if global_debug else "info"
if config.server.https:
# An engine only ever receives internal front→engine traffic (localhost-only +
# token-gated), so its whole access log is internal chatter. Silence it unless
# --debug-engine by handing uvicorn a log config with uvicorn.access at WARNING
# — done via the config (not a post-hoc setLevel) because uvicorn re-applies its
# logging config on run and would otherwise reset the level back to INFO. When
# the config is used we pass log_level=None so uvicorn doesn't re-override it.
_uvi_log_config = None
if getattr(args, 'engine_only', False) and not getattr(args, 'debug_engine_web', False):
import copy as _copy
_uvi_log_config = _copy.deepcopy(uvicorn.config.LOGGING_CONFIG)
_uvi_log_config["loggers"]["uvicorn.access"]["level"] = "WARNING"
_uvi_ll = None if _uvi_log_config is not None else _uvi_log_level
if bind_https:
import ssl
ssl_keyfile = config.server.https_key_path
ssl_certfile = config.server.https_cert_path
......@@ -1102,17 +1285,17 @@ def main():
except Exception as e:
print(f"Warning: Could not generate certificate: {e}")
print("Falling back to HTTP...")
uvicorn.run(fastapi_app, host=config.server.host, port=config.server.port,
log_level=_uvi_log_level, log_config=None)
uvicorn.run(fastapi_app, host=bind_host, port=bind_port,
log_level=_uvi_ll, log_config=_uvi_log_config)
return
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
ssl_context.load_cert_chain(ssl_certfile, ssl_keyfile)
uvicorn.run(fastapi_app, host=config.server.host, port=config.server.port,
ssl_context=ssl_context, log_level=_uvi_log_level, log_config=None)
uvicorn.run(fastapi_app, host=bind_host, port=bind_port,
ssl_context=ssl_context, log_level=_uvi_ll, log_config=_uvi_log_config)
else:
uvicorn.run(fastapi_app, host=config.server.host, port=config.server.port,
log_level=_uvi_log_level, log_config=None)
uvicorn.run(fastapi_app, host=bind_host, port=bind_port,
log_level=_uvi_ll, log_config=_uvi_log_config)
if __name__ == "__main__":
......
......@@ -21,6 +21,7 @@ from threading import Lock
from typing import List, Optional
import json
import os
import re
import time
......@@ -179,11 +180,15 @@ def detect_model_capabilities(model_name: str) -> ModelCapabilities:
return caps
# ── Image: upscaling (checked before general SD rule to catch SD-family upscalers) ──
if any(x in n for x in ['real-esrgan', 'esrgan', 'swinir', 'edsr',
'bsrgan', 'hat-', 'dat-',
# 'hat-'/'dat-' are short, ambiguous tokens (e.g. they appear inside
# "chat-", "update-"); require a word boundary before them so a text "chat"
# model isn't mistaken for the HAT/DAT super-resolution checkpoints.
if (any(x in n for x in ['real-esrgan', 'esrgan', 'swinir', 'edsr',
'bsrgan',
'x2-upscaler', 'x4-upscaler', 'x2_upscaler', 'x4_upscaler',
'latent-upscaler', 'latent_upscaler',
'ldm-super-resolution', 'rcan-', 'sr3-']):
'ldm-super-resolution', 'rcan-', 'sr3-'])
or re.search(r'\b[hd]at-', n)):
caps.image_upscaling = True
caps.image_to_image = True
return caps
......
......@@ -16,7 +16,7 @@
"""Model manager module - contains ModelManager, WhisperServerManager, and MultiModelManager classes."""
from typing import Optional, Dict, Any, List
from typing import Optional, Dict, Any, List, Set
import os
import random
import subprocess
......@@ -36,6 +36,36 @@ from codai.models.utils import FuzzyToolBreaker
from codai.pydantic.textrequest import ModelInfo
def get_active_ds4_config():
"""Return the active Ds4Config from the server config, or None if unavailable."""
try:
from codai.admin.routes import config_manager
if config_manager is not None and config_manager.config is not None:
return config_manager.config.ds4
except Exception:
pass
return None
def ds4_should_handle(model_name: str) -> bool:
"""True when ds4 is enabled and ``model_name`` should be served by ds4-server.
Matches the configured ``model_id`` (case-insensitive, short-name aware) or any
name containing ``deepseek-v4``, so the stock alias works without extra config.
"""
if not model_name:
return False
cfg = get_active_ds4_config()
if cfg is None or not getattr(cfg, "enabled", False):
return False
name = model_name.lower()
short = name.split("/")[-1]
mid = (getattr(cfg, "model_id", "") or "").lower()
if mid and (name == mid or short == mid):
return True
return "deepseek-v4" in name
def _trim_cpu_ram() -> None:
"""Return freed CPU heap memory to the OS (and let the kernel reclaim swap).
......@@ -128,6 +158,17 @@ class ModelManager:
def load_model(self, model_name: str, backend_type: str = "auto", **kwargs):
"""Load the model with the specified backend."""
# DeepSeek V4 via ds4: when enabled, route matching models to the managed
# ds4-server proxy instead of the in-process nvidia/vulkan backends.
if ds4_should_handle(model_name):
from codai.backends.ds4 import Ds4Backend
print(f"Routing '{model_name}' to ds4 (DeepSeek V4) backend")
self.backend_type = "ds4"
self.backend = Ds4Backend(get_active_ds4_config())
self.backend.load_model(model_name, **kwargs)
self.tool_parser = ModelParserAdapter(model_name=model_name)
return
available = detect_available_backends()
# Check if model is a GGUF file
......@@ -543,6 +584,11 @@ class MultiModelManager:
self.embedding_models: List[str] = [] # text / multimodal embeddings
self.spatial_models: List[str] = [] # depth estimation, segmentation, object detection
self.config: Dict[str, Dict] = {} # Store model configurations
# In the front/engine split, the front assigns a subset of models.json to
# this engine. When set, list_models() reports only these (so /v1/models per
# engine reflects what it actually serves); None = report all (single-process).
self._assigned_model_keys: Optional[set] = None
self.model_registered_types: Dict[str, Set[str]] = {}
self.tool_parser = ModelParserAdapter()
self.current_model_key: Optional[str] = None
self.load_mode: str = "ondemand"
......@@ -571,6 +617,7 @@ class MultiModelManager:
self._pending_new_instance: set = set() # keys awaiting a second+ instance load
self._global_max_instances: int = 1 # set from config at startup
self._measured_vram_gb: Dict[str, float] = {} # actual measured VRAM delta per model key
self._last_load_errors: Dict[str, str] = {} # model_key -> last failed load message
# Callbacks that free VRAM held *outside* the model manager (e.g. the
# LoRA trainer caches its SD/SDXL base model between jobs). Each returns
# the GB it freed (or None). Invoked as a last resort during eviction.
......@@ -736,6 +783,7 @@ class MultiModelManager:
self.default_model = model_name
self.config[model_name] = config or {}
self.model_backend_types[model_name] = backend_type
self._remember_registered_type(model_name, "text")
# Download/cache the model at startup if it's a URL or HF ID
resolved_model = self.load_model(model_name)
......@@ -834,6 +882,7 @@ class MultiModelManager:
from codai.tasks import loading_task
with loading_task(self.default_model, model_type="text"):
model_manager.load_model(self.default_model, backend_type=backend_type, **kwargs)
self._last_load_errors.pop(self.default_model, None)
self.add_model(self.default_model, model_manager)
self.record_vram_delta(self.default_model, _snap)
self.current_model_key = self.default_model
......@@ -842,6 +891,7 @@ class MultiModelManager:
return model_manager
except Exception as e:
print(f"Error loading model {self.default_model}: {e}")
self._last_load_errors[self.default_model] = str(e)
self._mark_cuda_poisoned_if_fatal(e)
self._model_ready_event.set()
return None
......@@ -935,6 +985,7 @@ class MultiModelManager:
from codai.tasks import loading_task
with loading_task(model_name, model_type="text"):
model_manager.load_model(model_name, backend_type=backend_type, **kwargs)
self._last_load_errors.pop(model_name, None)
self.add_model(model_name, model_manager)
self.record_vram_delta(model_name, _snap)
self.current_model_key = model_name
......@@ -944,6 +995,7 @@ class MultiModelManager:
return model_manager
except Exception as e:
print(f"Error loading model {model_name}: {e}")
self._last_load_errors[model_name] = str(e)
self._mark_cuda_poisoned_if_fatal(e)
self._model_ready_event.set() # signal: ready (even on failure)
return None
......@@ -953,6 +1005,7 @@ class MultiModelManager:
if model_name not in self.audio_models:
self.audio_models.append(model_name)
self.config[f"audio:{model_name}"] = config or {}
self._remember_registered_type(model_name, "audio")
if isinstance(config, dict) and config.get("backend") == "whisper-server":
print(f"Registered whisper-server audio model: {model_name}")
......@@ -984,6 +1037,7 @@ class MultiModelManager:
if model_id not in self.audio_models:
self.audio_models.append(model_id)
self.config[f"audio:{model_id}"] = cfg
self._remember_registered_type(model_id, "audio")
# Register alias for round-robin routing
if alias:
wsm._alias = alias
......@@ -1019,6 +1073,7 @@ class MultiModelManager:
"""Set the text-to-speech model and download/cache it if needed."""
self.tts_model = model_name
self.config[f"tts:{model_name}"] = config or {}
self._remember_registered_type(model_name, "tts")
# Download/cache the model at startup if it's a URL or HF ID
resolved_model = self.load_model(model_name)
......@@ -1033,6 +1088,7 @@ class MultiModelManager:
if model_name not in self.image_models:
self.image_models.append(model_name)
self.config[f"image:{model_name}"] = config or {}
self._remember_registered_type(model_name, "image")
# For image models, we don't download at startup since they may be large
# and handled by different backends (diffusers vs sd.cpp)
......@@ -1044,6 +1100,7 @@ class MultiModelManager:
if model_name not in self.vision_models:
self.vision_models.append(model_name)
self.config[f"vision:{model_name}"] = config or {}
self._remember_registered_type(model_name, "vision")
resolved_model = self.load_model(model_name)
if resolved_model != model_name:
......@@ -1057,6 +1114,7 @@ class MultiModelManager:
if model_name not in self.video_models:
self.video_models.append(model_name)
self.config[f"video:{model_name}"] = config or {}
self._remember_registered_type(model_name, "video")
print(f"Registered video model: {model_name}")
def set_audio_gen_model(self, model_name: str, config: Dict = None):
......@@ -1064,6 +1122,7 @@ class MultiModelManager:
if model_name not in self.audio_gen_models:
self.audio_gen_models.append(model_name)
self.config[f"audio_gen:{model_name}"] = config or {}
self._remember_registered_type(model_name, "audio_gen")
print(f"Registered audio-gen model: {model_name}")
def set_embedding_model(self, model_name: str, config: Dict = None):
......@@ -1071,6 +1130,7 @@ class MultiModelManager:
if model_name not in self.embedding_models:
self.embedding_models.append(model_name)
self.config[f"embedding:{model_name}"] = config or {}
self._remember_registered_type(model_name, "embedding")
print(f"Registered embedding model: {model_name}")
def set_spatial_model(self, model_name: str, config: Dict = None):
......@@ -1078,11 +1138,31 @@ class MultiModelManager:
if model_name not in self.spatial_models:
self.spatial_models.append(model_name)
self.config[f"spatial:{model_name}"] = config or {}
self._remember_registered_type(model_name, "spatial")
print(f"Registered spatial model: {model_name}")
def set_model_alias(self, alias: str, model_name: str):
"""Register an alias for a model."""
self.model_aliases[alias] = model_name
for model_type in self._registered_types_for(model_name):
self._remember_registered_type(alias, model_type)
def set_assigned_models(self, keys) -> None:
"""Restrict list_models() to the front-assigned subset (route-keys: alias /
path / id). None = no restriction."""
self._assigned_model_keys = set(keys) if keys is not None else None
def _entry_assigned(self, m) -> bool:
"""True if a models.json entry is assigned to this engine (or no restriction)."""
if self._assigned_model_keys is None:
return True
if isinstance(m, str):
rk = m
elif isinstance(m, dict):
rk = m.get("alias") or m.get("path") or m.get("id")
else:
return True
return rk in self._assigned_model_keys
def get_all_allowed_identifiers(self) -> set:
"""
......@@ -1204,6 +1284,12 @@ class MultiModelManager:
r_short = registered.split("/")[-1] if "/" in registered else registered
return n_short == r_short
requested_type = self._requested_type_from_registered_types(name)
if requested_type:
return requested_type
if self._registered_types_for(name):
return None
if self.default_model and _matches(self.default_model):
return "text"
for m in self.image_models:
......@@ -1231,6 +1317,115 @@ class MultiModelManager:
return "spatial"
return None
def _remember_registered_type(self, name: str, model_type: str) -> None:
"""Remember every configured type for a model identifier and short name."""
if not name or not model_type:
return
for key in {name, name.split("/")[-1] if "/" in name else name}:
self.model_registered_types.setdefault(key, set()).add(model_type)
def _registered_types_for(self, name: str) -> Set[str]:
"""Return all configured types for a model identifier or its short name."""
if not name:
return set()
short = name.split("/")[-1] if "/" in name else name
types = set(self.model_registered_types.get(name, set()))
types.update(self.model_registered_types.get(short, set()))
for key, vals in self.model_registered_types.items():
key_short = key.split("/")[-1] if "/" in key else key
if key == name or key_short == short:
types.update(vals)
# Live admin saves update models.json/config_manager immediately, but an
# already-running manager may not have had every category re-registered.
# Treat the saved config as authoritative so entries with model_types like
# text+image don't get rejected just because the image registration won.
types.update(self._registered_types_from_config(name))
return types
def _registered_types_from_config(self, name: str) -> Set[str]:
"""Infer all configured types for a model from config_manager.models_data."""
cat_type = {
"text_models": "text",
"gguf_models": "text",
"vision_models": "vision",
"image_models": "image",
"audio_models": "audio",
"tts_models": "tts",
"video_models": "video",
"audio_gen_models": "audio_gen",
"embedding_models": "embedding",
"spatial_models": "spatial",
}
cfg_cat_type = {
"text_models": "text",
"gguf_models": "text",
"vision_models": "vision",
"image_models": "image",
"audio_models": "audio",
"tts_models": "tts",
"video_models": "video",
"audio_gen_models": "audio_gen",
"embedding_models": "embedding",
"spatial_models": "spatial",
}
found: Set[str] = set()
short = name.split("/")[-1] if "/" in name else name
try:
from codai.admin.routes import config_manager
md = config_manager.models_data if config_manager is not None else {}
except Exception:
return found
for cat, entries in md.items():
default_type = cat_type.get(cat)
if not default_type:
continue
for entry in entries or []:
if isinstance(entry, str):
vals = [entry]
entry_types = [default_type]
else:
raw = entry.get("path") or entry.get("id") or ""
alias = entry.get("alias") or ""
vals = [raw, alias]
raw_types = entry.get("model_types") or [entry.get("model_type") or cat]
entry_types = [cfg_cat_type.get(t, default_type) for t in raw_types if cfg_cat_type.get(t, default_type)]
for val in vals:
if not val:
continue
val_short = val.split("/")[-1] if "/" in val else val
if val == name or val_short == short:
found.update(entry_types)
return found
def _requested_type_from_registered_types(self, name: str) -> Optional[str]:
"""Return a single registered type only when the model is not multi-type."""
types = self._registered_types_for(name)
return next(iter(types)) if len(types) == 1 else None
def model_supports_type(self, name: str, model_type: Optional[str]) -> bool:
"""True when a configured multi-type model supports the requested type."""
if not model_type:
return True
types = self._registered_types_for(name)
if model_type in types:
return True
return model_type == "text" and "vision" in types
def _config_for_model_key(self, model_key: str) -> Dict[str, Any]:
"""Return config for a key, falling back to compatible multi-type keys."""
cfg = self.config.get(model_key, {})
if cfg:
return cfg
if ":" in model_key:
_, bare = model_key.split(":", 1)
return self.config.get(bare, {})
bare = model_key
for prefix in ("vision", "image", "audio", "tts", "video", "audio_gen", "embedding", "spatial"):
cfg = self.config.get(f"{prefix}:{bare}", {})
if cfg:
return cfg
return {}
def is_allowed_model(self, requested_or_resolved: str, model_type: str = None) -> bool:
"""
Check if a model name (raw request value *or* resolved name) is one of
......@@ -1249,9 +1444,17 @@ class MultiModelManager:
if not requested_or_resolved:
return False
# ds4-served DeepSeek V4 has no models.json entry; accept it for text when
# the ds4 worker is enabled and the name matches.
if model_type in (None, "text") and ds4_should_handle(requested_or_resolved):
return True
# If a model_type is specified, reject models registered under a
# different type (e.g. an image GGUF requested via /v1/chat/completions).
if model_type:
registered_types = self._registered_types_for(requested_or_resolved)
if registered_types and not self.model_supports_type(requested_or_resolved, model_type):
return False
registered_type = self.get_registered_model_type(requested_or_resolved)
if registered_type is not None and registered_type != model_type:
# "vision" models are acceptable for "text" endpoints (multimodal)
......@@ -1687,7 +1890,7 @@ class MultiModelManager:
# the runtime reserve (KV cache / activations / VAE-decode spike) so the
# value we cache and persist reflects the model's PEAK runtime need — not
# just its loaded weights — and future eviction frees enough headroom.
cfg = self.config.get(model_key, {})
cfg = self._config_for_model_key(model_key)
reserve_gb = self._runtime_reserve_gb(
cfg if isinstance(cfg, dict) else {}, model_key, delta_gb)
measured = round(delta_gb + reserve_gb, 3)
......@@ -2335,6 +2538,31 @@ class MultiModelManager:
_load_bpe = self._load_bytes_per_elem(cfg)
prec_factor = (_load_bpe / _storage_bpe) if _storage_bpe > 0 else 1.0
# GGUF files are ALREADY quantized on disk and llama.cpp loads the baked-in
# quantization — it ignores load_in_4bit/load_in_8bit entirely. The stored
# used_vram_gb and file-size baselines already reflect that quantized
# footprint, so applying the 4/8-bit quant multiplier (or a storage→load
# precision normalization) on top would 2–3× UNDER-estimate the real
# resident size and let the loader try to fit a model that doesn't fit.
_gguf_path = str(cfg.get("path") or resolved_name or model_key or "")
_is_gguf = (_gguf_path.endswith(".gguf") or "gguf" in _gguf_path.lower()
or cfg.get("model_type") == "gguf_models")
if _is_gguf:
quant_mult = 1.0
prec_factor = 1.0
# n_gpu_layers controls how much of a GGUF actually lands in VRAM.
# With 0 layers on the GPU the weights live in CPU RAM / are mmap'd
# from disk, so the GPU only needs compute/KV buffers — don't reserve
# the whole model (which would force needless eviction of other
# models on every load attempt). A partial positive count is left
# conservative since the total layer count isn't known here.
try:
_ngl = int(cfg.get("n_gpu_layers")) if cfg.get("n_gpu_layers") is not None else -1
except (TypeError, ValueError):
_ngl = -1
if _ngl == 0:
quant_mult = 0.0
def _dbg_est(source: str, value: float) -> float:
try:
from codai.api.state import get_global_debug
......@@ -2592,7 +2820,41 @@ class MultiModelManager:
"""Resident-set size of the server process TREE, in GB (0.0 on failure).
Offloaded weights and worker subprocesses count against the global cap, so
sum the parent plus all children (mirrors thermal.read_process_tree_cpu)."""
sum the root process plus all children (mirrors thermal.read_process_tree_cpu).
Under the front/engine split the host-RAM cap is SHARED, not split: when the
front spawned this engine it set CODERAI_FRONT_PID, so the root is the
*front* — every engine then measures the same fleet-wide total (front + all
engines + their workers) and enforces the single cap against it. In
single-process mode the root is just this process, as before."""
try:
import os
import psutil
root_pid = os.environ.get("CODERAI_FRONT_PID")
proc = None
if root_pid:
try:
proc = psutil.Process(int(root_pid))
except Exception:
proc = None
if proc is None:
proc = psutil.Process()
total = proc.memory_info().rss
for child in proc.children(recursive=True):
try:
total += child.memory_info().rss
except Exception:
pass
return total / 1e9
except Exception:
return 0.0
@staticmethod
def _get_own_ram_gb() -> float:
"""RSS of THIS engine's own process tree only (ignores the shared-fleet
root), in GB. Used for per-engine *leak* detection so unbounded growth is
attributed to the engine that actually has it — unlike the shared cap, which
uses the whole fleet (:meth:`_get_process_ram_gb`)."""
try:
import psutil
proc = psutil.Process()
......@@ -2978,7 +3240,7 @@ class MultiModelManager:
# Per-model "load" = pre-loaded (treat as loadall for this model).
# Per-model "on-request" = load when needed with VRAM management.
# =====================================================================
per_model_cfg = self.config.get(model_key, {})
per_model_cfg = self._config_for_model_key(model_key)
per_model_load_mode = per_model_cfg.get("load_mode") # "load" | "on-request" | None
if per_model_load_mode == "on-request":
......@@ -3467,6 +3729,10 @@ class MultiModelManager:
"spatial_models"):
mtype = CAT_TYPE.get(cat, "text")
for m in md.get(cat, []):
# Only list models the front assigned to THIS engine (so a
# per-engine /v1/models reflects what it actually serves).
if not self._entry_assigned(m):
continue
if isinstance(m, str):
mid = m
else:
......@@ -3550,6 +3816,12 @@ class MultiModelManager:
for alias in self.model_aliases:
_add(alias)
# --- DeepSeek V4 via ds4 (no models.json entry; surfaced when enabled) ---
ds4_cfg = get_active_ds4_config()
if ds4_cfg is not None and getattr(ds4_cfg, "enabled", False):
mid = getattr(ds4_cfg, "model_id", "deepseek-v4") or "deepseek-v4"
_add(mid, "text", {"backend": "ds4"})
return models
......
......@@ -937,12 +937,143 @@ class CommandRParser(BaseParser):
return results
def _parse_gemma_loose_value(s: str, i: int):
"""Parse one value from gemma's loose object notation starting at index i.
Returns (python_value, next_index). Handles "strings", numbers, true/false/
null, nested {objects} and [arrays], and bareword fallbacks."""
n = len(s)
while i < n and s[i] in ' \t\r\n':
i += 1
if i >= n:
return None, i
c = s[i]
if c == '"':
# JSON-style string with escapes.
j = i + 1
buf = []
while j < n:
if s[j] == '\\' and j + 1 < n:
esc = s[j + 1]
buf.append({'n': '\n', 't': '\t', 'r': '\r'}.get(esc, esc))
j += 2
continue
if s[j] == '"':
j += 1
break
buf.append(s[j])
j += 1
return ''.join(buf), j
if c == '{':
return _parse_gemma_loose_object(s, i)
if c == '[':
arr = []
j = i + 1
while j < n:
while j < n and s[j] in ' \t\r\n,':
j += 1
if j < n and s[j] == ']':
j += 1
break
val, j = _parse_gemma_loose_value(s, j)
arr.append(val)
return arr, j
# Bareword / number / bool / null: read until a delimiter.
j = i
while j < n and s[j] not in ',}]':
j += 1
tok = s[i:j].strip()
low = tok.lower()
if low == 'true':
return True, j
if low == 'false':
return False, j
if low in ('null', 'none'):
return None, j
try:
return int(tok), j
except ValueError:
pass
try:
return float(tok), j
except ValueError:
pass
return tok, j
def _parse_gemma_loose_object(s: str, i: int):
"""Parse a {key:value,…} object (unquoted keys) starting at the '{' at i.
Returns (dict, next_index)."""
n = len(s)
obj = {}
assert s[i] == '{'
j = i + 1
while j < n:
while j < n and s[j] in ' \t\r\n,':
j += 1
if j < n and s[j] == '}':
j += 1
break
# Read key (bareword or "quoted").
if s[j] == '"':
key, j = _parse_gemma_loose_value(s, j)
else:
k = j
while j < n and s[j] not in ':}':
j += 1
key = s[k:j].strip()
while j < n and s[j] in ' \t\r\n':
j += 1
if j < n and s[j] == ':':
j += 1
val, j = _parse_gemma_loose_value(s, j)
if key:
obj[key] = val
return obj, j
def parse_gemma_native_tool_calls(text: str, tool_names=None):
"""Parse gemma-4's native tool-call format — ``call:NAME{args}`` (optionally
wrapped in the ``<|tool_call>…<tool_call|>`` special tokens) — into a list of
``(name, args_dict)``. ``tool_names`` (when given) restricts matches to real
tool names so prose containing ``call:`` isn't misread. Exact-duplicate calls
are collapsed (a degenerate model loop emits the same call repeatedly)."""
if not text or 'call:' not in text:
return []
out = []
seen = set()
for m in re.finditer(r'call:\s*([A-Za-z_]\w*)\s*\{', text):
name = m.group(1)
if tool_names and name not in tool_names:
continue
brace = m.end() - 1 # index of '{'
try:
args, _ = _parse_gemma_loose_object(text, brace)
except Exception:
continue
key = (name, json.dumps(args, sort_keys=True, default=str))
if key in seen:
continue
seen.add(key)
out.append((name, args))
return out
# 7. GEMMA PARSER
class GemmaParser(BaseParser):
@validate_tool_output
def parse(self, text: str) -> List[Dict]:
results = []
# gemma-4 native format: call:NAME{args} (the <|tool_call>…<tool_call|>
# markers are stripped by skip_special_tokens during decode). Restrict to
# declared tool names when we know them, to avoid matching prose.
native = parse_gemma_native_tool_calls(
text, set(self.tools.keys()) if self.tools else None)
for name, args in native:
results.append(self._to_oa(name, args))
if results:
return results
match = re.search(r'{\s*"name":\s*".*?"\s*,\s*"parameters":\s*\{.*?\}\s*\}', text, re.DOTALL)
if match:
try:
......@@ -2103,6 +2234,21 @@ class ModelParserAdapter:
if not text:
return text
# gemma-4 native: drop every `call:NAME{…}` span (balanced braces) and the
# `thought` channel residue left after skip_special_tokens strips the
# <|tool_call>/<|channel> markers.
if 'call:' in text:
while True:
m = re.search(r'call:\s*[A-Za-z_]\w*\s*\{', text)
if not m:
break
try:
_, end = _parse_gemma_loose_object(text, m.end() - 1)
except Exception:
end = m.end()
text = text[:m.start()] + text[end:]
text = re.sub(r'(?m)^\s*thought\s*$\n?', '', text)
# Custom XML format: <tool><action>...</action><object>...</object><properties>...</properties></tool>
text = re.sub(r'<tool>\s*<action>.*?</action>\s*<object>.*?</object>\s*<properties>.*?</properties>\s*</tool>', '', text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'<tool=[^>]+>.*?</tool_call>', '', text, flags=re.DOTALL)
......
......@@ -67,6 +67,29 @@ def _watch_enabled() -> bool:
return True
def _cfg(name: str, default):
"""Read a live-tunable knob off global_args, falling back to the module default."""
try:
from codai.api.state import get_global_args
ga = get_global_args()
val = getattr(ga, name, None) if ga else None
return val if val is not None else default
except Exception:
return default
def _poll_seconds() -> float:
return float(_cfg("ram_watch_poll_seconds", _POLL_SECONDS))
def _soft_fraction() -> float:
return float(_cfg("ram_watch_soft_fraction", _SOFT_FRACTION))
def _cuda_mitigation_enabled() -> bool:
return bool(_cfg("ram_watch_cuda", True))
def _scheduler_idle() -> bool:
"""True when no request is being served (so RSS growth isn't a live job)."""
try:
......@@ -76,7 +99,22 @@ def _scheduler_idle() -> bool:
return True
def _load_in_progress() -> bool:
"""True while a model is being loaded/switched.
A load streams multi-GB of weights into host RAM over many seconds, so RSS
climbs monotonically — which the leak heuristic would otherwise mistake for a
leak. The scheduler shows no active *request* lease during a load, so the
idle check alone doesn't catch this; we consult the manager's load event."""
try:
from codai.models.manager import multi_model_manager
return not multi_model_manager._model_ready_event.is_set()
except Exception:
return False
def _process_ram_gb() -> float:
"""Whole-fleet RSS (front + all engines) — what the SHARED cap is enforced on."""
try:
from codai.models.manager import multi_model_manager
return multi_model_manager._get_process_ram_gb()
......@@ -88,13 +126,29 @@ def _process_ram_gb() -> float:
return 0.0
def _mitigate(rss_gb: float, cap_gb: float, leak: bool) -> str:
def _own_ram_gb() -> float:
"""This engine's OWN tree RSS — what *leak* detection trends on, so a leak is
attributed to (and mitigated by) the engine that actually has it, not every
engine that merely observes the shared total rising."""
try:
from codai.models.manager import multi_model_manager
return multi_model_manager._get_own_ram_gb()
except Exception:
return _process_ram_gb()
def _mitigate(rss_gb: float, cap_gb: float, leak: bool, loading: bool = False) -> str:
"""Run the mitigation ladder; return a short description of what was done."""
import gc
actions = []
for _ in range(3):
gc.collect()
actions.append("gc")
# Skip CUDA empty_cache while a load is in flight (accelerate is actively
# allocating on the GPU from the main thread, and calling into the CUDA
# allocator from this background thread mid-load is needless interference),
# or when the operator has disabled CUDA mitigation via ram_watch_cuda.
if not loading and _cuda_mitigation_enabled():
try:
import torch
if torch.cuda.is_available():
......@@ -120,7 +174,7 @@ def _mitigate(rss_gb: float, cap_gb: float, leak: bool) -> str:
# Still over and eviction is enabled → unload idle LRU models.
try:
from codai.models.manager import multi_model_manager as _mm
if (_mm._get_process_ram_gb() > cap_gb * _SOFT_FRACTION
if (_mm._get_process_ram_gb() > cap_gb * _soft_fraction()
and _mm._evict_idle_on_ram_enabled()):
_mm._evict_models_for_ram(cap_gb * _EVICT_TARGET_FRACTION)
actions.append("evict_idle")
......@@ -134,18 +188,22 @@ def _loop():
global _recent
while True:
try:
time.sleep(_POLL_SECONDS)
time.sleep(_poll_seconds())
if not _watch_enabled():
continue
cap = _cap_gb()
rss = _process_ram_gb()
idle = _scheduler_idle()
# Leak heuristic: only trust growth measured while idle (a live job
# legitimately inflates RSS). Keep a short rolling window of idle samples.
rss = _process_ram_gb() # whole fleet — the shared cap is enforced on this
own = _own_ram_gb() # this engine only — leak is trended on this
loading = _load_in_progress()
idle = _scheduler_idle() and not loading
# Leak heuristic: only trust growth measured while THIS engine is idle (a
# live job — or a model load streaming weights into RAM — legitimately
# inflates RSS). Trend OWN RSS so a sibling engine's job/load can't look
# like a leak here. Keep a short rolling window of idle samples.
leak = False
if idle:
_recent.append(rss)
_recent.append(own)
_recent = _recent[-(_LEAK_SAMPLES + 1):]
if len(_recent) > _LEAK_SAMPLES:
rising = all(
......@@ -154,18 +212,20 @@ def _loop():
)
leak = rising
else:
_recent = [] # reset trend while a job runs
_recent = [] # reset trend while a job runs or a model loads
with _state_lock:
_state["rss_gb"] = round(rss, 2)
_state["own_rss_gb"] = round(own, 2)
_state["cap_gb"] = cap
_state["percent"] = round(100.0 * rss / cap, 1) if cap else None
_state["leak_suspected"] = leak
_state["samples"] += 1
# Engage the ladder when over the soft threshold or a leak is suspected.
if cap and (rss >= cap * _SOFT_FRACTION or leak):
desc = _mitigate(rss, cap, leak)
# Engage the ladder when the FLEET is over the soft threshold, or THIS
# engine is leaking (mitigation acts locally + evicts idle as needed).
if cap and (rss >= cap * _soft_fraction() or leak):
desc = _mitigate(rss, cap, leak, loading)
new_rss = _process_ram_gb()
_log.warning(
"RAM watch: RSS %.1f/%.1f GB (%.0f%%)%s — mitigation [%s] → %.1f GB",
......
......@@ -144,7 +144,20 @@ def _run(cmd, timeout=4.0) -> Optional[str]:
def _read_gpu_temp_uncached() -> Optional[float]:
"""Hottest GPU temperature in °C, or None if unreadable."""
"""Hottest GPU temperature in °C across ALL installed cards, or None.
Spans every vendor (NVIDIA via nvidia-smi, AMD via sysfs) and is scoped to the
cards THIS engine owns (``CODERAI_ENGINE_GPUS``) — so a hot GPU pauses only the
engine using it, while a hot CPU (read globally) pauses everything. In
single-process mode it covers all cards. Falls back to the per-vendor probes
below if the unified reader fails."""
try:
from codai.frontproxy.gpu_detect import engine_gpu_stats
temps = [c["temp"] for c in engine_gpu_stats() if c.get("temp") is not None]
if temps:
return max(temps)
except Exception:
pass
# NVIDIA — the inference GPU on CUDA backends.
if _NVIDIA_SMI:
out = _run([
......@@ -282,7 +295,14 @@ _gpu_util_cache: Tuple[float, Optional[float]] = (0.0, None)
def _read_gpu_util_uncached() -> Optional[float]:
"""Hottest GPU utilization in %, or None if unreadable."""
"""Busiest GPU utilization in % across ALL installed cards, or None."""
try:
from codai.frontproxy.gpu_detect import engine_gpu_stats
utils = [c["util"] for c in engine_gpu_stats() if c.get("util") is not None]
if utils:
return max(utils)
except Exception:
pass
if _NVIDIA_SMI:
out = _run([
_NVIDIA_SMI,
......@@ -441,6 +461,7 @@ class ThermalSettings:
__slots__ = (
"cpu_enabled", "gpu_enabled",
"cpu_high", "cpu_resume", "gpu_high", "gpu_resume",
"gpu_overrides",
"poll_seconds",
"soft_enabled", "soft_temp", "soft_max_sleep",
)
......@@ -449,18 +470,28 @@ class ThermalSettings:
cpu_high=90.0, cpu_resume=87.0,
gpu_high=90.0, gpu_resume=87.0,
poll_seconds=5.0,
soft_enabled=False, soft_temp=80.0, soft_max_sleep=3.0):
soft_enabled=False, soft_temp=80.0, soft_max_sleep=3.0,
gpu_overrides=None):
self.cpu_enabled = bool(cpu_enabled)
self.gpu_enabled = bool(gpu_enabled)
self.cpu_high = float(cpu_high)
self.cpu_resume = float(cpu_resume)
self.gpu_high = float(gpu_high)
self.gpu_resume = float(gpu_resume)
self.gpu_overrides = dict(gpu_overrides or {})
self.poll_seconds = max(1.0, float(poll_seconds))
self.soft_enabled = bool(soft_enabled)
self.soft_temp = float(soft_temp)
self.soft_max_sleep = max(0.0, float(soft_max_sleep))
def gpu_thresholds(self, vendor):
"""(high, resume) for a card of ``vendor``, honouring per-vendor overrides."""
ov = (self.gpu_overrides or {}).get((vendor or "").lower())
if isinstance(ov, dict):
return (float(ov.get("high", self.gpu_high)),
float(ov.get("resume", self.gpu_resume)))
return self.gpu_high, self.gpu_resume
def _settings_from_global_args() -> ThermalSettings:
"""Build settings from the live global_args, falling back to defaults."""
......@@ -479,6 +510,7 @@ def _settings_from_global_args() -> ThermalSettings:
cpu_resume=g("thermal_cpu_resume", 87.0),
gpu_high=g("thermal_gpu_high", 90.0),
gpu_resume=g("thermal_gpu_resume", 87.0),
gpu_overrides=g("thermal_gpu_overrides", None),
poll_seconds=g("thermal_poll_seconds", 5.0),
soft_enabled=g("thermal_soft_throttle_enabled", False),
soft_temp=g("thermal_soft_throttle_temp", 80.0),
......@@ -524,6 +556,39 @@ def checkpoint(context: str = "", throttle_seconds: float = 0.0) -> None:
wait_until_safe(context=context)
def gpu_eval(settings: ThermalSettings):
"""Per-card GPU thermal check, scoped to THIS engine's cards.
Returns ``(over_high, over_resume, worst)`` where ``worst`` is
``{name,temp,high,resume,vendor}`` for the card most over its OWN high threshold
(or hottest vs its resume when none are over high), or ``None`` if no card temp
is readable. Honours per-vendor overrides, so e.g. a Radeon limit can differ
from an NVIDIA one and each card is judged against its own threshold."""
try:
from codai.frontproxy.gpu_detect import engine_gpu_stats
cards = engine_gpu_stats()
except Exception:
cards = []
over_high = over_resume = False
worst = None
worst_margin = None
for c in cards:
t = c.get("temp")
if t is None:
continue
high, resume = settings.gpu_thresholds(c.get("vendor"))
if t >= high:
over_high = True
if t > resume:
over_resume = True
margin = t - high
if worst is None or margin > worst_margin:
worst_margin = margin
worst = {"name": c.get("name"), "temp": t, "high": high,
"resume": resume, "vendor": c.get("vendor")}
return over_high, over_resume, worst
def wait_until_safe(settings: Optional[ThermalSettings] = None,
debug: bool = False,
context: str = "") -> None:
......@@ -543,19 +608,23 @@ def wait_until_safe(settings: Optional[ThermalSettings] = None,
desc0 = f" [{context}]" if context else ""
# Read current temps once (cached) and log the full picture in debug mode.
gpu_t = read_gpu_temp() if settings.gpu_enabled else None
# GPU is evaluated per-card (each card vs its own vendor threshold); gpu_t is
# the worst offender's temperature, used for messaging/soft-throttle/debug.
gpu_over, gpu_over_resume, gpu_worst = (
gpu_eval(settings) if settings.gpu_enabled else (False, False, None))
gpu_t = gpu_worst["temp"] if gpu_worst else None
cpu_t = read_cpu_temp() if settings.cpu_enabled else None
_dbg(
f"check{desc0}: "
f"GPU {_fmt(gpu_t)} (enabled={settings.gpu_enabled}, "
f"pause>={settings.gpu_high:.0f} resume<={settings.gpu_resume:.0f}) | "
f"over_high={gpu_over} over_resume={gpu_over_resume}) | "
f"CPU {_fmt(cpu_t)} (enabled={settings.cpu_enabled}, "
f"pause>={settings.cpu_high:.0f} resume<={settings.cpu_resume:.0f})"
)
hot = []
if settings.gpu_enabled and gpu_t is not None and gpu_t >= settings.gpu_high:
hot.append(("GPU", gpu_t, settings.gpu_resume))
if settings.gpu_enabled and gpu_over:
hot.append(("GPU", gpu_worst["temp"], gpu_worst["resume"]))
if settings.cpu_enabled and cpu_t is not None and cpu_t >= settings.cpu_high:
hot.append(("CPU", cpu_t, settings.cpu_resume))
......@@ -567,7 +636,7 @@ def wait_until_safe(settings: Optional[ThermalSettings] = None,
# the resume line and a cooldown is already in progress.
joined = False
if not hot and _cooldown_active():
if (settings.gpu_enabled and gpu_t is not None and gpu_t > settings.gpu_resume) or \
if (settings.gpu_enabled and gpu_over_resume) or \
(settings.cpu_enabled and cpu_t is not None and cpu_t > settings.cpu_resume):
joined = True
if not hot and not joined:
......@@ -588,11 +657,12 @@ def wait_until_safe(settings: Optional[ThermalSettings] = None,
# Enter cooldown: wait until *every* triggered sensor is at/below resume.
desc = f" ({context})" if context else ""
if hot:
trig = ", ".join(f"{lbl} {t:.0f}°C>={settings.gpu_high if lbl=='GPU' else settings.cpu_high:.0f}°C"
for lbl, t, _ in hot)
print(f"[thermal] Hardware too hot{desc}: {trig} — pausing requests "
f"until cooldown (GPU<={settings.gpu_resume:.0f}°C / "
f"CPU<={settings.cpu_resume:.0f}°C)")
# Each triggered sensor carries its own resume threshold (per-card for GPU).
trig = ", ".join(f"{lbl} {t:.0f}°C (resume<={r:.0f}°C)" for lbl, t, r in hot)
gpu_note = (f" [{gpu_worst['name']}]" if gpu_worst and any(h[0] == 'GPU' for h in hot)
else "")
print(f"[thermal] Hardware too hot{desc}: {trig}{gpu_note} — pausing requests "
f"until cooldown")
else:
# Joined an already-active cooldown started by another parallel worker.
print(f"[thermal] Joining active cooldown{desc} — another generation is "
......@@ -605,13 +675,15 @@ def wait_until_safe(settings: Optional[ThermalSettings] = None,
# Re-evaluate against resume thresholds (lower than trigger → hysteresis).
# CPU temps are noisy, so average a few samples for the resume decision
# (the pause check above stays single-read to react fast to spikes).
gt = read_gpu_temp() if settings.gpu_enabled else None
_, gpu_still, gpu_w2 = (gpu_eval(settings) if settings.gpu_enabled
else (False, False, None))
ct = read_cpu_temp_avg() if settings.cpu_enabled else None
still = []
if gt is not None and gt > settings.gpu_resume:
still.append(("GPU", gt, settings.gpu_resume))
if settings.gpu_enabled and gpu_still:
still.append(("GPU", gpu_w2["temp"], gpu_w2["resume"]))
if ct is not None and ct > settings.cpu_resume:
still.append(("CPU", ct, settings.cpu_resume))
gt = gpu_w2["temp"] if gpu_w2 else None
_dbg(f"cooldown{desc} {int(waited)}s: GPU {_fmt(gt)} CPU {_fmt(ct)} (avg-3) "
f"(still hot: {[s[0] for s in still] or 'none'})")
if not still:
......
python tools/video_editor.py --no-browser --host 0.0.0.0 --media-dir tools/coderai_media --session
tools/gen_township_fighters.py -c township_output/township_config.json
# DeepSeek V4 via ds4
CoderAI can serve **DeepSeek V4** (Flash / PRO) through antirez's
[ds4 / DwarfStar](https://github.com/antirez/ds4) — a native (C/CUDA/Metal)
inference engine built specifically for DeepSeek V4 that ships its own
OpenAI-compatible HTTP server (`ds4-server`).
Because ds4 is a standalone binary (not a Python package), coderai owns its whole
lifecycle as an *external worker* — the same pattern used for Parler-TTS
(`codai/api/parler_worker.py`). When enabled, coderai builds ds4, downloads the
model weights, launches `ds4-server` as a managed subprocess, and proxies text
requests to it. Everything else in coderai (tool parsing, streaming, the chat UI)
keeps working unchanged.
> **Hardware:** DeepSeek V4 is large. Per upstream you want **96 GB+ RAM**
> (256 GB+ for the Q4 variant, 512 GB for PRO). First use also clones the repo,
> compiles a native binary, and downloads several GB of weights — it is slow.
## Enabling
Admin → **Settings → DeepSeek V4 (ds4)**:
- **Enable ds4** — turn the integration on.
- **Model id / alias** (default `deepseek-v4`) — any chat request whose model name
equals this id, or contains `deepseek-v4` (case-insensitive), is routed to ds4
instead of the normal NVIDIA/Vulkan backends. All other models are unaffected.
- **Weight variant** — passed to ds4's `download_model.sh`
(`q2-imatrix`, `q2-q4-imatrix`, `q4-imatrix`, `pro-q2-imatrix`).
- **Build target**`auto` detects CUDA (`cuda-generic`) / macOS (`metal`) /
`cpu`; override for DGX Spark (`cuda-spark`).
- **Install dir** — where ds4 is cloned/built (default `~/.coderai/ds4`, or
`$CODERAI_DS4_DIR`).
- **Auto build** — clone + `make` the `ds4-server` binary if it's missing.
- **Bind host / Port / Context**`ds4-server --host/--port/--ctx`
(port `0` auto-picks a free port).
- **Extra args** — passed verbatim to `ds4-server`, e.g.
`--kv-disk-dir /tmp/ds4-kv --kv-disk-space-mb 8192`.
Then send a normal request:
```sh
curl localhost:8776/v1/chat/completions -H 'Content-Type: application/json' -d '{
"model": "deepseek-v4",
"messages": [{"role":"user","content":"Hello"}]
}'
```
The first such request triggers build → download → serve (with generous timeouts);
build and download logs are streamed with a `[ds4]` prefix. The subprocess is torn
down by the model manager's normal eviction and on server shutdown.
## Building ahead of time / packaging
Runtime auto-build works, but for reproducible installs (and Docker) you can build
ds4 during setup:
```sh
./build.sh all --ds4 # clones + builds ds4-server into ~/.coderai/ds4
```
The OCI image builder (`packaging/linux/build_oci_image.sh`) auto-discovers and
bundles the prebuilt `ds4-server` binary (and its shared libraries) the same way it
bundles `whisper-server`. Model **weights are not bundled** — they are downloaded
on first use inside the container. If only the binary is shipped (no repo scripts),
coderai shallow-clones the repo at first use to obtain `download_model.sh`.
## Implementation
- `codai/config.py``Ds4Config`.
- `codai/api/ds4_worker.py` — clone/build, weight download, `ds4-server` lifecycle.
- `codai/backends/ds4.py``Ds4Backend`, an OpenAI-API proxy implementing the
`ModelBackend` interface.
- `codai/models/manager.py``ds4_should_handle()` routes matching models to
`Ds4Backend`; `is_allowed_model()` accepts the ds4 model id.
# Expressive TTS (emotion / delivery)
The video editor shows **Emotion** and **Delivery** dropdowns whenever the
configured TTS model advertises them (`codai/api/tts_backends.py`:
`family_emotions` / `family_styles`). Two engines support expressive control.
## Bark — in-stack, no extra deps
Works with the server's current `transformers`. Configure a Bark model as the
TTS model, e.g. `--tts-model suno/bark` (or `suno/bark-small`).
- **Delivery**: `normal`, `whispering` (`[whispers] …`), `singing` (`♪ … ♪`),
`emphasis` (UPPERCASE).
- **Emotion**: inserts a matching non-verbal cue — `laughter``[laughs]`,
`sigh``[sighs]`, `gasp``[gasps]`.
- **Voice**: a Bark preset like `v2/en_speaker_6`. The editor's Kokoro voice ids
don't apply and fall back to the default preset (set `voice_preset` in the
model config to change it). Speed isn't controllable in Bark.
## Parler — fully managed by coderai (no setup)
`parler-tts` pins an old `transformers`/`tokenizers`/`huggingface-hub` that
**conflict with this server** — never `pip install` it into the coderai venv.
coderai handles this for you: just use a Parler model as the TTS model
(e.g. `parler-tts/parler-tts-mini-multilingual`). The worker is launched lazily —
only when a request for that model actually arrives — and shut down when the
model is evicted, exactly like loading/unloading any other model. On first use it
1. creates a dedicated venv at `~/.coderai/parler_venv`
(override with `CODERAI_PARLER_VENV`), built `--system-site-packages` so the
base torch/numpy are reused and only the conflicting packages land in it;
2. `pip install`s parler-tts there;
3. launches `tools/parler_tts_service.py` in that venv on a local port, pointing
`HF_HUB_CACHE` at coderai's own cache and forcing **offline mode**
(`HF_HUB_OFFLINE=1`) so it loads strictly the model you **already downloaded
via the model interface** — the worker never downloads anything itself;
4. health-checks it and routes synthesis to it.
The worker is owned by `codai/api/parler_worker.py`; the backend's `cleanup()`
calls `stop_service()`, so the model manager's normal eviction tears the process
down. The first request blocks while the venv builds, then it's cached.
If the model isn't in coderai's cache, the worker fails fast with a clear error
("download '<model>' from the model interface first") instead of fetching it.
Download the Parler model through the normal HF download UI first.
The editor's **Emotion**/**Delivery** dropdowns drive it: coderai POSTs
`{text, voice, speed, emotion, style}` to the worker, which maps them into a
natural-language delivery description (whisper / shout / monotone / expressive +
emotion + pace). A fixed `description` in the model config overrides the
auto-built one. An explicit `service_url` in the config bypasses management and
talks to an externally-run service instead.
> The model must still be in the server's allowed-models registry to be
> selectable — that's the only configuration; the worker itself needs none.
# Frontend/engine split (responsive UI + multi-engine)
CoderAI boots as two layers so heavy model work never freezes the web interface:
- **front** — a thin reverse proxy on the public host/port. It imports no
torch/transformers/diffusers, so its event loop is always free. It streams
requests/responses (including SSE) to the engines and serves an aggregated,
cached status/tasks view.
- **engine(s)** — the real CoderAI app (the current server), bound to internal
localhost ports, doing all GPU/model work. One engine per GPU by default; each is
pinned with `CUDA_VISIBLE_DEVICES` so inside it the GPU is always `cuda:0` and the
existing per-process VRAM/eviction logic is unchanged.
```
client ─HTTP/SSE─▶ front (public) ─┬─ engine#0 (CUDA_VISIBLE_DEVICES=0, :8780)
• no torch ├─ engine#1 (CUDA_VISIBLE_DEVICES=1, :8781)
• always live └─ …
```
See `docs/process-isolation-plans.md` for the design rationale (this is Plan B +
multi-engine).
## Modes
| Launch | Result |
|---|---|
| `coderai` (default) | Front on the public port; auto-spawns one engine per GPU |
| `coderai --single-process` | Legacy: one process, full app on the public port |
| `coderai --engine-only --internal-port N` | One engine on `127.0.0.1:N` (the front launches these for you) |
`--engine-only` is not meant to be run by hand; the front's supervisor manages it.
## Config (`config.json` → `server`)
| Key | Default | Meaning |
|---|---|---|
| `single_process` | `false` | Force legacy one-process mode |
| `internal_port_base` | `8780` | First engine's internal port (+1 per extra engine) |
| `engines` | `0` | Number of engines; `0` = auto (one per GPU, min 1) |
| `engine_gpus` | `null` | Explicit GPU indices, e.g. `[0, 1]`; `null` = auto-detect (NVIDIA) |
| `engine_specs` | `null` | Explicit heterogeneous engines (see below). Overrides `engines`/`engine_gpus` |
| `proxy_status_timeout` | `2.0` | Short timeout (s) for status/UI proxying |
| `proxy_max_inflight` | `64` | Max concurrent proxied requests through the front |
### Heterogeneous engines (e.g. NVIDIA + Radeon)
Auto-detection only finds NVIDIA cards and assumes one backend, and CUDA vs Vulkan
device **enumeration is inconsistent** — so for a mixed setup, declare each engine
with its own backend and env block via `engine_specs`. Each engine is its own
process: the front applies the env at spawn, forces the backend
(`CODERAI_ENGINE_BACKEND`), and routes models only to capability-compatible engines.
- **Capabilities** (default from backend): `nvidia``["transformers","gguf"]`
(CUDA for transformers, GGUF via llama.cpp — which itself may use CUDA or Vulkan);
`vulkan``["gguf"]`. Override per engine with `"capabilities": [...]`.
- **Routing:** a transformers/safetensors model goes only to a `transformers`-capable
(NVIDIA) engine; a GGUF goes to whichever compatible engine already holds it, else
the least-loaded GGUF-capable engine (NVIDIA *or* Radeon).
Example `config.json``server.engine_specs` for an NVIDIA (`cuda:0`) + Radeon
(Vulkan device 1) box, where the NVIDIA engine also serves GGUF via the NVIDIA
Vulkan ICD:
```json
"engine_specs": [
{
"name": "nvidia",
"backend": "nvidia",
"env": {
"CUDA_VISIBLE_DEVICES": "0",
"RADEON_VISIBLE_DEVICES": "",
"VK_ICD_FILENAMES": "/usr/share/vulkan/icd.d/nvidia_icd.json",
"GGML_VK_VISIBLE_DEVICES": "0"
}
},
{
"name": "radeon",
"backend": "vulkan",
"env": {
"CUDA_VISIBLE_DEVICES": "",
"GGML_VK_VISIBLE_DEVICES": "1"
}
}
]
```
The first spec is the **primary** engine (owns admin/auth/config). Empty-string env
values are honoured (`CUDA_VISIBLE_DEVICES=""` hides all CUDA cards from the Radeon
engine). `internal_port_base` assigns ports in order (8780, 8781, …).
#### An engine can own several GPUs
"One engine per GPU" is only the auto-detect default. An engine owns whatever its
`env` exposes, so to run a single large model **across two NVIDIA cards**, give one
engine both — list both CUDA UUIDs — and the NVIDIA backend shards the model over
them automatically (`device_map`/accelerate `max_memory` across every visible CUDA
device; tune per-model with `max_gpu_percent` / `balanced_gpu_percent` / `max_vram`).
Example: 2× NVIDIA (one sharding engine) + 1× Radeon:
```json
"engine_specs": [
{
"name": "nvidia-dual",
"backend": "nvidia",
"env": {
"CUDA_VISIBLE_DEVICES": "GPU-<uuidA>,GPU-<uuidB>",
"CUDA_DEVICE_ORDER": "PCI_BUS_ID",
"VK_ICD_FILENAMES": "/usr/share/vulkan/icd.d/nvidia_icd.json",
"GGML_VK_VISIBLE_DEVICES": "0"
}
},
{
"name": "radeon",
"backend": "vulkan",
"env": {
"CUDA_VISIBLE_DEVICES": "",
"VK_ICD_FILENAMES": "/usr/share/vulkan/icd.d/radeon_icd.json",
"GGML_VK_VISIBLE_DEVICES": "0"
}
}
]
```
Use **GPU UUIDs** (from `nvidia-smi --query-gpu=uuid --format=csv`) rather than
indices so the assignment survives reboots/reordering. The front reports such an
engine's VRAM as the **sum across its GPUs** (with a per-device breakdown in
`/internal/engine-state` and `x_engines`).
## Choosing which card runs a model
When a model is compatible with more than one engine (e.g. a GGUF that runs on both
the NVIDIA and Radeon engines), the card is chosen by this precedence:
1. **Per-model pin** — set `engine` on the model (Models page → *Engine / card*, or
the `"engine"` field in `models.json`) to a declared engine name. Honoured only
if that engine can serve the model's format.
2. **Already resident** — the engine that already has the model loaded (avoids a
reload).
3. **Default engine**`server.default_engine` (Settings → *Default engine*), used
when the model is compatible with several engines.
4. **Least-loaded** compatible engine.
`default_engine` and the per-model *Engine / card* control only appear in the UI
when 2+ engines are declared.
**Bad pins are reported, not silently ignored.** Saving a per-model engine (or the
default engine) that is unknown, or that can't run the model's format (e.g. a
transformers model pinned to a Vulkan/Radeon engine), returns a warning in the admin
UI. At request time the front also logs a one-line warning (deduped per
model+engine) before falling back to a compatible engine.
## Routing
- **Inference** (`POST /v1/...` carrying a `model`) → chosen per the precedence
above, restricted to capability-compatible engines. This is what lets one model
load on engine A while engine B keeps generating.
- **Admin / auth / config / UI / status / tasks** → the **primary** engine
(engine#0). Sessions and `models.json` writes are per-process today, so pinning
these keeps sessions consistent without a shared store.
- **Status / tasks pollers** use a short timeout with a cached/empty fallback, so a
momentarily-blocked engine loop can never hang the dashboard. The front overlays
cross-engine VRAM totals (`vram`) and running tasks (tagged with their `engine`).
## Thermal protection
Thermal cooldowns are scoped to match how work is distributed:
- **CPU too hot → everything pauses.** CPU temperature is read globally, and every
engine gates on it, so all tasks back off until the CPU cools.
- **A GPU too hot → only that GPU's engine pauses.** Each engine reads only the
cards it owns (the front sets `CODERAI_ENGINE_GPUS` — NVIDIA UUIDs and/or a vendor
keyword), so a hot NVIDIA card pauses the NVIDIA engine while the Radeon engine
keeps generating, and vice-versa. Each engine is its own process with its own
cooldown state, so they're naturally independent.
Granularity is per-engine: if one engine owns several GPUs, a single hot card pauses
that engine's work on all of its cards (they share one process). In single-process
mode the GPU check covers all cards.
**Per-card thresholds.** Each card is judged against its own vendor's limit:
`thermal.gpu_high`/`gpu_resume` are the defaults, and `thermal.gpu_overrides`
(`{"amd": {"high": 95, "resume": 92}}`) raises/lowers them per vendor — so a Radeon
can run hotter than an NVIDIA card. Settings → Thermal renders one override row per
GPU vendor **detected on the machine** (never a hardcoded list).
**Which engine is cooling** is shown on the Tasks page banner (each engine reports
its cooldown via `/internal/engine-state`; the front names the cooling engine and
whether it's a GPU or CPU pause).
## Concurrency (per-engine)
Each engine is its own process with its own request queue, so concurrency limits
apply **per-engine** and total throughput is the sum across engines:
- **Max parallel requests** (`server.max_parallel_requests`) — how many requests an
engine runs at once.
- **Max instances per model** (`models.max_model_instances`) — concurrent copies of
one model (needed to run several requests against the *same* model at once).
Both take **per-engine overrides** (`*_overrides`, keyed by engine name, e.g.
`{"nvidia": 4, "radeon": 1}`) so a bigger card runs more in parallel than a smaller
one. Settings → Concurrency shows the defaults plus one override row per running
engine. The front resolves each engine's value and passes it down at spawn
(`CODERAI_MAX_PARALLEL` / `CODERAI_MAX_MODEL_INSTANCES`).
## Managing engines
The Tasks page shows an **Engines** panel (front mode only) with each engine's
health, VRAM and loaded-model count, and a **Restart** button — use it to kill an
engine that's wedged/looping; the supervisor respawns it immediately while the front
and other engines keep serving. Backed by `GET /admin/api/engines` and
`POST /admin/api/engines/{id}/restart` on the front (authorized against the primary
engine's session).
## Shared host-RAM cap
`offload.max_ram_gb` is a single **server-wide** ceiling shared by all engines, not
split into per-engine slices. The front sets `CODERAI_FRONT_PID` on each engine, so
every engine measures the same fleet-wide RSS (front + all engines + their workers)
and enforces the one cap against that total. When the combined usage crosses the
cap, each engine runs its normal mitigation/eviction (dropping its idle LRU models),
so whichever engine holds idle models frees them for the shared budget; busy models
aren't evicted. An idle engine uses ~0 of the budget; a busy one can use most of it.
VRAM is naturally per-card (each engine sees only its own GPUs via
`CUDA_VISIBLE_DEVICES`), and model eviction on swap is unchanged *within* an engine.
## Broker (runs in the front)
The AISBF broker client runs **in the front**, not in a model engine — it's
coordination/protocol work, so binding it to a GPU process would stall it whenever
that engine loads a model. Benefits:
- Never stalls during a model load (the front's loop is always free).
- One registration for the whole node, regardless of engine count.
- Advertises **aggregate** hardware: `build_hardware_summary` is torch-free in the
front (via `gpu_stats()`), so it reports the total VRAM across *every* card.
- Brokered requests dispatch through the **same router/proxy** as HTTP — a brokered
GGUF request can land on the Radeon engine, a transformers one on NVIDIA.
Engines run no broker client (`main.py` disables it under `--engine-only`); only
single-process mode keeps the broker in-process. Implementation:
`FrontProxy.start_broker` / `broker_execute` (`codai/frontproxy/app.py`) +
`execute_broker_request(..., executor=...)` (`codai/broker/dispatcher.py`).
## Model assignment (one owner per model)
With multiple engines, the front assigns each configured model to exactly **one**
owner engine and routes accordingly, so a model is never served from two engines:
- **Owner precedence:** per-model `engine` pin → default engine → balanced
round-robin across capability-compatible engines.
- Routing honours the assignment first (`registry.engine_for_assigned`); unassigned
/ ad-hoc models fall back to capability routing.
- `/v1/models` (and the broker's model list) is the **union** across engines, deduped
— the full catalogue with no duplicates.
- Engines aren't pruned, so the admin Models page (served from the primary) still
shows the complete configuration.
- **Two configs of one model** can run on different engines if they have distinct
aliases (the assignment keys on the routable id: alias → path); configs sharing a
path with no distinct alias collapse to one owner.
## Security: engines are localhost-only + token-gated
Engines bind **127.0.0.1 only** (forced regardless of the configured host, which is
the front's public bind), and the front reaches them via `http://127.0.0.1:<port>`.
On top of that, the front generates a per-run secret, passes it to each engine via
`CODERAI_INTERNAL_TOKEN`, and stamps every engine request with an
`X-Coderai-Internal` header; an engine rejects (403) any request lacking it (and the
front strips client-supplied copies so the token can't be spoofed). So nothing else
on localhost can talk to an engine and bypass the front's auth/routing. Single-process
mode sets no token and is unaffected.
## Fault isolation
The supervisor polls each engine's auth-free, localhost-only
`/internal/engine-state`. If an engine exits (including a CUDA device-side assert),
it is **respawned**; the front and sibling engines keep serving. The front's own
`/healthz` reports per-engine readiness.
## Known limitations (follow-ups)
- Admin/config/session state is pinned to the primary engine (not yet replicated —
that's "Plan C" in the design doc). Cross-engine **task visibility** works
(merged read-only); cross-engine **session sharing** does not — all admin traffic
intentionally lands on the primary.
- Placement is first-fit (model→least-loaded compatible engine); there is no live
cross-engine rebalancing/migration yet.
- Capability routing keys off the model **name** (a `.gguf`/`gguf` name → GGUF, else
transformers), matching the engine's own `is_gguf` heuristic. A transformers model
whose name happens to contain "gguf" would be mis-routed — rename or declare an
alias if that ever bites.
# Process-isolation plans: keeping the web UI responsive during model load/inference
> **Status (implemented):** Plan B + Multi-engine shipped. The front proxy lives in
> `codai/frontproxy/` (`app.py`, `engine_supervisor.py`, `registry.py`, `router.py`),
> engines run via `coderai --engine-only --internal-port N`, and default boot starts
> the front + one engine per GPU. Operator guide: `docs/frontend-engine-split.md`.
> Plan C (replicating session/config/queue ownership into the front) remains a
> follow-up — **except the broker, which has already moved into the front** (it
> registers once for the whole node, advertises aggregate VRAM torch-free, and
> dispatches brokered requests to engines through the router). Sessions/config/queue
> are the remaining Plan-C pieces.
## Problem statement
While a model loads (and, for some backends, while it generates), the web
interface and API become unresponsive.
Root cause: the server is a single process. GIL-heavy Python work blocks the
asyncio event loop that serves the UI/API. Specifically:
- **Transformers text** (`codai/backends/cuda.py`, `NvidiaBackend`) — both the
`from_pretrained` **load** and token-by-token `model.generate` hold the GIL.
Dispatching them via `asyncio.to_thread` does **not** free the loop, because
`to_thread` only helps when the worker releases the GIL.
- **Diffusers** (image/video/audio, `codai/api/images.py`, `video.py`,
`audio_gen.py`) — the `from_pretrained` **load** is GIL-heavy and freezes the
UI. The denoise loop itself is mostly torch CUDA ops that *do* release the
GIL, so the freeze is almost entirely the load.
- **Vulkan / GGUF** (`codai/backends/vulkan.py`, llama.cpp) — the native load
**releases the GIL**, so this path does *not* freeze the UI. (This is why the
existing defensive comments assume "the load releases the GIL during its C
call" — true for llama.cpp, false for the transformers/diffusers paths.)
The fix is to ensure the process serving the UI/API is not the process whose GIL
is held by model work. Three architectures achieve this with very different
cost/benefit. This document captures all three so we can choose deliberately.
> Note: an unrelated, already-shipped fix lives in `cuda.py` — Gemma-class models
> whose attention head dimension exceeds FlashAttention-2's limit of 256 now fall
> back to SDPA (`_model_head_dim`), which fixed "requests silently stop" for those
> models. That is orthogonal to the process-isolation work below.
---
## Summary comparison
| | A: model worker (out-of-process models) | B: thin resilient proxy | C: full frontend/engine split |
|---|---|---|---|
| Process boundary | Python pipeline-call layer | HTTP layer | HTTP layer + state ownership |
| Serialization burden | **High** (torch generators, callbacks, tensors, PIL) | **Low** (already HTTP) | **Low** (already HTTP) |
| Engine/model code changes | Large (text clean, diffusers invasive) | **None** (engine ≈ current app) | Moderate (engine becomes pure executor) |
| Fixes which model types | One modality at a time | **All at once** | **All at once** |
| New moving parts | Worker harness + per-modality IPC | Reverse proxy + status cache + supervisor | Proxy + relocated coordination state + supervisor |
| Crash/CUDA-poison isolation | Per-model worker | **Engine restart, front survives** | **Engine restart, front survives** |
| Effort | Text: medium. Diffusers: very large. | **Small–medium** | Large |
| Recommended role | Fallback / not preferred | **First cut (do this)** | Eventual evolution of B |
**Recommendation:** ship **B**, evolve toward **C** if/when coordination state
needs to be authoritative in the front; keep **A** only as a documented
alternative (it is the worst fit for diffusers).
---
## Shared context (applies to B and C)
- Public surface is **plain HTTP + SSE**. No inbound websockets, no mounted
sub-apps (verified). This makes a reverse-proxy split clean.
- `codai/broker/asgi_bridge.py` already drives the ASGI app from an external
transport, so the app is already transport-decoupled in spirit.
- The front process must import **no** `torch` / `transformers` / `diffusers`,
so its GIL is never held by model code and its event loop is always free.
- VRAM/GPU stats can be read by the front **without torch** via `nvidia-smi`
and sysfs/`lspci` (the existing `api_status` already reads sysfs/`lspci` for
the non-CUDA path).
---
## Plan A — Out-of-process model worker (models leave the API process)
The original approach: keep the API/UI in the main process, push the GIL-heavy
model into a child process behind a proxy backend.
### A.1 Generic worker harness
- `codai/backends/worker_client.py` — parent-side proxy implementing the
`ModelBackend` interface; spawns the child, waits on `/health`, forwards calls.
- `codai/backends/text_worker.py` — child entrypoint
(`python -m codai.backends.text_worker --port 0`) running a tiny local uvicorn
that instantiates the **real** `NvidiaBackend` and exposes `/load`,
`/generate`, `/generate_chat`, `/generate_stream` (SSE), `/generate_chat_stream`
(SSE), `/context_size`, `/usage`, `/tokenize`, `/health`, `/shutdown`.
- Wire into `ModelManager.load_model` (`codai/models/manager.py:158`): when
`backend_type == "nvidia"`, instantiate `WorkerTextBackend()` instead of
`NvidiaBackend()`, behind a default-on config flag. Instance pools, eviction,
VRAM delta accounting (`torch.cuda.mem_get_info` in the parent still sees the
child's allocations) are untouched — each instance owns a subprocess.
### A.2 Text worker (clean)
- I/O is tiny (text / SSE tokens). Streaming maps directly to SSE.
- `cleanup()` terminates the subprocess → frees VRAM.
- Bonus: a device-side CUDA assert kills only the child; parent maps the error to
the existing `cuda_context_poisoned` logic and respawns.
### A.3 Diffusers worker (very large — the blocker)
Diffusers cannot be a thin wrapper. Evidence in `codai/api/images.py`/`video.py`:
- Pipelines are stored **as live objects** in the shared registry
(`multi_model_manager.models[model_key] = pipe`) and called inline at ~dozens
of sites (txt2img, img2img, inpaint, upscale, depth, segmentation, video
modes, audio_gen).
- Pipelines are **mutated in-process**: `apply_accel_to_pipeline(pipeline, accel)`
(`images.py:345`), LoRA application, IP-Adapter wiring, scheduler swaps.
- `pipe(...)` call args are **not serializable**: `generator` (a
`torch.Generator` bound to a device), `callback_on_step_end=_step_cb` (a live
closure updating the in-process `_gen_progress`), `embed_kwargs` (prompt
embedding tensors), IP-Adapter/character/environment **PIL reference images**
(`images.py:877-899`).
Consequence: putting diffusers in a worker means **moving the entire generation
lifecycle into the worker** (load + accel + LoRA + IP-adapter + the call + output
extraction) and converting every call site to a **high-level** IPC request
(prompt, seed, steps, image bytes), serializing every input (PIL/tensors/masks/
control images), every output (images/frames/audio), and **relaying step
progress** back over IPC. Large, regression-prone rewrite of the media API.
### A.4 Assessment
- Text: medium effort, clean win.
- Diffusers: very large, fragile; payoff limited (denoise releases the GIL).
- **Not recommended** as the diffusers solution. Superseded by B.
---
## Plan B — Thin resilient reverse proxy (RECOMMENDED FIRST CUT)
Split at the HTTP boundary. The **engine** is the current app, essentially
unchanged, on an internal port. The **front** is a small async reverse proxy on
the public port whose event loop never freezes (no torch in its address space).
### B.1 Architecture
```
client ──HTTP/SSE──▶ front (public port) ──HTTP/SSE──▶ engine (internal port, all models)
• no torch • current app, unchanged
• always-responsive • may freeze on GIL-heavy load
• status cache + timeouts • does all GPU work
• supervises engine subprocess
```
### B.2 The one rule that makes it work
The front must answer **UI / status / admin** without synchronously
hard-depending on a possibly-frozen engine:
- UI / status / admin → short timeout on the engine call; on timeout serve a
**last-known status cache** plus an "engine busy loading model X" flag.
- Generation (chat / image / video / SSE) → proxied with **long timeout**. That
single request legitimately waits for the load; the rest of the UI stays live.
### B.3 New files
- `codai/frontproxy/__init__.py`
- `codai/frontproxy/app.py` — FastAPI app for the front:
- Catch-all reverse-proxy route: streams request body (chunked uploads),
forwards method/path/query/headers (incl. auth, rewriting `Host`), streams
the response back (SSE and large binary), preserves status codes.
- Status handler: proxies `/admin/api/status` with a short timeout; caches the
last success; on timeout/refusal returns cache + `{ "engine": "loading"|"down" }`.
- `/healthz` for the front itself.
- `codai/frontproxy/engine_supervisor.py` — spawn the engine subprocess
(`python -m codai.main --internal-port …`), poll `/healthz` on the engine,
restart on crash/exit (this is where CUDA-poison recovery becomes "respawn").
- HTTP client: `httpx.AsyncClient` with streaming, or `aiohttp`. Separate short-
and long-timeout clients.
### B.4 Engine-side changes (minimal)
- `codai/main.py` / `codai/cli.py`: add `--internal-port` / `--engine-only` so
the engine binds to localhost and the front owns the public port. Default boot
launches front + engine; a flag preserves the legacy single-process mode.
- Add a cheap `/healthz` on the engine (no torch, returns immediately) so the
supervisor can distinguish "loading" (slow) from "dead".
### B.5 Proxy correctness checklist (the real work)
- **SSE / streaming**: forward `text/event-stream` without buffering; flush per
chunk; propagate client disconnect to cancel the upstream request.
- **Large uploads**: stream `model-upload` / image inputs (don't buffer whole
body in memory).
- **Large downloads**: stream image/video/audio byte responses.
- **Auth / headers**: pass `Authorization`, cookies; rewrite `Host`; preserve
`Content-Type`, `Content-Length`/chunked, `Content-Disposition`.
- **Timeouts**: short for status/UI; long (or none) for generation; map engine
timeout to cached status, never to a hung front request.
- **Backpressure / limits**: bound concurrent in-flight proxied requests.
- **Redirects / error passthrough**: preserve 3xx/4xx/5xx and bodies.
### B.6 Limitations
- Does not speed up the one in-flight request waiting on a load; keeps the rest
of the UI responsive.
- True concurrency across models needs multiple engines (see "Multi-engine").
### B.7 Effort: small–medium. Engine code essentially untouched; risk concentrated
in the proxy, which is testable in isolation.
---
## Plan C — Full frontend/engine split (eventual evolution of B)
Make the front authoritative for all pure-Python coordination state so it never
needs the engine even for status; the engine becomes a pure executor.
### C.1 What moves to the front
Relocate non-GPU, pure-Python concerns out of the engine into the front (each is
serialization-trap-free):
- **Sessions / auth / API tokens** (`codai/admin` session manager).
- **Config / models.json management** (the admin "models" CRUD, `config_manager`).
- **Request queue + metrics** (`codai/queue/manager.py`).
- **Progress + model-registry view**: the engine pushes events (loaded/unloaded,
`_gen_progress` step updates, VRAM deltas) to the front over a control channel;
the front holds the authoritative cache and serves status with zero engine
dependency.
### C.2 Engine becomes pure executor
- Exposes only: load/unload, generate (all modalities), health, event stream.
- No session/config/queue logic; receives resolved requests from the front.
### C.3 Control channel
- A persistent engine→front event stream (SSE or a small socket) for progress,
load state, VRAM, and crash notifications. Front reconciles its cache; on
engine restart, front re-syncs.
### C.4 Benefits
- Status/admin are instant and always correct, even mid-load.
- Clean seam for **multi-engine** orchestration.
- Strong fault isolation: engine crash never loses UI/session/queue state.
### C.5 Effort: large, but every moved piece is plain Python (no pipeline
serialization). Best approached incrementally on top of a shipped B.
---
## Multi-engine (future, enabled by B/C)
One engine per GPU (or per hot model). The front routes a request to the engine
that holds the target model (or asks an idle engine to load it). One engine
loading no longer blocks generation on another engine. Requires:
- Engine registry in the front (which engine holds which model, health, VRAM).
- A placement/eviction policy across engines (extends the current per-process
VRAM logic to a fleet view).
---
## Decision log / open questions
- Confirm: default boot launches **front + engine** with a flag to retain the
legacy single-process mode? (Recommended yes.)
- Confirm: HTTP client — `httpx` (already a likely dependency) vs `aiohttp`.
- Confirm: status staleness budget when the engine is mid-load (e.g. serve cache
up to N seconds old, then show "engine loading").
- B → C migration order: sessions/tokens first (low risk), then config, then
queue, then progress/registry (needs the control channel).
## Recommended sequencing
1. **B** — front proxy + engine supervisor + status cache. Fixes the freeze for
all model types with no engine changes beyond `--internal-port`/`/healthz`.
2. (Optional, separate) **A.1/A.2** text worker — only if we want per-model fault
isolation *within* an engine; otherwise B already solves the UI freeze.
3. **C** — incrementally move coordination state to the front.
4. **Multi-engine** — once C's registry exists.
# Running CoderAI and the `tools/` web UIs behind nginx
Everything here works behind an nginx (or any) reverse proxy. There are two
ways to mount each service; pick per service:
* **Subdomain / root location** — the service owns `/` of a `server_name`
(e.g. `coderai.example.com`). Works for *every* service with no app changes.
* **Sub-path** — the service lives under a path (e.g. `example.com/coderai/`).
Supported by **CoderAI** and **`tools/video_editor.py`**. The other
`tools/` UIs currently need the subdomain/root form (see the table).
| Service | Root / subdomain | Sub-path (`/foo/`) |
|---------------------------------|:----------------:|:------------------:|
| CoderAI server (`codai`) | ✅ | ✅ |
| `tools/video_editor.py` | ✅ | ✅ |
| `tools/videogen.py` | ✅ | ⚠️ needs work |
| `tools/review_outputs.py` | ✅ | ⚠️ needs work |
| `tools/gen_township_fighters.py`| ✅ | ⚠️ needs work |
## Headers every proxy block needs
CoderAI builds public URLs (image/video/audio output links, redirects, admin
links) from these headers via `codai/api/urlutils.py`, and `video_editor.py`
honours `X-Forwarded-Prefix` for sub-path mounting:
```nginx
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header X-Forwarded-Host $host;
# Sub-path mounts only — tells the app its public prefix:
# proxy_set_header X-Forwarded-Prefix /coderai;
```
Also important for AI workloads:
```nginx
client_max_body_size 1024m; # large image/audio/video uploads
proxy_read_timeout 3600s; # long generations / renders
proxy_send_timeout 3600s;
proxy_buffering off; # required for SSE streaming (chat, progress)
```
## CoderAI — subdomain (root)
```nginx
server {
listen 443 ssl;
server_name coderai.example.com;
# ssl_certificate ... ; ssl_certificate_key ... ;
client_max_body_size 1024m;
location / {
proxy_pass http://127.0.0.1:8000;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header X-Forwarded-Host $host;
proxy_read_timeout 3600s;
proxy_send_timeout 3600s;
proxy_buffering off; # SSE: streamed chat + task progress
}
}
```
Optionally pin the public URL instead of trusting headers: start CoderAI with
`--url https://coderai.example.com`.
## CoderAI — sub-path (`https://example.com/coderai/`)
```nginx
location /coderai/ {
proxy_pass http://127.0.0.1:8000/; # trailing slash strips the prefix
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header X-Forwarded-Host $host;
proxy_set_header X-Forwarded-Prefix /coderai; # <-- the key line
proxy_read_timeout 3600s;
proxy_buffering off;
}
```
CoderAI reads `X-Forwarded-Prefix` into the ASGI `root_path`, so `request.url`,
redirects, `{{ root_path }}` template links, the `ROOT_PATH` JS global, and all
generated file URLs become `/coderai/...` automatically.
## `tools/video_editor.py`
Start it bound to localhost (default) and proxy to it. It works at root and at
a sub-path. For a sub-path, set `X-Forwarded-Prefix`; the page injects a
matching `<base href>` and all its API/media/render URLs are relative, so they
resolve correctly under any mount. It also strips the prefix server-side, so it
works whether or not nginx strips it.
```nginx
# Sub-path: https://example.com/editor/
location /editor/ {
proxy_pass http://127.0.0.1:8420/;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header X-Forwarded-Prefix /editor;
proxy_read_timeout 3600s; # long ffmpeg renders
proxy_send_timeout 3600s;
proxy_request_buffering off; # stream large uploads straight through
client_max_body_size 4096m; # video/music uploads from the browser machine
}
```
Run with `--no-browser` on a server. The video editor talks to CoderAI over
`--base-url` server-side (not from the browser), so the browser only ever needs
to reach the editor's own origin. Source files can be picked from the server's
media directory or uploaded from the browser machine (hence the larger
`client_max_body_size` / `proxy_request_buffering off` above).
## `tools/videogen.py`, `review_outputs.py`, `gen_township_fighters.py`
Mount each at the root of its own `server_name` (or a dedicated port). These
UIs use absolute (`/...`) asset and API paths plus SSE, so they expect to own
`/`:
```nginx
server {
listen 443 ssl;
server_name videogen.example.com;
location / {
proxy_pass http://127.0.0.1:7860; # the tool's --port
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_read_timeout 3600s;
proxy_buffering off; # these stream progress over SSE
}
}
```
Sub-path mounting for these three needs their client URLs made relative (the
same change already applied to `video_editor.py`).
......@@ -182,6 +182,8 @@ discover_local_binaries() {
"$HOME/whisper.cpp/build/bin/whisper-cli"
"$HOME/whisper.cpp/build/bin/main"
"$HOME/whisper.cpp/build/bin/server"
"/usr/local/bin/ds4-server"
"${CODERAI_DS4_DIR:-$HOME/.coderai/ds4}/ds4-server"
)
local path
for path in "${candidates[@]}"; do
......
#!/usr/bin/env python3
"""Standalone Parler-TTS HTTP microservice — run in its OWN venv.
parler-tts hard-pins an old transformers/tokenizers/huggingface-hub that conflict
with the coderai server's stack (transformers 5.x). So instead of polluting that
environment, Parler runs here behind a tiny stdlib HTTP shim, and coderai talks to
it as a remote TTS backend (``_RemoteParlerBackend``, selected when a model's
config carries a ``service_url``).
Setup (separate venv!):
python3 -m venv ~/.venvs/parler
source ~/.venvs/parler/bin/activate
pip install "git+https://github.com/huggingface/parler-tts.git" soundfile
python tools/parler_tts_service.py \
--model parler-tts/parler-tts-mini-multilingual --port 8123
Then point a coderai TTS model's config at it, e.g. in models.json:
"tts:parler-tts/parler-tts-mini-multilingual": {"service_url": "http://127.0.0.1:8123"}
Endpoints:
GET /health -> {"ok": true, "model": ..., "sampling_rate": N}
POST /speak -> audio/wav (body: {text, voice, speed, emotion, style, description?})
"""
import argparse
import io
import json
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
import numpy as np
import soundfile as sf
# These mirror the editor's gated controls. coderai surfaces the same lists via
# codai.api.tts_backends._FAMILY_{EMOTIONS,STYLES}["parler"].
EMOTIONS = ["neutral", "happy", "sad", "angry", "excited", "calm", "fearful"]
STYLES = ["normal", "whispering", "shouting", "monotone", "expressive"]
def build_description(voice: str, speed, emotion: str, style: str, speaker: str = "") -> str:
"""Map the UI controls into a Parler natural-language delivery description."""
spk = (voice or "").strip()
if spk and ("/" in spk or spk.lower().startswith(("af_", "am_", "bf_", "bm_"))):
spk = "" # a path or a Kokoro id is not a Parler speaker name
who = spk or speaker or "A speaker"
bits = [f"{who} speaks"]
if emotion and emotion != "neutral":
bits.append(f"in a {emotion} tone")
smap = {"whispering": "whispering softly", "shouting": "shouting loudly",
"monotone": "in a flat monotone", "expressive": "in a very expressive, animated way"}
if style and style not in ("", "normal"):
bits.append(smap.get(style, style))
try:
sp = float(speed or 1.0)
except (TypeError, ValueError):
sp = 1.0
bits.append(f"at a {'slow' if sp < 0.9 else 'fast' if sp > 1.15 else 'moderate'} pace")
return (" ".join(bits) +
". The recording is very high quality, the voice clear and close up "
"with no background noise.")
class _Engine:
"""Loads the Parler model once and synthesizes to a float waveform."""
def __init__(self, model_name: str):
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import torch
self.model_name = model_name
self._device = "cuda" if torch.cuda.is_available() else "cpu"
self._model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(self._device)
self._tok = AutoTokenizer.from_pretrained(model_name)
self.sr = int(self._model.config.sampling_rate)
def speak(self, text: str, description: str) -> np.ndarray:
ids = self._tok(description, return_tensors="pt").input_ids.to(self._device)
prompt = self._tok(text, return_tensors="pt").input_ids.to(self._device)
gen = self._model.generate(input_ids=ids, prompt_input_ids=prompt)
return np.asarray(gen.cpu().numpy().squeeze(), dtype=np.float32)
ENGINE: _Engine = None # set in main()
class Handler(BaseHTTPRequestHandler):
def _send(self, code, body=b"", ctype="application/json"):
self.send_response(code)
self.send_header("Content-Type", ctype)
self.send_header("Content-Length", str(len(body)))
self.end_headers()
if body:
self.wfile.write(body)
def log_message(self, fmt, *args): # quieter logs
pass
def do_GET(self):
if self.path.split("?")[0] == "/health":
self._send(200, json.dumps(
{"ok": True, "model": ENGINE.model_name, "sampling_rate": ENGINE.sr}).encode())
else:
self._send(404, b'{"error":"not found"}')
def do_POST(self):
if self.path.split("?")[0] != "/speak":
self._send(404, b'{"error":"not found"}')
return
try:
n = int(self.headers.get("Content-Length", 0))
req = json.loads(self.rfile.read(n) or b"{}")
text = (req.get("text") or "").strip()
if not text:
self._send(400, b'{"error":"empty text"}')
return
desc = req.get("description") or build_description(
req.get("voice", ""), req.get("speed", 1.0),
req.get("emotion", ""), req.get("style", ""))
audio = ENGINE.speak(text, desc)
buf = io.BytesIO()
sf.write(buf, audio, ENGINE.sr, format="WAV")
self._send(200, buf.getvalue(), ctype="audio/wav")
except Exception as e:
import traceback
traceback.print_exc()
self._send(500, json.dumps({"error": str(e)}).encode())
def main(argv=None):
ap = argparse.ArgumentParser(description="Standalone Parler-TTS HTTP service")
ap.add_argument("--model", default="parler-tts/parler-tts-mini-multilingual")
ap.add_argument("--host", default="127.0.0.1")
ap.add_argument("--port", type=int, default=8123)
args = ap.parse_args(argv)
global ENGINE
print(f"Loading {args.model} …")
ENGINE = _Engine(args.model)
print(f"Ready: {args.model} @ {ENGINE.sr} Hz — serving on http://{args.host}:{args.port}")
ThreadingHTTPServer((args.host, args.port), Handler).serve_forever()
if __name__ == "__main__":
main()
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"media_dir": "/storage/coderai/tools/coderai_media",
"output_dir": "/storage/coderai/video_editor_output",
"base_url": "http://127.0.0.1:8000",
"api_key": "sk-coderai-1b8b559808f9fb9927cabef33e0c1bf7ca7943f3281cbf7b7b661fc37aa9fbe0",
"voice": "feminine",
"voice_name": "af_sarah",
"tts_model": "suno/bark",
"stt_model": null,
"audio_model": null,
"video": null
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment