fix: GGUF vision/mmproj routing + VRAM estimate; Tasks page it/s + history

- api_model_load: load a GGUF/text model via llama.cpp even when it's also bucketed under image/vision (respect the entry's primary model_type), so a gemma+mmproj LLM never hits the diffusers from_pretrained() path. - model config save: a GGUF LLM with an mmproj auto-gets the image_to_text capability and is kept out of the diffusers vision_models/image_models buckets. - VRAM estimate: _runtime_reserve_gb scales the KV-cache reserve by the cache quantization (q4_0 ≈ 0.27× f16) so quantized-KV models at large context aren't over-estimated into needless CPU offload. - Free disk (HF): quiet huggingface_hub's noisy not-found traceback and make the delete idempotent (repo already gone = success). - Tasks page: generation tasks now report it/s (or s/it when slow); text keeps tok/s. Throughput computed centrally in the task registry (live EMA + run average on finish). New "Recent tasks (last 10)" history section. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

fix: GGUF vision/mmproj routing + VRAM estimate; Tasks page it/s + history
- api_model_load: load a GGUF/text model via llama.cpp even when it's also bucketed under image/vision (respect the entry's primary model_type), so a gemma+mmproj LLM never hits the diffusers from_pretrained() path. - model config save: a GGUF LLM with an mmproj auto-gets the image_to_text capability and is kept out of the diffusers vision_models/image_models buckets. - VRAM estimate: _runtime_reserve_gb scales the KV-cache reserve by the cache quantization (q4_0 ≈ 0.27× f16) so quantized-KV models at large context aren't over-estimated into needless CPU offload. - Free disk (HF): quiet huggingface_hub's noisy not-found traceback and make the delete idempotent (repo already gone = success). - Tasks page: generation tasks now report it/s (or s/it when slow); text keeps tok/s. Throughput computed centrally in the task registry (live EMA + run average on finish). New "Recent tasks (last 10)" history section. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
ade800f9 · Stefy Lanza (nextime / spora ) · 7d3d8e5b · ade800f9 · ade800f9 · ade800f9
Commit ade800f9 authored Jun 19, 2026 by Stefy Lanza (nextime / spora )
Showing with 205 additions and 38 deletions

routes.py codai/admin/routes.py +49 -4

tasks.html codai/admin/templates/tasks.html +107 -32

manager.py codai/models/manager.py +21 -1

registry.py codai/tasks/registry.py +28 -1

No files found.
--- a/codai/admin/routes.py
+++ b/codai/admin/routes.py
@@ -1682,6 +1682,14 @@ def _do_delete_model(model_id: str, cache_type: str) -> dict:
    if cache_type == "hf":
        hf_dir = caches.get("huggingface")
        if hf_dir:
+            # huggingface_hub logs a WARNING + full traceback when a repo dir has
+            # already vanished (e.g. a GGUF model whose HF repo was never really
+            # cached). Quiet it during the delete — "repo gone" is exactly the
+            # end state Free disk wants, so it's not an error.
+            import logging as _logging
+            _hf_log = _logging.getLogger("huggingface_hub.utils._cache_manager")
+            _prev_lvl = _hf_log.level
+            _hf_log.setLevel(_logging.ERROR)
            try:
                from huggingface_hub import scan_cache_dir
                info = scan_cache_dir(hf_dir)
@@ -1692,13 +1700,17 @@ def _do_delete_model(model_id: str, cache_type: str) -> dict:
                    return {"success": True}
            except Exception:
                pass
-            # Fallback: remove directory directly
+            finally:
+                _hf_log.setLevel(_prev_lvl)
+            # Fallback: remove the repo dir directly if it's still there.
            safe = model_id.replace("/", "--")
            d = os.path.join(hf_dir, f"models--{safe}")
            if os.path.exists(d):
                shutil.rmtree(d, ignore_errors=True)
-                return {"success": True}
-        return {"success": False, "detail": "Model not found in HF cache"}
+            # Whether or not anything was on disk, the files are gone now — Free
+            # disk is idempotent, so report success instead of a scary error.
+            return {"success": True}
+        return {"success": False, "detail": "HF cache directory not configured"}

    if cache_type == "gguf":
        gguf_dir = get_model_cache_dir()
@@ -2032,7 +2044,13 @@ async def api_model_load(request: Request, username: str = Depends(require_admin
    if not path:
        raise HTTPException(status_code=400, detail="path required")

-    # Find the model config entry to determine its type
+    # Find the model config entry to determine its type. A model may be
+    # registered in several categories (e.g. a vision LLM advertises image_to_text
+    # → also listed under vision_models). The category-bucket loop below would pick
+    # whichever non-text bucket it hits first, sending the model to the diffusers /
+    # transformers loader — which calls from_pretrained() and fails on a GGUF file.
+    # So the entry's DECLARED primary model_type wins, and a GGUF/llama.cpp text
+    # model always loads via the text path regardless of its other buckets.
    model_type = "text"
    model_cfg: dict = {}
    if config_manager:
@@ -2049,6 +2067,12 @@ async def api_model_load(request: Request, username: str = Depends(require_admin
                    model_type = mtype
                    model_cfg = m if isinstance(m, dict) else {}
                    break
+        # Respect the entry's declared primary type; a text/gguf model (or any
+        # .gguf path) is a llama.cpp model and must use the text loader even when
+        # it's also bucketed under image/vision for capability routing.
+        _primary = (model_cfg.get("model_type") if isinstance(model_cfg, dict) else "") or ""
+        if _primary in ("text_models", "gguf_models") or str(path).lower().endswith(".gguf"):
+            model_type = "text"

    # Offload to a thread: request_model may block (thermal wait / busy model /
    # actual load) and would otherwise freeze the whole admin web UI event loop.
@@ -2451,6 +2475,27 @@ async def api_model_configure(request: Request, username: str = Depends(require_
        if key in data:
            entry[key] = data[key]

+    # A GGUF LLM is served by llama.cpp. Its multimodal projector (mmproj) gives
+    # it VISION INPUT, which is the `image_to_text` capability served through
+    # llama.cpp — NOT the diffusers `vision_models`/`image_models` categories
+    # (those route the .gguf to from_pretrained() and fail). So for a GGUF text
+    # model: auto-tag image_to_text when an mmproj is set, and keep it out of the
+    # diffusers buckets (it stays a llama.cpp model that advertises vision).
+    _path_l = str(path).lower()
+    _is_gguf_llm = ((_path_l.endswith(".gguf") or "gguf" in _path_l
+                     or entry.get("model_type") == "gguf_models")
+                    and entry.get("model_type") in ("text_models", "gguf_models"))
+    if _is_gguf_llm:
+        _caps = list(entry.get("capabilities") or [])
+        if entry.get("mmproj") and "image_to_text" not in _caps:
+            _caps.append("image_to_text")
+            entry["capabilities"] = _caps
+        _DIFFUSERS_CATS = {"image_models", "vision_models", "video_models", "spatial_models"}
+        _kept = [t for t in model_types if t not in _DIFFUSERS_CATS] or ["text_models"]
+        if _kept != model_types:
+            model_types = _kept
+            entry["model_types"] = model_types
+
    # Add entry to each selected category
    for mtype in model_types:
        config_manager.models_data.setdefault(mtype, []).append(entry)

--- a/codai/admin/templates/tasks.html
+++ b/codai/admin/templates/tasks.html
@@ -73,6 +73,19 @@
    </tbody>
  </table>
 </div>
+
+<div class="table-wrap" id="history-wrap" style="display:none;margin-top:1.25rem">
+  <h2 style="font-size:14px;margin:0 0 .5rem">Recent tasks <span class="dim small">(last 10)</span></h2>
+  <table>
+    <thead>
+      <tr>
+        <th>Type</th><th>Name / Model</th><th>Result</th>
+        <th style="width:220px">Throughput</th><th>Finished</th><th style="text-align:right"></th>
+      </tr>
+    </thead>
+    <tbody id="history-body"></tbody>
+  </table>
+</div>
 {% endblock %}

 {% block scripts %}
@@ -94,6 +107,15 @@ const STATUS_BADGE = {
  cancelled:'badge-user', interrupted:'badge-warn'
 };

+// Format a task's throughput with the right unit. Text → tok/s; generation →
+// it/s, or s/it when each iteration takes longer than a second (more readable).
+function fmtRate(t) {
+  const r = Number(t.rate) || 0;
+  if (r <= 0) return '';
+  if (t.kind === 'text') return `${r} tok/s`;
+  return r >= 1 ? `${r} it/s` : `${(1 / r).toFixed(1)} s/it`;
+}
+
 function progressBar(t) {
  // Downloads report a 0-100 percent; render a percentage bar (the filename /
  // rate / ETA detail is shown in the status cell, so it isn't repeated here).
@@ -103,9 +125,10 @@ function progressBar(t) {
            <span class="dim small">${pct}%</span>`;
  }
  const total = t.total || 0, step = t.step || 0;
-  // Live throughput for text generation (tokens/s), shown while running.
+  // Live throughput, shown while running: tokens/s for text, iterations/s (or
+  // seconds/iteration when slow) for generation tasks (image/video/audio/…).
  const rate = (t.rate && t.status === 'running')
-    ? ` <span class="dim small">· ${t.rate} tok/s</span>` : '';
+    ? ` <span class="dim small">· ${fmtRate(t)}</span>` : '';
  if (!total) {
    if (t.status === 'running') {
      const tok = step ? `${step} tok` : 'working…';
@@ -139,6 +162,70 @@ function actions(t) {
  return btns.join(' ') || '<span class="dim small">—</span>';
 }

+function fmtDur(start, end) {
+  if (!start || !end || end < start) return '';
+  const s = Math.round(end - start);
+  if (s < 60) return `${s}s`;
+  const m = Math.floor(s / 60), ss = s % 60;
+  if (m < 60) return `${m}m ${ss}s`;
+  return `${Math.floor(m / 60)}h ${m % 60}m`;
+}
+
+// One active-task row (main table).
+function taskRow(t) {
+  const badge = STATUS_BADGE[t.status] || 'badge-dim';
+  const title = t.title || '(untitled)';
+  let statusCell;
+  if (t.cooling) {
+    statusCell = `<span class="badge badge-warn">❄ Cooling down</span>`
+      + `<div class="dim small">${esc(t.cooling_message || 'paused for thermal cooldown')}</div>`;
+  } else if (t.throttling) {
+    statusCell = `<span class="badge ${badge}">${esc(t.status)}</span>`
+      + ` <span class="badge badge-user">🐢 throttling</span>`
+      + `<div class="dim small">${esc(t.throttle_message || 'CPU soft-throttle')}</div>`;
+  } else if (t.paused) {
+    statusCell = `<span class="badge badge-warn">⏸ Paused</span>`
+      + `<div class="dim small">suspended — click Resume to continue</div>`;
+  } else {
+    statusCell = `<span class="badge ${badge}">${esc(t.status)}</span>`
+      + (t.message ? `<div class="dim small">${esc(t.message)}</div>` : '');
+  }
+  return `<tr>
+    <td><span class="badge badge-user">${esc(KIND_LABEL[t.kind] || t.kind)}</span></td>
+    <td><div class="td-name">${esc(title)}${t.engine?` <span class="badge badge-user" style="font-size:9px;padding:.05rem .3rem;vertical-align:middle" title="Running on engine">${esc(t.engine)}</span>`:''}</div><div class="dim small mono">${esc(t.model || '')}</div></td>
+    <td>${statusCell}</td>
+    <td>${progressBar(t)}</td>
+    <td class="dim small">${fmtTime(t.started_at)}</td>
+    <td style="text-align:right">${actions(t)}</td>
+  </tr>`;
+}
+
+// One finished-task row (history table): result + average throughput + duration.
+function historyRow(t) {
+  const badge = STATUS_BADGE[t.status] || 'badge-dim';
+  const title = t.title || '(untitled)';
+  const dur = fmtDur(t.started_at, t.ended_at);
+  const rate = fmtRate(t);
+  const result = `<span class="badge ${badge}">${esc(t.status)}</span>`
+    + (dur ? ` <span class="dim small">in ${dur}</span>` : '')
+    + (t.message && t.status !== 'done' ? `<div class="dim small">${esc(t.message)}</div>` : '');
+  let thru = '<span class="dim small">—</span>';
+  if (rate) {
+    const cnt = t.total ? ` <span class="dim small">· ${t.total} ${t.kind === 'text' ? 'tok' : 'it'}</span>` : '';
+    thru = `<span class="dim small">${rate}</span>${cnt}`;
+  } else if (t.step) {
+    thru = `<span class="dim small">${t.step} ${t.kind === 'text' ? 'tok' : 'it'}</span>`;
+  }
+  return `<tr>
+    <td><span class="badge badge-user">${esc(KIND_LABEL[t.kind] || t.kind)}</span></td>
+    <td><div class="td-name">${esc(title)}</div><div class="dim small mono">${esc(t.model || '')}</div></td>
+    <td>${result}</td>
+    <td>${thru}</td>
+    <td class="dim small">${fmtTime(t.ended_at)}</td>
+    <td style="text-align:right"><button class="btn btn-ghost btn-sm" onclick="removeTask('${esc(t.id)}')">Remove</button></td>
+  </tr>`;
+}
+
 // ---- Live hardware telemetry ----
 // `frac` is a 0-100 fraction OF CAPACITY (the bar fill + colour are driven by it).
 function _utilClass(frac){ return frac == null ? 'sys-ok' : (frac >= 90 ? 'sys-hot' : frac >= 70 ? 'sys-warn' : 'sys-ok'); }
@@ -291,38 +378,26 @@ async function loadTasks() {
      sbanner.style.display = 'none';
    }

+    // Split: active tasks in the main table, the most recent finished ones in a
+    // history section (last 10) so completed generations stay visible briefly.
+    const active = tasks.filter(t => t.active);
+    const finished = tasks.filter(t => !t.active)
+      .sort((a, b) => (b.ended_at || b.started_at || 0) - (a.ended_at || a.started_at || 0))
+      .slice(0, 10);
+
    const tbody = document.getElementById('tasks-body');
-    if (!tasks.length) {
-      tbody.innerHTML = '<tr class="empty-row"><td colspan="6">No tasks yet</td></tr>';
-      return;
+    tbody.innerHTML = active.length
+      ? active.map(taskRow).join('')
+      : '<tr class="empty-row"><td colspan="6">No active tasks</td></tr>';
+
+    const hwrap = document.getElementById('history-wrap');
+    const hbody = document.getElementById('history-body');
+    if (finished.length) {
+      hwrap.style.display = '';
+      hbody.innerHTML = finished.map(historyRow).join('');
+    } else {
+      hwrap.style.display = 'none';
    }
-    tbody.innerHTML = tasks.map(t => {
-      const badge = STATUS_BADGE[t.status] || 'badge-dim';
-      const title = t.title || '(untitled)';
-      let statusCell;
-      if (t.cooling) {
-        statusCell = `<span class="badge badge-warn">❄ Cooling down</span>`
-          + `<div class="dim small">${esc(t.cooling_message || 'paused for thermal cooldown')}</div>`;
-      } else if (t.throttling) {
-        statusCell = `<span class="badge ${badge}">${esc(t.status)}</span>`
-          + ` <span class="badge badge-user">🐢 throttling</span>`
-          + `<div class="dim small">${esc(t.throttle_message || 'CPU soft-throttle')}</div>`;
-      } else if (t.paused) {
-        statusCell = `<span class="badge badge-warn">⏸ Paused</span>`
-          + `<div class="dim small">suspended — click Resume to continue</div>`;
-      } else {
-        statusCell = `<span class="badge ${badge}">${esc(t.status)}</span>`
-          + (t.message ? `<div class="dim small">${esc(t.message)}</div>` : '');
-      }
-      return `<tr>
-        <td><span class="badge badge-user">${esc(KIND_LABEL[t.kind] || t.kind)}</span></td>
-        <td><div class="td-name">${esc(title)}${t.engine?` <span class="badge badge-user" style="font-size:9px;padding:.05rem .3rem;vertical-align:middle" title="Running on engine">${esc(t.engine)}</span>`:''}</div><div class="dim small mono">${esc(t.model || '')}</div></td>
-        <td>${statusCell}</td>
-        <td>${progressBar(t)}</td>
-        <td class="dim small">${fmtTime(t.started_at)}</td>
-        <td style="text-align:right">${actions(t)}</td>
-      </tr>`;
-    }).join('');
  } catch (e) {
    // transient fetch errors during a model swap are fine; keep last render.
  } finally {

--- a/codai/models/manager.py
+++ b/codai/models/manager.py
@@ -2611,6 +2611,19 @@ class MultiModelManager:
                specs.append(v)
        return self._lora_vram_gb(specs)

+    @staticmethod
+    def _kv_cache_size_factor(cache_type) -> float:
+        """KV-cache bytes-per-element relative to f16 (=1.0), by llama.cpp cache
+        type. Quantized caches are much smaller, so the runtime KV reserve should
+        shrink accordingly (q4_0 ≈ 0.27× f16). Unset/unknown → f16 (1.0)."""
+        t = str(cache_type or "").strip().lower().replace("-", "_")
+        return {
+            "": 1.0, "f16": 1.0, "f32": 2.0, "bf16": 1.0,
+            "q8_0": 0.53,
+            "q5_1": 0.34, "q5_0": 0.33,
+            "q4_1": 0.28, "q4_0": 0.27,
+        }.get(t, 1.0)
+
    def _runtime_reserve_gb(self, cfg: dict, model_key: str, base_gb: float) -> float:
        """Extra VRAM the model needs at RUNTIME beyond its resident weights.

@@ -2631,7 +2644,14 @@ class MultiModelManager:
            except (TypeError, ValueError):
                n_ctx = 2048
            ctx_factor = min(3.0, max(1.0, n_ctx / 4096.0))
-            return base_gb * (0.10 + 0.08 * ctx_factor)   # ~0.18–0.34×
+            # The KV-cache part of the reserve scales with the cache quantization:
+            # a q4_0 KV cache is ~4x smaller than f16, so reserving an f16-sized KV
+            # for a quantized cache wildly over-estimates and forces needless CPU
+            # offload at large n_ctx. Scale only the KV term (0.08*ctx_factor); the
+            # 0.10 base (activations/compute buffers) is unaffected by KV dtype.
+            kv_factor = (self._kv_cache_size_factor(cfg.get("cache_type_k"))
+                         + self._kv_cache_size_factor(cfg.get("cache_type_v"))) / 2.0
+            return base_gb * (0.10 + 0.08 * ctx_factor * kv_factor)   # ~0.13–0.34×
        if mtype == "video":
            return base_gb * 0.08   # temporal activations + VAE decode (tiling-capped)
        if mtype == "image":

--- a/codai/tasks/registry.py
+++ b/codai/tasks/registry.py
@@ -81,6 +81,8 @@ class TaskRegistry:
        self._tasks: Dict[str, Task] = {}
        self._events: Dict[str, threading.Event] = {}
        self._pause_events: Dict[str, threading.Event] = {}
+        # Per-task (last_time, last_step) for throughput (it/s) estimation in step().
+        self._rate_state: Dict[str, tuple] = {}
        self._history = history

    def register(self, kind: str, *, title: str = "", model: str = "",
@@ -121,9 +123,25 @@ class TaskRegistry:
            t = self._tasks.get(tid)
            if not t:
                return
-            t.step = int(step)
+            new_step = int(step)
            if total is not None:
                t.total = int(total)
+            # Estimate throughput (iterations/s) for generation tasks from the
+            # rate of step increments, EMA-smoothed. Text generation reports its
+            # own tokens/s via update(rate=…), so it doesn't go through here.
+            now = time.time()
+            prev = self._rate_state.get(tid)
+            if prev is not None:
+                _pt, _ps = prev
+                _dt = now - _pt
+                _dn = new_step - _ps
+                if _dt >= 0.3 and _dn > 0:
+                    _inst = _dn / _dt
+                    t.rate = round(_inst if not t.rate else 0.5 * t.rate + 0.5 * _inst, 2)
+                    self._rate_state[tid] = (now, new_step)
+            else:
+                self._rate_state[tid] = (now, new_step)
+            t.step = new_step

    def current_loading_task(self) -> Optional[str]:
        """Id of the most-recently-started running ``loading`` task, if any.
@@ -152,6 +170,13 @@ class TaskRegistry:
            if message:
                t.message = message
            t.ended_at = time.time()
+            # Replace the live instantaneous rate with the run AVERAGE (it/s) so
+            # the history shows a stable, representative throughput.
+            if t.step and t.started_at:
+                _dur = (t.ended_at or t.started_at) - t.started_at
+                if _dur > 0 and not (t.kind == "text" and t.rate):
+                    t.rate = round(t.step / _dur, 2)
+            self._rate_state.pop(tid, None)
            self._prune_locked()

    def cancel(self, tid: str) -> bool:
@@ -234,6 +259,7 @@ class TaskRegistry:
        with self._lock:
            self._events.pop(tid, None)
            self._pause_events.pop(tid, None)
+            self._rate_state.pop(tid, None)
            return self._tasks.pop(tid, None) is not None

    def get(self, tid: str) -> Optional[dict]:
@@ -256,6 +282,7 @@ class TaskRegistry:
            self._tasks.pop(t.id, None)
            self._events.pop(t.id, None)
            self._pause_events.pop(t.id, None)
+            self._rate_state.pop(t.id, None)


 # Process-wide singleton.