fix: GGUF vision/mmproj routing + VRAM estimate; Tasks page it/s + history

- api_model_load: load a GGUF/text model via llama.cpp even when it's also
  bucketed under image/vision (respect the entry's primary model_type), so a
  gemma+mmproj LLM never hits the diffusers from_pretrained() path.
- model config save: a GGUF LLM with an mmproj auto-gets the image_to_text
  capability and is kept out of the diffusers vision_models/image_models buckets.
- VRAM estimate: _runtime_reserve_gb scales the KV-cache reserve by the cache
  quantization (q4_0 ≈ 0.27× f16) so quantized-KV models at large context aren't
  over-estimated into needless CPU offload.
- Free disk (HF): quiet huggingface_hub's noisy not-found traceback and make the
  delete idempotent (repo already gone = success).
- Tasks page: generation tasks now report it/s (or s/it when slow); text keeps
  tok/s. Throughput computed centrally in the task registry (live EMA + run
  average on finish). New "Recent tasks (last 10)" history section.
Co-Authored-By: 's avatarClaude Opus 4.8 <noreply@anthropic.com>
parent 7d3d8e5b
......@@ -1682,6 +1682,14 @@ def _do_delete_model(model_id: str, cache_type: str) -> dict:
if cache_type == "hf":
hf_dir = caches.get("huggingface")
if hf_dir:
# huggingface_hub logs a WARNING + full traceback when a repo dir has
# already vanished (e.g. a GGUF model whose HF repo was never really
# cached). Quiet it during the delete — "repo gone" is exactly the
# end state Free disk wants, so it's not an error.
import logging as _logging
_hf_log = _logging.getLogger("huggingface_hub.utils._cache_manager")
_prev_lvl = _hf_log.level
_hf_log.setLevel(_logging.ERROR)
try:
from huggingface_hub import scan_cache_dir
info = scan_cache_dir(hf_dir)
......@@ -1692,13 +1700,17 @@ def _do_delete_model(model_id: str, cache_type: str) -> dict:
return {"success": True}
except Exception:
pass
# Fallback: remove directory directly
finally:
_hf_log.setLevel(_prev_lvl)
# Fallback: remove the repo dir directly if it's still there.
safe = model_id.replace("/", "--")
d = os.path.join(hf_dir, f"models--{safe}")
if os.path.exists(d):
shutil.rmtree(d, ignore_errors=True)
return {"success": True}
return {"success": False, "detail": "Model not found in HF cache"}
# Whether or not anything was on disk, the files are gone now — Free
# disk is idempotent, so report success instead of a scary error.
return {"success": True}
return {"success": False, "detail": "HF cache directory not configured"}
if cache_type == "gguf":
gguf_dir = get_model_cache_dir()
......@@ -2032,7 +2044,13 @@ async def api_model_load(request: Request, username: str = Depends(require_admin
if not path:
raise HTTPException(status_code=400, detail="path required")
# Find the model config entry to determine its type
# Find the model config entry to determine its type. A model may be
# registered in several categories (e.g. a vision LLM advertises image_to_text
# → also listed under vision_models). The category-bucket loop below would pick
# whichever non-text bucket it hits first, sending the model to the diffusers /
# transformers loader — which calls from_pretrained() and fails on a GGUF file.
# So the entry's DECLARED primary model_type wins, and a GGUF/llama.cpp text
# model always loads via the text path regardless of its other buckets.
model_type = "text"
model_cfg: dict = {}
if config_manager:
......@@ -2049,6 +2067,12 @@ async def api_model_load(request: Request, username: str = Depends(require_admin
model_type = mtype
model_cfg = m if isinstance(m, dict) else {}
break
# Respect the entry's declared primary type; a text/gguf model (or any
# .gguf path) is a llama.cpp model and must use the text loader even when
# it's also bucketed under image/vision for capability routing.
_primary = (model_cfg.get("model_type") if isinstance(model_cfg, dict) else "") or ""
if _primary in ("text_models", "gguf_models") or str(path).lower().endswith(".gguf"):
model_type = "text"
# Offload to a thread: request_model may block (thermal wait / busy model /
# actual load) and would otherwise freeze the whole admin web UI event loop.
......@@ -2451,6 +2475,27 @@ async def api_model_configure(request: Request, username: str = Depends(require_
if key in data:
entry[key] = data[key]
# A GGUF LLM is served by llama.cpp. Its multimodal projector (mmproj) gives
# it VISION INPUT, which is the `image_to_text` capability served through
# llama.cpp — NOT the diffusers `vision_models`/`image_models` categories
# (those route the .gguf to from_pretrained() and fail). So for a GGUF text
# model: auto-tag image_to_text when an mmproj is set, and keep it out of the
# diffusers buckets (it stays a llama.cpp model that advertises vision).
_path_l = str(path).lower()
_is_gguf_llm = ((_path_l.endswith(".gguf") or "gguf" in _path_l
or entry.get("model_type") == "gguf_models")
and entry.get("model_type") in ("text_models", "gguf_models"))
if _is_gguf_llm:
_caps = list(entry.get("capabilities") or [])
if entry.get("mmproj") and "image_to_text" not in _caps:
_caps.append("image_to_text")
entry["capabilities"] = _caps
_DIFFUSERS_CATS = {"image_models", "vision_models", "video_models", "spatial_models"}
_kept = [t for t in model_types if t not in _DIFFUSERS_CATS] or ["text_models"]
if _kept != model_types:
model_types = _kept
entry["model_types"] = model_types
# Add entry to each selected category
for mtype in model_types:
config_manager.models_data.setdefault(mtype, []).append(entry)
......
......@@ -73,6 +73,19 @@
</tbody>
</table>
</div>
<div class="table-wrap" id="history-wrap" style="display:none;margin-top:1.25rem">
<h2 style="font-size:14px;margin:0 0 .5rem">Recent tasks <span class="dim small">(last 10)</span></h2>
<table>
<thead>
<tr>
<th>Type</th><th>Name / Model</th><th>Result</th>
<th style="width:220px">Throughput</th><th>Finished</th><th style="text-align:right"></th>
</tr>
</thead>
<tbody id="history-body"></tbody>
</table>
</div>
{% endblock %}
{% block scripts %}
......@@ -94,6 +107,15 @@ const STATUS_BADGE = {
cancelled:'badge-user', interrupted:'badge-warn'
};
// Format a task's throughput with the right unit. Text → tok/s; generation →
// it/s, or s/it when each iteration takes longer than a second (more readable).
function fmtRate(t) {
const r = Number(t.rate) || 0;
if (r <= 0) return '';
if (t.kind === 'text') return `${r} tok/s`;
return r >= 1 ? `${r} it/s` : `${(1 / r).toFixed(1)} s/it`;
}
function progressBar(t) {
// Downloads report a 0-100 percent; render a percentage bar (the filename /
// rate / ETA detail is shown in the status cell, so it isn't repeated here).
......@@ -103,9 +125,10 @@ function progressBar(t) {
<span class="dim small">${pct}%</span>`;
}
const total = t.total || 0, step = t.step || 0;
// Live throughput for text generation (tokens/s), shown while running.
// Live throughput, shown while running: tokens/s for text, iterations/s (or
// seconds/iteration when slow) for generation tasks (image/video/audio/…).
const rate = (t.rate && t.status === 'running')
? ` <span class="dim small">· ${t.rate} tok/s</span>` : '';
? ` <span class="dim small">· ${fmtRate(t)}</span>` : '';
if (!total) {
if (t.status === 'running') {
const tok = step ? `${step} tok` : 'working…';
......@@ -139,6 +162,70 @@ function actions(t) {
return btns.join(' ') || '<span class="dim small">—</span>';
}
function fmtDur(start, end) {
if (!start || !end || end < start) return '';
const s = Math.round(end - start);
if (s < 60) return `${s}s`;
const m = Math.floor(s / 60), ss = s % 60;
if (m < 60) return `${m}m ${ss}s`;
return `${Math.floor(m / 60)}h ${m % 60}m`;
}
// One active-task row (main table).
function taskRow(t) {
const badge = STATUS_BADGE[t.status] || 'badge-dim';
const title = t.title || '(untitled)';
let statusCell;
if (t.cooling) {
statusCell = `<span class="badge badge-warn">❄ Cooling down</span>`
+ `<div class="dim small">${esc(t.cooling_message || 'paused for thermal cooldown')}</div>`;
} else if (t.throttling) {
statusCell = `<span class="badge ${badge}">${esc(t.status)}</span>`
+ ` <span class="badge badge-user">🐢 throttling</span>`
+ `<div class="dim small">${esc(t.throttle_message || 'CPU soft-throttle')}</div>`;
} else if (t.paused) {
statusCell = `<span class="badge badge-warn">⏸ Paused</span>`
+ `<div class="dim small">suspended — click Resume to continue</div>`;
} else {
statusCell = `<span class="badge ${badge}">${esc(t.status)}</span>`
+ (t.message ? `<div class="dim small">${esc(t.message)}</div>` : '');
}
return `<tr>
<td><span class="badge badge-user">${esc(KIND_LABEL[t.kind] || t.kind)}</span></td>
<td><div class="td-name">${esc(title)}${t.engine?` <span class="badge badge-user" style="font-size:9px;padding:.05rem .3rem;vertical-align:middle" title="Running on engine">${esc(t.engine)}</span>`:''}</div><div class="dim small mono">${esc(t.model || '')}</div></td>
<td>${statusCell}</td>
<td>${progressBar(t)}</td>
<td class="dim small">${fmtTime(t.started_at)}</td>
<td style="text-align:right">${actions(t)}</td>
</tr>`;
}
// One finished-task row (history table): result + average throughput + duration.
function historyRow(t) {
const badge = STATUS_BADGE[t.status] || 'badge-dim';
const title = t.title || '(untitled)';
const dur = fmtDur(t.started_at, t.ended_at);
const rate = fmtRate(t);
const result = `<span class="badge ${badge}">${esc(t.status)}</span>`
+ (dur ? ` <span class="dim small">in ${dur}</span>` : '')
+ (t.message && t.status !== 'done' ? `<div class="dim small">${esc(t.message)}</div>` : '');
let thru = '<span class="dim small">—</span>';
if (rate) {
const cnt = t.total ? ` <span class="dim small">· ${t.total} ${t.kind === 'text' ? 'tok' : 'it'}</span>` : '';
thru = `<span class="dim small">${rate}</span>${cnt}`;
} else if (t.step) {
thru = `<span class="dim small">${t.step} ${t.kind === 'text' ? 'tok' : 'it'}</span>`;
}
return `<tr>
<td><span class="badge badge-user">${esc(KIND_LABEL[t.kind] || t.kind)}</span></td>
<td><div class="td-name">${esc(title)}</div><div class="dim small mono">${esc(t.model || '')}</div></td>
<td>${result}</td>
<td>${thru}</td>
<td class="dim small">${fmtTime(t.ended_at)}</td>
<td style="text-align:right"><button class="btn btn-ghost btn-sm" onclick="removeTask('${esc(t.id)}')">Remove</button></td>
</tr>`;
}
// ---- Live hardware telemetry ----
// `frac` is a 0-100 fraction OF CAPACITY (the bar fill + colour are driven by it).
function _utilClass(frac){ return frac == null ? 'sys-ok' : (frac >= 90 ? 'sys-hot' : frac >= 70 ? 'sys-warn' : 'sys-ok'); }
......@@ -291,38 +378,26 @@ async function loadTasks() {
sbanner.style.display = 'none';
}
// Split: active tasks in the main table, the most recent finished ones in a
// history section (last 10) so completed generations stay visible briefly.
const active = tasks.filter(t => t.active);
const finished = tasks.filter(t => !t.active)
.sort((a, b) => (b.ended_at || b.started_at || 0) - (a.ended_at || a.started_at || 0))
.slice(0, 10);
const tbody = document.getElementById('tasks-body');
if (!tasks.length) {
tbody.innerHTML = '<tr class="empty-row"><td colspan="6">No tasks yet</td></tr>';
return;
tbody.innerHTML = active.length
? active.map(taskRow).join('')
: '<tr class="empty-row"><td colspan="6">No active tasks</td></tr>';
const hwrap = document.getElementById('history-wrap');
const hbody = document.getElementById('history-body');
if (finished.length) {
hwrap.style.display = '';
hbody.innerHTML = finished.map(historyRow).join('');
} else {
hwrap.style.display = 'none';
}
tbody.innerHTML = tasks.map(t => {
const badge = STATUS_BADGE[t.status] || 'badge-dim';
const title = t.title || '(untitled)';
let statusCell;
if (t.cooling) {
statusCell = `<span class="badge badge-warn">❄ Cooling down</span>`
+ `<div class="dim small">${esc(t.cooling_message || 'paused for thermal cooldown')}</div>`;
} else if (t.throttling) {
statusCell = `<span class="badge ${badge}">${esc(t.status)}</span>`
+ ` <span class="badge badge-user">🐢 throttling</span>`
+ `<div class="dim small">${esc(t.throttle_message || 'CPU soft-throttle')}</div>`;
} else if (t.paused) {
statusCell = `<span class="badge badge-warn">⏸ Paused</span>`
+ `<div class="dim small">suspended — click Resume to continue</div>`;
} else {
statusCell = `<span class="badge ${badge}">${esc(t.status)}</span>`
+ (t.message ? `<div class="dim small">${esc(t.message)}</div>` : '');
}
return `<tr>
<td><span class="badge badge-user">${esc(KIND_LABEL[t.kind] || t.kind)}</span></td>
<td><div class="td-name">${esc(title)}${t.engine?` <span class="badge badge-user" style="font-size:9px;padding:.05rem .3rem;vertical-align:middle" title="Running on engine">${esc(t.engine)}</span>`:''}</div><div class="dim small mono">${esc(t.model || '')}</div></td>
<td>${statusCell}</td>
<td>${progressBar(t)}</td>
<td class="dim small">${fmtTime(t.started_at)}</td>
<td style="text-align:right">${actions(t)}</td>
</tr>`;
}).join('');
} catch (e) {
// transient fetch errors during a model swap are fine; keep last render.
} finally {
......
......@@ -2611,6 +2611,19 @@ class MultiModelManager:
specs.append(v)
return self._lora_vram_gb(specs)
@staticmethod
def _kv_cache_size_factor(cache_type) -> float:
"""KV-cache bytes-per-element relative to f16 (=1.0), by llama.cpp cache
type. Quantized caches are much smaller, so the runtime KV reserve should
shrink accordingly (q4_0 ≈ 0.27× f16). Unset/unknown → f16 (1.0)."""
t = str(cache_type or "").strip().lower().replace("-", "_")
return {
"": 1.0, "f16": 1.0, "f32": 2.0, "bf16": 1.0,
"q8_0": 0.53,
"q5_1": 0.34, "q5_0": 0.33,
"q4_1": 0.28, "q4_0": 0.27,
}.get(t, 1.0)
def _runtime_reserve_gb(self, cfg: dict, model_key: str, base_gb: float) -> float:
"""Extra VRAM the model needs at RUNTIME beyond its resident weights.
......@@ -2631,7 +2644,14 @@ class MultiModelManager:
except (TypeError, ValueError):
n_ctx = 2048
ctx_factor = min(3.0, max(1.0, n_ctx / 4096.0))
return base_gb * (0.10 + 0.08 * ctx_factor) # ~0.18–0.34×
# The KV-cache part of the reserve scales with the cache quantization:
# a q4_0 KV cache is ~4x smaller than f16, so reserving an f16-sized KV
# for a quantized cache wildly over-estimates and forces needless CPU
# offload at large n_ctx. Scale only the KV term (0.08*ctx_factor); the
# 0.10 base (activations/compute buffers) is unaffected by KV dtype.
kv_factor = (self._kv_cache_size_factor(cfg.get("cache_type_k"))
+ self._kv_cache_size_factor(cfg.get("cache_type_v"))) / 2.0
return base_gb * (0.10 + 0.08 * ctx_factor * kv_factor) # ~0.13–0.34×
if mtype == "video":
return base_gb * 0.08 # temporal activations + VAE decode (tiling-capped)
if mtype == "image":
......
......@@ -81,6 +81,8 @@ class TaskRegistry:
self._tasks: Dict[str, Task] = {}
self._events: Dict[str, threading.Event] = {}
self._pause_events: Dict[str, threading.Event] = {}
# Per-task (last_time, last_step) for throughput (it/s) estimation in step().
self._rate_state: Dict[str, tuple] = {}
self._history = history
def register(self, kind: str, *, title: str = "", model: str = "",
......@@ -121,9 +123,25 @@ class TaskRegistry:
t = self._tasks.get(tid)
if not t:
return
t.step = int(step)
new_step = int(step)
if total is not None:
t.total = int(total)
# Estimate throughput (iterations/s) for generation tasks from the
# rate of step increments, EMA-smoothed. Text generation reports its
# own tokens/s via update(rate=…), so it doesn't go through here.
now = time.time()
prev = self._rate_state.get(tid)
if prev is not None:
_pt, _ps = prev
_dt = now - _pt
_dn = new_step - _ps
if _dt >= 0.3 and _dn > 0:
_inst = _dn / _dt
t.rate = round(_inst if not t.rate else 0.5 * t.rate + 0.5 * _inst, 2)
self._rate_state[tid] = (now, new_step)
else:
self._rate_state[tid] = (now, new_step)
t.step = new_step
def current_loading_task(self) -> Optional[str]:
"""Id of the most-recently-started running ``loading`` task, if any.
......@@ -152,6 +170,13 @@ class TaskRegistry:
if message:
t.message = message
t.ended_at = time.time()
# Replace the live instantaneous rate with the run AVERAGE (it/s) so
# the history shows a stable, representative throughput.
if t.step and t.started_at:
_dur = (t.ended_at or t.started_at) - t.started_at
if _dur > 0 and not (t.kind == "text" and t.rate):
t.rate = round(t.step / _dur, 2)
self._rate_state.pop(tid, None)
self._prune_locked()
def cancel(self, tid: str) -> bool:
......@@ -234,6 +259,7 @@ class TaskRegistry:
with self._lock:
self._events.pop(tid, None)
self._pause_events.pop(tid, None)
self._rate_state.pop(tid, None)
return self._tasks.pop(tid, None) is not None
def get(self, tid: str) -> Optional[dict]:
......@@ -256,6 +282,7 @@ class TaskRegistry:
self._tasks.pop(t.id, None)
self._events.pop(t.id, None)
self._pause_events.pop(t.id, None)
self._rate_state.pop(t.id, None)
# Process-wide singleton.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment