quant: surface jobs on Tasks page + model list, persist across restart

- quant jobs now appear on the Tasks page (api_tasks emits kind=quantize) and as
  a live badge on the HF model-list row (polled; re-renders only on change).
- persist job state to <cache>/quantized/jobs.json; on startup a job left
  "running" is marked "interrupted" only if its owning PID is dead (merge-safe
  save so multiple processes don't clobber each other).
- gitignore the runtime model cache (models/), logs/, and the third-party
  GPTQModel/ source clone (installed into the venv, not part of this repo).
Co-Authored-By: 's avatarClaude Opus 4.8 <noreply@anthropic.com>
parent 48be0d91
......@@ -17,6 +17,15 @@ __pycache__/
# Debug logs
debug.log
/logs/
# Runtime model cache (downloads, self-quantized checkpoints, job state).
# Root-anchored so it never shadows the tracked codai/models/ source package.
/models/
# Third-party source clone of the GPTQ quantizer — installed into the venv from
# source; the working tree is not part of this repo (it has its own .git).
/GPTQModel/
# Test files
test_*.py
......
......@@ -2449,6 +2449,37 @@ def api_tasks(username: str = Depends(require_admin)):
"restartable": False,
})
# GPTQ/AWQ quantization jobs run in in-process daemon threads (status persisted
# to disk so it survives a restart). Surface them alongside downloads/training.
try:
from codai.models import quant as _quant
for _name, _qj in _quant.all_jobs().items():
if _name in seen:
continue
seen.add(_name)
_qs = _qj.get("status") or "running"
_active = _qs == "running"
_pct = int(round((_qj.get("progress") or 0) * 100))
_msg = _qj.get("message") or ""
if _qj.get("error"):
_msg = str(_qj["error"])
tasks.append({
"id": f"quantize:{_name}",
"kind": "quantize",
"title": _name,
"model": (_qj.get("method") or "gptq").upper(),
"status": _qs,
"step": _pct, "total": 100, "percent": _pct,
"message": _msg,
"started_at": _qj.get("started"),
"active": _active,
"cancellable": False,
"pausable": False,
"restartable": _qs in ("failed", "interrupted"),
})
except Exception:
pass
# Successfully-finished work is dropped from the live list — a "done" job is
# no longer actionable, so it shouldn't clutter the view. Terminal-but-notable
# states (cancelled / error / interrupted) stay, so they can be inspected,
......
......@@ -88,6 +88,15 @@ window.__DEFAULT_WHISPER_SERVER_PATH__ = {{ default_whisper_server_path|tojson }
</div>
</div>
<!-- name filter (applies to HF models, GGUF files and components) -->
<div class="card" style="padding-top:.75rem;padding-bottom:.75rem">
<div style="display:flex;align-items:center;gap:.5rem">
<span class="fl">Filter</span>
<input type="text" id="local-filter" class="form-input" placeholder="Filter models by name…" style="flex:1" oninput="applyLocalFilter()">
<button class="btn btn-ghost btn-sm" onclick="document.getElementById('local-filter').value='';applyLocalFilter()">Clear</button>
</div>
</div>
<!-- HF models -->
<div class="card">
<div class="card-title">HuggingFace models <span id="hf-model-badge" class="muted small"></span></div>
......@@ -2151,7 +2160,10 @@ function _renderQuantStatus(job, caps){
el.innerHTML = `<span style="color:var(--ok,#16a34a)">✓ Quantized checkpoint ready — used automatically on next load.</span> <span class="muted">Kernels: ${esc(kernels)}</span>`;
} else if(job && job.status === 'failed'){
btn.disabled = false;
el.innerHTML = `<span style="color:var(--danger,#dc2626)">Quantization failed: ${esc(job.error||'')}</span> <span class="muted">Falls back to bitsandbytes.</span>`;
el.innerHTML = `<span style="color:var(--danger,#dc2626)">Quantization failed: ${esc(job.error||'')}</span> <span class="muted">Falls back to bitsandbytes. Click to retry.</span>`;
} else if(job && job.status === 'interrupted'){
btn.disabled = false;
el.innerHTML = `<span style="color:var(--warn,#d97706)">Quantization was interrupted by a server restart.</span> <span class="muted">Click to restart.</span>`;
} else {
btn.disabled = false;
el.innerHTML = `Marlin/ExLlama 4-bit (2–4× faster than bitsandbytes). One-time background job; checkpoint auto-used on next load. <span class="muted">Kernels: ${esc(kernels||'detecting…')}</span>`;
......@@ -2334,7 +2346,7 @@ async function loadCachedModels(){
const incompleteBadgeHf = m.incomplete ? '<span class="badge" style="background:rgba(255,160,0,.18);color:#b87200;font-size:10px;margin-left:.3rem" title="Download may be incomplete — some files are missing or truncated">⚠ incomplete</span>' : '';
const _hfConfigCount = _hfConfigs.length;
return `<tr${hlHf?' class="local-cap-highlight"':''} style="border-top:1px solid var(--border)${hlHf?';background:rgba(110,207,126,.07);outline:2px solid rgba(110,207,126,.25);outline-offset:-1px':''}${m.incomplete?';background:rgba(255,160,0,.04)':''}">
<td style="padding:.4rem .25rem;font-family:monospace;font-size:12px;max-width:260px;overflow:hidden;text-overflow:ellipsis" title="${esc(m.id)}">${esc(m.id)}${incompleteBadgeHf}${_hfConfigPills}</td>
<td style="padding:.4rem .25rem;font-family:monospace;font-size:12px;max-width:260px;overflow:hidden;text-overflow:ellipsis" title="${esc(m.id)}">${esc(m.id)}${incompleteBadgeHf}${_quantBadge(m.id)}${_hfConfigPills}</td>
<td style="text-align:right;padding:.4rem .25rem;white-space:nowrap;color:var(--text-2)">${fmtGB(m.size_gb)}</td>
<td style="text-align:right;padding:.4rem .25rem;color:var(--text-2)">${m.file_count}</td>
<td style="padding:.4rem .25rem;font-size:11px">${capBadges||'<span class="muted small">—</span>'}</td>
......@@ -2443,11 +2455,35 @@ async function loadCachedModels(){
const wsHtml = _renderWhisperServerRows(whisperModels);
if(wsHtml) ggufEl.insertAdjacentHTML('afterend', wsHtml);
resetWhisperServerBuilderDefaults();
applyLocalFilter();
}catch(e){
hfEl.innerHTML = ggufEl.innerHTML = `<span class="muted small">Error: ${esc(e.message)}</span>`;
}
}
// Filter HF models, GGUF files and component rows by name match (first cell).
function applyLocalFilter(){
const inp = document.getElementById('local-filter');
const q = (inp ? inp.value : '').trim().toLowerCase();
[['hf-models-list','hf-model-badge'],
['gguf-models-list','gguf-file-badge'],
['comp-list','comp-badge']].forEach(([listId,badgeId])=>{
const el = document.getElementById(listId);
if(!el) return;
const rows = el.querySelectorAll('tbody > tr');
let shown = 0;
rows.forEach(tr=>{
const cell = tr.querySelector('td');
const name = (cell ? cell.textContent : tr.textContent).toLowerCase();
const match = !q || name.includes(q);
tr.style.display = match ? '' : 'none';
if(match) shown++;
});
const badge = document.getElementById(badgeId);
if(badge && rows.length){ badge.textContent = q ? `(${shown}/${rows.length})` : `(${rows.length})`; }
});
}
function toggleWhisperModelSource(){
const sourceSelect = document.getElementById('ws-model-source');
const source = sourceSelect ? sourceSelect.value : 'cached-gguf';
......@@ -2505,12 +2541,45 @@ function _instanceBadge(lookupPaths, cfgMax){
return '';
}
// ---- Quant-job badges on the model list (polled; re-renders only on change) ----
let _quantJobs = {};
let _quantSig = '';
async function refreshQuantJobs(){
try{
const r = await fetch('/admin/api/quantize-status');
_quantJobs = (await r.json()).jobs || {};
}catch(e){ /* keep last-known on transient error */ }
// Signature buckets progress to 5% so we only re-render on meaningful change.
return Object.entries(_quantJobs)
.map(([k,v])=>`${k}:${v.status}:${Math.round((v.progress||0)*20)}`).sort().join('|');
}
function _quantBadge(modelId){
const j = _quantJobs[modelId];
if(!j) return '';
const m = esc((j.method||'gptq').toUpperCase());
if(j.status==='running'){
const pct = Math.round((j.progress||0)*100);
return `<span class="badge" style="background:rgba(99,102,241,.18);color:#A5B4FC;font-size:10px;margin-left:.3rem" title="Quantizing to ${m}: ${esc(j.message||'')}">⏳ quantizing ${pct}%</span>`;
}
if(j.status==='failed') return `<span class="badge badge-err" style="font-size:10px;margin-left:.3rem" title="${esc(j.error||'')} — falls back to bitsandbytes">quant failed</span>`;
if(j.status==='interrupted') return `<span class="badge" style="background:rgba(255,160,0,.18);color:#b87200;font-size:10px;margin-left:.3rem" title="Interrupted by a server restart — re-run from the model config">quant interrupted</span>`;
if(j.status==='done') return `<span class="badge badge-ok" style="font-size:10px;margin-left:.3rem" title="Fast-kernel ${m} checkpoint ready — used automatically on next load">4-bit ready</span>`;
return '';
}
async function refreshLocal(){
await refreshLoadedStatus();
_quantSig = await refreshQuantJobs();
loadCacheStats();
loadCachedModels();
}
// Poll quant jobs; re-render the list only when a job's state/progress changes.
setInterval(async () => {
const sig = await refreshQuantJobs();
if (sig !== _quantSig) { _quantSig = sig; loadCachedModels(); }
}, 5000);
loadGlobalSettings();
// Load engine/card info first so the per-model card tags render on the first paint,
// then re-render once it's available (covers the fetch resolving after the list).
......
......@@ -99,13 +99,83 @@ def find_quantized_checkpoint(model_name: str, method: str = "gptq") -> Optional
# --------------------------------------------------------------------------------
# Background quantization job
# Background quantization job (persisted so status survives a server restart)
# --------------------------------------------------------------------------------
_jobs: Dict[str, Dict[str, Any]] = {} # model_name -> job status dict
_jobs_lock = threading.Lock()
def _jobs_file() -> Path:
return Path(get_model_cache_dir()) / "quantized" / "jobs.json"
def _pid_alive(pid) -> bool:
"""True if process ``pid`` is still running. Conservative: unknown → alive."""
try:
pid = int(pid)
except (TypeError, ValueError):
return False
try:
os.kill(pid, 0)
return True
except ProcessLookupError:
return False
except PermissionError:
return True # exists but owned by another user
except Exception:
return True
def _save_jobs_locked() -> None:
"""Persist the job table to disk. Caller holds ``_jobs_lock``.
Merges with whatever is on disk so a *different* process's jobs (the front and
engines each import this module) aren't erased by a last-writer-wins overwrite.
This process's in-memory entries win for keys it owns.
"""
try:
import json
f = _jobs_file()
f.parent.mkdir(parents=True, exist_ok=True)
merged = {}
try:
if f.is_file():
disk = json.loads(f.read_text())
if isinstance(disk, dict):
merged.update(disk)
except Exception:
pass
merged.update(_jobs)
tmp = f.with_suffix(".json.tmp")
tmp.write_text(json.dumps(merged))
tmp.replace(f)
except Exception:
pass
def _load_jobs() -> None:
"""Load persisted jobs on import. A job left 'running' whose owning process is
no longer alive can't still be running → mark interrupted. A job still owned by
a live process (another engine, or this one) is left untouched."""
try:
import json
f = _jobs_file()
if not f.is_file():
return
data = json.loads(f.read_text())
if not isinstance(data, dict):
return
for name, job in data.items():
if (isinstance(job, dict) and job.get("status") == "running"
and not _pid_alive(job.get("pid"))):
job["status"] = "interrupted"
job["message"] = "interrupted by server restart"
_jobs[name] = job
except Exception:
pass
def get_job(model_name: str) -> Optional[Dict[str, Any]]:
with _jobs_lock:
j = _jobs.get(model_name)
......@@ -121,6 +191,10 @@ def _set_job(model_name: str, **fields) -> None:
with _jobs_lock:
j = _jobs.setdefault(model_name, {"model": model_name})
j.update(fields)
_save_jobs_locked()
_load_jobs()
def start_quantization(model_name: str, method: str = "gptq", bits: int = 4,
......@@ -149,7 +223,8 @@ def start_quantization(model_name: str, method: str = "gptq", bits: int = 4,
_jobs[model_name] = {"model": model_name, "method": method, "bits": bits,
"status": "running", "progress": 0.0,
"message": "starting", "started": time.time(),
"error": None, "output": None}
"pid": os.getpid(), "error": None, "output": None}
_save_jobs_locked()
t = threading.Thread(
target=_quantize_worker,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment