quant: surface jobs on Tasks page + model list, persist across restart

- quant jobs now appear on the Tasks page (api_tasks emits kind=quantize) and as a live badge on the HF model-list row (polled; re-renders only on change). - persist job state to <cache>/quantized/jobs.json; on startup a job left "running" is marked "interrupted" only if its owning PID is dead (merge-safe save so multiple processes don't clobber each other). - gitignore the runtime model cache (models/), logs/, and the third-party GPTQModel/ source clone (installed into the venv, not part of this repo). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

quant: surface jobs on Tasks page + model list, persist across restart
- quant jobs now appear on the Tasks page (api_tasks emits kind=quantize) and as a live badge on the HF model-list row (polled; re-renders only on change). - persist job state to <cache>/quantized/jobs.json; on startup a job left "running" is marked "interrupted" only if its owning PID is dead (merge-safe save so multiple processes don't clobber each other). - gitignore the runtime model cache (models/), logs/, and the third-party GPTQModel/ source clone (installed into the venv, not part of this repo). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
6d053dc1 · Stefy Lanza (nextime / spora ) · 48be0d91 · 6d053dc1 · 6d053dc1 · 6d053dc1
Commit 6d053dc1 authored Jun 18, 2026 by Stefy Lanza (nextime / spora )
Show whitespace changes
Inline Side-by-side

Showing with 188 additions and 4 deletions

.gitignore .gitignore +9 -0

routes.py codai/admin/routes.py +31 -0

models.html codai/admin/templates/models.html +71 -2

quant.py codai/models/quant.py +77 -2

No files found.
--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,15 @@ __pycache__/

 # Debug logs
 debug.log
+/logs/
+
+# Runtime model cache (downloads, self-quantized checkpoints, job state).
+# Root-anchored so it never shadows the tracked codai/models/ source package.
+/models/
+
+# Third-party source clone of the GPTQ quantizer — installed into the venv from
+# source; the working tree is not part of this repo (it has its own .git).
+/GPTQModel/

 # Test files
 test_*.py

--- a/codai/admin/routes.py
+++ b/codai/admin/routes.py
@@ -2449,6 +2449,37 @@ def api_tasks(username: str = Depends(require_admin)):
            "restartable": False,
        })

+    # GPTQ/AWQ quantization jobs run in in-process daemon threads (status persisted
+    # to disk so it survives a restart). Surface them alongside downloads/training.
+    try:
+        from codai.models import quant as _quant
+        for _name, _qj in _quant.all_jobs().items():
+            if _name in seen:
+                continue
+            seen.add(_name)
+            _qs = _qj.get("status") or "running"
+            _active = _qs == "running"
+            _pct = int(round((_qj.get("progress") or 0) * 100))
+            _msg = _qj.get("message") or ""
+            if _qj.get("error"):
+                _msg = str(_qj["error"])
+            tasks.append({
+                "id": f"quantize:{_name}",
+                "kind": "quantize",
+                "title": _name,
+                "model": (_qj.get("method") or "gptq").upper(),
+                "status": _qs,
+                "step": _pct, "total": 100, "percent": _pct,
+                "message": _msg,
+                "started_at": _qj.get("started"),
+                "active": _active,
+                "cancellable": False,
+                "pausable": False,
+                "restartable": _qs in ("failed", "interrupted"),
+            })
+    except Exception:
+        pass
+
    # Successfully-finished work is dropped from the live list — a "done" job is
    # no longer actionable, so it shouldn't clutter the view. Terminal-but-notable
    # states (cancelled / error / interrupted) stay, so they can be inspected,

--- a/codai/admin/templates/models.html
+++ b/codai/admin/templates/models.html
@@ -88,6 +88,15 @@ window.__DEFAULT_WHISPER_SERVER_PATH__ = {{ default_whisper_server_path|tojson }
    </div>
  </div>

+  <!-- name filter (applies to HF models, GGUF files and components) -->
+  <div class="card" style="padding-top:.75rem;padding-bottom:.75rem">
+    <div style="display:flex;align-items:center;gap:.5rem">
+      <span class="fl">Filter</span>
+      <input type="text" id="local-filter" class="form-input" placeholder="Filter models by name…" style="flex:1" oninput="applyLocalFilter()">
+      <button class="btn btn-ghost btn-sm" onclick="document.getElementById('local-filter').value='';applyLocalFilter()">Clear</button>
+    </div>
+  </div>
+
  <!-- HF models -->
  <div class="card">
    <div class="card-title">HuggingFace models <span id="hf-model-badge" class="muted small"></span></div>
@@ -2151,7 +2160,10 @@ function _renderQuantStatus(job, caps){
    el.innerHTML = `<span style="color:var(--ok,#16a34a)">✓ Quantized checkpoint ready — used automatically on next load.</span> <span class="muted">Kernels: ${esc(kernels)}</span>`;
  } else if(job && job.status === 'failed'){
    btn.disabled = false;
-    el.innerHTML = `<span style="color:var(--danger,#dc2626)">Quantization failed: ${esc(job.error||'')}</span> <span class="muted">Falls back to bitsandbytes.</span>`;
+    el.innerHTML = `<span style="color:var(--danger,#dc2626)">Quantization failed: ${esc(job.error||'')}</span> <span class="muted">Falls back to bitsandbytes. Click to retry.</span>`;
+  } else if(job && job.status === 'interrupted'){
+    btn.disabled = false;
+    el.innerHTML = `<span style="color:var(--warn,#d97706)">Quantization was interrupted by a server restart.</span> <span class="muted">Click to restart.</span>`;
  } else {
    btn.disabled = false;
    el.innerHTML = `Marlin/ExLlama 4-bit (2–4× faster than bitsandbytes). One-time background job; checkpoint auto-used on next load. <span class="muted">Kernels: ${esc(kernels||'detecting…')}</span>`;
@@ -2334,7 +2346,7 @@ async function loadCachedModels(){
        const incompleteBadgeHf = m.incomplete ? '<span class="badge" style="background:rgba(255,160,0,.18);color:#b87200;font-size:10px;margin-left:.3rem" title="Download may be incomplete — some files are missing or truncated">⚠ incomplete</span>' : '';
        const _hfConfigCount = _hfConfigs.length;
        return `<tr${hlHf?' class="local-cap-highlight"':''} style="border-top:1px solid var(--border)${hlHf?';background:rgba(110,207,126,.07);outline:2px solid rgba(110,207,126,.25);outline-offset:-1px':''}${m.incomplete?';background:rgba(255,160,0,.04)':''}">
-          <td style="padding:.4rem .25rem;font-family:monospace;font-size:12px;max-width:260px;overflow:hidden;text-overflow:ellipsis" title="${esc(m.id)}">${esc(m.id)}${incompleteBadgeHf}${_hfConfigPills}</td>
+          <td style="padding:.4rem .25rem;font-family:monospace;font-size:12px;max-width:260px;overflow:hidden;text-overflow:ellipsis" title="${esc(m.id)}">${esc(m.id)}${incompleteBadgeHf}${_quantBadge(m.id)}${_hfConfigPills}</td>
          <td style="text-align:right;padding:.4rem .25rem;white-space:nowrap;color:var(--text-2)">${fmtGB(m.size_gb)}</td>
          <td style="text-align:right;padding:.4rem .25rem;color:var(--text-2)">${m.file_count}</td>
          <td style="padding:.4rem .25rem;font-size:11px">${capBadges||'<span class="muted small">—</span>'}</td>
@@ -2443,11 +2455,35 @@ async function loadCachedModels(){
    const wsHtml = _renderWhisperServerRows(whisperModels);
    if(wsHtml) ggufEl.insertAdjacentHTML('afterend', wsHtml);
    resetWhisperServerBuilderDefaults();
+    applyLocalFilter();
  }catch(e){
    hfEl.innerHTML = ggufEl.innerHTML = `<span class="muted small">Error: ${esc(e.message)}</span>`;
  }
 }

+// Filter HF models, GGUF files and component rows by name match (first cell).
+function applyLocalFilter(){
+  const inp = document.getElementById('local-filter');
+  const q = (inp ? inp.value : '').trim().toLowerCase();
+  [['hf-models-list','hf-model-badge'],
+   ['gguf-models-list','gguf-file-badge'],
+   ['comp-list','comp-badge']].forEach(([listId,badgeId])=>{
+    const el = document.getElementById(listId);
+    if(!el) return;
+    const rows = el.querySelectorAll('tbody > tr');
+    let shown = 0;
+    rows.forEach(tr=>{
+      const cell = tr.querySelector('td');
+      const name = (cell ? cell.textContent : tr.textContent).toLowerCase();
+      const match = !q || name.includes(q);
+      tr.style.display = match ? '' : 'none';
+      if(match) shown++;
+    });
+    const badge = document.getElementById(badgeId);
+    if(badge && rows.length){ badge.textContent = q ? `(${shown}/${rows.length})` : `(${rows.length})`; }
+  });
+}
+
 function toggleWhisperModelSource(){
  const sourceSelect = document.getElementById('ws-model-source');
  const source = sourceSelect ? sourceSelect.value : 'cached-gguf';
@@ -2505,12 +2541,45 @@ function _instanceBadge(lookupPaths, cfgMax){
  return '';
 }

+// ---- Quant-job badges on the model list (polled; re-renders only on change) ----
+let _quantJobs = {};
+let _quantSig = '';
+async function refreshQuantJobs(){
+  try{
+    const r = await fetch('/admin/api/quantize-status');
+    _quantJobs = (await r.json()).jobs || {};
+  }catch(e){ /* keep last-known on transient error */ }
+  // Signature buckets progress to 5% so we only re-render on meaningful change.
+  return Object.entries(_quantJobs)
+    .map(([k,v])=>`${k}:${v.status}:${Math.round((v.progress||0)*20)}`).sort().join('|');
+}
+function _quantBadge(modelId){
+  const j = _quantJobs[modelId];
+  if(!j) return '';
+  const m = esc((j.method||'gptq').toUpperCase());
+  if(j.status==='running'){
+    const pct = Math.round((j.progress||0)*100);
+    return `<span class="badge" style="background:rgba(99,102,241,.18);color:#A5B4FC;font-size:10px;margin-left:.3rem" title="Quantizing to ${m}: ${esc(j.message||'')}">⏳ quantizing ${pct}%</span>`;
+  }
+  if(j.status==='failed') return `<span class="badge badge-err" style="font-size:10px;margin-left:.3rem" title="${esc(j.error||'')} — falls back to bitsandbytes">quant failed</span>`;
+  if(j.status==='interrupted') return `<span class="badge" style="background:rgba(255,160,0,.18);color:#b87200;font-size:10px;margin-left:.3rem" title="Interrupted by a server restart — re-run from the model config">quant interrupted</span>`;
+  if(j.status==='done') return `<span class="badge badge-ok" style="font-size:10px;margin-left:.3rem" title="Fast-kernel ${m} checkpoint ready — used automatically on next load">4-bit ready</span>`;
+  return '';
+}
+
 async function refreshLocal(){
  await refreshLoadedStatus();
+  _quantSig = await refreshQuantJobs();
  loadCacheStats();
  loadCachedModels();
 }

+// Poll quant jobs; re-render the list only when a job's state/progress changes.
+setInterval(async () => {
+  const sig = await refreshQuantJobs();
+  if (sig !== _quantSig) { _quantSig = sig; loadCachedModels(); }
+}, 5000);
+
 loadGlobalSettings();
 // Load engine/card info first so the per-model card tags render on the first paint,
 // then re-render once it's available (covers the fetch resolving after the list).

--- a/codai/models/quant.py
+++ b/codai/models/quant.py
@@ -99,13 +99,83 @@ def find_quantized_checkpoint(model_name: str, method: str = "gptq") -> Optional


 # --------------------------------------------------------------------------------
-# Background quantization job
+# Background quantization job (persisted so status survives a server restart)
 # --------------------------------------------------------------------------------

 _jobs: Dict[str, Dict[str, Any]] = {}     # model_name -> job status dict
 _jobs_lock = threading.Lock()


+def _jobs_file() -> Path:
+    return Path(get_model_cache_dir()) / "quantized" / "jobs.json"
+
+
+def _pid_alive(pid) -> bool:
+    """True if process ``pid`` is still running. Conservative: unknown → alive."""
+    try:
+        pid = int(pid)
+    except (TypeError, ValueError):
+        return False
+    try:
+        os.kill(pid, 0)
+        return True
+    except ProcessLookupError:
+        return False
+    except PermissionError:
+        return True  # exists but owned by another user
+    except Exception:
+        return True
+
+
+def _save_jobs_locked() -> None:
+    """Persist the job table to disk. Caller holds ``_jobs_lock``.
+
+    Merges with whatever is on disk so a *different* process's jobs (the front and
+    engines each import this module) aren't erased by a last-writer-wins overwrite.
+    This process's in-memory entries win for keys it owns.
+    """
+    try:
+        import json
+        f = _jobs_file()
+        f.parent.mkdir(parents=True, exist_ok=True)
+        merged = {}
+        try:
+            if f.is_file():
+                disk = json.loads(f.read_text())
+                if isinstance(disk, dict):
+                    merged.update(disk)
+        except Exception:
+            pass
+        merged.update(_jobs)
+        tmp = f.with_suffix(".json.tmp")
+        tmp.write_text(json.dumps(merged))
+        tmp.replace(f)
+    except Exception:
+        pass
+
+
+def _load_jobs() -> None:
+    """Load persisted jobs on import. A job left 'running' whose owning process is
+    no longer alive can't still be running → mark interrupted. A job still owned by
+    a live process (another engine, or this one) is left untouched."""
+    try:
+        import json
+        f = _jobs_file()
+        if not f.is_file():
+            return
+        data = json.loads(f.read_text())
+        if not isinstance(data, dict):
+            return
+        for name, job in data.items():
+            if (isinstance(job, dict) and job.get("status") == "running"
+                    and not _pid_alive(job.get("pid"))):
+                job["status"] = "interrupted"
+                job["message"] = "interrupted by server restart"
+            _jobs[name] = job
+    except Exception:
+        pass
+
+
 def get_job(model_name: str) -> Optional[Dict[str, Any]]:
    with _jobs_lock:
        j = _jobs.get(model_name)
@@ -121,6 +191,10 @@ def _set_job(model_name: str, **fields) -> None:
    with _jobs_lock:
        j = _jobs.setdefault(model_name, {"model": model_name})
        j.update(fields)
+        _save_jobs_locked()
+
+
+_load_jobs()


 def start_quantization(model_name: str, method: str = "gptq", bits: int = 4,
@@ -149,7 +223,8 @@ def start_quantization(model_name: str, method: str = "gptq", bits: int = 4,
        _jobs[model_name] = {"model": model_name, "method": method, "bits": bits,
                             "status": "running", "progress": 0.0,
                             "message": "starting", "started": time.time(),
-                             "error": None, "output": None}
+                             "pid": os.getpid(), "error": None, "output": None}
+        _save_jobs_locked()

    t = threading.Thread(
        target=_quantize_worker,