feat: per-model auto-compact of the conversation context (off by default)

When enabled for a model, if the prompt would exceed auto_compact_pct% of the model's context window, the conversation is shrunk to ~65% before generation instead of erroring on overflow. Per-model config (auto_compact / auto_compact_pct / auto_compact_strategy) with three strategies: - drop_oldest : keep system messages + the most recent turns that fit. - keep_head_tail : also keep the first user turn as an anchor + a count note. - summarize : replace the dropped middle with a best-effort LLM summary (generated by the loaded model; falls back to a count note). Token size is a cheap chars/4 estimate; membership uses object identity so value-equal turns don't collide. Wired into the chat path (codai/api/text.py), the model-configure whitelist, and the model config modal UI. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

feat: per-model auto-compact of the conversation context (off by default)
When enabled for a model, if the prompt would exceed auto_compact_pct% of the model's context window, the conversation is shrunk to ~65% before generation instead of erroring on overflow. Per-model config (auto_compact / auto_compact_pct / auto_compact_strategy) with three strategies: - drop_oldest : keep system messages + the most recent turns that fit. - keep_head_tail : also keep the first user turn as an anchor + a count note. - summarize : replace the dropped middle with a best-effort LLM summary (generated by the loaded model; falls back to a count note). Token size is a cheap chars/4 estimate; membership uses object identity so value-equal turns don't collide. Wired into the chat path (codai/api/text.py), the model-configure whitelist, and the model config modal UI. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
a019905f · Stefy Lanza (nextime / spora ) · 8c85e16a · a019905f · a019905f · a019905f
Commit a019905f authored Jun 19, 2026 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 183 additions and 3 deletions

routes.py codai/admin/routes.py +2 -1

models.html codai/admin/templates/models.html +32 -0

text.py codai/api/text.py +149 -2

No files found.
--- a/codai/admin/routes.py
+++ b/codai/admin/routes.py
@@ -2471,7 +2471,8 @@ async def api_model_configure(request: Request, username: str = Depends(require_
                "component_quantization", "output_crf", "force_vram_update",
                "balanced_gpu_percent", "acceleration",
                "cache_type_k", "cache_type_v", "turboquant", "engine",
-                "quant_backend", "kv_cache_budget_mb", "kv_cache_slots", "mmproj"):
+                "quant_backend", "kv_cache_budget_mb", "kv_cache_slots", "mmproj",
+                "auto_compact", "auto_compact_pct", "auto_compact_strategy"):
        if key in data:
            entry[key] = data[key]

--- a/codai/admin/templates/models.html
+++ b/codai/admin/templates/models.html
@@ -666,6 +666,26 @@ window.__DEFAULT_WHISPER_SERVER_PATH__ = {{ default_whisper_server_path|tojson }
          <input type="number" id="cfg-ram-gb" class="form-input" min="0" step="0.5" placeholder="auto">
        </div>
      </div>
+      <!-- Auto-compact: shrink an over-long conversation before generation -->
+      <div class="card-title" style="margin-top:1.25rem">Auto-compact context</div>
+      <label style="display:flex;align-items:center;gap:.5rem;cursor:pointer;font-size:13px"><input type="checkbox" id="cfg-autocompact" onchange="_toggleAutoCompact()"> Auto-compact when the prompt nears the context limit <span class="muted">(off by default)</span></label>
+      <div id="cfg-autocompact-opts" style="display:none;grid-template-columns:1fr 1fr;gap:.75rem;margin-top:.5rem">
+        <div class="form-row" style="margin:0">
+          <label class="form-label">Trigger at <span class="muted">(% of context)</span></label>
+          <input type="number" id="cfg-autocompact-pct" class="form-input" min="50" max="99" step="1" value="85">
+          <span class="form-hint" style="font-size:11px">Compacts to ~65% of the context window when the prompt reaches this %.</span>
+        </div>
+        <div class="form-row" style="margin:0">
+          <label class="form-label">Strategy</label>
+          <select id="cfg-autocompact-strategy" class="form-input">
+            <option value="drop_oldest">Drop oldest — keep system + most recent turns</option>
+            <option value="keep_head_tail">Keep head + tail — also keep the first turn, drop the middle</option>
+            <option value="summarize">Summarize — replace the dropped middle with an LLM summary</option>
+          </select>
+        </div>
+      </div>
      <div style="display:flex;gap:1.5rem;flex-wrap:wrap;margin-top:.75rem">
        <label style="display:flex;align-items:center;gap:.5rem;cursor:pointer;font-size:13px"><input type="checkbox" id="cfg-4bit"> 4-bit quantization</label>
        <label style="display:flex;align-items:center;gap:.5rem;cursor:pointer;font-size:13px"><input type="checkbox" id="cfg-8bit"> 8-bit quantization</label>
@@ -2747,6 +2767,11 @@ async function freeDiskConfirm(idx){
  }catch(e){showAlert('Error: '+e.message)}
 }
+function _toggleAutoCompact(){
+  const on = document.getElementById('cfg-autocompact').checked;
+  document.getElementById('cfg-autocompact-opts').style.display = on ? 'grid' : 'none';
+}
 /* ── type checkbox helpers ─────────────────────────────── */
 function _autoDetectParser(path) {
  const n = (path || '').toLowerCase();
@@ -3115,6 +3140,10 @@ function openCfgModal(idx, cfgIdx){
  }
  document.getElementById('cfg-max-gpu').value = s.max_gpu_percent != null ? s.max_gpu_percent : '';
  document.getElementById('cfg-ram-gb').value = s.manual_ram_gb != null ? s.manual_ram_gb : '';
+  document.getElementById('cfg-autocompact').checked = !!s.auto_compact;
+  document.getElementById('cfg-autocompact-pct').value = s.auto_compact_pct != null ? s.auto_compact_pct : 85;
+  document.getElementById('cfg-autocompact-strategy').value = s.auto_compact_strategy || 'drop_oldest';
+  _toggleAutoCompact();
  document.getElementById('cfg-4bit').checked = !!s.load_in_4bit;
  document.getElementById('cfg-8bit').checked = !!s.load_in_8bit;
  document.getElementById('cfg-quant-backend').value = s.quant_backend || 'auto';
@@ -3465,6 +3494,9 @@ async function saveModelConfig(){
    cache_type_k:      document.getElementById('cfg-cache-type-k').value || null,
    cache_type_v:      document.getElementById('cfg-cache-type-v').value || null,
    mmproj:            document.getElementById('cfg-mmproj').value || null,
+    auto_compact:      document.getElementById('cfg-autocompact').checked,
+    auto_compact_pct:  parseInt(document.getElementById('cfg-autocompact-pct').value) || 85,
+    auto_compact_strategy: document.getElementById('cfg-autocompact-strategy').value || 'drop_oldest',
    max_gpu_percent:   isNaN(maxGpu) ? null : maxGpu,
    manual_ram_gb:     isNaN(ramGb) ? null : ramGb,
    load_in_4bit:      document.getElementById('cfg-4bit').checked,

--- a/codai/api/text.py
+++ b/codai/api/text.py
@@ -299,6 +299,125 @@ def _normalize_tool_call_arguments(tool_calls):
    return out
+def _estimate_tokens(messages) -> int:
+    """Cheap prompt-size estimate (≈ chars/4 + per-message overhead). Good enough
+    to decide whether to auto-compact; not an exact tokenizer count."""
+    total = 0
+    for m in messages:
+        c = m.get("content")
+        if isinstance(c, str):
+            total += len(c)
+        elif isinstance(c, list):
+            for it in c:
+                if isinstance(it, dict) and isinstance(it.get("text"), str):
+                    total += len(it["text"])
+        if m.get("tool_calls"):
+            try:
+                total += len(json.dumps(m["tool_calls"]))
+            except Exception:
+                pass
+        total += 16
+    return int(total / 4) + 8
+def _compact_messages(messages, n_ctx, pct, strategy, summary_text=None):
+    """Shrink an over-long message list to ~65% of n_ctx, keeping system messages
+    and the most recent turns. Returns (new_messages, info|None). Strategies:
+      - drop_oldest : keep only system + the recent tail that fits.
+      - keep_head_tail: also keep the first user turn (context anchor) + a note.
+      - summarize   : keep_head_tail, but replace the dropped middle with an LLM
+                      summary (``summary_text``) when available, else a count note.
+    Returns info=None (no change) when not over threshold or nothing can be dropped.
+    """
+    if not messages or not n_ctx or n_ctx <= 0:
+        return messages, None
+    try:
+        pct = float(pct)
+    except (TypeError, ValueError):
+        pct = 85.0
+    pct = min(99.0, max(50.0, pct))
+    est = _estimate_tokens(messages)
+    if est < n_ctx * pct / 100.0:
+        return messages, None
+    target = int(n_ctx * 0.65)
+    sys_msgs = [m for m in messages if m.get("role") == "system"]
+    body = [m for m in messages if m.get("role") != "system"]
+    running = _estimate_tokens(sys_msgs)
+    # Track membership by object identity — message dicts can be value-equal
+    # (e.g. duplicate "try again" turns or identical tool results).
+    tail = []
+    tail_ids = set()
+    for m in reversed(body):
+        t = _estimate_tokens([m])
+        if tail and running + t > target:
+            break
+        tail.insert(0, m)
+        tail_ids.add(id(m))
+        running += t
+    # Don't start the kept tail on an orphaned tool result (its assistant
+    # tool_calls turn would have been dropped) — that breaks chat templates.
+    while tail and tail[0].get("role") == "tool":
+        running -= _estimate_tokens([tail[0]])
+        tail_ids.discard(id(tail[0]))
+        tail.pop(0)
+    head = []
+    head_ids = set()
+    if strategy in ("keep_head_tail", "summarize"):
+        first_user = next((m for m in body if m.get("role") == "user"), None)
+        if first_user is not None and id(first_user) not in tail_ids:
+            head = [first_user]
+            head_ids.add(id(first_user))
+    dropped = [m for m in body if id(m) not in head_ids and id(m) not in tail_ids]
+    if not dropped:
+        return messages, None
+    notes = []
+    if strategy == "summarize" and summary_text:
+        notes.append({"role": "system",
+                      "content": "[Summary of earlier conversation]\n" + summary_text})
+    elif strategy in ("keep_head_tail", "summarize"):
+        notes.append({"role": "system",
+                      "content": f"[Note: {len(dropped)} earlier message(s) omitted to fit the context window.]"})
+    new = sys_msgs + head + notes + tail
+    return new, {"dropped": len(dropped), "strategy": strategy,
+                 "before_tokens": est, "after_tokens": _estimate_tokens(new),
+                 "n_ctx": n_ctx}
+async def _summarize_for_compact(manager, messages, keep_recent: int = 4):
+    """Best-effort: summarize the older turns with the loaded model itself. Returns
+    a summary string or None (caller falls back to a count note)."""
+    try:
+        body = [m for m in messages if m.get("role") != "system"]
+        older = body[:-keep_recent] if len(body) > keep_recent else body
+        if not older:
+            return None
+        lines = []
+        for m in older:
+            c = m.get("content")
+            if isinstance(c, list):
+                c = " ".join(it.get("text", "") for it in c if isinstance(it, dict))
+            lines.append(f"{m.get('role', '?')}: {str(c)[:2000]}")
+        convo = "\n".join(lines)[:12000]
+        prompt = [
+            {"role": "system", "content": "Summarize the following conversation "
+             "concisely, preserving key facts, decisions, code, file paths and open "
+             "tasks. Output only the summary."},
+            {"role": "user", "content": convo},
+        ]
+        out = await asyncio.to_thread(
+            manager.generate_chat, messages=prompt, max_tokens=400, temperature=0.2)
+        return (out or "").strip() or None
+    except Exception as e:
+        print(f"[auto-compact] summary generation failed: {e}", flush=True)
+        return None
 @router.post("/v1/chat/completions", summary="Chat completions")
 async def chat_completions(request: ChatCompletionRequest, http_request: Request = None):
    """Chat completions endpoint with streaming and tool support."""
@@ -845,8 +964,36 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
        # A list is legitimate multipart vision content — leave it intact.
        elif not isinstance(m["content"], str) and not isinstance(m["content"], list):
            messages_dict[i]["content"] = str(m["content"])
+    # Auto-compact (per-model, OFF by default): when the prompt would exceed
+    # `auto_compact_pct`% of the model's context window, shrink it to ~65% using
+    # the configured strategy (drop_oldest | keep_head_tail | summarize) instead of
+    # erroring out on overflow.
+    try:
+        from codai.models.manager import multi_model_manager as _mmm
+        _cc = _mmm._config_for_model(getattr(request, "model", None) or "") or {}
+    except Exception:
+        _cc = {}
+    if _cc.get("auto_compact"):
+        try:
+            _nctx = current_manager.get_context_size() if current_manager else 0
+        except Exception:
+            _nctx = 0
+        _pct = _cc.get("auto_compact_pct", 85)
+        _strategy = (_cc.get("auto_compact_strategy") or "drop_oldest").strip()
+        if _nctx and _estimate_tokens(messages_dict) >= _nctx * float(_pct or 85) / 100.0:
+            _summary = None
+            if _strategy == "summarize":
+                _summary = await _summarize_for_compact(current_manager, messages_dict)
+            messages_dict, _info = _compact_messages(
+                messages_dict, _nctx, _pct, _strategy, _summary)
+            if _info:
+                print(f"[auto-compact] {getattr(request, 'model', '?')}: "
+                      f"~{_info['before_tokens']}→{_info['after_tokens']} tokens "
+                      f"(n_ctx={_nctx}, dropped {_info['dropped']} msgs via "
+                      f"{_info['strategy']})", flush=True)
    # Convert tools to dict format if present
    tools_dict = None
    if request.tools: