feat: per-model auto-compact of the conversation context (off by default)

When enabled for a model, if the prompt would exceed auto_compact_pct% of the
model's context window, the conversation is shrunk to ~65% before generation
instead of erroring on overflow. Per-model config (auto_compact / auto_compact_pct
/ auto_compact_strategy) with three strategies:
  - drop_oldest    : keep system messages + the most recent turns that fit.
  - keep_head_tail : also keep the first user turn as an anchor + a count note.
  - summarize      : replace the dropped middle with a best-effort LLM summary
                     (generated by the loaded model; falls back to a count note).

Token size is a cheap chars/4 estimate; membership uses object identity so
value-equal turns don't collide. Wired into the chat path (codai/api/text.py),
the model-configure whitelist, and the model config modal UI.
Co-Authored-By: 's avatarClaude Opus 4.8 <noreply@anthropic.com>
parent 8c85e16a
...@@ -2471,7 +2471,8 @@ async def api_model_configure(request: Request, username: str = Depends(require_ ...@@ -2471,7 +2471,8 @@ async def api_model_configure(request: Request, username: str = Depends(require_
"component_quantization", "output_crf", "force_vram_update", "component_quantization", "output_crf", "force_vram_update",
"balanced_gpu_percent", "acceleration", "balanced_gpu_percent", "acceleration",
"cache_type_k", "cache_type_v", "turboquant", "engine", "cache_type_k", "cache_type_v", "turboquant", "engine",
"quant_backend", "kv_cache_budget_mb", "kv_cache_slots", "mmproj"): "quant_backend", "kv_cache_budget_mb", "kv_cache_slots", "mmproj",
"auto_compact", "auto_compact_pct", "auto_compact_strategy"):
if key in data: if key in data:
entry[key] = data[key] entry[key] = data[key]
......
...@@ -666,6 +666,26 @@ window.__DEFAULT_WHISPER_SERVER_PATH__ = {{ default_whisper_server_path|tojson } ...@@ -666,6 +666,26 @@ window.__DEFAULT_WHISPER_SERVER_PATH__ = {{ default_whisper_server_path|tojson }
<input type="number" id="cfg-ram-gb" class="form-input" min="0" step="0.5" placeholder="auto"> <input type="number" id="cfg-ram-gb" class="form-input" min="0" step="0.5" placeholder="auto">
</div> </div>
</div> </div>
<!-- Auto-compact: shrink an over-long conversation before generation -->
<div class="card-title" style="margin-top:1.25rem">Auto-compact context</div>
<label style="display:flex;align-items:center;gap:.5rem;cursor:pointer;font-size:13px"><input type="checkbox" id="cfg-autocompact" onchange="_toggleAutoCompact()"> Auto-compact when the prompt nears the context limit <span class="muted">(off by default)</span></label>
<div id="cfg-autocompact-opts" style="display:none;grid-template-columns:1fr 1fr;gap:.75rem;margin-top:.5rem">
<div class="form-row" style="margin:0">
<label class="form-label">Trigger at <span class="muted">(% of context)</span></label>
<input type="number" id="cfg-autocompact-pct" class="form-input" min="50" max="99" step="1" value="85">
<span class="form-hint" style="font-size:11px">Compacts to ~65% of the context window when the prompt reaches this %.</span>
</div>
<div class="form-row" style="margin:0">
<label class="form-label">Strategy</label>
<select id="cfg-autocompact-strategy" class="form-input">
<option value="drop_oldest">Drop oldest — keep system + most recent turns</option>
<option value="keep_head_tail">Keep head + tail — also keep the first turn, drop the middle</option>
<option value="summarize">Summarize — replace the dropped middle with an LLM summary</option>
</select>
</div>
</div>
<div style="display:flex;gap:1.5rem;flex-wrap:wrap;margin-top:.75rem"> <div style="display:flex;gap:1.5rem;flex-wrap:wrap;margin-top:.75rem">
<label style="display:flex;align-items:center;gap:.5rem;cursor:pointer;font-size:13px"><input type="checkbox" id="cfg-4bit"> 4-bit quantization</label> <label style="display:flex;align-items:center;gap:.5rem;cursor:pointer;font-size:13px"><input type="checkbox" id="cfg-4bit"> 4-bit quantization</label>
<label style="display:flex;align-items:center;gap:.5rem;cursor:pointer;font-size:13px"><input type="checkbox" id="cfg-8bit"> 8-bit quantization</label> <label style="display:flex;align-items:center;gap:.5rem;cursor:pointer;font-size:13px"><input type="checkbox" id="cfg-8bit"> 8-bit quantization</label>
...@@ -2747,6 +2767,11 @@ async function freeDiskConfirm(idx){ ...@@ -2747,6 +2767,11 @@ async function freeDiskConfirm(idx){
}catch(e){showAlert('Error: '+e.message)} }catch(e){showAlert('Error: '+e.message)}
} }
function _toggleAutoCompact(){
const on = document.getElementById('cfg-autocompact').checked;
document.getElementById('cfg-autocompact-opts').style.display = on ? 'grid' : 'none';
}
/* ── type checkbox helpers ─────────────────────────────── */ /* ── type checkbox helpers ─────────────────────────────── */
function _autoDetectParser(path) { function _autoDetectParser(path) {
const n = (path || '').toLowerCase(); const n = (path || '').toLowerCase();
...@@ -3115,6 +3140,10 @@ function openCfgModal(idx, cfgIdx){ ...@@ -3115,6 +3140,10 @@ function openCfgModal(idx, cfgIdx){
} }
document.getElementById('cfg-max-gpu').value = s.max_gpu_percent != null ? s.max_gpu_percent : ''; document.getElementById('cfg-max-gpu').value = s.max_gpu_percent != null ? s.max_gpu_percent : '';
document.getElementById('cfg-ram-gb').value = s.manual_ram_gb != null ? s.manual_ram_gb : ''; document.getElementById('cfg-ram-gb').value = s.manual_ram_gb != null ? s.manual_ram_gb : '';
document.getElementById('cfg-autocompact').checked = !!s.auto_compact;
document.getElementById('cfg-autocompact-pct').value = s.auto_compact_pct != null ? s.auto_compact_pct : 85;
document.getElementById('cfg-autocompact-strategy').value = s.auto_compact_strategy || 'drop_oldest';
_toggleAutoCompact();
document.getElementById('cfg-4bit').checked = !!s.load_in_4bit; document.getElementById('cfg-4bit').checked = !!s.load_in_4bit;
document.getElementById('cfg-8bit').checked = !!s.load_in_8bit; document.getElementById('cfg-8bit').checked = !!s.load_in_8bit;
document.getElementById('cfg-quant-backend').value = s.quant_backend || 'auto'; document.getElementById('cfg-quant-backend').value = s.quant_backend || 'auto';
...@@ -3465,6 +3494,9 @@ async function saveModelConfig(){ ...@@ -3465,6 +3494,9 @@ async function saveModelConfig(){
cache_type_k: document.getElementById('cfg-cache-type-k').value || null, cache_type_k: document.getElementById('cfg-cache-type-k').value || null,
cache_type_v: document.getElementById('cfg-cache-type-v').value || null, cache_type_v: document.getElementById('cfg-cache-type-v').value || null,
mmproj: document.getElementById('cfg-mmproj').value || null, mmproj: document.getElementById('cfg-mmproj').value || null,
auto_compact: document.getElementById('cfg-autocompact').checked,
auto_compact_pct: parseInt(document.getElementById('cfg-autocompact-pct').value) || 85,
auto_compact_strategy: document.getElementById('cfg-autocompact-strategy').value || 'drop_oldest',
max_gpu_percent: isNaN(maxGpu) ? null : maxGpu, max_gpu_percent: isNaN(maxGpu) ? null : maxGpu,
manual_ram_gb: isNaN(ramGb) ? null : ramGb, manual_ram_gb: isNaN(ramGb) ? null : ramGb,
load_in_4bit: document.getElementById('cfg-4bit').checked, load_in_4bit: document.getElementById('cfg-4bit').checked,
......
...@@ -299,6 +299,125 @@ def _normalize_tool_call_arguments(tool_calls): ...@@ -299,6 +299,125 @@ def _normalize_tool_call_arguments(tool_calls):
return out return out
def _estimate_tokens(messages) -> int:
"""Cheap prompt-size estimate (≈ chars/4 + per-message overhead). Good enough
to decide whether to auto-compact; not an exact tokenizer count."""
total = 0
for m in messages:
c = m.get("content")
if isinstance(c, str):
total += len(c)
elif isinstance(c, list):
for it in c:
if isinstance(it, dict) and isinstance(it.get("text"), str):
total += len(it["text"])
if m.get("tool_calls"):
try:
total += len(json.dumps(m["tool_calls"]))
except Exception:
pass
total += 16
return int(total / 4) + 8
def _compact_messages(messages, n_ctx, pct, strategy, summary_text=None):
"""Shrink an over-long message list to ~65% of n_ctx, keeping system messages
and the most recent turns. Returns (new_messages, info|None). Strategies:
- drop_oldest : keep only system + the recent tail that fits.
- keep_head_tail: also keep the first user turn (context anchor) + a note.
- summarize : keep_head_tail, but replace the dropped middle with an LLM
summary (``summary_text``) when available, else a count note.
Returns info=None (no change) when not over threshold or nothing can be dropped.
"""
if not messages or not n_ctx or n_ctx <= 0:
return messages, None
try:
pct = float(pct)
except (TypeError, ValueError):
pct = 85.0
pct = min(99.0, max(50.0, pct))
est = _estimate_tokens(messages)
if est < n_ctx * pct / 100.0:
return messages, None
target = int(n_ctx * 0.65)
sys_msgs = [m for m in messages if m.get("role") == "system"]
body = [m for m in messages if m.get("role") != "system"]
running = _estimate_tokens(sys_msgs)
# Track membership by object identity — message dicts can be value-equal
# (e.g. duplicate "try again" turns or identical tool results).
tail = []
tail_ids = set()
for m in reversed(body):
t = _estimate_tokens([m])
if tail and running + t > target:
break
tail.insert(0, m)
tail_ids.add(id(m))
running += t
# Don't start the kept tail on an orphaned tool result (its assistant
# tool_calls turn would have been dropped) — that breaks chat templates.
while tail and tail[0].get("role") == "tool":
running -= _estimate_tokens([tail[0]])
tail_ids.discard(id(tail[0]))
tail.pop(0)
head = []
head_ids = set()
if strategy in ("keep_head_tail", "summarize"):
first_user = next((m for m in body if m.get("role") == "user"), None)
if first_user is not None and id(first_user) not in tail_ids:
head = [first_user]
head_ids.add(id(first_user))
dropped = [m for m in body if id(m) not in head_ids and id(m) not in tail_ids]
if not dropped:
return messages, None
notes = []
if strategy == "summarize" and summary_text:
notes.append({"role": "system",
"content": "[Summary of earlier conversation]\n" + summary_text})
elif strategy in ("keep_head_tail", "summarize"):
notes.append({"role": "system",
"content": f"[Note: {len(dropped)} earlier message(s) omitted to fit the context window.]"})
new = sys_msgs + head + notes + tail
return new, {"dropped": len(dropped), "strategy": strategy,
"before_tokens": est, "after_tokens": _estimate_tokens(new),
"n_ctx": n_ctx}
async def _summarize_for_compact(manager, messages, keep_recent: int = 4):
"""Best-effort: summarize the older turns with the loaded model itself. Returns
a summary string or None (caller falls back to a count note)."""
try:
body = [m for m in messages if m.get("role") != "system"]
older = body[:-keep_recent] if len(body) > keep_recent else body
if not older:
return None
lines = []
for m in older:
c = m.get("content")
if isinstance(c, list):
c = " ".join(it.get("text", "") for it in c if isinstance(it, dict))
lines.append(f"{m.get('role', '?')}: {str(c)[:2000]}")
convo = "\n".join(lines)[:12000]
prompt = [
{"role": "system", "content": "Summarize the following conversation "
"concisely, preserving key facts, decisions, code, file paths and open "
"tasks. Output only the summary."},
{"role": "user", "content": convo},
]
out = await asyncio.to_thread(
manager.generate_chat, messages=prompt, max_tokens=400, temperature=0.2)
return (out or "").strip() or None
except Exception as e:
print(f"[auto-compact] summary generation failed: {e}", flush=True)
return None
@router.post("/v1/chat/completions", summary="Chat completions") @router.post("/v1/chat/completions", summary="Chat completions")
async def chat_completions(request: ChatCompletionRequest, http_request: Request = None): async def chat_completions(request: ChatCompletionRequest, http_request: Request = None):
"""Chat completions endpoint with streaming and tool support.""" """Chat completions endpoint with streaming and tool support."""
...@@ -845,8 +964,36 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request ...@@ -845,8 +964,36 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
# A list is legitimate multipart vision content — leave it intact. # A list is legitimate multipart vision content — leave it intact.
elif not isinstance(m["content"], str) and not isinstance(m["content"], list): elif not isinstance(m["content"], str) and not isinstance(m["content"], list):
messages_dict[i]["content"] = str(m["content"]) messages_dict[i]["content"] = str(m["content"])
# Auto-compact (per-model, OFF by default): when the prompt would exceed
# `auto_compact_pct`% of the model's context window, shrink it to ~65% using
# the configured strategy (drop_oldest | keep_head_tail | summarize) instead of
# erroring out on overflow.
try:
from codai.models.manager import multi_model_manager as _mmm
_cc = _mmm._config_for_model(getattr(request, "model", None) or "") or {}
except Exception:
_cc = {}
if _cc.get("auto_compact"):
try:
_nctx = current_manager.get_context_size() if current_manager else 0
except Exception:
_nctx = 0
_pct = _cc.get("auto_compact_pct", 85)
_strategy = (_cc.get("auto_compact_strategy") or "drop_oldest").strip()
if _nctx and _estimate_tokens(messages_dict) >= _nctx * float(_pct or 85) / 100.0:
_summary = None
if _strategy == "summarize":
_summary = await _summarize_for_compact(current_manager, messages_dict)
messages_dict, _info = _compact_messages(
messages_dict, _nctx, _pct, _strategy, _summary)
if _info:
print(f"[auto-compact] {getattr(request, 'model', '?')}: "
f"~{_info['before_tokens']}→{_info['after_tokens']} tokens "
f"(n_ctx={_nctx}, dropped {_info['dropped']} msgs via "
f"{_info['strategy']})", flush=True)
# Convert tools to dict format if present # Convert tools to dict format if present
tools_dict = None tools_dict = None
if request.tools: if request.tools:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment