fix: tool-call streaming/format robustness + clear over-context error

- Streaming tool gate now withholds the gemma/qwen native `<|tool_call>` marker
  (and partials) too, not just `<tool_call>`/`call:NAME{` — so the raw marker no
  longer leaks to the client mid-stream (Kilo was executing partial calls).
- Normalize tool-call function.arguments from JSON string → dict before applying
  the chat template, so templates that render `arguments|items` (Qwen) don't
  raise "Can only get item pairs from a mapping".
- Context-window overflow now returns a meaningful error: a structured SSE error
  event (code context_length_exceeded) when streaming, or HTTP 400 with a clear
  message for non-streaming — instead of injecting "[Generation error: …]" as
  assistant content (which polluted chat history).
- Models page: unconfigured GGUF files now expose the "Free disk" button (records
  them as "to download" before deleting), matching HF models.
Co-Authored-By: 's avatarClaude Opus 4.8 <noreply@anthropic.com>
parent ade800f9
......@@ -2465,7 +2465,7 @@ async function loadCachedModels(){
<button class="btn btn-secondary btn-sm" style="font-size:10px;padding:.15rem .4rem" onclick="openCfgModal(${idx},0)">${f.in_config?'Configure':'Add'}</button>
${f.in_config?`<button class="btn btn-ghost btn-sm" style="font-size:10px;padding:.15rem .4rem" onclick="disableModel(${idx})">Remove</button>`:''}
${toDlGg?`<button class="btn btn-ghost btn-sm" style="font-size:10px;padding:.15rem .4rem" onclick="unmarkDownload(${idx})" title="Remove this model from the download list">✕ Remove from list</button>`:''}
${!f.missing&&!toDlGg&&(f.in_config||redownloadTarget)?`<button class="btn btn-ghost btn-sm" style="font-size:10px;padding:.15rem .4rem" onclick="freeDiskConfirm(${idx})" title="Delete this file from disk to save space, but keep it in the list so you can re-download it later">🗑 Free disk</button>`:''}
${!f.missing&&!toDlGg?`<button class="btn btn-ghost btn-sm" style="font-size:10px;padding:.15rem .4rem" onclick="freeDiskConfirm(${idx})" title="Delete this file from disk to save space, but keep it in the list (as “to download”) so you can fetch it again later">🗑 Free disk</button>`:''}
${!f.missing&&!toDlGg?`<button class="btn btn-danger btn-sm" style="font-size:10px;padding:.15rem .4rem" onclick="deleteModelConfirm(${idx})">Delete</button>`:''}
</td>
</tr>`;
......
......@@ -270,6 +270,35 @@ def _normalize_vision_content(content: list) -> list:
return norm
def _normalize_tool_call_arguments(tool_calls):
"""Return tool_calls with each ``function.arguments`` as a dict (mapping)
rather than a JSON string. OpenAI/Kilo send arguments as a JSON STRING, but
several GGUF chat templates (e.g. Qwen) render them with ``arguments|items``,
which requires a mapping — otherwise llama.cpp raises "Can only get item pairs
from a mapping" while applying the template. A dict also serializes correctly
for templates that use ``arguments|tojson``, so this is safe either way."""
out = []
for tc in (tool_calls or []):
if hasattr(tc, "model_dump"):
tc = tc.model_dump()
if not isinstance(tc, dict):
out.append(tc)
continue
tc = dict(tc)
fn = tc.get("function")
if isinstance(fn, dict) and isinstance(fn.get("arguments"), str):
try:
parsed = json.loads(fn["arguments"] or "{}")
except Exception:
parsed = None
if isinstance(parsed, dict):
fn = dict(fn)
fn["arguments"] = parsed
tc["function"] = fn
out.append(tc)
return out
@router.post("/v1/chat/completions", summary="Chat completions")
async def chat_completions(request: ChatCompletionRequest, http_request: Request = None):
"""Chat completions endpoint with streaming and tool support."""
......@@ -793,8 +822,10 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
msg_dict["content"] = str(content) if content is not None else ""
# Handle tool_calls - convert to proper format if present
if msg.tool_calls:
# tool_calls should be a list of dicts with 'id', 'type', 'function' keys
msg_dict["tool_calls"] = msg.tool_calls
# tool_calls should be a list of dicts with 'id', 'type', 'function'
# keys; normalise function.arguments from a JSON string to a dict so
# templates that do `arguments|items` (Qwen, …) don't raise.
msg_dict["tool_calls"] = _normalize_tool_call_arguments(msg.tool_calls)
if msg.name:
msg_dict["name"] = msg.name
if msg.tool_call_id:
......@@ -1446,7 +1477,12 @@ import re as _re
_TOOL_SPAN_RE = _re.compile(r'<(tool|tool_call)\b[\s\S]*?</\1\s*>', _re.IGNORECASE)
_TOOL_OPEN_RE = _re.compile(r'<(?:tool|tool_call)\b', _re.IGNORECASE)
_TOOL_OPEN_TAGS = ('<tool>', '<tool_call>')
_TOOL_OPEN_TAGS = ('<tool>', '<tool_call>', '<|tool_call>', '<|tool_call|>')
# gemma/qwen native special-token tool marker `<|tool_call>` — usually a special
# token stripped on decode, but some GGUFs emit it as plain text. Treat it like a
# tool-open: withhold everything from it to the end so the raw marker (and the
# `call:NAME{…}` that follows) never leaks to the client as visible content.
_NATIVE_TOOL_OPEN_RE = _re.compile(r'<\|tool_call', _re.IGNORECASE)
# gemma-4 native tool call: `call:NAME{…}` (the <|tool_call> markers are stripped
# by skip_special_tokens). Once it starts we withhold everything to the end of the
# stream — the call is surfaced as structured tool_calls after generation.
......@@ -1478,6 +1514,12 @@ def _gate_tool_content(buffer: str, final: bool = False):
emit.append(buffer[:m.start()])
held = '' if final else buffer[m.start():]
return ''.join(emit), held
# gemma/qwen native `<|tool_call>` marker — withhold from it to the end.
nm = _NATIVE_TOOL_OPEN_RE.search(buffer)
if nm:
emit.append(buffer[:nm.start()])
held = '' if final else buffer[nm.start():]
return ''.join(emit), held
# gemma-4 `call:NAME{…}` — withhold from the call onward (extracted at the end).
gm = _GEMMA_CALL_OPEN_RE.search(buffer)
if gm:
......@@ -1501,6 +1543,21 @@ def _gate_tool_content(buffer: str, final: bool = False):
return ''.join(emit), ''
def _context_overflow_detail(e) -> Optional[str]:
"""If the exception is a context-window overflow (prompt + generation exceed
the model's n_ctx), return a clear client-facing message; else None."""
s = str(e)
low = s.lower()
markers = ("exceed context window", "context window of", "requested tokens",
"exceeds n_ctx", "exceed the context", "context length",
"n_ctx", "kv cache is full", "context shift is disabled")
if any(m in low for m in markers) or ("token" in low and "exceed" in low):
return ("The conversation is too long for this model's context window "
f"({s}). Shorten the prompt or lower max_tokens, or increase the "
"model's context size (n_ctx) in its configuration.")
return None
async def stream_chat_response(
messages: List[Dict],
model_name: str,
......@@ -1918,18 +1975,16 @@ async def stream_chat_response(
yield "data: [DONE]\n\n"
except Exception as e:
print(f"Error during streaming generation: {e}")
data = {
"id": completion_id,
"object": "chat.completion.chunk",
"created": created,
"model": model_name,
"choices": [{
"index": 0,
"delta": {"content": f"\n[Generation error: {str(e)}]"},
"finish_reason": "stop",
}],
}
yield f"data: {json.dumps(data)}\n\n"
# Surface errors as a STRUCTURED error event (not as assistant content) so
# the client treats it as an error and it doesn't pollute the chat history.
_ctx = _context_overflow_detail(e)
if _ctx:
err = {"error": {"message": _ctx, "type": "invalid_request_error",
"code": "context_length_exceeded", "param": "messages"}}
else:
err = {"error": {"message": str(e), "type": "internal_error",
"code": "generation_error"}}
yield f"data: {json.dumps(err)}\n\n"
yield "data: [DONE]\n\n"
finally:
# Always clean up queue state
......@@ -2139,6 +2194,9 @@ async def generate_chat_response(
return formatted_response
except Exception as e:
print(f"Error during generation: {e}")
_ctx = _context_overflow_detail(e)
if _ctx:
raise HTTPException(status_code=400, detail=_ctx)
raise HTTPException(status_code=500, detail=f"Generation error: {str(e)}")
# =============================================================================
......@@ -2361,4 +2419,7 @@ async def generate_completion_response(
}
except Exception as e:
print(f"Error during completion: {e}")
_ctx = _context_overflow_detail(e)
if _ctx:
raise HTTPException(status_code=400, detail=_ctx)
raise HTTPException(status_code=500, detail=f"Generation error: {str(e)}")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment