text: stop runaway tool-call loops + honor client repetition penalties

Some quantized fine-tunes (seen with an "Aggressive" Qwen3.6-35B Q4_K_M) collapse
into a runaway repetition loop — emitting a malformed parallel tool-call flood of
1700+ tokens that never terminates — when top_p=1.0 and no repetition penalty are
in effect (exactly the conditions Qwen's own docs warn cause endless repetitions).

Two fixes:

1. Anti-loop generation stop in stream_chat_response: a model-agnostic detector
   normalises away the variable parts of the tail (quoted strings, filesystem
   paths, whitespace) so a loop whose only per-cycle difference is an arg/path
   still reads as periodic, then breaks generation when a short structural unit
   repeats >=5x back-to-back. Tuned to not trip on prose, repetitive code, or a
   legit handful of distinct tool calls.

2. Honor client-supplied repetition controls. The chat paths previously forwarded
   only temperature/top_p, silently dropping repeat/presence/frequency penalty —
   so a caller (e.g. Kilo) setting them per-model had no effect. Plumb them through
   generate_chat_stream / generate_chat to both backends (cuda already accepts
   them; vulkan now does too) with graceful signature fallbacks. Defaults are
   no-ops, so unset clients are unaffected.
Co-Authored-By: 's avatarClaude Opus 4.8 <noreply@anthropic.com>
parent 913e283a
......@@ -1972,6 +1972,9 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
compact_plan=_compact_plan,
suppress_reasoning=_suppress_reasoning,
reasoning_active=_reasoning_active,
repeat_penalty=request.repeat_penalty,
presence_penalty=request.presence_penalty,
frequency_penalty=request.frequency_penalty,
):
yield chunk
finally:
......@@ -1996,6 +1999,9 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
enable_thinking=reasoning_enabled,
suppress_reasoning=_suppress_reasoning,
reasoning_active=_reasoning_active,
repeat_penalty=request.repeat_penalty,
presence_penalty=request.presence_penalty,
frequency_penalty=request.frequency_penalty,
)
finally:
_release_instance()
......@@ -2131,6 +2137,37 @@ def _context_overflow_detail(e) -> Optional[str]:
return None
def _detect_runaway_repetition(text: str) -> bool:
"""Heuristic guard against a model stuck emitting the same fragment forever
(e.g. Qwen collapsing into a malformed parallel tool-call loop). It normalises
away the *variable* parts — quoted strings, filesystem paths, whitespace — so a
loop whose only difference each cycle is the path/arg still reads as periodic,
then looks for a short structural unit repeated back-to-back many times.
Tuned to fire only on genuine degeneration (>=5 identical structural periods),
so ordinary prose or code — which doesn't repeat a structural unit 5x verbatim —
won't trip it."""
tail = text[-1600:]
skel = _re.sub(r'"[^"]*"', '""', tail) # collapse quoted strings/args
skel = _re.sub(r'/[^\s"<>]+', '/', skel) # collapse filesystem paths
skel = _re.sub(r'\s+', ' ', skel)
n = len(skel)
for period in range(6, 140):
if n < period * 5:
break
unit = skel[-period:]
if not unit.strip():
continue
reps = 1
i = n - 2 * period
while i >= 0 and skel[i:i + period] == unit:
reps += 1
i -= period
if reps >= 5:
return True
return False
async def stream_chat_response(
messages: List[Dict],
model_name: str,
......@@ -2147,6 +2184,9 @@ async def stream_chat_response(
compact_plan: Optional[Dict] = None,
suppress_reasoning: bool = False,
reasoning_active: bool = False,
repeat_penalty: float = 1.0,
presence_penalty: float = 0.0,
frequency_penalty: float = 0.0,
) -> AsyncGenerator[str, None]:
"""Stream chat completion response with queue notifications."""
completion_id = f"chatcmpl-{uuid.uuid4().hex}"
......@@ -2338,6 +2378,9 @@ async def stream_chat_response(
tools=tools,
response_format=response_format,
enable_thinking=enable_thinking,
repeat_penalty=repeat_penalty,
presence_penalty=presence_penalty,
frequency_penalty=frequency_penalty,
):
# Cooperative cancellation: stop streaming if the task was cancelled.
if task_registry.is_cancelled(_tid):
......@@ -2366,6 +2409,21 @@ async def stream_chat_response(
# Pass through all content including whitespace - it's essential for message composition
generated_text += filtered_chunk
# Anti-loop safety net: if the model has collapsed into a runaway
# repetition (e.g. a malformed parallel tool-call flood), stop pulling
# tokens instead of burning the whole context. Check periodically once
# there's enough text to judge; downstream finalisation still runs on
# what we have (the parser's repetition guard keeps the first real call).
if chunk_count % 32 == 0 and len(generated_text) > 600 \
and _detect_runaway_repetition(generated_text):
if _debug_requests_enabled():
print(f"# <<< [anti-loop] runaway repetition detected at "
f"{chunk_count} tok — stopping generation", flush=True)
logger.warning("stream_chat_response: runaway repetition detected for "
"model=%s at %d chunks; truncating generation",
model_name, chunk_count)
break
# Live progress under --debug-requests so a non-terminating / looping
# generation is visible AS IT HAPPENS — the end-of-stream response logs
# below never fire if the model never stops. The front pumps engine
......@@ -2694,6 +2752,9 @@ async def generate_chat_response(
enable_thinking: bool = False,
suppress_reasoning: bool = False,
reasoning_active: bool = False,
repeat_penalty: float = 1.0,
presence_penalty: float = 0.0,
frequency_penalty: float = 0.0,
) -> Dict:
"""Generate non-streaming chat completion response."""
completion_id = f"chatcmpl-{uuid.uuid4().hex}"
......@@ -2732,6 +2793,9 @@ async def generate_chat_response(
tools=tools,
response_format=response_format,
enable_thinking=enable_thinking,
repeat_penalty=repeat_penalty,
presence_penalty=presence_penalty,
frequency_penalty=frequency_penalty,
)
# Always filter out malformed content
......
......@@ -1606,7 +1606,8 @@ class VulkanBackend(ModelBackend):
return val
def generate_chat(self, messages, max_tokens=None, temperature=0.7, top_p=1.0,
stop=None, tools=None, response_format=None):
stop=None, tools=None, response_format=None, enable_thinking=False,
repeat_penalty=1.0, presence_penalty=0.0, frequency_penalty=0.0):
"""Non-streaming chat completion using llama.cpp's native chat handler."""
if self.model is None:
raise RuntimeError("Model not loaded")
......@@ -1619,6 +1620,13 @@ class VulkanBackend(ModelBackend):
temperature=temperature,
top_p=top_p,
)
# Forward client-supplied repetition controls (no-op at defaults).
if repeat_penalty and repeat_penalty != 1.0:
kwargs['repeat_penalty'] = repeat_penalty
if presence_penalty:
kwargs['presence_penalty'] = presence_penalty
if frequency_penalty:
kwargs['frequency_penalty'] = frequency_penalty
if stop:
kwargs['stop'] = stop
if response_format and response_format.get('type') == 'json_object':
......@@ -1644,7 +1652,9 @@ class VulkanBackend(ModelBackend):
return content
async def generate_chat_stream(self, messages, max_tokens=None, temperature=0.7,
top_p=1.0, stop=None, tools=None, response_format=None):
top_p=1.0, stop=None, tools=None, response_format=None,
enable_thinking=False, repeat_penalty=1.0,
presence_penalty=0.0, frequency_penalty=0.0):
"""Streaming chat completion using llama.cpp's native chat handler."""
if self.model is None:
raise RuntimeError("Model not loaded")
......@@ -1658,6 +1668,13 @@ class VulkanBackend(ModelBackend):
top_p=top_p,
stream=True,
)
# Forward client-supplied repetition controls (no-op at defaults).
if repeat_penalty and repeat_penalty != 1.0:
kwargs['repeat_penalty'] = repeat_penalty
if presence_penalty:
kwargs['presence_penalty'] = presence_penalty
if frequency_penalty:
kwargs['frequency_penalty'] = frequency_penalty
if stop:
kwargs['stop'] = stop
_tc = _make_llama_thermal_criteria()
......
......@@ -390,25 +390,33 @@ class ModelManager:
def generate_chat(self, messages: List[Dict], max_tokens: Optional[int] = None,
temperature: float = 0.7, top_p: float = 1.0,
stop: Optional[List[str]] = None, tools: Optional[List] = None,
response_format: Optional[Dict] = None, enable_thinking: bool = False):
"""Generate chat completion non-streaming."""
response_format: Optional[Dict] = None, enable_thinking: bool = False,
repeat_penalty: float = 1.0, presence_penalty: float = 0.0,
frequency_penalty: float = 0.0):
"""Generate chat completion non-streaming.
Forwards client repetition controls to the backend (defaults are no-ops)."""
if self.backend is None:
raise RuntimeError("No model loaded")
_pen = dict(repeat_penalty=repeat_penalty, presence_penalty=presence_penalty,
frequency_penalty=frequency_penalty)
# Use generate_chat if available (Vulkan backend), otherwise format and use generate
if hasattr(self.backend, 'generate_chat'):
try:
return self.backend.generate_chat(messages, max_tokens, temperature, top_p,
stop, tools, response_format,
enable_thinking=enable_thinking)
except TypeError:
# Backend doesn't accept enable_thinking (e.g. Vulkan) — call plainly.
return self.backend.generate_chat(messages, max_tokens, temperature, top_p,
stop, tools, response_format)
for _extra in (dict(enable_thinking=enable_thinking, **_pen),
dict(enable_thinking=enable_thinking), {}):
try:
return self.backend.generate_chat(messages, max_tokens, temperature, top_p,
stop, tools, response_format, **_extra)
except TypeError:
continue
else:
# Fallback for NVIDIA backend
from codai.pydantic.textrequest import ChatMessage
prompt = self.format_messages([ChatMessage(**m) for m in messages])
return self.backend.generate(prompt, max_tokens, temperature, top_p, stop)
try:
return self.backend.generate(prompt, max_tokens, temperature, top_p, stop, **_pen)
except TypeError:
return self.backend.generate(prompt, max_tokens, temperature, top_p, stop)
async def generate_stream(self, prompt: str, max_tokens: Optional[int] = None,
temperature: float = 0.7, top_p: float = 1.0,
......@@ -422,26 +430,42 @@ class ModelManager:
async def generate_chat_stream(self, messages: List[Dict], max_tokens: Optional[int] = None,
temperature: float = 0.7, top_p: float = 1.0,
stop: Optional[List[str]] = None, tools: Optional[List] = None,
response_format: Optional[Dict] = None, enable_thinking: bool = False):
"""Generate chat completion streaming."""
response_format: Optional[Dict] = None, enable_thinking: bool = False,
repeat_penalty: float = 1.0, presence_penalty: float = 0.0,
frequency_penalty: float = 0.0):
"""Generate chat completion streaming.
Client-supplied repetition controls (repeat/presence/frequency penalty) are
forwarded to the backend so a caller like Kilo can damp runaway loops via its
own per-model settings. Defaults are no-ops, so unset clients are unaffected."""
if self.backend is None:
raise RuntimeError("No model loaded")
_pen = dict(repeat_penalty=repeat_penalty, presence_penalty=presence_penalty,
frequency_penalty=frequency_penalty)
# Use generate_chat_stream if available (Vulkan backend), otherwise format and use generate_stream
if hasattr(self.backend, 'generate_chat_stream'):
try:
_gen = self.backend.generate_chat_stream(
messages, max_tokens, temperature, top_p, stop, tools,
response_format, enable_thinking=enable_thinking)
except TypeError:
_gen = self.backend.generate_chat_stream(
messages, max_tokens, temperature, top_p, stop, tools, response_format)
# Try richest signature first, then progressively drop kwargs the backend
# doesn't accept (older signatures) so we never hard-fail on plumbing.
for _extra in (dict(enable_thinking=enable_thinking, **_pen),
dict(enable_thinking=enable_thinking), {}):
try:
_gen = self.backend.generate_chat_stream(
messages, max_tokens, temperature, top_p, stop, tools,
response_format, **_extra)
break
except TypeError:
continue
async for chunk in _gen:
yield chunk
else:
# Fallback for NVIDIA backend
from codai.pydantic.textrequest import ChatMessage
prompt = self.format_messages([ChatMessage(**m) for m in messages])
async for chunk in self.backend.generate_stream(prompt, max_tokens, temperature, top_p, stop):
try:
_gen = self.backend.generate_stream(prompt, max_tokens, temperature, top_p, stop, **_pen)
except TypeError:
_gen = self.backend.generate_stream(prompt, max_tokens, temperature, top_p, stop)
async for chunk in _gen:
yield chunk
@property
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment