vulkan: fold system message into user turn when template rejects it

Gemma's chat template has no 'system' role; llama.cpp raises "System role not supported" and the generation fails (the Kilo client always sends a system prompt). On that specific error, retry with the system message(s) folded into the first user turn — Gemma's own convention, and a no-op for models that accept system. Handles both streaming and non-streaming paths and preserves multimodal (list) content. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

vulkan: fold system message into user turn when template rejects it
Gemma's chat template has no 'system' role; llama.cpp raises "System role not supported" and the generation fails (the Kilo client always sends a system prompt). On that specific error, retry with the system message(s) folded into the first user turn — Gemma's own convention, and a no-op for models that accept system. Handles both streaming and non-streaming paths and preserves multimodal (list) content. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
39a62745 · Stefy Lanza (nextime / spora ) · eb138bfa · 39a62745
Commit 39a62745 authored Jun 19, 2026 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 70 additions and 2 deletions

vulkan.py codai/backends/vulkan.py +70 -2

No files found.
--- a/codai/backends/vulkan.py
+++ b/codai/backends/vulkan.py
@@ -1514,6 +1514,57 @@ class VulkanBackend(ModelBackend):
    # Chat-level generation (uses llama.cpp native chat template)
    # ------------------------------------------------------------------
+    @staticmethod
+    def _fold_system_into_user(messages):
+        """Merge system message(s) into the first user turn.
+        Some chat templates (notably Gemma) reject a 'system' role and llama.cpp
+        raises "System role not supported". Folding the system text into the first
+        user message is Gemma's own convention and is harmless for other models.
+        Preserves order and multimodal (list) content."""
+        def _role(m):
+            return m.get('role') if isinstance(m, dict) else getattr(m, 'role', None)
+        def _content(m):
+            return m.get('content') if isinstance(m, dict) else getattr(m, 'content', None)
+        sys_texts, rest = [], []
+        for m in messages:
+            if _role(m) == 'system':
+                c = _content(m)
+                if isinstance(c, str):
+                    sys_texts.append(c)
+                elif isinstance(c, list):
+                    sys_texts += [p.get('text', '') for p in c
+                                  if isinstance(p, dict) and p.get('type') == 'text']
+                continue
+            rest.append(m)
+        preamble = "\n\n".join(t for t in sys_texts if t)
+        if not preamble:
+            return messages
+        out, injected = [], False
+        for m in rest:
+            if not injected and _role(m) == 'user':
+                c = _content(m)
+                nm = dict(m) if isinstance(m, dict) else {'role': 'user', 'content': c}
+                if isinstance(c, str):
+                    nm['content'] = preamble + "\n\n" + c
+                elif isinstance(c, list):
+                    nm['content'] = [{'type': 'text', 'text': preamble}] + c
+                else:
+                    nm['content'] = preamble
+                out.append(nm)
+                injected = True
+            else:
+                out.append(m)
+        if not injected:
+            out.insert(0, {'role': 'user', 'content': preamble})
+        return out
+    @staticmethod
+    def _is_system_role_error(exc) -> bool:
+        return 'system role' in str(exc).lower()
    def generate_chat(self, messages, max_tokens=None, temperature=0.7, top_p=1.0,
                      stop=None, tools=None, response_format=None):
        """Non-streaming chat completion using llama.cpp's native chat handler."""
@@ -1535,7 +1586,13 @@ class VulkanBackend(ModelBackend):
            kwargs['stopping_criteria'] = _tc
        with self._gen_lock:
-            result = self.model.create_chat_completion(**kwargs)
+            try:
+                result = self.model.create_chat_completion(**kwargs)
+            except Exception as e:
+                if not self._is_system_role_error(e):
+                    raise
+                kwargs['messages'] = self._fold_system_into_user(messages)
+                result = self.model.create_chat_completion(**kwargs)
        usage = result.get('usage', {})
        self._store_usage(
            prompt_tokens=usage.get('prompt_tokens', 0),
@@ -1563,10 +1620,21 @@ class VulkanBackend(ModelBackend):
        if _tc is not None and _chat_supports_stopping_criteria():
            kwargs['stopping_criteria'] = _tc
+        # Build the completion up front so a template error (e.g. Gemma's "System
+        # role not supported") is caught here and retried with the system message
+        # folded into the first user turn — before any chunk is streamed.
+        try:
+            _completion = self.model.create_chat_completion(**kwargs)
+        except Exception as e:
+            if not self._is_system_role_error(e):
+                raise
+            kwargs['messages'] = self._fold_system_into_user(messages)
+            _completion = self.model.create_chat_completion(**kwargs)
        prompt_tokens = 0
        completion_tokens = 0
        try:
-            async for chunk in _aiter_blocking(self.model.create_chat_completion(**kwargs), lock=self._gen_lock):
+            async for chunk in _aiter_blocking(_completion, lock=self._gen_lock):
                delta = chunk['choices'][0].get('delta', {})
                text = delta.get('content') or ''
                if text: