Collect all chunks in thread pool before yielding to avoid generator issues

47738566 · Stefy Lanza (nextime / spora ) · bf2b3b0a · 47738566
Commit 47738566 authored Mar 01, 2026 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 24 additions and 23 deletions

coderai coderai +24 -23

No files found.
--- a/coderai
+++ b/coderai
@@ -1048,8 +1048,10 @@ class VulkanBackend(ModelBackend):
        total_content = ""
        chunk_count = 0
-        def sync_generator():
+        # Collect all chunks synchronously then yield them
-            """Synchronous generator that runs in executor."""
+        # This avoids issues with generators across thread boundaries
+        def collect_chunks():
+            """Collect all chunks from the stream."""
            print(f"DEBUG: generate_chat_stream: Calling create_chat_completion with tools={tools}")
            stream = self.model.create_chat_completion(
                messages=messages,
@@ -1061,32 +1063,31 @@ class VulkanBackend(ModelBackend):
                stream=True,
            )
            print(f"DEBUG: generate_chat_stream: Got stream object: {type(stream)}")
+            chunks = []
            for chunk in stream:
-                yield chunk
+                chunks.append(chunk)
+            print(f"DEBUG: generate_chat_stream: Collected {len(chunks)} chunks")
+            return chunks
        try:
-            # Run synchronous generator in thread pool
+            # Run the collection in thread pool
            loop = asyncio.get_event_loop()
-            gen = sync_generator()
+            chunks = await loop.run_in_executor(None, collect_chunks)
-            while True:
+            for chunk in chunks:
-                try:
+                chunk_count += 1
-                    chunk = await loop.run_in_executor(None, lambda: next(gen, None))
+                print(f"DEBUG: generate_chat_stream: Processing chunk {chunk_count}: {repr(chunk)}")
-                    if chunk is None:
+                delta = chunk["choices"][0].get("delta", {})
-                        break
+                content = delta.get("content", "")
-                    chunk_count += 1
+                # Handle Qwen3's special thinking token - skip it and continue
-                    print(f"DEBUG: generate_chat_stream: Raw chunk {chunk_count}: {repr(chunk)}")
+                # Qwen3 uses `<think>` tags for reasoning, we should pass through the content
-                    delta = chunk["choices"][0].get("delta", {})
+                if content:
-                    content = delta.get("content", "")
+                    total_content += content
+                    yield content
-                    # Handle Qwen3's special thinking token - skip it and continue
-                    # Qwen3 uses `<think>` tags for reasoning, we should pass through the content
+                # Small yield to allow other async tasks
-                    if content:
+                await asyncio.sleep(0)
-                        total_content += content
-                        yield content
-                except StopIteration:
-                    break
            print(f"DEBUG: generate_chat_stream yielded {chunk_count} chunks, total content length: {len(total_content)}")
            if chunk_count == 0 or not total_content.strip():