Add more debugging to track llama.cpp streaming response

parent b341f96a
......@@ -1048,6 +1048,7 @@ class VulkanBackend(ModelBackend):
total_content = ""
chunk_count = 0
try:
print(f"DEBUG: generate_chat_stream: Calling create_chat_completion with tools={tools}")
stream = self.model.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
......@@ -1057,8 +1058,10 @@ class VulkanBackend(ModelBackend):
tools=tools,
stream=True,
)
print(f"DEBUG: generate_chat_stream: Got stream object: {type(stream)}")
for chunk in stream:
chunk_count += 1
print(f"DEBUG: generate_chat_stream: Raw chunk {chunk_count}: {repr(chunk)}")
delta = chunk["choices"][0].get("delta", {})
content = delta.get("content", "")
if content:
......@@ -1069,6 +1072,9 @@ class VulkanBackend(ModelBackend):
print(f"DEBUG: Empty stream from create_chat_completion, using fallback")
raise Exception("Empty stream response")
except Exception as e:
print(f"DEBUG: generate_chat_stream exception: {type(e).__name__}: {e}")
import traceback
traceback.print_exc()
if chunk_count == 0:
print(f"Warning: create_chat_completion stream failed ({e}), falling back to text generation")
# Fallback: format messages manually and use text generation
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment