Commit b7bfccda authored by Your Name's avatar Your Name

Pass raw mode output through formatter/parser

Now raw mode passes the generated text through OpenAIFormatter which:
- Handles tool extraction
- Provides OpenAI compatibility
- Handles other post-processing

This ensures raw mode output is treated the same as regular mode.
parent d11b24fc
...@@ -2349,76 +2349,63 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request ...@@ -2349,76 +2349,63 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
if global_debug: if global_debug:
print(f"RAW: Generated text after cleanup: {generated_text[:100]}...") print(f"RAW: Generated text after cleanup: {generated_text[:100]}...")
# Build response similar to generate_chat_response # Pass through the formatter/parser (same as regular mode)
completion_id = f"chatcmpl-{uuid.uuid4().hex}" # This handles tool extraction and OpenAI compatibility
created = int(time.time()) from codai.models.parser import OpenAIFormatter
formatter = OpenAIFormatter(response_model_name)
# Filter content # Estimate token counts
generated_text = filter_malformed_content(generated_text) prompt_tokens = len(raw_prompt_for_generation.split())
completion_tokens = len(generated_text.split()) if generated_text else 0
if global_dump:
print(f"\n{'='*80}")
print(f"=== RAW MODE OUTPUT ===")
print(f"{'='*80}")
print(generated_text)
print(f"{'='*80}\n")
response_message = {
"role": "assistant",
"content": generated_text,
}
finish_reason = "stop"
# Estimate tokens (rough approximation)
prompt_tokens = len(raw_prompt_for_generation.split())
completion_tokens = len(generated_text.split())
response = { # Format through the parser
"id": completion_id, formatted_response = formatter.format_litellm_full(
"object": "chat.completion", text=generated_text,
"created": created, prompt_tokens=prompt_tokens,
"model": response_model_name, completion_tokens=completion_tokens,
"choices": [{ tool_calls=None # Raw mode doesn't have tool calls from generation
"index": 0, )
"message": response_message,
"finish_reason": finish_reason,
}],
"usage": {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens,
},
}
# Add rate limit headers if global_debug:
headers = {} print(f"RAW: Passed through formatter, got: {formatted_response.get('choices', [{}])[0].get('message', {}).get('content', '')[:100]}...")
if 'usage' in response:
headers = current_manager.backend.get_rate_limit_headers(
prompt_tokens=response.get('usage', {}).get('prompt_tokens', 0),
completion_tokens=response.get('usage', {}).get('completion_tokens', 0)
) if hasattr(current_manager.backend, 'get_rate_limit_headers') else {}
# Add mock reasoning stats if 'mock' is in force_reasoning_args # Add mock reasoning stats if 'mock' is in force_reasoning_args
if "mock" in force_reasoning_args and response: if "mock" in force_reasoning_args and formatted_response:
# Add fake reasoning tokens to trigger VSCode plugin stats # Add fake reasoning tokens to trigger VSCode plugin stats
mock_reasoning_tokens = 50 mock_reasoning_tokens = 50
# Update usage # Update usage
if "usage" in response: if "usage" in formatted_response:
response["usage"]["completion_tokens"] += mock_reasoning_tokens formatted_response["usage"]["completion_tokens"] += mock_reasoning_tokens
response["usage"]["total_tokens"] += mock_reasoning_tokens formatted_response["usage"]["total_tokens"] += mock_reasoning_tokens
response["usage"]["completion_tokens_details"] = { formatted_response["usage"]["completion_tokens_details"] = {
"reasoning_tokens": mock_reasoning_tokens "reasoning_tokens": mock_reasoning_tokens
} }
# Add reasoning to message if not present # Add reasoning to message if not present
if "choices" in response and response["choices"]: if "choices" in formatted_response and formatted_response["choices"]:
choice = response["choices"][0] choice = formatted_response["choices"][0]
if "message" in choice and "reasoning" not in choice["message"]: if "message" in choice and "reasoning" not in choice["message"]:
choice["message"]["reasoning"] = "Processing task in optimized mode..." choice["message"]["reasoning"] = "Processing task in optimized mode..."
return JSONResponse(content=response, headers=headers) # Dump parsed output if enabled
if global_dump:
import json
print(f"\n{'='*80}")
print(f"=== RAW MODE PARSED OUTPUT (DUMP) ===")
print(f"{'='*80}")
print(json.dumps(formatted_response, indent=2))
print(f"{'='*80}\n")
# Add rate limit headers
headers = {}
if 'usage' in formatted_response:
headers = current_manager.backend.get_rate_limit_headers(
prompt_tokens=formatted_response.get('usage', {}).get('prompt_tokens', 0),
completion_tokens=formatted_response.get('usage', {}).get('completion_tokens', 0)
) if hasattr(current_manager.backend, 'get_rate_limit_headers') else {}
return JSONResponse(content=formatted_response, headers=headers)
if request.stream: if request.stream:
return StreamingResponse( return StreamingResponse(
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment