Commit ceb4ae88 authored by Your Name's avatar Your Name

Add 'raw' option to --force-reasoning for native tokenizer prompt seeding

- Added 'raw' to valid force-reasoning options (chat, stop, inject, prompt, twopass, mock, raw)
- Implemented raw mode handler that:
  - Uses tokenizer.apply_chat_template() with add_generation_prompt=True
  - Seeds reasoning tag + commitment sentence
  - Uses two-pass generation: first captures reasoning, then gets final answer
  - Supports both streaming and non-streaming responses
  - Falls back gracefully if tokenizer not available

This enables using the model's native tokenizer for prompt seeding, bypassing
double-templating issues with chat APIs.
parent 9de7c79d
...@@ -2008,6 +2008,71 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request ...@@ -2008,6 +2008,71 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
print(f"...{seeded_prompt[-80:]}") print(f"...{seeded_prompt[-80:]}")
print(f"--- END SEEDED PROMPT ---") print(f"--- END SEEDED PROMPT ---")
# Handle 'raw' - use tokenizer's apply_chat_template for raw completion
# This bypasses the chat API middleware and uses the model's native template
if "raw" in force_reasoning_args:
# Get the tokenizer from the backend
tokenizer = None
if hasattr(current_manager, 'backend') and hasattr(current_manager.backend, 'tokenizer'):
tokenizer = current_manager.backend.tokenizer
if tokenizer is None:
print("WARNING: No tokenizer available for raw mode, falling back to prompt mode")
else:
# Extract system and user messages for template
system_prompt = "You are a helpful assistant."
user_message = ""
for msg in messages:
if msg.role == "system":
system_prompt = msg.content
elif msg.role == "user":
# Get the last user message
user_message = msg.content
# Convert messages to dict format for apply_chat_template
chat_messages = []
if system_prompt:
chat_messages.append({"role": "system", "content": system_prompt})
if user_message:
chat_messages.append({"role": "user", "content": user_message})
# Get the prompt with generation prompt (forces model to start responding)
try:
raw_prompt = tokenizer.apply_chat_template(
chat_messages,
add_generation_prompt=True,
tokenize=False
)
except Exception as e:
print(f"WARNING: apply_chat_template failed: {e}, falling back")
raw_prompt = f"System: {system_prompt}\n\nUser: {user_message}\n\nAssistant:"
# Get the reasoning tag for this model family
thought_tag, close_tag, _ = get_reasoning_stop_tokens(model_family)
# Append the reasoning tag with commitment sentence
# This is what forces the model to start reasoning
commitment = "Let me think about this step by step."
raw_prompt = raw_prompt + thought_tag + commitment
if global_debug:
print(f"RAW: Using raw completion with tokenizer apply_chat_template")
print(f"\n--- RAW PROMPT (last 120 chars) ---")
print(f"...{raw_prompt[-120:]}")
print(f"--- END RAW PROMPT ---")
# Mark that we're using raw mode so generate() is called instead of generate_chat()
use_raw_mode = True
raw_prompt_for_generation = raw_prompt
raw_stop_sequences = list(stop_sequences) # Copy current stop sequences
# Add the close tag to stop sequences for first pass
if close_tag not in raw_stop_sequences:
raw_stop_sequences.append(close_tag)
if global_debug:
print(f"RAW: First pass will stop at: {close_tag}")
# Prepare stop sequences # Prepare stop sequences
stop_sequences = [] stop_sequences = []
if request.stop: if request.stop:
...@@ -2104,10 +2169,216 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request ...@@ -2104,10 +2169,216 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
} }
}) })
# Handle raw mode - use generate() instead of generate_chat() for raw prompt completion
use_raw_mode = False
raw_prompt_for_generation = None
raw_stop_sequences = None
# Check if we set raw mode in the prompt handling section above
# The variables should already be set if raw was in force_reasoning_args
if "raw" in force_reasoning_args:
# Raw mode was already set up in the prompt handling section
# Just verify the variables exist
try:
_ = raw_prompt_for_generation
_ = raw_stop_sequences
except NameError:
# Variables not set - try to get tokenizer again
tokenizer = None
if hasattr(current_manager, 'backend') and hasattr(current_manager.backend, 'tokenizer'):
tokenizer = current_manager.backend.tokenizer
if tokenizer is not None:
# Extract system and user messages
system_prompt = "You are a helpful assistant."
user_message = ""
for msg in messages:
if msg.role == "system":
system_prompt = msg.content
elif msg.role == "user":
user_message = msg.content
# Get the prompt with generation prompt
try:
raw_prompt_for_generation = tokenizer.apply_chat_template(
[{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message}],
add_generation_prompt=True,
tokenize=False
)
except Exception as e:
raw_prompt_for_generation = f"System: {system_prompt}\n\nUser: {user_message}\n\nAssistant:"
# Get reasoning tag
thought_tag, close_tag, _ = get_reasoning_stop_tokens(model_family)
raw_prompt_for_generation += thought_tag + "Let me think about this step by step."
raw_stop_sequences = list(stop_sequences)
if close_tag not in raw_stop_sequences:
raw_stop_sequences.append(close_tag)
use_raw_mode = True
# Get resolved model name for response (with coderai/ prefix and proper formatting) # Get resolved model name for response (with coderai/ prefix and proper formatting)
response_model_name = get_resolved_model_name(requested_model, current_manager) response_model_name = get_resolved_model_name(requested_model, current_manager)
print(f"DEBUG: Requested model: {requested_model}, Resolved model for response: {response_model_name}") print(f"DEBUG: Requested model: {requested_model}, Resolved model for response: {response_model_name}")
# Handle raw mode - two pass: first capture reasoning, then get final answer
if use_raw_mode and raw_prompt_for_generation:
if global_debug:
print(f"RAW: Starting two-pass generation")
print(f"RAW: First pass prompt: ...{raw_prompt_for_generation[-100:]}")
if request.stream:
# For streaming, we need to handle it differently
# First pass: generate until reasoning close tag (stream it)
async def raw_stream_generate():
thought_tag, close_tag, _ = get_reasoning_stop_tokens(model_family)
reasoning_text = ""
# Use the backend's async generate if available
if hasattr(current_manager.backend, 'generate_stream'):
async for chunk in await current_manager.backend.generate_stream(
prompt=raw_prompt_for_generation,
max_tokens=request.max_tokens or 2048,
temperature=request.temperature,
top_p=request.top_p,
stop=raw_stop_sequences,
):
reasoning_text += chunk
yield f"data: {json.dumps({'choices': [{'delta': {'content': chunk}, 'finish_reason': None}]})}\n\n"
# Check if we hit the close tag
if close_tag and close_tag in reasoning_text:
break
else:
# Fallback: non-streaming
first_pass_result = current_manager.generate(
prompt=raw_prompt_for_generation,
max_tokens=request.max_tokens or 2048,
temperature=request.temperature,
top_p=request.top_p,
stop=raw_stop_sequences,
)
yield f"data: {json.dumps({'choices': [{'delta': {'content': first_pass_result}, 'finish_reason': None}]})}\n\n"
# After reasoning, yield the close tag and continue with final answer
if close_tag:
yield f"data: {json.dumps({'choices': [{'delta': {'content': close_tag}, 'finish_reason': None}]})}\n\n"
# Second pass: get the rest
full_prompt = raw_prompt_for_generation + reasoning_text + (close_tag or "")
second_pass_result = current_manager.generate(
prompt=full_prompt,
max_tokens=request.max_tokens or 2048,
temperature=request.temperature,
top_p=request.top_p,
stop=stop_sequences,
)
yield f"data: {json.dumps({'choices': [{'delta': {'content': second_pass_result}, 'finish_reason': 'stop'}]})}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(raw_stream_generate(), media_type="text/event-stream")
# Non-streaming path (already implemented above)
# First pass: generate until reasoning close tag
first_pass_result = current_manager.generate(
prompt=raw_prompt_for_generation,
max_tokens=request.max_tokens or 2048,
temperature=request.temperature,
top_p=request.top_p,
stop=raw_stop_sequences,
)
if global_debug:
print(f"RAW: First pass result: ...{first_pass_result[-200:]}")
# Extract reasoning (everything up to the close tag)
thought_tag, close_tag, _ = get_reasoning_stop_tokens(model_family)
reasoning_text = ""
final_text = first_pass_result
if close_tag and close_tag in first_pass_result:
# Split at close tag
parts = first_pass_result.split(close_tag, 1)
reasoning_text = parts[0]
final_text = parts[1] if len(parts) > 1 else ""
if global_debug:
print(f"RAW: Extracted reasoning: {reasoning_text[:100]}...")
print(f"RAW: Final text: {final_text[:100]}...")
# If we have reasoning, continue with second pass to get more complete answer
# Build the full prompt with reasoning included
full_prompt = raw_prompt_for_generation + reasoning_text + (close_tag or "")
# Second pass: generate the rest (or just use what we have)
# For now, just return what we have + optionally continue
if final_text.strip():
# We have a complete answer after reasoning
generated_text = reasoning_text + (close_tag or "") + final_text
else:
# Need second pass to get answer
second_pass_result = current_manager.generate(
prompt=full_prompt,
max_tokens=request.max_tokens or 2048,
temperature=request.temperature,
top_p=request.top_p,
stop=stop_sequences,
)
generated_text = reasoning_text + (close_tag or "") + second_pass_result
# Build response similar to generate_chat_response
completion_id = f"chatcmpl-{uuid.uuid4().hex}"
created = int(time.time())
# Filter content
generated_text = filter_malformed_content(generated_text)
if global_dump:
print(f"\n{'='*80}")
print(f"=== RAW MODE OUTPUT ===")
print(f"{'='*80}")
print(generated_text)
print(f"{'='*80}\n")
response_message = {
"role": "assistant",
"content": generated_text,
}
finish_reason = "stop"
# Estimate tokens (rough approximation)
prompt_tokens = len(raw_prompt_for_generation.split())
completion_tokens = len(generated_text.split())
response = {
"id": completion_id,
"object": "chat.completion",
"created": created,
"model": response_model_name,
"choices": [{
"index": 0,
"message": response_message,
"finish_reason": finish_reason,
}],
"usage": {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens,
},
}
# Add rate limit headers
headers = {}
if 'usage' in response:
headers = current_manager.backend.get_rate_limit_headers(
prompt_tokens=response.get('usage', {}).get('prompt_tokens', 0),
completion_tokens=response.get('usage', {}).get('completion_tokens', 0)
) if hasattr(current_manager.backend, 'get_rate_limit_headers') else {}
return JSONResponse(content=response, headers=headers)
if request.stream: if request.stream:
return StreamingResponse( return StreamingResponse(
stream_chat_response( stream_chat_response(
...@@ -3137,7 +3408,7 @@ def parse_args(): ...@@ -3137,7 +3408,7 @@ def parse_args():
if not value: if not value:
return [] return []
options = [v.strip().lower() for v in value.split(',')] options = [v.strip().lower() for v in value.split(',')]
valid = {'chat', 'stop', 'inject', 'prompt', 'all', 'twopass', 'mock'} valid = {'chat', 'stop', 'inject', 'prompt', 'all', 'twopass', 'mock', 'raw'}
invalid = [o for o in options if o not in valid] invalid = [o for o in options if o not in valid]
if invalid: if invalid:
raise argparse.ArgumentTypeError(f"Invalid choices: {invalid}. Valid options: {valid}") raise argparse.ArgumentTypeError(f"Invalid choices: {invalid}. Valid options: {valid}")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment