Add 'raw' option to --force-reasoning for native tokenizer prompt seeding

- Added 'raw' to valid force-reasoning options (chat, stop, inject, prompt, twopass, mock, raw) - Implemented raw mode handler that: - Uses tokenizer.apply_chat_template() with add_generation_prompt=True - Seeds reasoning tag + commitment sentence - Uses two-pass generation: first captures reasoning, then gets final answer - Supports both streaming and non-streaming responses - Falls back gracefully if tokenizer not available This enables using the model's native tokenizer for prompt seeding, bypassing double-templating issues with chat APIs.

Add 'raw' option to --force-reasoning for native tokenizer prompt seeding
- Added 'raw' to valid force-reasoning options (chat, stop, inject, prompt, twopass, mock, raw) - Implemented raw mode handler that: - Uses tokenizer.apply_chat_template() with add_generation_prompt=True - Seeds reasoning tag + commitment sentence - Uses two-pass generation: first captures reasoning, then gets final answer - Supports both streaming and non-streaming responses - Falls back gracefully if tokenizer not available This enables using the model's native tokenizer for prompt seeding, bypassing double-templating issues with chat APIs.
ceb4ae88 · Your Name · 9de7c79d · ceb4ae88
Commit ceb4ae88 authored Mar 17, 2026 by Your Name
Show whitespace changes
Inline Side-by-side

Showing with 272 additions and 1 deletion

coderai coderai +272 -1

No files found.
--- a/coderai
+++ b/coderai
@@ -2008,6 +2008,71 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
            print(f"...{seeded_prompt[-80:]}")
            print(f"--- END SEEDED PROMPT ---")
+    # Handle 'raw' - use tokenizer's apply_chat_template for raw completion
+    # This bypasses the chat API middleware and uses the model's native template
+    if "raw" in force_reasoning_args:
+        # Get the tokenizer from the backend
+        tokenizer = None
+        if hasattr(current_manager, 'backend') and hasattr(current_manager.backend, 'tokenizer'):
+            tokenizer = current_manager.backend.tokenizer
+        if tokenizer is None:
+            print("WARNING: No tokenizer available for raw mode, falling back to prompt mode")
+        else:
+            # Extract system and user messages for template
+            system_prompt = "You are a helpful assistant."
+            user_message = ""
+            for msg in messages:
+                if msg.role == "system":
+                    system_prompt = msg.content
+                elif msg.role == "user":
+                    # Get the last user message
+                    user_message = msg.content
+            # Convert messages to dict format for apply_chat_template
+            chat_messages = []
+            if system_prompt:
+                chat_messages.append({"role": "system", "content": system_prompt})
+            if user_message:
+                chat_messages.append({"role": "user", "content": user_message})
+            # Get the prompt with generation prompt (forces model to start responding)
+            try:
+                raw_prompt = tokenizer.apply_chat_template(
+                    chat_messages,
+                    add_generation_prompt=True,
+                    tokenize=False
+                )
+            except Exception as e:
+                print(f"WARNING: apply_chat_template failed: {e}, falling back")
+                raw_prompt = f"System: {system_prompt}\n\nUser: {user_message}\n\nAssistant:"
+            # Get the reasoning tag for this model family
+            thought_tag, close_tag, _ = get_reasoning_stop_tokens(model_family)
+            # Append the reasoning tag with commitment sentence
+            # This is what forces the model to start reasoning
+            commitment = "Let me think about this step by step."
+            raw_prompt = raw_prompt + thought_tag + commitment
+            if global_debug:
+                print(f"RAW: Using raw completion with tokenizer apply_chat_template")
+                print(f"\n--- RAW PROMPT (last 120 chars) ---")
+                print(f"...{raw_prompt[-120:]}")
+                print(f"--- END RAW PROMPT ---")
+            # Mark that we're using raw mode so generate() is called instead of generate_chat()
+            use_raw_mode = True
+            raw_prompt_for_generation = raw_prompt
+            raw_stop_sequences = list(stop_sequences)  # Copy current stop sequences
+            # Add the close tag to stop sequences for first pass
+            if close_tag not in raw_stop_sequences:
+                raw_stop_sequences.append(close_tag)
+            if global_debug:
+                print(f"RAW: First pass will stop at: {close_tag}")
    # Prepare stop sequences
    stop_sequences = []
    if request.stop:
@@ -2104,10 +2169,216 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
                }
            })
+    # Handle raw mode - use generate() instead of generate_chat() for raw prompt completion
+    use_raw_mode = False
+    raw_prompt_for_generation = None
+    raw_stop_sequences = None
+    # Check if we set raw mode in the prompt handling section above
+    # The variables should already be set if raw was in force_reasoning_args
+    if "raw" in force_reasoning_args:
+        # Raw mode was already set up in the prompt handling section
+        # Just verify the variables exist
+        try:
+            _ = raw_prompt_for_generation
+            _ = raw_stop_sequences
+        except NameError:
+            # Variables not set - try to get tokenizer again
+            tokenizer = None
+            if hasattr(current_manager, 'backend') and hasattr(current_manager.backend, 'tokenizer'):
+                tokenizer = current_manager.backend.tokenizer
+            if tokenizer is not None:
+                # Extract system and user messages
+                system_prompt = "You are a helpful assistant."
+                user_message = ""
+                for msg in messages:
+                    if msg.role == "system":
+                        system_prompt = msg.content
+                    elif msg.role == "user":
+                        user_message = msg.content
+                # Get the prompt with generation prompt
+                try:
+                    raw_prompt_for_generation = tokenizer.apply_chat_template(
+                        [{"role": "system", "content": system_prompt},
+                         {"role": "user", "content": user_message}],
+                        add_generation_prompt=True,
+                        tokenize=False
+                    )
+                except Exception as e:
+                    raw_prompt_for_generation = f"System: {system_prompt}\n\nUser: {user_message}\n\nAssistant:"
+                # Get reasoning tag
+                thought_tag, close_tag, _ = get_reasoning_stop_tokens(model_family)
+                raw_prompt_for_generation += thought_tag + "Let me think about this step by step."
+                raw_stop_sequences = list(stop_sequences)
+                if close_tag not in raw_stop_sequences:
+                    raw_stop_sequences.append(close_tag)
+                use_raw_mode = True
    # Get resolved model name for response (with coderai/ prefix and proper formatting)
    response_model_name = get_resolved_model_name(requested_model, current_manager)
    print(f"DEBUG: Requested model: {requested_model}, Resolved model for response: {response_model_name}")
+    # Handle raw mode - two pass: first capture reasoning, then get final answer
+    if use_raw_mode and raw_prompt_for_generation:
+        if global_debug:
+            print(f"RAW: Starting two-pass generation")
+            print(f"RAW: First pass prompt: ...{raw_prompt_for_generation[-100:]}")
+        if request.stream:
+            # For streaming, we need to handle it differently
+            # First pass: generate until reasoning close tag (stream it)
+            async def raw_stream_generate():
+                thought_tag, close_tag, _ = get_reasoning_stop_tokens(model_family)
+                reasoning_text = ""
+                # Use the backend's async generate if available
+                if hasattr(current_manager.backend, 'generate_stream'):
+                    async for chunk in await current_manager.backend.generate_stream(
+                        prompt=raw_prompt_for_generation,
+                        max_tokens=request.max_tokens or 2048,
+                        temperature=request.temperature,
+                        top_p=request.top_p,
+                        stop=raw_stop_sequences,
+                    ):
+                        reasoning_text += chunk
+                        yield f"data: {json.dumps({'choices': [{'delta': {'content': chunk}, 'finish_reason': None}]})}\n\n"
+                        # Check if we hit the close tag
+                        if close_tag and close_tag in reasoning_text:
+                            break
+                else:
+                    # Fallback: non-streaming
+                    first_pass_result = current_manager.generate(
+                        prompt=raw_prompt_for_generation,
+                        max_tokens=request.max_tokens or 2048,
+                        temperature=request.temperature,
+                        top_p=request.top_p,
+                        stop=raw_stop_sequences,
+                    )
+                    yield f"data: {json.dumps({'choices': [{'delta': {'content': first_pass_result}, 'finish_reason': None}]})}\n\n"
+                # After reasoning, yield the close tag and continue with final answer
+                if close_tag:
+                    yield f"data: {json.dumps({'choices': [{'delta': {'content': close_tag}, 'finish_reason': None}]})}\n\n"
+                # Second pass: get the rest
+                full_prompt = raw_prompt_for_generation + reasoning_text + (close_tag or "")
+                second_pass_result = current_manager.generate(
+                    prompt=full_prompt,
+                    max_tokens=request.max_tokens or 2048,
+                    temperature=request.temperature,
+                    top_p=request.top_p,
+                    stop=stop_sequences,
+                )
+                yield f"data: {json.dumps({'choices': [{'delta': {'content': second_pass_result}, 'finish_reason': 'stop'}]})}\n\n"
+                yield "data: [DONE]\n\n"
+            return StreamingResponse(raw_stream_generate(), media_type="text/event-stream")
+        # Non-streaming path (already implemented above)
+        # First pass: generate until reasoning close tag
+        first_pass_result = current_manager.generate(
+            prompt=raw_prompt_for_generation,
+            max_tokens=request.max_tokens or 2048,
+            temperature=request.temperature,
+            top_p=request.top_p,
+            stop=raw_stop_sequences,
+        )
+        if global_debug:
+            print(f"RAW: First pass result: ...{first_pass_result[-200:]}")
+        # Extract reasoning (everything up to the close tag)
+        thought_tag, close_tag, _ = get_reasoning_stop_tokens(model_family)
+        reasoning_text = ""
+        final_text = first_pass_result
+        if close_tag and close_tag in first_pass_result:
+            # Split at close tag
+            parts = first_pass_result.split(close_tag, 1)
+            reasoning_text = parts[0]
+            final_text = parts[1] if len(parts) > 1 else ""
+        if global_debug:
+            print(f"RAW: Extracted reasoning: {reasoning_text[:100]}...")
+            print(f"RAW: Final text: {final_text[:100]}...")
+        # If we have reasoning, continue with second pass to get more complete answer
+        # Build the full prompt with reasoning included
+        full_prompt = raw_prompt_for_generation + reasoning_text + (close_tag or "")
+        # Second pass: generate the rest (or just use what we have)
+        # For now, just return what we have + optionally continue
+        if final_text.strip():
+            # We have a complete answer after reasoning
+            generated_text = reasoning_text + (close_tag or "") + final_text
+        else:
+            # Need second pass to get answer
+            second_pass_result = current_manager.generate(
+                prompt=full_prompt,
+                max_tokens=request.max_tokens or 2048,
+                temperature=request.temperature,
+                top_p=request.top_p,
+                stop=stop_sequences,
+            )
+            generated_text = reasoning_text + (close_tag or "") + second_pass_result
+        # Build response similar to generate_chat_response
+        completion_id = f"chatcmpl-{uuid.uuid4().hex}"
+        created = int(time.time())
+        # Filter content
+        generated_text = filter_malformed_content(generated_text)
+        if global_dump:
+            print(f"\n{'='*80}")
+            print(f"=== RAW MODE OUTPUT ===")
+            print(f"{'='*80}")
+            print(generated_text)
+            print(f"{'='*80}\n")
+        response_message = {
+            "role": "assistant",
+            "content": generated_text,
+        }
+        finish_reason = "stop"
+        # Estimate tokens (rough approximation)
+        prompt_tokens = len(raw_prompt_for_generation.split()) 
+        completion_tokens = len(generated_text.split())
+        response = {
+            "id": completion_id,
+            "object": "chat.completion",
+            "created": created,
+            "model": response_model_name,
+            "choices": [{
+                "index": 0,
+                "message": response_message,
+                "finish_reason": finish_reason,
+            }],
+            "usage": {
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion_tokens,
+                "total_tokens": prompt_tokens + completion_tokens,
+            },
+        }
+        # Add rate limit headers
+        headers = {}
+        if 'usage' in response:
+            headers = current_manager.backend.get_rate_limit_headers(
+                prompt_tokens=response.get('usage', {}).get('prompt_tokens', 0),
+                completion_tokens=response.get('usage', {}).get('completion_tokens', 0)
+            ) if hasattr(current_manager.backend, 'get_rate_limit_headers') else {}
+        return JSONResponse(content=response, headers=headers)
    if request.stream:
        return StreamingResponse(
            stream_chat_response(
@@ -3137,7 +3408,7 @@ def parse_args():
        if not value:
            return []
        options = [v.strip().lower() for v in value.split(',')]
-        valid = {'chat', 'stop', 'inject', 'prompt', 'all', 'twopass', 'mock'}
+        valid = {'chat', 'stop', 'inject', 'prompt', 'all', 'twopass', 'mock', 'raw'}
        invalid = [o for o in options if o not in valid]
        if invalid:
            raise argparse.ArgumentTypeError(f"Invalid choices: {invalid}. Valid options: {valid}")