Commit b7d84534 authored by Your Name's avatar Your Name

Add debug output for flash-attention and force-reasoning mode

- Enhanced flash attention status output in NvidiaBackend to always show availability
- Added debug output in chat completions endpoint for force-reasoning mode
- Shows CLI flag value, API param, reasoning action, and whether injection was done
- Displays the actual injected system prompt content when debug mode is enabled
parent b49d3f59
...@@ -30,6 +30,7 @@ class NvidiaBackend(ModelBackend): ...@@ -30,6 +30,7 @@ class NvidiaBackend(ModelBackend):
except ImportError: except ImportError:
self.flash_attn_available = False self.flash_attn_available = False
# Always print the status when model is loaded (for visibility)
if self.use_flash_attn: if self.use_flash_attn:
if self.flash_attn_available: if self.flash_attn_available:
print("Flash Attention 2: Available and enabled") print("Flash Attention 2: Available and enabled")
...@@ -38,6 +39,12 @@ class NvidiaBackend(ModelBackend): ...@@ -38,6 +39,12 @@ class NvidiaBackend(ModelBackend):
print("Install with: pip install flash-attn --no-build-isolation") print("Install with: pip install flash-attn --no-build-isolation")
print("Falling back to standard attention") print("Falling back to standard attention")
self.use_flash_attn = False self.use_flash_attn = False
else:
# Print availability status even when not requested (for transparency)
if self.flash_attn_available:
print("Flash Attention 2: Available (not enabled)")
else:
print("Flash Attention 2: Not available")
def _detect_device(self) -> str: def _detect_device(self) -> str:
"""Auto-detect available GPU or fall back to CPU.""" """Auto-detect available GPU or fall back to CPU."""
......
...@@ -1898,6 +1898,14 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request ...@@ -1898,6 +1898,14 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
force_reasoning_mode = getattr(global_args, 'force_reasoning', None) if global_args else None force_reasoning_mode = getattr(global_args, 'force_reasoning', None) if global_args else None
enable_thinking_api = getattr(request, 'enable_thinking', False) enable_thinking_api = getattr(request, 'enable_thinking', False)
# DEBUG: Print force_reasoning status when debug mode is enabled
if global_debug:
print(f"\n{'='*60}")
print(f"=== REASONING MODE DEBUG ===")
print(f"{'='*60}")
print(f"force_reasoning CLI flag: {force_reasoning_mode}")
print(f"enable_thinking API param: {enable_thinking_api}")
# Determine if reasoning should be enabled # Determine if reasoning should be enabled
# Force reasoning if: API param is true OR CLI flag is set (not None) # Force reasoning if: API param is true OR CLI flag is set (not None)
reasoning_enabled = enable_thinking_api or (force_reasoning_mode is not None) reasoning_enabled = enable_thinking_api or (force_reasoning_mode is not None)
...@@ -1937,6 +1945,16 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request ...@@ -1937,6 +1945,16 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
break break
if not system_found: if not system_found:
messages = [ChatMessage(role="system", content=system_content)] + list(messages) messages = [ChatMessage(role="system", content=system_content)] + list(messages)
# DEBUG: Print injection status
if global_debug:
print(f"reasoning_action: {reasoning_action}")
print(f"reasoning_enabled: {reasoning_enabled}")
print(f"INJECTION DONE: System prompt has been injected with agentic instructions")
print(f"\n--- INJECTED SYSTEM PROMPT ---")
print(system_content)
print(f"--- END SYSTEM PROMPT ---")
print(f"{'='*60}\n")
# Prepare stop sequences (before reasoning block to avoid UnboundLocalError) # Prepare stop sequences (before reasoning block to avoid UnboundLocalError)
stop_sequences = [] stop_sequences = []
...@@ -1954,6 +1972,12 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request ...@@ -1954,6 +1972,12 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
if stop_token not in stop_sequences: if stop_token not in stop_sequences:
stop_sequences.append(stop_token) stop_sequences.append(stop_token)
print(f"DEBUG: Added reasoning stop tokens for model family '{model_family}': {additional_stops}") print(f"DEBUG: Added reasoning stop tokens for model family '{model_family}': {additional_stops}")
# DEBUG: Print stop action
if global_debug:
print(f"reasoning_action: {reasoning_action}")
print(f"STOP TOKENS ADDED: Reasoning stop tokens added to generation")
print(f"{'='*60}\n")
# Format messages with tools if provided # Format messages with tools if provided
if request.tools: if request.tools:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment