#!/usr/bin/env python3
"""
OpenAI-compatible API server for HuggingFace models (NVIDIA) and GGUF models (Vulkan).
Supports CUDA (NVIDIA) and Vulkan (AMD) GPU backends, memory-aware model loading,
streaming, and tool calling.
"""

import argparse
import asyncio
import hashlib
import json
import os
import pathlib
import re
import sys
import time
import uuid
import warnings
import requests
from abc import ABC, abstractmethod
from contextlib import asynccontextmanager
from typing import AsyncGenerator, Dict, List, Optional, Union

import psutil
from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import StreamingResponse, FileResponse, JSONResponse
from pydantic import BaseModel, Field, validator, field_validator, ConfigDict
from pydantic_core import PydanticCustomError
from threading import Thread

# Import codai module for enhanced tool call parsing
from codai.models import ModelParserDispatcher, OpenAIFormatter, ToolCallParser, ModelParserAdapter

# Import from codai modules for use in this file
from codai.models.manager import ModelManager, WhisperServerManager, MultiModelManager
from codai.queue.manager import QueueManager, queue_manager
from codai.backends import NvidiaBackend, VulkanBackend, detect_available_backends
from codai.models.capabilities import ModelCapabilities, detect_model_capabilities
from codai.models.utils import get_model_family, get_reasoning_stop_tokens, get_resolved_model_name
from codai.models.cache import (
    get_model_cache_dir, get_all_cache_dirs, get_cached_model_path,
    is_huggingface_model_id, download_huggingface_model, download_model
)
from codai.pydantic.textrequest import (
    ToolFunction, Tool, ChatMessage, ChatCompletionRequest,
    CompletionRequest, ModelInfo, ModelList
)
from codai.pydantic.imagerequest import ImageGenerationRequest, ImageGenerationResponse
from codai.pydantic.transcriptionrequest import TranscriptionRequest, TranscriptionResponse
# Per-model semaphores for request concurrency control
model_semaphores: dict = {}
load_mode = {"mode": "ondemand"}  # Track load mode globally
queue_flags = {"model_1": False, "image_1": False, "audio_1": False, "tts_1": False}  # Track --X-1 flags
# =============================================================================
# Model Cache Directory
# =============================================================================

# =============================================================================
# Backend Detection and Imports
# =============================================================================

# =============================================================================
# Flash Attention Detection (for NVIDIA backend)
# =============================================================================

# =============================================================================
# Pydantic Models for API
# =============================================================================

# =============================================================================
# Audio Transcription Models
# =============================================================================

# =============================================================================
# Image Generation Models
# =============================================================================

# =============================================================================
# Content Filtering Utility
# =============================================================================

def filter_malformed_content(text: str) -> str:
    """Filter out malformed SEARCH/REPLACE blocks that the model might output as content."""
    if not text:
        return text
    
    # Remove diff-like blocks that shouldn't be in the output
    filtered = text
    
    # Remove git-style diff markers and SEARCH/REPLACE patterns
    filtered = re.sub(r'<<<<<<<\s+SEARCH.*?=======', '', filtered, flags=re.DOTALL)
    filtered = re.sub(r'=======.*?>>>>>>>\s+REPLACE', '', filtered, flags=re.DOTALL)
    filtered = re.sub(r'>>>>>>>\s+REPLACE', '', filtered)
    
    # Also remove common malformed patterns seen in outputs
    filtered = re.sub(r'<<<<<<<\s+SEARCH\s*:start_line:\d+[^<]*', '', filtered, flags=re.DOTALL)
    filtered = re.sub(r'<button>Stop Generation</button>', '', filtered)
    filtered = re.sub(r'\<\|assistant\|\>', '', filtered)
    filtered = re.sub(r'\</\|assistant\|\>', '', filtered)
    
    # Clean up excessive newlines left from removal
    filtered = re.sub(r'\n{3,}', '\n\n', filtered)
    
    # Don't strip single newlines or whitespace - they might be valid content
    return filtered


def cleanup_control_tokens(text: str) -> str:
    """
    Clean up leading/trailing control tokens from model output.
    
    Removes tokens like <|im_end|>, <|im_start|>, 'assistant', etc. that might
    appear at the start or end of the response after reasoning extraction.
    """
    if not text:
        return text
    
    cleaned = text
    
    # List of control tokens to strip from start/end
    control_tokens = [
        '<|im_end|>',
        '<|im_start|>',
        '<|endoftext|>',
        '<|end_of_text|>',
        '<|eot_id|>',
        '<|eom_id|>',
        'assistant',
        'Assistant',
        'ASSISTANT',
        '<|assistant|>',
        '<|model|>',
        '<|python|>',
        '<|javascript|>',
        '<|html|>',
        '\n\nassistant',
        '\nAssistant',
    ]
    
    # Strip from start - keep trying until no more tokens at start
    changed = True
    while changed:
        changed = False
        for token in control_tokens:
            if cleaned.startswith(token):
                cleaned = cleaned[len(token):]
                changed = True
            elif cleaned.startswith('\n' + token):
                cleaned = cleaned[len('\n' + token):]
                changed = True
            elif cleaned.startswith(' ' + token):
                cleaned = cleaned[len(' ' + token):]
                changed = True
    
    # Strip from end - keep trying until no more tokens at end
    changed = True
    while changed:
        changed = False
        for token in control_tokens:
            if cleaned.endswith(token):
                cleaned = cleaned[:-len(token)]
                changed = True
            elif cleaned.endswith('\n' + token):
                cleaned = cleaned[:-len('\n' + token)]
                changed = True
            elif cleaned.endswith(' ' + token):
                cleaned = cleaned[:-len(' ' + token)]
                changed = True
    
    # Clean up any resulting double newlines
    cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
    
    # Strip leading/trailing whitespace
    cleaned = cleaned.strip()
    
    return cleaned
# =============================================================================
# Tool Parsing
# =============================================================================

# =============================================================================
# Model Parser Dispatcher Wrapper - integrates model_parser module
# =============================================================================

def format_tools_for_prompt(tools: List[Tool], messages: List[ChatMessage]) -> List[ChatMessage]:
    """Format tools into the system message or add a tool description."""
    if not tools:
        return messages
    
    tool_descriptions = []
    for tool in tools:
        func = tool.function
        desc = f"Tool: {func.name}"
        if func.description:
            desc += f"\nDescription: {func.description}"
        if func.parameters:
            desc += f"\nParameters: {json.dumps(func.parameters, indent=2)}"
        tool_descriptions.append(desc)
    
    tools_text = "You have access to the following tools:\n\n" + "\n\n".join(tool_descriptions)
    tools_text += "\n\nIMPORTANT: When you need to use a tool, you MUST format your response EXACTLY as:\n"
    tools_text += '<tool>{"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}</tool>'
    tools_text += "\n\nRules:\n"
    tools_text += "1. The content inside <tool> tags must be valid JSON\n"
    tools_text += "2. Do NOT use nested XML tags like <name> or <arguments> - use JSON format only\n"
    tools_text += "3. The 'name' field must match one of the available tool names exactly\n"
    tools_text += "4. The 'arguments' field must be a JSON object with the required parameters\n"
    tools_text += "\nExample:\n"
    tools_text += 'User: Read the file example.txt\n'
    tools_text += 'Assistant: <tool>{"name": "read_file", "arguments": {"files": [{"path": "example.txt"}]}}</tool>'
    
    # Add or prepend to system message
    new_messages = list(messages)
    system_found = False
    
    for i, msg in enumerate(new_messages):
        if msg.role == "system":
            new_messages[i] = ChatMessage(
                role="system",
                content=f"{tools_text}\n\n{msg.content or ''}"
            )
            system_found = True
            break
    
    if not system_found:
        new_messages.insert(0, ChatMessage(role="system", content=tools_text))
    
    return new_messages
# =============================================================================
# Abstract Model Backend
# =============================================================================

# =============================================================================
# NVIDIA/HuggingFace Backend
# =============================================================================

# =============================================================================
# Vulkan Backend (llama-cpp-python)
# =============================================================================

# =============================================================================
# Model Manager
# =============================================================================

# =============================================================================
# Whisper Server Manager - manages whisper-server subprocess
# =============================================================================

import subprocess
import signal
import requests
import time
import threading
# =============================================================================
# Multi-Model Manager (supports audio transcription and image generation)
# =============================================================================

# Global multi-model manager
multi_model_manager = MultiModelManager()
# Global model manager (for backward compatibility)
model_manager = ModelManager()

# Global args for access in endpoints
global_args = None


# Global system prompt (set via --system-prompt flag)
# None = don't inject, True = use default, string = use custom text
global_system_prompt = None


# Global debug flag
global_debug = False
global_dump = False
global_file_path = None

# =============================================================================
# Queue Manager for Model Loading Notifications
# =============================================================================

# Global queue manager
queue_manager = QueueManager()
# =============================================================================
# FastAPI Application
# =============================================================================

@asynccontextmanager
async def lifespan(app: FastAPI):
    """Lifespan context manager for startup/shutdown."""
    # Startup
    yield
    # Shutdown
    multi_model_manager.cleanup()
    model_manager.cleanup()
    # Stop whisper-server if running
    if multi_model_manager.whisper_server:
        multi_model_manager.whisper_server.stop()
app = FastAPI(
    title="OpenAI-Compatible API",
    description="OpenAI-compatible API supporting NVIDIA (CUDA) and Vulkan backends",
    version="2.0.0",
    lifespan=lifespan,
)

# Add request logging middleware for debugging
@app.middleware("http")
async def log_requests(request: Request, call_next):
    """Log all incoming requests for debugging."""
    if request.url.path in ["/v1/chat/completions", "/v1/completions"]:
        body = b""
        body_str = ""
        try:
            body = await request.body()
            body_str = body.decode('utf-8')
            
            # In debug mode, dump the full request
            if global_debug:
                print(f"\n{'='*80}")
                print(f"=== FULL DEBUG REQUEST ===")
                print(f"{'='*80}")
                print(f"Path: {request.url.path}")
                print(f"Method: {request.method}")
                print(f"Headers: {dict(request.headers)}")
                print(f"\n--- FULL BODY ({len(body)} bytes) ---")
                print(body_str)
                print(f"--- END FULL BODY ---")
                print(f"{'='*80}\n")
            else:
                print(f"\n{'='*60}")
                print(f"=== INCOMING REQUEST ===")
                print(f"{'='*60}")
                print(f"Path: {request.url.path}")
                print(f"Method: {request.method}")
                print(f"Headers: {dict(request.headers)}")
                print(f"\n--- RAW BODY ({len(body)} bytes) ---")
                # Print body with truncation for very large bodies
                if len(body_str) > 2000:
                    print(f"{body_str[:1000]}...\n... [truncated {len(body_str)-2000} chars] ...\n...{body_str[-1000:]}")
                else:
                    print(body_str)
                print(f"--- END RAW BODY ---")
            
            # Try to parse as JSON to see if it's valid
            try:
                parsed = json.loads(body_str)
                print(f"\n--- PARSED JSON STRUCTURE ---")
                print(f"Keys: {list(parsed.keys())}")
                
                # Display model settings
                print(f"\n--- MODEL SETTINGS ---")
                model_settings = ['model', 'temperature', 'top_p', 'n', 'max_tokens', 'stream', 'stop', 
                                 'presence_penalty', 'frequency_penalty', 'repeat_penalty',
                                 'tool_choice', 'response_format', 'user', 'enable_thinking']
                for setting in model_settings:
                    if setting in parsed:
                        value = parsed[setting]
                        # Truncate long values for display
                        if isinstance(value, str) and len(value) > 100:
                            value = value[:100] + "..."
                        elif isinstance(value, list) and len(value) > 10:
                            value = value[:10] + [f"... ({len(value)-10} more)"]
                        print(f"  {setting}: {value}")
                print(f"--- END MODEL SETTINGS ---")
                
                if 'messages' in parsed and isinstance(parsed['messages'], list):
                    print(f"\n--- MESSAGES ({len(parsed['messages'])} total) ---")
                    for i, msg in enumerate(parsed['messages']):
                        role = msg.get('role', 'unknown')
                        content = msg.get('content', '')
                        # Show preview of content
                        if isinstance(content, str):
                            content_preview = content[:80].replace('\n', ' ')
                            if len(content) > 80:
                                content_preview += "..."
                        else:
                            content_preview = str(content)[:80]
                        print(f"  [{i}] {role}: {content_preview}")
                        # Show tool_calls if present
                        if 'tool_calls' in msg and msg['tool_calls']:
                            print(f"       + tool_calls: {len(msg['tool_calls'])} call(s)")
                        # Show if there's reasoning
                        if 'reasoning' in msg and msg['reasoning']:
                            print(f"       + reasoning: {str(msg['reasoning'])[:50]}...")
                print(f"--- END MESSAGES ---")
                
                # Display tools if present
                if 'tools' in parsed and parsed['tools']:
                    print(f"\n--- TOOLS ({len(parsed['tools'])} total) ---")
                    for i, tool in enumerate(parsed['tools']):
                        if isinstance(tool, dict):
                            func = tool.get('function', {})
                            name = func.get('name', 'unknown')
                            desc = func.get('description', '')
                            if desc and len(desc) > 60:
                                desc = desc[:60] + "..."
                            print(f"  [{i}] {name}: {desc}")
                
                print(f"--- END PARSED JSON ---")
            except json.JSONDecodeError as e:
                print(f"\n*** JSON Parse Error: {e} ***")
                print(f"Error at position: char {e.pos}, line {e.lineno}, column {e.colno}")
            except Exception as e:
                print(f"\n*** Error analyzing JSON: {e} ***")
        except Exception as e:
            # Handle ClientDisconnect and other exceptions gracefully
            print(f"Error logging request: {e}")
            # Continue with empty body if we couldn't read it
            body = b""
        
        # Re-create request with body for downstream handlers (only if we successfully read it)
        if body:
            async def receive():
                return {"type": "http.request", "body": body}
            request = Request(request.scope, receive, request._send)
    
    try:
        response = await call_next(request)
        if request.url.path in ["/v1/chat/completions", "/v1/completions"]:
            print(f"\n--- RESPONSE ---")
            print(f"Status Code: {response.status_code}")
            
            # For error responses, try to read and log the body, then create a new response
            if response.status_code >= 400:
                try:
                    # Read the response body
                    response_body = b""
                    async for chunk in response.body_iterator:
                        response_body += chunk
                    
                    error_body = response_body.decode('utf-8')
                    print(f"Error Response Body: {error_body}")
                    
                    # Try to parse and pretty-print error details
                    try:
                        error_json = json.loads(error_body)
                        if 'detail' in error_json:
                            print(f"\n*** VALIDATION ERROR DETAILS ***")
                            detail = error_json['detail']
                            if isinstance(detail, list):
                                for err in detail:
                                    if isinstance(err, dict):
                                        loc = err.get('loc', [])
                                        msg = err.get('msg', 'Unknown error')
                                        err_type = err.get('type', 'unknown')
                                        print(f"  - Location: {loc}")
                                        print(f"    Message: {msg}")
                                        print(f"    Type: {err_type}")
                            else:
                                print(f"  Detail: {detail}")
                            print(f"*** END ERROR DETAILS ***")
                    except Exception as parse_err:
                        print(f"Could not parse error details: {parse_err}")
                    
                    # Create new response with the same body
                    from starlette.responses import Response
                    return Response(
                        content=response_body,
                        status_code=response.status_code,
                        headers=dict(response.headers),
                        media_type=response.media_type
                    )
                except Exception as read_err:
                    print(f"Could not read error response body: {read_err}")
            
            print(f"--- END RESPONSE ---")
            print(f"{'='*60}\n")
        return response
    except Exception as e:
        print(f"\n*** EXCEPTION DURING REQUEST PROCESSING ***")
        print(f"Error type: {type(e).__name__}")
        print(f"Error message: {e}")
        import traceback
        traceback.print_exc()
        print(f"*** END EXCEPTION ***\n")
        raise
    finally:
        if request.url.path in ["/v1/chat/completions", "/v1/completions"]:
            pass  # End logging already done above for successful responses
@app.get("/v1/models", response_model=ModelList)
async def list_models():
    """List available models."""
    models = multi_model_manager.list_models()
    return ModelList(data=models)

# =============================================================================
# Static File Serving Endpoint
# =============================================================================

@app.get("/v1/files/{filename}")
async def get_file(filename: str):
    """Serve generated files (images, audio) from the file path directory."""
    import os
    if not global_file_path:
        raise HTTPException(status_code=404, detail="File path not configured")
    
    file_path = os.path.join(global_file_path, filename)
    if not os.path.exists(file_path):
        raise HTTPException(status_code=404, detail="File not found")
    
    return FileResponse(file_path)

# =============================================================================
# Audio Transcription Endpoint
# =============================================================================

from fastapi import UploadFile, File, Form

@app.post("/v1/audio/transcriptions")
async def create_transcription(
    model: str = Form(...),
    file: UploadFile = File(...),
    language: Optional[str] = Form(None),
    prompt: Optional[str] = Form(None),
    response_format: Optional[str] = Form("json"),
    temperature: Optional[float] = Form(0.0),
):
    """
    Audio transcription endpoint (OpenAI-compatible).
    
    Supports:
    - OpenAI's whisper-1 model (via OpenAI API)
    - Local faster-whisper models (when --audio-model is specified)
    - whisper.cpp server (when --whisper-server is specified)
    """
    # Check if whisper-server is available FIRST (before checking audio_model)
    print(f"DEBUG: Audio request - whisper_server available: {multi_model_manager.whisper_server is not None}, running: {multi_model_manager.whisper_server.is_running() if multi_model_manager.whisper_server else 'N/A'}")
    if multi_model_manager.whisper_server and multi_model_manager.whisper_server.is_running():
        # Use whisper-server - read file and send to server
        file_content = await file.read()
        print(f"DEBUG: whisper-server transcription request - file_size={len(file_content)}, language={language}, prompt={prompt}")
        result = multi_model_manager.whisper_server.transcribe(
            file_content,
            language=language,
            prompt=prompt
        )
        print(f"DEBUG: whisper-server transcription result: {result}")
        if "error" in result:
            raise HTTPException(status_code=500, detail=result["error"])
        # Convert whisper-server response to OpenAI format
        text = result.get("text", "")
        return {
            "text": text
        }
    
    audio_model = multi_model_manager.audio_model
    
    # DEBUG: Print audio model status
    print(f"DEBUG: audio_model check - audio_models list: {multi_model_manager.audio_models}, audio_model: {audio_model}, whisper_server: {multi_model_manager.whisper_server}")
    
    # If no audio model configured, return an error
    if not audio_model:
        raise HTTPException(
            status_code=400,
            detail="Audio transcription not configured. Use --audio-model or --whisper-server to specify a model."
        )
    
    # Determine model to use - always use the configured audio model
    # The model parameter from the request is ignored in favor of the configured transcription model
    actual_model = audio_model  # This is the configured transcription model from --audio-model
    model_to_use = actual_model
    
    print(f"DEBUG: Transcription request model: {model}, using configured model: {actual_model}")
    
    # Check if Vulkan is available for whispercpp
    whisper_vulkan_available = False
    try:
        # Check if whispercpp is installed and has Vulkan support
        import whispercpp
        # Try to detect Vulkan support by checking if we can list devices
        # whispercpp doesn't have a direct Vulkan check, but we can verify by environment
        if os.environ.get('VK_DEVICE_SELECT_DEVICE'):
            whisper_vulkan_available = True
            print(f"Whisper Vulkan: Using configured Vulkan device")
        elif os.path.exists('/dev/dri'):  # Linux DRM devices exist = AMD/Intel GPU
            whisper_vulkan_available = True
            print(f"Whisper Vulkan: Auto-detected GPU")
    except ImportError:
        pass
    
    # Read file content
    file_content = await file.read()
    
    # Write to temp file
    import tempfile
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file.filename}") as tmp:
        tmp.write(file_content)
        tmp_path = tmp.name
    
    try:
        # Check if model is a GGUF file - faster-whisper doesn't support GGUF format
        is_gguf_model = model_to_use.endswith('.gguf') or 'gguf' in model_to_use.lower()
        
        if is_gguf_model:
            # Skip faster-whisper for GGUF files - go directly to whispercpp
            print("Detected GGUF model - using whispercpp backend")
            faster_whisper_failed = True
        else:
            # Try faster-whisper first
            faster_whisper_failed = False
            try:
                from faster_whisper import WhisperModel
                
                # Determine compute type based on GPU availability
                import torch
                if torch.cuda.is_available():
                    compute_type = "float16"
                else:
                    compute_type = "int8"
                
                # Try to load the model (lazy loading)
                model_key = f"audio:{model_to_use}"
                whisper_model = multi_model_manager.get_model(model_key)
                
                if whisper_model is None:
                    print(f"Loading faster-whisper model: {model_to_use}")
                    
                    # Check if model_to_use is a URL - download it (with caching)
                    model_path = None
                    if model_to_use.startswith('http://') or model_to_use.startswith('https://'):
                        # Check cache first
                        cached_path = get_cached_model_path(model_to_use)
                        if cached_path:
                            model_to_use = cached_path
                            print(f"Using cached model: {model_to_use}")
                        else:
                            print(f"Downloading model from URL: {model_to_use}")
                            try:
                                import requests
                                import hashlib
                                
                                # Get cache directory
                                cache_dir = get_model_cache_dir()
                                
                                # Extract filename from URL
                                url_path = model_to_use.split('?')[0]
                                filename = os.path.basename(url_path)
                                
                                if not filename.endswith('.bin') and not filename.endswith('.ggml'):
                                    filename = "whisper-model.bin"
                                
                                # Create safe filename in cache
                                url_hash = hashlib.sha256(model_to_use.encode()).hexdigest()
                                cached_filename = f"{url_hash}_{filename}"
                                model_path = os.path.join(cache_dir, cached_filename)
                                
                                # Download to cache
                                response = requests.get(model_to_use, stream=True)
                                response.raise_for_status()
                                
                                total_size = int(response.headers.get('content-length', 0))
                                downloaded = 0
                                
                                with open(model_path, 'wb') as f:
                                    for chunk in response.iter_content(chunk_size=8192*1024):
                                        if chunk:
                                            f.write(chunk)
                                            downloaded += len(chunk)
                                            if total_size > 0:
                                                percent = (downloaded / total_size) * 100
                                                print(f"Downloaded: {percent:.1f}%", end='\r')
                                
                                print(f"\nDownloaded and cached to: {model_path}")
                                model_to_use = model_path
                                
                            except Exception as e:
                                print(f"Error downloading model: {e}")
                                raise
                    
                    whisper_model = WhisperModel(
                        model_to_use,
                        device="cpu",  # faster-whisper CUDA doesn't work with AMD/Vulkan
                        compute_type=compute_type
                    )
                    # Store in multi_model_manager
                    multi_model_manager.add_model(model_key, whisper_model)
                
                # Run transcription
                segments, info = whisper_model.transcribe(
                    tmp_path,
                    language=language,
                    initial_prompt=prompt,
                    temperature=temperature or 0.0,
                )
                
                # Collect all segments
                text_parts = []
                for segment in segments:
                    text_parts.append(segment.text.strip())
                
                full_text = " ".join(text_parts)
                
                return {"text": full_text}
            
            except ImportError:
                # faster-whisper not available, will try whispercpp below
                faster_whisper_failed = True
            except Exception as e:
                # faster-whisper failed for some other reason
                print(f"Warning: faster-whisper failed to load model: {e}")
                faster_whisper_failed = True
        
        # If faster-whisper failed (not installed or couldn't load), try whispercpp
        if faster_whisper_failed:
            try:
                import whispercpp
                
                # Try to load the model (lazy loading)
                model_key = f"audio:{model_to_use}"
                whisper_model = multi_model_manager.get_model(model_key)
                
                if whisper_model is None:
                    print(f"Loading whispercpp model: {model_to_use}")
                    if whisper_vulkan_available:
                        print(f"  -> Using Vulkan GPU acceleration (device {whisper_vulkan_device})")
                    
                    # Check if model_to_use is a URL - download it (with caching)
                    model_path = None
                    if model_to_use.startswith('http://') or model_to_use.startswith('https://'):
                        # Check cache first
                        cached_path = get_cached_model_path(model_to_use)
                        if cached_path:
                            model_path = cached_path
                            print(f"Using cached model: {model_path}")
                        else:
                            print(f"Downloading model from URL: {model_to_use}")
                            try:
                                import requests
                                import hashlib
                                
                                # Get cache directory
                                cache_dir = get_model_cache_dir()
                                
                                # Extract filename from URL
                                url_path = model_to_use.split('?')[0]
                                filename = os.path.basename(url_path)
                                
                                if not filename.endswith('.gguf'):
                                    filename = "whisper-model.gguf"
                                
                                # Create safe filename in cache
                                url_hash = hashlib.sha256(model_to_use.encode()).hexdigest()
                                cached_filename = f"{url_hash}_{filename}"
                                model_path = os.path.join(cache_dir, cached_filename)
                                
                                # Download to cache
                                response = requests.get(model_to_use, stream=True)
                                response.raise_for_status()
                                
                                total_size = int(response.headers.get('content-length', 0))
                                downloaded = 0
                                
                                with open(model_path, 'wb') as f:
                                    for chunk in response.iter_content(chunk_size=8192*1024):
                                        if chunk:
                                            f.write(chunk)
                                            downloaded += len(chunk)
                                            if total_size > 0:
                                                percent = (downloaded / total_size) * 100
                                                print(f"Downloaded: {percent:.1f}%", end='\r')
                                
                                print(f"\nDownloaded and cached to: {model_path}")
                                model_to_use = model_path
                                
                            except Exception as e:
                                print(f"Error downloading model: {e}")
                                raise
                    
                    # whispercpp needs a local file path
                    if not model_path:
                        model_path = model_to_use if os.path.isfile(model_to_use) else None
                    
                    if not model_path or not os.path.isfile(model_path):
                        raise HTTPException(
                            status_code=400,
                            detail="whispercpp requires a local GGUF file path. Cannot use URLs directly."
                        )
                    
                    # Load the whispercpp model
                    # Note: whispercpp uses model files directly, not paths like Llama
                    # whispercpp only supports:
                    # 1. Built-in model names (tiny, base, small, medium, large-v1, large)
                    # 2. Pre-converted GGUF files in whisper.cpp format (NOT HuggingFace GGUF)
                    try:
                        whisper_model = whispercpp.Whisper.from_pretrained(model_path)
                    except Exception as e:
                        error_msg = str(e).lower()
                        if 'not a valid preconverted model' in error_msg:
                            # This is expected for HuggingFace GGUF files
                            print(f"Warning: whispercpp does not support HuggingFace GGUF format")
                            print("whispercpp only supports its own pre-converted models or built-in names.")
                            print("For Vulkan audio transcription, please either:")
                            print("  1. Install PyTorch + faster-whisper: pip install torch faster-whisper")
                            print("  2. Use a built-in whispercpp model: --audio-model base")
                            raise HTTPException(
                                status_code=400,
                                detail="whispercpp does not support HuggingFace GGUF Whisper models. Use --audio-model with a built-in name (tiny/base/small/medium/large-v1/large) or install faster-whisper with PyTorch."
                            )
                        else:
                            raise
                    
                    # Store in multi_model_manager
                    multi_model_manager.add_model(model_key, whisper_model)
                
                # Run transcription
                # whispercpp returns text directly
                result = whisper_model.transcribe(tmp_path)
                
                # Collect all segments
                text_parts = []
                for segment in result:
                    text_parts.append(str(segment).strip())
                
                full_text = " ".join(text_parts) if text_parts else ""
                
                return {"text": full_text}
            
            except ImportError as e:
                # Check if it's a specific error about whispercpp not working
                error_msg = str(e).lower()
                if 'invalid elf' in error_msg or 'mach-o' in error_msg:
                    # whispercpp library failed to load - architecture mismatch
                    print(f"Warning: whispercpp library failed to load: {e}")
                    print("This usually means whispercpp was installed for a different OS/architecture.")
                    print("Try reinstalling: pip install whispercpp --force-reinstall")
                    print("Audio model will load on-demand when transcription is requested.")
                else:
                    # Neither faster-whisper nor whispercpp available
                    print(f"Warning: No audio transcription library available: {e}")
                    print("Options:")
                    print("  1. Install PyTorch + faster-whisper: pip install torch faster-whisper")
                    print("  2. Use a built-in whispercpp model: --audio-model base")
                    print("  3. Use --whisper-cpp to specify whisper.cpp CLI path")
                    print("Audio model will load on-demand when transcription is requested.")
                
                # Try whisper.cpp CLI as fallback if specified
                whisper_cpp_path = getattr(global_args, 'whisper_cpp', None)
                if whisper_cpp_path and os.path.isfile(whisper_cpp_path):
                    print(f"Using whisper.cpp CLI: {whisper_cpp_path}")
                    try:
                        import subprocess
                        
                        # Determine the model path - check if it's already cached or needs downloading
                        model_path = None
                        
                        # First check if it's a local file
                        if os.path.isfile(model_to_use):
                            model_path = model_to_use
                            print(f"DEBUG: Using local model file: {model_path}")
                        else:
                            # Check cache for downloaded model
                            cached = get_cached_model_path(model_to_use)
                            if cached and os.path.isfile(cached):
                                model_path = cached
                                print(f"DEBUG: Using cached model: {model_path}")
                            else:
                                # Download the model if not cached
                                print(f"DEBUG: Model not cached, downloading: {model_to_use}")
                                cache_dir = get_model_cache_dir()
                                model_path = download_model(model_to_use, cache_dir)
                                print(f"DEBUG: Downloaded model to: {model_path}")
                        
                        print(f"DEBUG: Whisper model: {model_to_use}")
                        print(f"DEBUG: Whisper model path (resolved): {model_path}")
                        
                        # Build whisper.cpp CLI command
                        # Usage: whisper-cli [options] file0 file1 ...
                        # Options:
                        #   -m, --model FNAME    model path
                        #   -f, --file FNAME    input audio file
                        #   -of, --output-file  output file (without extension)
                        #   -dev, --device N   GPU device ID
                        #   -otxt, --output-txt output as text file
                        cmd = [whisper_cpp_path]
                        if model_path:
                            cmd.extend(["-m", model_path])
                        cmd.extend(["-f", tmp_path])
                        cmd.extend(["-otxt"])  # Output as text
                        
                        # Add Vulkan device if specified
                        audio_vulkan_device = getattr(global_args, 'audio_vulkan_device', 0)
                        if audio_vulkan_device is not None:
                            cmd.extend(["-dev", str(audio_vulkan_device)])
                        
                        print(f"DEBUG: Running whisper.cpp command: {' '.join(cmd)}")
                        
                        result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
                        
                        if result.returncode == 0:
                            # Read output - whisper.cpp -otxt outputs to stdout or a file
                            # With -otxt flag, it outputs the transcribed text
                            if result.stdout:
                                full_text = result.stdout
                            else:
                                # Try to read from a file with same name as input but .txt extension
                                output_txt = tmp_path + ".txt"
                                if os.path.exists(output_txt):
                                    with open(output_txt, 'r') as f:
                                        full_text = f.read()
                                    os.unlink(output_txt)
                                else:
                                    full_text = ""
                            return {"text": full_text}
                        else:
                            print(f"whisper.cpp CLI error: {result.stderr}")
                    except Exception as subprocess_error:
                        print(f"whisper.cpp CLI subprocess error: {subprocess_error}")
                
                # Return error response
                raise HTTPException(
                    status_code=501,
                    detail="Audio transcription not available. Install faster-whisper (requires PyTorch) or use --whisper-cpp to specify whisper.cpp CLI path."
                )
        
    finally:
        # Cleanup temp file
        os.unlink(tmp_path)
# =============================================================================
# Image Generation Endpoint
# =============================================================================

# Global load_mode tracker - will be set in main()
def get_load_mode():
    return load_mode.get("mode", "ondemand")

# Helper function to get CFG scale for image generation
def get_cfg_scale():
    """Get CFG scale for image generation. Auto-detect VRAM for Vulkan."""
    cfg_scale = getattr(global_args, 'image_cfg_scale', 1.0)
    
    # If using Vulkan and CLI didn't specify cfg_scale (default 1.0), check VRAM
    if cfg_scale == 1.0:  # Only auto-detect if using default
        backend = getattr(global_args, 'backend', 'auto')
        image_backend = getattr(global_args, 'image_backend', 'auto')
        
        # Check if using Vulkan (either global or image-specific)
        use_vulkan = (backend == 'vulkan') or (image_backend == 'vulkan') or (image_backend == 'auto' and backend == 'auto')
        
        if use_vulkan:
            # Try to detect VRAM
            try:
                import subprocess
                # Try vulkaninfo first
                result = subprocess.run(['vulkaninfo', '-J'], capture_output=True, text=True, timeout=5)
                if result.returncode == 0:
                    import json
                    data = json.loads(result.stdout)
                    # Find device memory
                    for dev in data.get('devices', []):
                        mem = dev.get('deviceMemoryHeap', [{}])
                        for heap in mem:
                            if heap.get('flags', []).get('deviceLocal', False):
                                vram_mb = heap.get('size', 0) / (1024 * 1024)
                                print(f"DEBUG: Detected VRAM: {vram_mb:.0f} MB")
                                if vram_mb < 16000:  # Less than 16GB
                                    print(f"DEBUG: VRAM < 16GB, using cfg_scale=1.0 for better performance")
                                    return 1.0
                                break
            except Exception as e:
                print(f"DEBUG: Could not detect VRAM: {e}")
                # Default to 1.0 for Vulkan if detection fails
                return 1.0
    
    return cfg_scale

# Helper function to save generated images and return response dict
def save_image_response(img, request_format="base64", http_request=None):
    """
    Save image to file path if configured, return response dict.
    
    If --file-path is set and request_format is url (not base64), return only URL.
    If --file-path is set and request_format is base64, return both URL and base64.
    If --file-path is not set, return base64 as usual.
    """
    import base64
    import io
    import os
    import uuid
    from PIL import Image
    
    # Convert to PIL Image if needed
    if not isinstance(img, Image.Image):
        img = Image.fromarray(img)
    
    result = {}
    
    # Save to file path if configured
    if global_file_path:
        os.makedirs(global_file_path, exist_ok=True)
        # Generate unique filename
        filename = f"{uuid.uuid4().hex}.png"
        file_path = os.path.join(global_file_path, filename)
        img.save(file_path, format="PNG")
        # Add URL to response
        # Determine base URL based on --url argument
        url_setting = getattr(global_args, 'url', 'auto') if global_args else 'auto'
        if url_setting == 'auto':
            # Use server host from request headers (what client used to connect)
            if http_request:
                # Get the Host header - this is what the client used to reach the server
                client_host = http_request.headers.get('host', '')
                if not client_host:
                    # Fallback to client IP if no Host header
                    client_host = http_request.client.host if http_request.client else '127.0.0.1'
                # Strip port if present in Host header
                if ':' in client_host and not client_host.replace(':', '').isdigit():
                    client_host = client_host.split(':')[0]
                # Check if HTTPS is enabled
                use_https = getattr(global_args, 'https', False) or getattr(global_args, 'pubkey', None)
                protocol = "https" if use_https else "http"
                port = getattr(global_args, 'port', 8000)
                base_url = f"{protocol}://{client_host}:{port}"
            else:
                base_url = "http://127.0.0.1:8000"
        else:
            # Use explicitly provided URL (strip trailing slash if present)
            base_url = url_setting.rstrip('/')
        result["url"] = f"{base_url}/v1/files/{filename}"
        
        # If client explicitly requested base64, include it
        # Otherwise, only return URL when file-path is set
        if request_format == "base64":
            buffered = io.BytesIO()
            img.save(buffered, format="PNG")
            img_bytes = buffered.getvalue()
            img_base64 = base64.b64encode(img_bytes).decode('utf-8')
            result["b64_json"] = img_base64
    else:
        # No file-path, return base64 as usual
        buffered = io.BytesIO()
        img.save(buffered, format="PNG")
        img_bytes = buffered.getvalue()
        img_base64 = base64.b64encode(img_bytes).decode('utf-8')
        result["b64_json"] = img_base64
    
    return result

@app.post("/v1/images/generations")
async def create_image_generation(request: ImageGenerationRequest, http_request: Request = None):
    """
    Image generation endpoint (OpenAI-compatible).
    
    Supports:
    - Stable Diffusion via stable-diffusion-cpp-python (sd.cpp)
    - Stable Diffusion XL (via local inference with diffusers)
    - Other diffusers models
    """
    # Get or create semaphore for this model
    model_key = f"image:{request.model}" if request.model else "image"
    mode = get_load_mode()
    
    # Check if --image-1 is set (no queue, return 409 if busy)
    use_1_mode = queue_flags.get("image_1", False)
    
    # In loadall mode, allow 1 concurrent request per model
    # In ondemand mode, serialize all requests (use global semaphore)
    if mode == "loadall":
        if model_key not in model_semaphores:
            model_semaphores[model_key] = asyncio.Semaphore(1)
        semaphore = model_semaphores[model_key]
    else:
        # Use a global semaphore for ondemand mode
        if "global_image" not in model_semaphores:
            model_semaphores["global_image"] = asyncio.Semaphore(1)
        semaphore = model_semaphores["global_image"]
    
    # Try to acquire semaphore without blocking
    if use_1_mode:
        acquired = semaphore.locked()
        if acquired:
            raise HTTPException(
                status_code=409,
                detail="Image model is busy. Try again later."
            )
    
    async with semaphore:
        image_model = multi_model_manager.image_model
    
    # If no image model configured, try to use main --model as fallback
    if not image_model:
        # Try to get the main model from args
        main_model = getattr(global_args, 'model', None)
        if main_model and isinstance(main_model, list) and len(main_model) > 0:
            image_model = main_model[0]
        elif main_model:
            image_model = main_model
        
        # Check if main model is a GGUF file - can't use for image generation
        if image_model and ('.gguf' in image_model.lower() or 'gguf' in image_model.lower()):
            print(f"Note: Main model is a GGUF file (for text), not suitable for image generation")
            image_model = None  # Can't use GGUF for images
    
    # If still no image model configured, return an error
    if not image_model:
        raise HTTPException(
            status_code=400,
            detail="Image generation not configured. Use --image-model to specify a model."
        )
    
    # Determine model to use
    # Priority: 1) model specified in request, 2) default image model from --image-model
    model_to_use = request.model
    if not model_to_use or model_to_use == "image":
        # No model specified in request, use default
        model_to_use = image_model
    elif model_to_use.startswith("image:"):
        # Legacy format - strip prefix and use default
        model_to_use = image_model
    else:
        # Check if model_to_use is a valid model (URL, file, or known model)
        # If not, fallback to the configured image model to avoid HF resolution errors
        if image_model:
            is_url = model_to_use.startswith('http://') or model_to_use.startswith('https://')
            is_file = os.path.isfile(model_to_use) if model_to_use else False
            if not is_url and not is_file:
                # Unknown model name - use default instead of trying to resolve as HF
                print(f"Warning: Unknown model '{model_to_use}' in image generation request, using configured --image-model")
                model_to_use = image_model
    
    # Track errors for proper fallback chain
    diffusers_error = None
    sd_cpp_error = None
    
    # Parse size (e.g., "1024x1024")
    width, height = 1024, 1024
    if request.size:
        parts = request.size.split("x")
        if len(parts) == 2:
            try:
                width = int(parts[0])
                height = int(parts[1])
            except ValueError:
                pass
    
    # Try diffusers first (torch-based, best quality for NVIDIA)
    # Skip if it's a GGUF model (those need stable-diffusion-cpp)
    # First, cleanup any other models to free VRAM
    for key in list(multi_model_manager.models.keys()):
        # Skip image models
        if key.startswith("image:"):
            continue
        # Unload any other model (text, audio, etc.) to free VRAM
        model_to_cleanup = multi_model_manager.models.get(key)
        if model_to_cleanup is not None:
            print(f"Unloading '{key}' from VRAM to make room for diffusers image model")
            try:
                if hasattr(model_to_cleanup, 'cleanup') and callable(getattr(model_to_cleanup, 'cleanup')):
                    model_to_cleanup.cleanup()
                elif hasattr(model_to_cleanup, 'model') and model_to_cleanup.model is not None:
                    if hasattr(model_to_cleanup.model, 'cleanup'):
                        model_to_cleanup.model.cleanup()
            except Exception as e:
                print(f"Warning during cleanup of '{key}': {e}")
            del multi_model_manager.models[key]
    
    try:
        import torch
        from diffusers import StableDiffusionXLPipeline, DiffusionPipeline
        
        # Check if this is a GGUF model - skip diffusers for those
        is_gguf_model = (model_to_use.endswith('.gguf') or 'gguf' in model_to_use.lower() or
                        (model_to_use.startswith('http') and '.gguf' in model_to_use))
        
        if is_gguf_model:
            print(f"GGUF model detected ({model_to_use}), skipping diffusers, using stable-diffusion-cpp...")
            raise Exception("GGUF model - use stable-diffusion-cpp instead")
        
        # Determine model key
        model_key = f"image:{model_to_use}"
        pipeline = multi_model_manager.get_model(model_key)
        
        if pipeline is None:
            print(f"Loading Stable Diffusion model: {model_to_use}")
            
            # Determine precision from CLI argument
            precision = getattr(global_args, 'image_precision', 'f32') or 'f32'
            precision_map = {
                'bf16': torch.bfloat16,
                'f32': torch.float32,
                'f16': torch.float16,
                'f8': torch.float8_e4m3fn,
            }
            torch_dtype = precision_map.get(precision, torch.float32)
            print(f"Using precision: {precision} ({torch_dtype})")
            
            # Check if CPU offload is requested via CLI
            use_sequential_offload = getattr(global_args, 'image_cpu_offload', False)
            
            # Track loading attempts for OOM handling
            load_attempt = 0
            max_attempts = 3
            pipeline = None
            
            while pipeline is None and load_attempt < max_attempts:
                try:
                    load_attempt += 1
                    print(f"Loading attempt {load_attempt}/{max_attempts}...")
                    
                    # Try to load as Stable Diffusion XL first
                    try:
                        pipeline = StableDiffusionXLPipeline.from_pretrained(
                            model_to_use,
                            torch_dtype=torch_dtype,
                            use_safetensors=True,
                        )
                    except Exception:
                        # Try generic diffusion pipeline
                        pipeline = DiffusionPipeline.from_pretrained(
                            model_to_use,
                            torch_dtype=torch_dtype,
                            use_safetensors=True,
                        )
                    
                    # Apply memory optimizations based on attempt
                    if torch.cuda.is_available():
                        if load_attempt >= 2:
                            # Second attempt: enable attention slicing
                            print("Enabling attention slicing for lower VRAM usage...")
                            pipeline.enable_attention_slicing()
                        
                        if load_attempt >= 3 or use_sequential_offload:
                            # Third attempt or offload requested: enable sequential CPU offload
                            print("Enabling sequential CPU offload for lower VRAM usage...")
                            pipeline.enable_sequential_cpu_offload()
                        else:
                            # First attempt: try regular GPU
                            pipeline = pipeline.to("cuda")
                    else:
                        pipeline = pipeline.to("cpu")
                    
                except Exception as load_error:
                    error_msg = str(load_error).lower()
                    is_oom = any(x in error_msg for x in ['out of memory', 'oom', 'cuda error', 'cudamalloc'])
                    
                    if is_oom and load_attempt < max_attempts:
                        print(f"OOM during model loading: {load_error}")
                        print(f"Retrying with more aggressive memory optimization...")
                        pipeline = None  # Reset for retry
                        # Clear CUDA cache
                        if torch.cuda.is_available():
                            torch.cuda.empty_cache()
                    else:
                        raise load_error
            
            # Enable VAE tiling if requested (for lower VRAM usage)
            if getattr(global_args, 'vae_tiling', False):
                print("Enabling VAE tiling for lower VRAM usage...")
                try:
                    pipeline.enable_vae_tiling()
                except Exception as e:
                    print(f"Warning: Could not enable VAE tiling: {e}")
            
            multi_model_manager.add_model(model_key, pipeline)
        
        # Get timestamp BEFORE calling diffusers (to avoid scope issues)
        import time as time_module
        timestamp = int(time_module.time())
        
        # Generate images
        # Use request seed if provided, otherwise use CLI default seed
        seed = request.seed if request.seed is not None else getattr(global_args, 'image_seed', None)
        generator = None
        if seed is not None:
            generator = torch.Generator(device=pipeline.device).manual_seed(seed)
        
        # Quality: "standard" or "hd"
        quality = request.quality or "standard"
        
        # Use request parameters if provided, otherwise fall back to quality-based defaults
        num_steps = request.steps if request.steps else (30 if quality == "standard" else 50)
        cfg_scale = request.guidance_scale if request.guidance_scale else (
            getattr(global_args, 'image_cfg_scale', 7.5) if quality == "standard" else 9.0
        )
        
        # Generate
        result = pipeline(
            prompt=request.prompt,
            negative_prompt=None,
            num_images_per_prompt=request.n,
            height=height,
            width=width,
            generator=generator,
            guidance_scale=cfg_scale,
            num_inference_steps=num_steps,
        )
        
        # Extract images
        images = []
        try:
            result_images = result.images
        except Exception as img_err:
            print(f"Warning: Could not access result.images: {img_err}")
            # Try alternative: result might have 'image' or 'output'
            result_images = getattr(result, 'image', None) or getattr(result, 'output', None)
            if result_images is None:
                raise Exception(f"Could not extract images from diffusers result: {img_err}")
        
        for img in result_images:
            # Convert to base64
            import base64
            import io
            import numpy as np
            
            # Handle NaN/Inf values in image data - convert to valid values
            if isinstance(img, np.ndarray):
                # Replace NaN and Inf with valid values
                img = np.nan_to_num(img, nan=0.0, posinf=1.0, neginf=0.0)
                # Clip to valid range [0, 1]
                img = np.clip(img, 0.0, 1.0)
            
            # Use helper function to save and get response
            img_data = save_image_response(img, request.response_format, http_request)
            images.append(img_data)
        
        return {
            "created": timestamp,
            "data": images
        }
        
    except ImportError as e:
        # diffusers/torch not installed - record error and try sd.cpp
        diffusers_error = str(e)
        print(f"diffusers not available: {diffusers_error}, trying stable-diffusion-cpp-python...")
    except Exception as e:
        # Other error with diffusers - record and try sd.cpp
        import traceback
        diffusers_error = str(e)
        print(f"diffusers error: {diffusers_error}")
        print(f"Traceback: {traceback.format_exc()}")
        print(f"Trying stable-diffusion-cpp-python...")
    
    # Try stable-diffusion-cpp-python (sd.cpp) as fallback
    # First, check all available image models to find one loaded via sd.cpp
    # Always check for cached models - allows dynamically loaded models to be reused across requests
    sd_model = None
    for key in multi_model_manager.models:
        if key.startswith("image:"):
            potential_model = multi_model_manager.get_model(key)
            if potential_model is not None:
                # Check if it's a stable-diffusion-cpp model
                try:
                    from stable_diffusion_cpp import StableDiffusion
                    if isinstance(potential_model, StableDiffusion):
                        sd_model = potential_model
                        print(f"Found cached stable-diffusion-cpp model with key: {key}")
                        break
                except ImportError:
                    pass
    
    # If no cached image model found, need to load one - first cleanup any existing models
    if sd_model is None:
        # Check if there's a text model loaded and unload it to free VRAM
        # Cleanup ALL models except the one we're about to load
        for key in list(multi_model_manager.models.keys()):
            # Skip the image model we'll be loading (if we find it later)
            # For now, cleanup all other models
            if key.startswith("image:"):
                continue
            # Unload any other model (text, audio, etc.) to free VRAM
            model_to_cleanup = multi_model_manager.models.get(key)
            if model_to_cleanup is not None:
                print(f"Unloading '{key}' from VRAM to make room for image model")
                try:
                    if hasattr(model_to_cleanup, 'cleanup') and callable(getattr(model_to_cleanup, 'cleanup')):
                        model_to_cleanup.cleanup()
                    elif hasattr(model_to_cleanup, 'model') and model_to_cleanup.model is not None:
                        if hasattr(model_to_cleanup.model, 'cleanup'):
                            model_to_cleanup.model.cleanup()
                except Exception as e:
                    print(f"Warning during cleanup of '{key}': {e}")
                del multi_model_manager.models[key]
    
    if sd_model is not None:
        # Check if it's a stable-diffusion-cpp model (has generate method from sd.cpp)
        try:
            from stable_diffusion_cpp import StableDiffusion
            if isinstance(sd_model, StableDiffusion):
                print(f"Using stable-diffusion-cpp-python for image generation")
                # Use sd.cpp for generation
                # Parse size
                width, height = 512, 512
                if request.size:
                    parts = request.size.split("x")
                    if len(parts) == 2:
                        try:
                            width = int(parts[0])
                            height = int(parts[1])
                        except ValueError:
                            pass
                
                # Use default steps for Z-Image Turbo (very fast)
                steps = 4  # Default for fast generation
                
                # Generate images using sd.cpp (run in thread to not block event loop)
                # Use request seed if provided, otherwise use CLI default seed
                seed = request.seed if request.seed is not None else getattr(global_args, 'image_seed', None)
                
                result = await asyncio.to_thread(
                    sd_model.generate_image,
                    prompt=request.prompt,
                    negative_prompt='',
                    width=width,
                    height=height,
                    cfg_scale=get_cfg_scale(),
                    sample_steps=steps,
                    seed=seed if seed is not None else 42,
                    batch_count=request.n if request.n else 1,
                )
                
                # Small delay to let Vulkan driver settle after generation
                import time
                time.sleep(0.1)
                
                # Convert results to response format
                images = []
                import base64
                import io
                from PIL import Image
                
                for img in result:
                    # Use helper function to save and get response
                    img_data = save_image_response(img, http_request=http_request)
                    images.append(img_data)
                
                return {
                    "created": int(time.time()),
                    "data": images
                }
        except ImportError as e:
            # stable-diffusion-cpp not available
            sd_cpp_error = str(e)
            print(f"stable-diffusion-cpp-python not available: {sd_cpp_error}")
        except Exception as e:
            print(f"sd.cpp generation error: {e}")
            sd_cpp_error = str(e)
    else:
        # No sd.cpp model pre-loaded, try to load dynamically
        print("No pre-loaded sd.cpp model found, trying to load...")
        try:
            from stable_diffusion_cpp import StableDiffusion
            
            # Check if model_to_use is a URL and get cached path
            # Also handle HuggingFace model IDs that need to be resolved
            model_path = None
            if model_to_use.startswith('http://') or model_to_use.startswith('https://'):
                cached_path = get_cached_model_path(model_to_use)
                if cached_path:
                    model_path = cached_path
                    print(f"Using cached model: {model_path}")
                else:
                    # Not cached - download it
                    print(f"Downloading model: {model_to_use}")
                    cache_dir = get_model_cache_dir()
                    model_path = download_model(model_to_use, cache_dir)
                    print(f"Downloaded to: {model_path}")
            elif os.path.isfile(model_to_use):
                model_path = model_to_use
            else:
                # Try to resolve as HuggingFace model ID
                print(f"Trying to resolve as HuggingFace model ID: {model_to_use}")
                try:
                    from huggingface_hub import hf_hub_download, list_repo_files
                    
                    # Parse model name (format: "org/model" or "org/model/filename.gguf")
                    parts = model_to_use.split('/')
                    if len(parts) >= 2:
                        repo_id = f"{parts[0]}/{parts[1]}"
                        
                        # First check if there's a cached GGUF file for this model
                        # Try common GGUF file patterns
                        files = list_repo_files(repo_id)
                        gguf_files = [f for f in files if f.endswith('.gguf')]
                        
                        if gguf_files:
                            # Try to find a cached version first
                            for gguf_file in gguf_files:
                                # Construct potential URL and check cache
                                potential_url = f"https://huggingface.co/{repo_id}/resolve/main/{gguf_file}"
                                cached = get_cached_model_path(potential_url)
                                if cached:
                                    model_path = cached
                                    print(f"Using cached GGUF model: {model_path}")
                                    break
                            
                            # If not cached, download the first GGUF file
                            if not model_path:
                                print(f"Downloading GGUF model from HF: {gguf_files[0]}")
                                model_path = hf_hub_download(repo_id=repo_id, filename=gguf_files[0])
                                print(f"Downloaded to: {model_path}")
                except Exception as e:
                    print(f"Could not resolve as HuggingFace model: {e}")
            
            if model_path is None:
                print("Warning: Could not resolve sd.cpp model path")
                sd_cpp_error = "Could not resolve model path"
            else:
                # Load sd.cpp model
                # Determine backend to use based on CLI args
                backend = getattr(global_args, 'backend', 'auto')
                image_backend = getattr(global_args, 'image_backend', 'auto')
                
                # Use CUDA only if explicitly requested via --backend nvidia or --image-backend nvidia
                use_cuda = (backend == 'nvidia' or backend == 'cuda' or 
                           image_backend == 'nvidia' or image_backend == 'cuda')
                
                if use_cuda:
                    print(f"Using CUDA backend for sd.cpp image generation")
                else:
                    print(f"Using Vulkan backend for sd.cpp image generation")
                
                # Build kwargs for stable-diffusion-cpp with CLI args
                sd_kwargs = {'diffusion_model_path': model_path}
                
                # Add VAE path from CLI args if provided
                vae_path = getattr(global_args, 'vae_path', None)
                if vae_path:
                    # Check if it's a URL and download if needed
                    if vae_path.startswith('http://') or vae_path.startswith('https://'):
                        cached = get_cached_model_path(vae_path)
                        if cached:
                            sd_kwargs['vae_path'] = cached
                            print(f"Using cached VAE model: {cached}")
                        else:
                            cache_dir = get_model_cache_dir()
                            sd_kwargs['vae_path'] = download_model(vae_path, cache_dir)
                    else:
                        sd_kwargs['vae_path'] = vae_path
                
                # Add LLM/CLIP path from CLI args if provided
                llm_path = getattr(global_args, 'llm_path', None)
                if llm_path:
                    if llm_path.startswith('http://') or llm_path.startswith('https://'):
                        cached = get_cached_model_path(llm_path)
                        if cached:
                            sd_kwargs['llm_path'] = cached
                            print(f"Using cached LLM model: {cached}")
                        else:
                            cache_dir = get_model_cache_dir()
                            sd_kwargs['llm_path'] = download_model(llm_path, cache_dir)
                    else:
                        sd_kwargs['llm_path'] = llm_path
                
                # Add T5XXL path from CLI args if provided
                t5xxl_path = getattr(global_args, 't5xxl_path', None)
                if t5xxl_path:
                    if t5xxl_path.startswith('http://') or t5xxl_path.startswith('https://'):
                        cached = get_cached_model_path(t5xxl_path)
                        if cached:
                            sd_kwargs['t5xxl_path'] = cached
                            print(f"Using cached T5XXL model: {cached}")
                        else:
                            cache_dir = get_model_cache_dir()
                            sd_kwargs['t5xxl_path'] = download_model(t5xxl_path, cache_dir)
                    else:
                        sd_kwargs['t5xxl_path'] = t5xxl_path
                
                # Add clip_on_cpu if specified
                if getattr(global_args, 'clip_on_cpu', False):
                    sd_kwargs['keep_clip_on_cpu'] = True
                    print(f"DEBUG: Running CLIP on CPU to save VRAM (keep_clip_on_cpu=True)")
                
                # Use all available CPU cores
                import psutil
                sd_kwargs['n_threads'] = psutil.cpu_count()
                
                sd_model = StableDiffusion(**sd_kwargs)
                
                # Cache the model for reuse on subsequent requests
                cache_key = f"image:{model_path}"
                multi_model_manager.add_model(cache_key, sd_model)
                print(f"Using stable-diffusion-cpp-python for image generation")
                
                # Generate images
                width, height = 512, 512
                if request.size:
                    parts = request.size.split("x")
                    if len(parts) == 2:
                        try:
                            width = int(parts[0])
                            height = int(parts[1])
                        except ValueError:
                            pass
                
                steps = 4
                
                # Use request seed if provided, otherwise use CLI default seed
                seed = request.seed if request.seed is not None else getattr(global_args, 'image_seed', None)
                
                result = await asyncio.to_thread(
                    sd_model.generate_image,
                    prompt=request.prompt,
                    negative_prompt='',
                    width=width,
                    height=height,
                    cfg_scale=get_cfg_scale(),
                    sample_steps=steps,
                    seed=seed if seed is not None else 42,
                    batch_count=request.n if request.n else 1,
                )
                
                # Small delay to let Vulkan driver settle after generation
                import time
                time.sleep(0.1)
                
                # Convert results to response format
                images = []
                import base64
                import io
                from PIL import Image
                
                for img in result:
                    # Use helper function to save and get response
                    img_data = save_image_response(img, http_request=http_request)
                    images.append(img_data)
                
                return {
                    "created": int(time.time()),
                    "data": images
                }
        except ImportError as e:
            sd_cpp_error = str(e)
            print(f"stable-diffusion-cpp-python not available: {sd_cpp_error}")
        except Exception as e:
            sd_cpp_error = str(e)
            print(f"sd.cpp error: {sd_cpp_error}")
    
    # Both backends failed - return error with installation instructions
    raise HTTPException(
        status_code=400,
        detail=f"Model '{model_to_use}' does not support image generation"
    )
# =============================================================================
# Text-to-Speech Endpoint
# =============================================================================

class TTSRequest(BaseModel):
    model: str
    input: str
    voice: Optional[str] = "af_sarah"
    response_format: Optional[str] = "mp3"
    speed: Optional[float] = 1.0
    
    model_config = ConfigDict(extra="allow")
class TTSResponse(BaseModel):
    audio: str  # base64 encoded audio
    model_config = ConfigDict(extra="allow")
@app.post("/v1/audio/speech")
async def create_speech(request: TTSRequest):
    """
    Text-to-speech endpoint (OpenAI-compatible).
    
    Supports:
    - Kokoro TTS models (when --tts-model is specified)
    """
    tts_model = multi_model_manager.tts_model
    
    # If no TTS model configured, return an error
    if not tts_model:
        raise HTTPException(
            status_code=400,
            detail="TTS not configured. Use --tts-model to specify a model."
        )
    
    # Determine model to use
    model_to_use = request.model
    if model_to_use.startswith("tts:"):
        model_to_use = tts_model
    
    # Try to use kokoro if available
    try:
        from kokoro import Kokoro
        
        # Determine model key
        model_key = f"tts:{model_to_use}"
        kokoro_model = multi_model_manager.get_model(model_key)
        
        if kokoro_model is None:
            print(f"Loading Kokoro TTS model: {model_to_use}")
            
            # Check if model_to_use is a URL - download it (with caching)
            model_path = None
            if model_to_use.startswith('http://') or model_to_use.startswith('https://'):
                # Check cache first
                cached_path = get_cached_model_path(model_to_use)
                if cached_path:
                    model_path = cached_path
                    print(f"Using cached model: {model_path}")
                else:
                    print(f"Downloading model from URL: {model_to_use}")
                    try:
                        import requests
                        import hashlib
                        
                        # Get cache directory
                        cache_dir = get_model_cache_dir()
                        
                        # Extract filename from URL
                        url_path = model_to_use.split('?')[0]
                        filename = os.path.basename(url_path)
                        
                        if not filename.endswith('.pt') and not filename.endswith('.bin'):
                            filename = "kokoro-model.pt"
                        
                        # Create safe filename in cache
                        url_hash = hashlib.sha256(model_to_use.encode()).hexdigest()
                        cached_filename = f"{url_hash}_{filename}"
                        model_path = os.path.join(cache_dir, cached_filename)
                        
                        # Download to cache
                        response = requests.get(model_to_use, stream=True)
                        response.raise_for_status()
                        
                        total_size = int(response.headers.get('content-length', 0))
                        downloaded = 0
                        
                        with open(model_path, 'wb') as f:
                            for chunk in response.iter_content(chunk_size=8192*1024):
                                if chunk:
                                    f.write(chunk)
                                    downloaded += len(chunk)
                                    if total_size > 0:
                                        percent = (downloaded / total_size) * 100
                                        print(f"Downloaded: {percent:.1f}%", end='\r')
                        
                        print(f"\nDownloaded and cached to: {model_path}")
                        
                    except Exception as e:
                        print(f"Error downloading model: {e}")
                        raise
            else:
                # Use local path or model name
                model_path = model_to_use
            
            # Load the Kokoro model
            kokoro_model = Kokoro(model_path if model_path else model_to_use)
            multi_model_manager.add_model(model_key, kokoro_model)
        
        # Generate speech
        voice = request.voice or "af_sarah"
        speed = request.speed or 1.0
        
        audio_bytes = kokoro_model.generate(request.input, voice=voice, speed=speed)
        
        # Convert to base64
        import base64
        audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
        
        return {
            "audio": audio_base64
        }
        
    except ImportError as e:
        # kokoro not installed
        raise HTTPException(
            status_code=501,
            detail=f"TTS not available. Install kokoro: pip install kokoro. Error: {str(e)}"
        )
    except Exception as e:
        print(f"TTS error: {e}")
        import traceback
        traceback.print_exc()
        raise HTTPException(status_code=500, detail=f"TTS error: {str(e)}")
@app.post("/v1/chat/completions")
async def chat_completions(request: ChatCompletionRequest, http_request: Request = None):
    """Chat completions endpoint with streaming and tool support."""
    
    # Check if we should use litellm backend
    parser_type = getattr(global_args, 'parser', 'auto') if global_args else 'auto'
    
    if parser_type == 'litellm':
        # Use LiteLLM backend
        from codai.openai.litellm import get_litellm_backend, LITELLM_AVAILABLE
        
        if not LITELLM_AVAILABLE:
            raise HTTPException(
                status_code=500,
                detail="LiteLLM is not installed. Run: pip install litellm"
            )
        
        # Check for API key in request - litellm requires an API key
        # If not provided, use a fake key to allow the request to proceed
        api_key = None
        
        # Try to get API key from request body
        if hasattr(request, 'api_key') and request.api_key:
            api_key = request.api_key
        
        # If no API key in body, try to get from Authorization header
        if not api_key:
            auth_header = http_request.headers.get('Authorization', '') if http_request else ''
            if auth_header.startswith('Bearer '):
                api_key = auth_header[7:]  # Extract token after 'Bearer '
        
        # If still no API key, use a fake key to allow litellm to proceed
        # litellm will then fail with the actual provider error if needed
        if not api_key:
            api_key = "fake-key-for-local-testing"
            print("DEBUG: No API key provided, using fake key for litellm")
        
        # Determine the base URL for litellm to connect to
        # Use the server's host and port for local connections
        api_base = None
        
        # Check if model starts with 'ollama:' - use local Ollama
        if request.model and request.model.startswith('ollama:'):
            # Get the host from the request headers
            client_host = "127.0.0.1"
            if http_request:
                host_header = http_request.headers.get('host', '')
                if host_header:
                    # Strip port if present
                    if ':' in host_header:
                        client_host = host_header.split(':')[0]
                        if client_host.replace('.', '').isdigit():
                            # It's an IP, keep it
                            pass
                        else:
                            # It's a hostname, use localhost
                            client_host = "127.0.0.1"
                    else:
                        client_host = host_header
            
            # Get port from global_args or use default
            port = getattr(global_args, 'port', 11434) if global_args else 11434
            api_base = f"http://{client_host}:{port}/v1"
            print(f"DEBUG: Using api_base for Ollama: {api_base}")
        else:
            # For non-Ollama models, use the server's own URL as base
            # This allows LiteLLM to make requests to the local server
            if http_request:
                # Get the host from the request headers
                host_header = http_request.headers.get('host', '')
                if host_header:
                    # Strip port if present to reconstruct clean URL
                    if ':' in host_header:
                        client_host = host_header.split(':')[0]
                        # Keep the port from the request for consistency
                        server_port = host_header.split(':')[1] if len(host_header.split(':')) > 1 else str(getattr(global_args, 'port', 6745))
                    else:
                        client_host = host_header
                        server_port = str(getattr(global_args, 'port', 6745))
                else:
                    # Fallback to client host if no Host header
                    client_host = http_request.client.host if http_request.client else "127.0.0.1"
                    server_port = str(getattr(global_args, 'port', 6745))
            else:
                # Fallback if no http_request
                client_host = "127.0.0.1"
                server_port = str(getattr(global_args, 'port', 6745))
            
            # Determine protocol (http or https)
            use_https = getattr(global_args, 'https', False) or getattr(global_args, 'pubkey', None)
            protocol = "https" if use_https else "http"
            api_base = f"{protocol}://{client_host}:{server_port}/v1"
            print(f"DEBUG: Using api_base for local server: {api_base}")
        
        # Get or create litellm backend
        litellm_backend = get_litellm_backend(
            model=request.model,
            api_key=api_key,
            api_base=api_base,
            context_window=8192,  # Default, can be made configurable
            model_manager=multi_model_manager  # Pass for alias resolution
        )
        
        # Get the tool_parser from multi_model_manager for model-specific parsing
        tool_parser = multi_model_manager.tool_parser if hasattr(multi_model_manager, 'tool_parser') else None
        
        # Convert messages to dict format
        messages_dict = []
        for msg in request.messages:
            msg_dict = {"role": msg.role, "content": msg.content or ""}
            if hasattr(msg, 'tool_calls') and msg.tool_calls:
                msg_dict["tool_calls"] = msg.tool_calls
            if hasattr(msg, 'tool_call_id') and msg.tool_call_id:
                msg_dict["tool_call_id"] = msg.tool_call_id
            messages_dict.append(msg_dict)
        
        # Prepare tools if provided
        tools_dict = None
        if request.tools:
            tools_dict = request.tools
        
        # Generate response
        try:
            if request.stream:
                # Streaming response
                
                async def generate():
                    try:
                        async for chunk in await litellm_backend.chat_completion(
                            messages=messages_dict,
                            model=request.model,
                            temperature=request.temperature,
                            top_p=request.top_p,
                            max_tokens=request.max_tokens,
                            stop=request.stop,
                            tools=tools_dict,
                            tool_choice=request.tool_choice,
                            stream=True,
                            tool_parser=tool_parser,
                        ):
                            # Add rate limit headers
                            headers = {}
                            if 'usage' in chunk:
                                headers = litellm_backend.get_rate_limit_headers(
                                    prompt_tokens=chunk.get('usage', {}).get('prompt_tokens', 0),
                                    completion_tokens=chunk.get('usage', {}).get('completion_tokens', 0)
                                )
                            
                            # Handle Qwen tool calls if model is Qwen family
                            if 'qwen' in request.model.lower():
                                content = chunk.get('choices', [{}])[0].get('delta', {}).get('content', '')
                                tool_calls = chunk.get('choices', [{}])[0].get('delta', {}).get('tool_calls', [])
                                
                                if not tool_calls and content:
                                    # Try to parse tool calls from content
                                    tool_calls = litellm_backend.parse_qwen_tool_calls(content)
                                    if tool_calls:
                                        # Strip tool tags from content
                                        content = litellm_backend.strip_tool_tags(content)
                                        chunk['choices'][0]['delta']['content'] = content
                                        chunk['choices'][0]['delta']['tool_calls'] = tool_calls
                            
                            yield f"data: {json.dumps(chunk)}\n\n"
                        
                        yield "data: [DONE]\n\n"
                    except Exception as e:
                        yield f"data: {json.dumps({'error': {'message': str(e), 'type': 'internal_error'}})}\n\n"
                
                return StreamingResponse(generate(), media_type="text/event-stream")
            else:
                # Non-streaming response
                response = await litellm_backend.chat_completion(
                    messages=messages_dict,
                    model=request.model,
                    temperature=request.temperature,
                    top_p=request.top_p,
                    max_tokens=request.max_tokens,
                    stop=request.stop,
                    tools=tools_dict,
                    tool_choice=request.tool_choice,
                    stream=False,
                    tool_parser=tool_parser,
                )
                
                # Handle Qwen tool calls
                if 'qwen' in request.model.lower() and 'choices' in response:
                    msg = response['choices'][0].get('message', {})
                    content = msg.get('content', '')
                    tool_calls = msg.get('tool_calls', [])
                    
                    if not tool_calls and content:
                        tool_calls = litellm_backend.parse_qwen_tool_calls(content)
                        if tool_calls:
                            msg['content'] = litellm_backend.strip_tool_tags(content)
                            msg['tool_calls'] = tool_calls
                            response['choices'][0]['message'] = msg
                
                # Add rate limit headers
                headers = {}
                if 'usage' in response:
                    headers = litellm_backend.get_rate_limit_headers(
                        prompt_tokens=response.get('usage', {}).get('prompt_tokens', 0),
                        completion_tokens=response.get('usage', {}).get('completion_tokens', 0)
                    )
                
                
        except Exception as e:
            # Handle litellm errors
            error_response = {
                "error": {
                    "message": str(e),
                    "type": "internal_error",
                    "code": 500
                }
            }
            return JSONResponse(content=error_response, status_code=500)
    
    # Continue with original implementation for 'auto' parser
    # Get the model for this request
    requested_model = request.model
    
    # Try to get the appropriate model
    mm = multi_model_manager.get_model_for_request(requested_model)
    
    if mm is None:
        # Model not loaded - try to use default
        if model_manager.backend is not None:
            # Fallback to legacy model_manager
            current_manager = model_manager
        else:
            raise HTTPException(status_code=503, detail="Model not loaded")
    else:
        current_manager = mm
    
    # Inject system prompt if --system-prompt flag was provided
    messages = request.messages
    if global_system_prompt is not None:
        # Get the custom system prompt text
        if global_system_prompt is True:
            # Default system prompt
            system_addon = "You are a helpful assistant."
        else:
            # Custom system prompt provided as argument
            system_addon = str(global_system_prompt)
        
        # Check if there's already a system message
        system_found = False
        for i, msg in enumerate(messages):
            if msg.role == "system":
                # Chain the custom system prompt at the START of existing system message
                messages[i] = ChatMessage(role="system", content=system_addon + "\n\n" + msg.content)
                system_found = True
                break
        
        if not system_found:
            # No existing system message, use the custom one
            messages = [ChatMessage(role="system", content=system_addon)] + list(messages)
    
    # Enable thinking/reasoning mode if requested via API parameter OR CLI flag
    force_reasoning_args = getattr(global_args, 'force_reasoning', None) if global_args else None
    
    enable_thinking_api = getattr(request, 'enable_thinking', False)
    
    # Parse force_reasoning: can be list (from CLI) or string (legacy)
    if isinstance(force_reasoning_args, str):
        # Legacy: convert string to list
        if force_reasoning_args == "both":
            force_reasoning_args = ["inject", "stop"]
        elif force_reasoning_args == "stop":
            force_reasoning_args = ["stop"]
        elif force_reasoning_args == "inject":
            force_reasoning_args = ["inject"]
        elif force_reasoning_args == "all":
            # 'all' enables all reasoning methods
            force_reasoning_args = ["chat", "inject", "prompt", "mock", "raw", "twopass"]
        else:
            force_reasoning_args = []
    elif not force_reasoning_args:
        force_reasoning_args = []
    
    # Combine CLI args with API param
    # 'chat' from CLI enables API reasoning param
    reasoning_enabled = enable_thinking_api or (len(force_reasoning_args) > 0)
    
    # DEBUG: Print force_reasoning status when debug mode is enabled
    if global_debug:
        print(f"\n{'='*60}")
        print(f"=== REASONING MODE DEBUG ===")
        print(f"{'='*60}")
        print(f"force_reasoning CLI args: {force_reasoning_args}")
        print(f"enable_thinking API param: {enable_thinking_api}")
        # Debug stop sequences if available
        if 'raw_stop_sequences' in locals():
            print(f"stop argument for chat call: {raw_stop_sequences}")
    
    # Get model family for reasoning tokens
    model_family = get_model_family(request.model)
    
    # Check if model is qwen3 and force_reasoning is enabled
    is_qwen3 = 'qwen3' in model_family.lower() if model_family else False
    use_qwen3_penalties = is_qwen3 and force_reasoning_args
    
    # System prompt addon for qwen3 with force_reasoning
    qwen3_system_addon = ""
    if use_qwen3_penalties:
        qwen3_system_addon = "\n\nCRITICAL: Do not repeat tool calls. If a tool fails with an [ERROR], do not retry the exact same parameters. Propose a different approach or ask for clarification."
        if global_debug:
            print(f"QWEEN3: Adding penalties and system addon for qwen3 with force_reasoning")
    
    # Handle 'chat' - enable thinking API parameter
    if "chat" in force_reasoning_args or enable_thinking_api:
        # Note: This only works with compatible APIs (OpenAI-like)
        # We'll set it on the request if supported
        if hasattr(request, 'thinking'):
            request.thinking = {"type": "enabled"}
        if global_debug:
            print(f"CHAT: Reasoning API param enabled")
    
    # Handle 'inject' - system prompt injection
    # Skip for 'raw' mode since it handles everything separately
    if "raw" not in force_reasoning_args and "inject" in force_reasoning_args:
        from codai.models.templates import AgenticTemplateManager
        template_manager = AgenticTemplateManager(request.model)
        
        # Use reasoning tag (]]) when prompt is also selected for consistency
        use_reasoning_tag = "prompt" in force_reasoning_args
        
        # Get the current system prompt if exists
        system_content = None
        for msg in messages:
            if msg.role == "system":
                system_content = msg.content
                break
        if system_content:
            # Inject agentic instructions
            system_content = template_manager.get_agent_system_prompt(system_content, use_reasoning_tag=use_reasoning_tag)
        else:
            system_content = template_manager.get_agent_system_prompt("You are a helpful assistant.", use_reasoning_tag=use_reasoning_tag)
        # Update or add system message
        system_found = False
        for i, msg in enumerate(messages):
            if msg.role == "system":
                messages[i] = ChatMessage(role="system", content=system_content)
                system_found = True
                break
        if not system_found:
            messages = [ChatMessage(role="system", content=system_content)] + list(messages)
        
        if global_debug:
            print(f"INJECT: System prompt injected with agentic instructions")
            print(f"\n--- INJECTED SYSTEM PROMPT ---")
            print(system_content)
            print(f"--- END SYSTEM PROMPT ---")
    
    # Handle 'prompt' - prompt seeding (ends with thought tag)
    # Note: 'prompt' and 'raw' are mutually exclusive - raw bypasses this
    if "prompt" in force_reasoning_args and "raw" not in force_reasoning_args:
        from codai.models.templates import AgenticTemplateManager
        template_manager = AgenticTemplateManager(request.model)
        
        # Convert messages to the format expected by force_reasoning_prompt
        user_message = ""
        system_prompt = "You are a helpful assistant."
        
        # Extract system and user messages
        for msg in messages:
            if msg.role == "system":
                system_prompt = msg.content
            elif msg.role == "user":
                user_message = msg.content
        
        # Add qwen3 system addon if applicable
        if qwen3_system_addon:
            system_prompt = system_prompt + qwen3_system_addon
        
        # Get the seeded prompt (ends with thought tag)
        seeded_prompt = template_manager.force_reasoning_prompt(system_prompt, user_message)
        
        # Replace messages with the seeded prompt (as a single user message for raw completion)
        messages = [ChatMessage(role="user", content=seeded_prompt)]
        
        if global_debug:
            print(f"PROMPT: Prompt seeding applied (ends with thought tag)")
            print(f"\n--- SEEDED PROMPT (last 80 chars) ---")
            print(f"...{seeded_prompt[-80:]}")
            print(f"--- END SEEDED PROMPT ---")
    
    # Handle 'raw' - use template_manager.format_for_raw_completion for raw completion
    # This bypasses the chat API and uses the model's native template with reasoning seed
    # The template_manager.format_for_raw_completion will be called in the block below
    
    # Prepare stop sequences
    stop_sequences = []
    if request.stop:
        if isinstance(request.stop, str):
            stop_sequences = [request.stop]
        else:
            stop_sequences = list(request.stop)
    
    # Handle 'stop' - add reasoning stop tokens (also done for 'inject' and 'prompt')
    # Skip for 'raw' mode since it handles stop tokens separately
    if "raw" not in force_reasoning_args and ("stop" in force_reasoning_args or "inject" in force_reasoning_args or "prompt" in force_reasoning_args):
        _, _, additional_stops = get_reasoning_stop_tokens(model_family)
        for stop_token in additional_stops:
            if stop_token not in stop_sequences:
                stop_sequences.append(stop_token)
        
        # When using prompt seeding, also add ]]> to force stopping after reasoning
        if "prompt" in force_reasoning_args:
            # Add common reasoning end tags based on model family
            if "</think>" not in stop_sequences:
                stop_sequences.append("</think>")
        
        if global_debug:
            print(f"STOP: Added reasoning stop tokens: {additional_stops}")
    
    # Format messages with tools if provided - BUT SKIP for raw mode
    # (raw mode handles tools separately via format_for_raw_completion)
    if request.tools and "raw" not in force_reasoning_args:
        messages = format_tools_for_prompt(request.tools, messages)
    
    # Get the tool_parser from the current manager
    tool_parser = current_manager.tool_parser if hasattr(current_manager, 'tool_parser') else ModelParserAdapter()
    
    # Convert messages to dict format for chat completion
    messages_dict = []
    for msg in messages:
        msg_dict = {"role": msg.role}
        # Always include content key - llama_cpp template expects it
        # Convert content to string if it's a list (multipart content)
        content = msg.content
        if content is None:
            content = ""
        elif isinstance(content, list):
            # Handle multipart content array format: [{"type": "text", "text": "..."}]
            parts = []
            for item in content:
                if isinstance(item, dict):
                    if item.get('type') == 'text' and 'text' in item:
                        parts.append(item['text'])
                    else:
                        parts.append(f"[{item.get('type', 'unknown')} content]")
                else:
                    parts.append(str(item))
            content = '\n'.join(parts)
        # Ensure content is never None - convert to string
        msg_dict["content"] = str(content) if content is not None else ""
        # Handle tool_calls - convert to proper format if present
        if msg.tool_calls:
            # tool_calls should be a list of dicts with 'id', 'type', 'function' keys
            msg_dict["tool_calls"] = msg.tool_calls
        if msg.name:
            msg_dict["name"] = msg.name
        if msg.tool_call_id:
            msg_dict["tool_call_id"] = msg.tool_call_id
        messages_dict.append(msg_dict)
    
    # Final safety check: ensure NO message has None content before passing to llama_cpp
    # Also ensure content key always exists (not just None check)
    for i, m in enumerate(messages_dict):
        # Handle missing content key entirely
        if "content" not in m:
            messages_dict[i]["content"] = ""
        # Handle None content
        elif m.get("content") is None:
            messages_dict[i]["content"] = ""
        # Handle content that's not a string (shouldn't happen but be safe)
        elif not isinstance(m["content"], str):
            messages_dict[i]["content"] = str(m["content"])
    
    # Debug: print first few messages to see their structure
    print(f"DEBUG: messages_dict[0] keys: {list(messages_dict[0].keys()) if messages_dict else 'empty'}")
    if len(messages_dict) > 1:
        print(f"DEBUG: messages_dict[1] keys: {list(messages_dict[1].keys()) if len(messages_dict) > 1 else 'empty'}")
    
    # Convert tools to dict format if present
    tools_dict = None
    if request.tools:
        tools_dict = []
        for tool in request.tools:
            tools_dict.append({
                "type": tool.type,
                "function": {
                    "name": tool.function.name,
                    "description": tool.function.description,
                    "parameters": tool.function.parameters
                }
            })
    
    # Handle raw mode - use generate() instead of generate_chat() for raw prompt completion
    # Note: These may have been set earlier in the prompt handling section
    # Initialize only if not already set
    if 'use_raw_mode' not in locals():
        use_raw_mode = False
    if 'raw_prompt_for_generation' not in locals():
        raw_prompt_for_generation = None
    if 'raw_stop_sequences' not in locals():
        raw_stop_sequences = None
    
    # Check if we need to set up raw mode (if not already done in prompt handling)
    if "raw" in force_reasoning_args and not use_raw_mode:
        # Create template_manager if not already created
        if 'template_manager' not in locals():
            from codai.models.templates import AgenticTemplateManager
            template_manager = AgenticTemplateManager(request.model)
        
        # Use template_manager.format_for_raw_completion which handles everything
        if hasattr(template_manager, 'format_for_raw_completion'):
            # Extract system and user messages
            system_prompt = "You are a helpful assistant."
            user_message = ""
            for msg in messages:
                if msg.role == "system":
                    system_prompt = msg.content
                elif msg.role == "user":
                    user_message = msg.content
            
            raw_prompt_for_generation, raw_stop_sequences = template_manager.format_for_raw_completion(
                system_prompt=system_prompt,
                user_message=user_message,
                inject_system=True,
                force_reasoning=True,
                tools=request.tools  # Pass tools for family-specific formatting
            )
            use_raw_mode = True
            
            if global_debug:
                print(f"RAW: Using template_manager.format_for_raw_completion")
                print(f"RAW: Prompt ends with: ...{raw_prompt_for_generation[-80:]}")
        else:
            if global_debug:
                print(f"RAW: template_manager.format_for_raw_completion not available")
    
    # Get resolved model name for response (with coderai/ prefix and proper formatting)
    # Use multi_model_manager to get the actual loaded models, not the individual model manager
    response_model_name = get_resolved_model_name(requested_model, multi_model_manager)
    print(f"DEBUG: Requested model: {requested_model}, Resolved model for response: {response_model_name}")
    
    # Handle raw mode - two pass: first capture reasoning, then get final answer
    if use_raw_mode and raw_prompt_for_generation:
        if global_debug:
            print(f"RAW: Starting two-pass generation")
            print(f"RAW: First pass prompt: ...{raw_prompt_for_generation[-100:]}")
        
        # Build extra params for qwen3
        extra_params = {}
        if use_qwen3_penalties:
            extra_params = {
                'repeat_penalty': 1.15,
                'presence_penalty': 1.5,
                'frequency_penalty': 0.5,
            }
        
        if request.stream:
            # For streaming, we need to handle it differently
            # First pass: generate until reasoning close tag (stream it)
            async def raw_stream_generate():
                import json  # Local import for nested function
                thought_tag, close_tag, _ = get_reasoning_stop_tokens(model_family)
                reasoning_text = ""
                
                if global_debug:
                    print(f"DEBUG: raw_stream_generate started, stream=True")
                
                # Use the backend's async generate if available
                if hasattr(current_manager.backend, 'generate_stream'):
                    async for chunk in current_manager.backend.generate_stream(
                        prompt=raw_prompt_for_generation,
                        max_tokens=request.max_tokens or 2048,
                        temperature=request.temperature,
                        top_p=request.top_p,
                        stop=raw_stop_sequences,
                        **extra_params,
                    ):
                        reasoning_text += chunk
                        
                        # Debug: log first pass chunks
                        if global_debug:
                            print(f"DEBUG FIRST PASS: chunk length={len(chunk)}, total reasoning so far={len(reasoning_text)}")
                        
                        yield f"data: {json.dumps({'choices': [{'delta': {'content': chunk}, 'finish_reason': None}]})}\n\n"
                        
                        # Check if we hit the close tag
                        if close_tag and close_tag in reasoning_text:
                            if global_debug:
                                print(f"DEBUG: Close tag detected in first pass, reasoning length={len(reasoning_text)}")
                            break
                else:
                    # Fallback: non-streaming
                    if global_debug:
                        print(f"DEBUG: Using non-streaming fallback for first pass")
                    first_pass_result = current_manager.generate(
                        prompt=raw_prompt_for_generation,
                        max_tokens=request.max_tokens or 2048,
                        temperature=request.temperature,
                        top_p=request.top_p,
                        stop=raw_stop_sequences,
                        **extra_params,
                    )
                    yield f"data: {json.dumps({'choices': [{'delta': {'content': first_pass_result}, 'finish_reason': None}]})}\n\n"
                
                # After reasoning, yield the close tag and continue with final answer
                if close_tag:
                    yield f"data: {json.dumps({'choices': [{'delta': {'content': close_tag}, 'finish_reason': None}]})}\n\n"
                
                # Second pass: get the rest
                full_prompt = raw_prompt_for_generation + reasoning_text + (close_tag or "")
                
                if global_debug:
                    print(f"DEBUG: raw_stream_generate second pass, full_prompt length: {len(full_prompt)}")
                
                second_pass_result = current_manager.generate(
                    prompt=full_prompt,
                    max_tokens=request.max_tokens or 2048,
                    temperature=request.temperature,
                    top_p=request.top_p,
                    stop=stop_sequences,
                    **extra_params,
                )
                
                # In debug mode, dump the full generated text (second pass result)
                if global_debug:
                    print(f"\n{'='*80}")
                    print(f"=== RAW STREAM: FULL GENERATED TEXT (DEBUG) ===")
                    print(f"{'='*80}")
                    print(f"--- SECOND PASS RESULT ---")
                    print(second_pass_result)
                    print(f"--- END SECOND PASS RESULT ---")
                    print(f"{'='*80}\n")
                    
                    # Also dump the reasoning text from first pass
                    print(f"\n{'='*80}")
                    print(f"=== RAW STREAM: REASONING TEXT (DEBUG) ===")
                    print(f"{'='*80}")
                    print(reasoning_text)
                    print(f"{'='*80}\n")
                
                # Try to extract tool calls from the second pass result
                # If second pass is empty, try the reasoning text as fallback
                extracted_tool_calls = None
                text_for_tool_extraction = second_pass_result
                
                # If second pass is empty or just whitespace, try reasoning text
                if not text_for_tool_extraction or not text_for_tool_extraction.strip():
                    if global_debug:
                        print(f"DEBUG: Second pass result is empty, trying reasoning text")
                        print(f"DEBUG: Reasoning text length: {len(reasoning_text)}")
                        print(f"DEBUG: Reasoning text preview: {reasoning_text[:200] if reasoning_text else 'empty'}")
                    text_for_tool_extraction = reasoning_text
                
                if global_debug:
                    print(f"DEBUG: Final text for tool extraction: {text_for_tool_extraction[:200] if text_for_tool_extraction else 'empty'}")
                
                if request.tools and text_for_tool_extraction:
                    # Convert tools for ModelParserAdapter
                    from codai.pydantic.textrequest import Tool, ToolFunction
                    from codai.models.parser import ModelParserAdapter
                    
                    tools_list = []
                    for t in request.tools:
                        try:
                            if isinstance(t, dict):
                                func_data = t.get("function", {})
                                tool_func = ToolFunction(
                                    name=func_data.get("name", ""),
                                    description=func_data.get("description"),
                                    parameters=func_data.get("parameters")
                                )
                            else:
                                tool_func = ToolFunction(
                                    name=t.function.name if hasattr(t.function, 'name') else str(t.function),
                                    description=t.function.description if hasattr(t.function, 'description') else None,
                                    parameters=t.function.parameters if hasattr(t.function, 'parameters') else None
                                )
                            tools_list.append(Tool(type=t.get("type", "function") if isinstance(t, dict) else t.type, function=tool_func))
                        except Exception as e:
                            print(f"DEBUG: Error converting tool in raw stream: {e}")
                            continue
                    
                    if tools_list:
                        adapter = ModelParserAdapter(model_name=response_model_name)
                        extracted_tool_calls = adapter.extract_tool_calls(text_for_tool_extraction, tools_list)
                        
                        if global_debug and extracted_tool_calls:
                            print(f"\n{'='*80}")
                            print(f"=== RAW STREAM: EXTRACTED TOOL CALLS (DEBUG) ===")
                            print(f"{'='*80}")
                            print(json.dumps(extracted_tool_calls, indent=2))
                            print(f"{'='*80}\n")
                        elif global_debug:
                            print(f"DEBUG: No tool calls found in raw stream")
                
                if extracted_tool_calls:
                    # Yield tool calls instead of content
                    yield f"data: {json.dumps({'choices': [{'delta': {'tool_calls': extracted_tool_calls}, 'finish_reason': 'tool_calls'}]})}\n\n"
                else:
                    # No tool calls, yield the content as usual
                    yield f"data: {json.dumps({'choices': [{'delta': {'content': second_pass_result}, 'finish_reason': 'stop'}]})}\n\n"
                yield "data: [DONE]\n\n"
            
            return StreamingResponse(raw_stream_generate(), media_type="text/event-stream")
        
        # Non-streaming path (already implemented above)
        # First pass: generate until reasoning close tag
        first_pass_result = current_manager.generate(
            prompt=raw_prompt_for_generation,
            max_tokens=request.max_tokens or 2048,
            temperature=request.temperature,
            top_p=request.top_p,
            stop=raw_stop_sequences,
            **extra_params,
        )
        
        if global_debug:
            print(f"RAW: First pass result: ...{first_pass_result[-200:]}")
        
        # Dump first pass result if --dump is enabled
        if global_dump:
            print(f"\n{'='*80}")
            print(f"=== RAW MODE: FIRST PASS RESULT (DUMP) ===")
            print(f"{'='*80}")
            print(first_pass_result)
            print(f"{'='*80}\n")
        
        # Extract reasoning (everything up to the close tag)
        thought_tag, close_tag, _ = get_reasoning_stop_tokens(model_family)
        reasoning_text = ""
        final_text = first_pass_result
        
        # Define tool tags that indicate end of reasoning
        tool_tags = ["<tool_call>", "<tool>", "<|tool_call|", "<|tool|", "<function="]
        
        if close_tag and close_tag in first_pass_result:
            # Split at close tag
            parts = first_pass_result.split(close_tag, 1)
            reasoning_text = parts[0]
            final_text = parts[1] if len(parts) > 1 else ""
        else:
            # Try to find tool tags as fallback stop markers
            earliest_tool_idx = len(first_pass_result)
            earliest_tool_tag = None
            for tag in tool_tags:
                idx = first_pass_result.find(tag)
                if idx != -1 and idx < earliest_tool_idx:
                    earliest_tool_idx = idx
                    earliest_tool_tag = tag
            
            if earliest_tool_tag:
                # Split at tool tag
                if global_debug:
                    print(f"RAW: No close tag found, using tool tag '{earliest_tool_tag}' as fallback")
                parts = first_pass_result.split(earliest_tool_tag, 1)
                reasoning_text = parts[0]
                final_text = earliest_tool_tag + (parts[1] if len(parts) > 1 else "")
        
        if global_debug:
            print(f"RAW: Extracted reasoning: {reasoning_text[:100]}...")
            print(f"RAW: Final text before cleanup: {final_text[:100]}...")
        
        # Dump extraction details if --dump is enabled
        if global_dump:
            print(f"\n{'='*80}")
            print(f"=== RAW MODE: EXTRACTION (DUMP) ===")
            print(f"{'='*80}")
            print(f"Close tag used: {close_tag}")
            print(f"\n--- REASONING TEXT ---")
            print(reasoning_text)
            print(f"\n--- FINAL TEXT (before cleanup) ---")
            print(final_text)
            print(f"{'='*80}\n")
        
        # Clean up control tokens from final text
        final_text = cleanup_control_tokens(final_text)
        
        if global_debug:
            print(f"RAW: Final text after cleanup: {final_text[:100]}...")
        
        # If we have reasoning, continue with second pass to get more complete answer
        # Build the full prompt with reasoning included
        full_prompt = raw_prompt_for_generation + reasoning_text + (close_tag or "")
        
        # Second pass: generate the rest (or just use what we have)
        # For now, just return what we have + optionally continue
        if final_text.strip():
            # We have a complete answer after reasoning
            generated_text = reasoning_text + (close_tag or "") + final_text
        else:
            # Need second pass to get answer
            second_pass_result = current_manager.generate(
                prompt=full_prompt,
                max_tokens=request.max_tokens or 2048,
                temperature=request.temperature,
                top_p=request.top_p,
                stop=stop_sequences,
                **extra_params,
            )
            # Clean up the second pass result
            second_pass_result = cleanup_control_tokens(second_pass_result)
            generated_text = reasoning_text + (close_tag or "") + second_pass_result
        
        # Additional cleanup of the full generated text
        generated_text = cleanup_control_tokens(generated_text)
        
        if global_debug:
            print(f"RAW: Generated text after cleanup: {generated_text[:100]}...")
        
        # Pass through the formatter/parser (same as regular mode)
        # Pipeline: Model output -> Extract reasoning (if raw mode) -> ModelParserAdapter (extract tools) -> OpenAIFormatter (final format)
        from codai.models.parser import OpenAIFormatter, ModelParserAdapter
        
        # Convert request tools for ModelParserAdapter
        tools_list = None
        if request.tools:
            from codai.pydantic.textrequest import Tool, ToolFunction
            tools_list = []
            for t in request.tools:
                try:
                    # Handle both dict and pydantic model formats
                    if isinstance(t, dict):
                        func_data = t.get("function", {})
                        tool_func = ToolFunction(
                            name=func_data.get("name", ""),
                            description=func_data.get("description"),
                            parameters=func_data.get("parameters")
                        )
                    else:
                        # Pydantic model
                        tool_func = ToolFunction(
                            name=t.function.name if hasattr(t.function, 'name') else str(t.function),
                            description=t.function.description if hasattr(t.function, 'description') else None,
                            parameters=t.function.parameters if hasattr(t.function, 'parameters') else None
                        )
                    tools_list.append(Tool(type=t.get("type", "function") if isinstance(t, dict) else t.type, function=tool_func))
                except Exception as e:
                    print(f"DEBUG: Error converting tool in raw mode: {e}, tool type: {type(t)}")
                    continue
        
        # Step 1: Use ModelParserAdapter to extract tool calls from final_text (NOT generated_text which includes reasoning)
        # This fixes Bug 2 and Bug 3: reasoning was appearing in both content AND reasoning fields
        # because the parser was receiving the full generated_text including reasoning
        extracted_tool_calls = None
        clean_text = final_text  # Use final_text (after reasoning) instead of generated_text (which includes reasoning)
        if tools_list:
            adapter = ModelParserAdapter(model_name=response_model_name)
            # Extract tool calls from final_text only (after reasoning is done)
            extracted_tool_calls = adapter.extract_tool_calls(final_text, tools_list)
            
            if extracted_tool_calls:
                # Strip tool calls from the text
                clean_text = adapter.strip_tool_calls_from_content(final_text)
                if global_debug:
                    print(f"RAW: Extracted {len(extracted_tool_calls)} tool calls from final_text (after reasoning)")
        
        # Estimate token counts
        prompt_tokens = len(raw_prompt_for_generation.split())
        completion_tokens = len(clean_text.split()) if clean_text else 0
        
        # Step 2: Use OpenAIFormatter for final formatting
        formatter = OpenAIFormatter(response_model_name)
        try:
            formatted_response = formatter.format_full(
                text=clean_text,
                prompt_tokens=prompt_tokens,
                completion_tokens=completion_tokens,
                tool_calls=extracted_tool_calls
            )
        except Exception as e:
            print(f"RAW: ERROR in formatter.format_full: {e}")
            formatted_response = None
        
        if global_debug:
            if formatted_response and isinstance(formatted_response, dict):
                try:
                    choices = formatted_response.get('choices', [])
                    if choices and len(choices) > 0:
                        message = choices[0].get('message', {}) if isinstance(choices[0], dict) else {}
                        content = message.get('content', '') if isinstance(message, dict) else ''
                        print(f"RAW: Passed through formatter, got: {str(content)[:100]}...")
                    else:
                        print(f"RAW: WARNING - formatter returned empty choices!")
                except Exception as e:
                    print(f"RAW: ERROR accessing formatter response: {e}")
            else:
                print(f"RAW: WARNING - formatter returned None or invalid response!")
        
        # Add mock reasoning stats if 'mock' is in force_reasoning_args
        # But only if we DON'T already have real reasoning from extraction
        has_real_reasoning = reasoning_text and len(reasoning_text.strip()) > 10
        
        if force_reasoning_args and "mock" in force_reasoning_args and formatted_response and not has_real_reasoning:
            # Add fake reasoning tokens to trigger VSCode plugin stats
            mock_reasoning_tokens = 50
            
            # Update usage
            if "usage" in formatted_response:
                formatted_response["usage"]["completion_tokens"] += mock_reasoning_tokens
                formatted_response["usage"]["total_tokens"] += mock_reasoning_tokens
                formatted_response["usage"]["completion_tokens_details"] = {
                    "reasoning_tokens": mock_reasoning_tokens
                }
            
            # Add reasoning to message if not present
            if "choices" in formatted_response and formatted_response["choices"]:
                choice = formatted_response["choices"][0]
                if "message" in choice and "reasoning" not in choice["message"]:
                    choice["message"]["reasoning"] = "Processing task in optimized mode..."
        elif has_real_reasoning and formatted_response:
            # We have real reasoning from extraction - add it to the message
            if "choices" in formatted_response and formatted_response["choices"]:
                choice = formatted_response["choices"][0]
                if "message" in choice:
                    choice["message"]["reasoning"] = reasoning_text.strip()
                    # Also update usage with actual reasoning tokens
                    if "usage" in formatted_response:
                        reasoning_tokens = len(reasoning_text.strip().split())
                        formatted_response["usage"]["completion_tokens_details"] = {
                            "reasoning_tokens": reasoning_tokens
                        }
        
        # Dump parsed output if enabled
        if global_dump:
            import json
            print(f"\n{'='*80}")
            print(f"=== RAW MODE PARSED OUTPUT (DUMP) ===")
            print(f"{'='*80}")
            print(json.dumps(formatted_response, indent=2))
            print(f"{'='*80}\n")
        
        # Add rate limit headers
        headers = {}
        if formatted_response and 'usage' in formatted_response:
            headers = current_manager.backend.get_rate_limit_headers(
                prompt_tokens=formatted_response.get('usage', {}).get('prompt_tokens', 0),
                completion_tokens=formatted_response.get('usage', {}).get('completion_tokens', 0)
            ) if hasattr(current_manager.backend, 'get_rate_limit_headers') else {}
        
        # Ensure we have a valid response to return
        if not formatted_response:
            # Create a minimal fallback response
            formatted_response = {
                "id": f"chatcmpl-{uuid.uuid4().hex}",
                "object": "chat.completion",
                "created": int(time.time()),
                "model": response_model_name,
                "choices": [{
                    "index": 0,
                    "message": {
                        "role": "assistant",
                        "content": clean_text or ""
                    },
                    "finish_reason": "stop"
                }],
                "usage": {
                    "prompt_tokens": prompt_tokens,
                    "completion_tokens": completion_tokens,
                    "total_tokens": prompt_tokens + completion_tokens
                }
            }
        
        return JSONResponse(content=formatted_response, headers=headers)
    
    if request.stream:
        return StreamingResponse(
            stream_chat_response(
                messages_dict,
                response_model_name,
                request.max_tokens,
                request.temperature,
                request.top_p,
                stop_sequences,
                tools_dict,
                current_manager,
                tool_parser,
                request.response_format,
            ),
            media_type="text/event-stream",
        )
    else:
        return await generate_chat_response(
            messages_dict,
            response_model_name,
            request.max_tokens,
            request.temperature,
            request.top_p,
            stop_sequences,
            tools_dict,
            current_manager,
            tool_parser,
            request.response_format,
            force_reasoning_args,
        )

async def stream_chat_response(
    messages: List[Dict],
    model_name: str,
    max_tokens: Optional[int],
    temperature: float,
    top_p: float,
    stop: List[str],
    tools: Optional[List[Dict]],
    current_manager: ModelManager,
    tool_parser: ToolCallParser,
    response_format: Optional[Dict] = None,
) -> AsyncGenerator[str, None]:
    """Stream chat completion response with queue notifications."""
    completion_id = f"chatcmpl-{uuid.uuid4().hex}"
    created = int(time.time())
    request_id = f"req-{uuid.uuid4().hex[:8]}"
    
    generated_text = ""
    print(f"DEBUG: stream_chat_response started, stream=True, tools={tools is not None}")
    
    # Check if model is loaded - if not, notify waiting clients
    # The model manager exists but backend may not be loaded yet in on-demand mode
    model_loaded = False
    if current_manager is not None:
        if hasattr(current_manager, 'backend') and current_manager.backend is not None:
            # Check if backend has the model loaded
            if hasattr(current_manager.backend, 'model') and current_manager.backend.model is not None:
                model_loaded = True
        elif hasattr(current_manager, 'model') and current_manager.model is not None:
            # Alternative check for some model managers
            model_loaded = True
    
    # If model not loaded, add to queue and send waiting notifications
    if not model_loaded:
        await queue_manager.add_waiting(request_id)
        wait_interval = 2.0  # Send waiting update every 2 seconds
        last_wait_update = time.time()
        
        # Send initial waiting message
        data = {
            "id": completion_id,
            "object": "chat.completion.chunk",
            "created": created,
            "model": model_name,
            "choices": [{
                "index": 0,
                "delta": {"content": "Waiting for model to load..."},
                "finish_reason": None,
            }],
            "x_queue_info": {
                "status": "waiting",
                "message": "Model is loading, please wait...",
            },
        }
        yield f"data: {json.dumps(data)}\n\n"
        
        # Keep sending wait updates until model is loaded
        # In a real implementation, this would check a loading status
        # For now, we'll send a few updates then proceed
        max_wait_updates = 5
        wait_count = 0
        while wait_count < max_wait_updates:
            await asyncio.sleep(wait_interval)
            wait_time = await queue_manager.get_wait_time(request_id)
            wait_count += 1
            
            queue_pos = await queue_manager.get_queue_position(request_id)
            
            data = {
                "id": completion_id,
                "object": "chat.completion.chunk",
                "created": created,
                "model": model_name,
                "choices": [{
                    "index": 0,
                    "delta": {"content": f""},
                    "finish_reason": None,
                }],
                "x_queue_info": {
                    "status": "waiting",
                    "message": f"Waiting for model... ({int(wait_time)}s)",
                    "queue_position": queue_pos,
                    "wait_time_seconds": int(wait_time),
                },
            }
            yield f"data: {json.dumps(data)}\n\n"
    
    # Mark as starting processing
    await queue_manager.start_processing(request_id, model_name)
    
    # Send "Model starting" message
    data = {
        "id": completion_id,
        "object": "chat.completion.chunk",
        "created": created,
        "model": model_name,
        "choices": [{
            "index": 0,
            "delta": {"content": ""},
            "finish_reason": None,
        }],
        "x_queue_info": {
            "status": "starting",
            "message": "Model starting",
        },
    }
    yield f"data: {json.dumps(data)}\n\n"
    
    try:
        chunk_count = 0
        
        # Debug: Print what is being passed to the model
        if global_debug:
            print(f"\n{'='*80}")
            print(f"=== MODEL INPUT (DEBUG) ===")
            print(f"{'='*80}")
            print(f"Model: {model_name}")
            print(f"Max tokens: {max_tokens}")
            print(f"Temperature: {temperature}")
            print(f"Top P: {top_p}")
            print(f"Stop sequences: {stop}")
            print(f"Tools: {tools is not None}")
            print(f"Response format: {response_format}")
            print(f"\n--- Messages ---")
            for i, msg in enumerate(messages):
                role = msg.get('role', 'unknown')
                content = msg.get('content', '')
                if content and len(content) > 500:
                    content = content[:500] + "... [truncated]"
                print(f"[{i}] {role}: {repr(content)}")
            print(f"{'='*80}\n")
        
        # Use generate_chat_stream for proper chat template handling
        async for chunk in current_manager.generate_chat_stream(
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            stop=stop,
            tools=tools,
            response_format=response_format,
        ):
            chunk_count += 1
            # Always filter malformed content
            filtered_chunk = filter_malformed_content(chunk)
            
            # Always filter out tool call format
            filtered_chunk = tool_parser.strip_tool_calls_from_content(filtered_chunk)
            
            # Pass through all content including whitespace - it's essential for message composition
            generated_text += filtered_chunk
            
            data = {
                "id": completion_id,
                "object": "chat.completion.chunk",
                "created": created,
                "model": model_name,
                "choices": [{
                    "index": 0,
                    "delta": {"content": filtered_chunk},
                    "finish_reason": None,
                }],
            }
            yield f"data: {json.dumps(data)}\n\n"
            # Explicitly flush to ensure data is sent immediately
            await asyncio.sleep(0)
        
        print(f"DEBUG: stream_chat_response completed, {chunk_count} chunks, generated_text length: {len(generated_text)}")
        if not generated_text.strip():
            print(f"DEBUG: Warning - no content generated!")
        
        # In debug mode, dump the full generated text
        if global_debug:
            print(f"\n{'='*80}")
            print(f"=== FULL GENERATED TEXT (DEBUG) ===")
            print(f"{'='*80}")
            # Show both raw (actual) content and escaped representation
            print(f"--- RAW CONTENT (actual newlines shown as lines) ---")
            print(generated_text)
            print(f"--- END RAW CONTENT ---")
            print(f"--- ESCAPED CONTENT (repr() - shows \\n for newlines) ---")
            print(repr(generated_text))
            print(f"--- END ESCAPED CONTENT ---")
            print(f"{'='*80}\n")
        
        # Check for tool calls in complete output (for API response format)
        if tools:
            # Convert tools back to Tool objects for parsing
            from typing import cast
            tool_objects = []
            for t in tools:
                try:
                    # Handle both dict and pydantic model formats
                    if isinstance(t, dict):
                        func_data = t.get("function", {})
                        tool_func = ToolFunction(
                            name=func_data.get("name", ""),
                            description=func_data.get("description"),
                            parameters=func_data.get("parameters")
                        )
                    else:
                        # Pydantic model
                        tool_func = ToolFunction(
                            name=t.function.name if hasattr(t.function, 'name') else str(t.function),
                            description=t.function.description if hasattr(t.function, 'description') else None,
                            parameters=t.function.parameters if hasattr(t.function, 'parameters') else None
                        )
                    tool_objects.append(Tool(type=t.get("type", "function") if isinstance(t, dict) else t.type, function=tool_func))
                except Exception as e:
                    print(f"DEBUG: Error converting tool: {e}, tool type: {type(t)}")
                    continue
            try:
                tool_calls = tool_parser.extract_tool_calls(generated_text, tool_objects)
            except Exception as e:
                print(f"DEBUG: Error extracting tool calls: {e}")
                tool_calls = None
            if tool_calls:
                # In debug mode, dump tool calls
                if global_debug:
                    print(f"\n{'='*80}")
                    print(f"=== EXTRACTED TOOL CALLS (DEBUG) ===")
                    print(f"{'='*80}")
                    print(json.dumps(tool_calls, indent=2))
                    print(f"{'='*80}\n")
                # Tool calls were extracted and stripped from content during streaming
                # Just send the tool_calls chunk
                data = {
                    "id": completion_id,
                    "object": "chat.completion.chunk",
                    "created": created,
                    "model": model_name,
                    "choices": [{
                        "index": 0,
                        "delta": {"tool_calls": tool_calls},
                        "finish_reason": "tool_calls",
                        "logprobs": None,
                        "native_finish_reason": "tool_calls",
                    }],
                }
                yield f"data: {json.dumps(data)}\n\n"
            else:
                # Calculate token counts for usage in final chunk
                prompt_text = "\n".join([m.get("content", "") for m in messages])
                prompt_tokens = len(prompt_text.split())
                completion_tokens = len(generated_text.split()) if generated_text else 0
                
                # Use OpenAIFormatter for final chunk sanitization
                formatter = OpenAIFormatter(model_name)
                usage_details = {
                    "prompt_tokens": prompt_tokens,
                    "completion_tokens": completion_tokens,
                    "total_tokens": prompt_tokens + completion_tokens,
                }
                final_chunk = formatter.format_litellm_chunk("", is_final=True, usage=usage_details)
                yield f"data: {json.dumps(final_chunk)}\n\n"
        else:
            # Calculate token counts for usage in final chunk
            prompt_text = "\n".join([m.get("content", "") for m in messages])
            prompt_tokens = len(prompt_text.split())
            completion_tokens = len(generated_text.split()) if generated_text else 0
            
            # Build complete final chunk with all OpenAI fields
            final_chunk = {
                "id": completion_id,
                "object": "chat.completion.chunk",
                "created": created,
                "model": model_name,
                "choices": [{
                    "index": 0,
                    "finish_reason": "stop",
                    "logprobs": None,
                    "native_finish_reason": "stop",
                }],
                "usage": {
                    "prompt_tokens": prompt_tokens,
                    "completion_tokens": completion_tokens,
                    "total_tokens": prompt_tokens + completion_tokens,
                    "prompt_tokens_details": {
                        "cached_tokens": 0,
                        "audio_tokens": 0,
                    },
                    "completion_tokens_details": {
                        "reasoning_tokens": 0,
                        "audio_tokens": 0,
                    },
                },
                "provider": {
                    "provider_name": "coderai",
                    "provider_id": "coderai",
                },
                "system_fingerprint": None,
            }
            yield f"data: {json.dumps(final_chunk)}\n\n"
        
        yield "data: [DONE]\n\n"
    except Exception as e:
        print(f"Error during streaming generation: {e}")
        data = {
            "id": completion_id,
            "object": "chat.completion.chunk",
            "created": created,
            "model": model_name,
            "choices": [{
                "index": 0,
                "delta": {"content": f"\n[Generation error: {str(e)}]"},
                "finish_reason": "stop",
            }],
        }
        yield f"data: {json.dumps(data)}\n\n"
        yield "data: [DONE]\n\n"
    finally:
        # Always clean up queue state
        await queue_manager.finish_processing()
async def generate_chat_response(
    messages: List[Dict],
    model_name: str,
    max_tokens: Optional[int],
    temperature: float,
    top_p: float,
    stop: List[str],
    tools: Optional[List[Dict]],
    current_manager: ModelManager,
    tool_parser: ToolCallParser,
    response_format: Optional[Dict] = None,
    force_reasoning_args: Optional[List[str]] = None,
) -> Dict:
    """Generate non-streaming chat completion response."""
    completion_id = f"chatcmpl-{uuid.uuid4().hex}"
    created = int(time.time())
    
    # Debug: Print what is being passed to the model
    if global_debug:
        print(f"\n{'='*80}")
        print(f"=== MODEL INPUT (DEBUG) ===")
        print(f"{'='*80}")
        print(f"Model: {model_name}")
        print(f"Max tokens: {max_tokens}")
        print(f"Temperature: {temperature}")
        print(f"Top P: {top_p}")
        print(f"Stop sequences: {stop}")
        print(f"Tools: {tools is not None}")
        print(f"Response format: {response_format}")
        print(f"\n--- Messages ---")
        for i, msg in enumerate(messages):
            role = msg.get('role', 'unknown')
            content = msg.get('content', '')
            if content and len(content) > 500:
                content = content[:500] + "... [truncated]"
            print(f"[{i}] {role}: {repr(content)}")
        print(f"{'='*80}\n")
    
    try:
        # Use generate_chat for proper chat template handling
        generated_text = current_manager.generate_chat(
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            stop=stop,
            tools=tools,
            response_format=response_format,
        )
        
        # Always filter out malformed content
        generated_text = filter_malformed_content(generated_text)
        
        # Dump raw output if enabled
        if global_dump:
            print(f"\n{'='*80}")
            print(f"=== RAW MODEL OUTPUT (DUMP) ===")
            print(f"{'='*80}")
            print(generated_text)
            print(f"{'='*80}\n")
        
        response_message = {
            "role": "assistant",
            "content": generated_text,
        }
        
        finish_reason = "stop"
        
        # Check for tool calls
        if tools:
            # Convert tools back to Tool objects for parsing
            tool_objects = []
            for t in tools:
                try:
                    # Handle both dict and pydantic model formats
                    if isinstance(t, dict):
                        func_data = t.get("function", {})
                        tool_func = ToolFunction(
                            name=func_data.get("name", ""),
                            description=func_data.get("description"),
                            parameters=func_data.get("parameters")
                        )
                    else:
                        # Pydantic model
                        tool_func = ToolFunction(
                            name=t.function.name if hasattr(t.function, 'name') else str(t.function),
                            description=t.function.description if hasattr(t.function, 'description') else None,
                            parameters=t.function.parameters if hasattr(t.function, 'parameters') else None
                        )
                    tool_objects.append(Tool(type=t.get("type", "function") if isinstance(t, dict) else t.type, function=tool_func))
                except Exception as e:
                    print(f"DEBUG: Error converting tool: {e}, tool type: {type(t)}")
                    continue
            try:
                tool_calls = tool_parser.extract_tool_calls(generated_text, tool_objects)
            except Exception as e:
                print(f"DEBUG: Error extracting tool calls: {e}")
                tool_calls = None
            if tool_calls:
                # Always strip tool call format from content
                clean_content = tool_parser.strip_tool_calls_from_content(generated_text)
                response_message["content"] = clean_content if clean_content.strip() else None
                response_message["tool_calls"] = tool_calls
                finish_reason = "tool_calls"
        
        # Calculate token counts - rough estimate since we don't have direct access to tokenizer
        prompt_text = "\n".join([m.get("content", "") for m in messages])
        prompt_tokens = len(prompt_text.split())
        completion_tokens = len(generated_text.split()) if generated_text else 0
        
        # Use OpenAIFormatter for final sanitization
        formatter = OpenAIFormatter(model_name)
        formatted_response = formatter.format_litellm_full(
            text=response_message.get("content", ""),
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            tool_calls=response_message.get("tool_calls")
        )
        
        # Add mock reasoning stats if 'mock' is in force_reasoning_args
        # But only if we don't already have real reasoning in the response
        # Check if reasoning already exists in the message
        existing_reasoning = None
        if "choices" in formatted_response and formatted_response["choices"]:
            choice = formatted_response["choices"][0]
            if "message" in choice:
                existing_reasoning = choice["message"].get("reasoning")
        
        if force_reasoning_args and "mock" in force_reasoning_args and formatted_response and not existing_reasoning:
            # Add fake reasoning tokens to trigger VSCode plugin stats
            mock_reasoning_tokens = 50
            
            # Update usage
            if "usage" in formatted_response:
                formatted_response["usage"]["completion_tokens"] += mock_reasoning_tokens
                formatted_response["usage"]["total_tokens"] += mock_reasoning_tokens
                formatted_response["usage"]["completion_tokens_details"] = {
                    "reasoning_tokens": mock_reasoning_tokens
                }
            
            # Add reasoning to message if not present
            if "choices" in formatted_response and formatted_response["choices"]:
                choice = formatted_response["choices"][0]
                if "message" in choice and "reasoning" not in choice["message"]:
                    choice["message"]["reasoning"] = "Processing task in optimized mode..."
        
        # Dump parsed output if enabled
        if global_dump:
            import json
            print(f"\n{'='*80}")
            print(f"=== PARSED OUTPUT (DUMP) ===")
            print(f"{'='*80}")
            print(json.dumps(formatted_response, indent=2))
            print(f"{'='*80}\n")
        
        return formatted_response
    except Exception as e:
        print(f"Error during generation: {e}")
        raise HTTPException(status_code=500, detail=f"Generation error: {str(e)}")
@app.post("/v1/completions")
async def completions(request: CompletionRequest):
    """Text completions endpoint."""
    # Get the model for this request
    requested_model = request.model
    
    # Try to get the appropriate model
    mm = multi_model_manager.get_model_for_request(requested_model)
    
    if mm is None:
        # Model not loaded - try to use default
        if model_manager.backend is not None:
            # Fallback to legacy model_manager
            current_manager = model_manager
        else:
            raise HTTPException(status_code=503, detail="Model not loaded")
    else:
        current_manager = mm
    
    prompts = request.prompt if isinstance(request.prompt, list) else [request.prompt]
    stop_sequences = []
    if request.stop:
        stop_sequences = [request.stop] if isinstance(request.stop, str) else request.stop
    
    if request.stream:
        return StreamingResponse(
            stream_completion_response(
                prompts[0],
                request.model,
                request.max_tokens,
                request.temperature,
                request.top_p,
                stop_sequences,
                current_manager,
            ),
            media_type="text/event-stream",
        )
    else:
        return await generate_completion_response(
            prompts[0],
            request.model,
            request.max_tokens,
            request.temperature,
            request.top_p,
            stop_sequences,
            current_manager,
        )
async def stream_completion_response(
    prompt: str,
    model_name: str,
    max_tokens: Optional[int],
    temperature: float,
    top_p: float,
    stop: List[str],
    current_manager: ModelManager,
) -> AsyncGenerator[str, None]:
    """Stream completion response."""
    completion_id = f"cmpl-{uuid.uuid4().hex}"
    created = int(time.time())
    
    try:
        async for chunk in current_manager.generate_stream(
            prompt=prompt,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            stop=stop,
        ):
            data = {
                "id": completion_id,
                "object": "text_completion",
                "created": created,
                "model": model_name,
                "choices": [{
                    "text": chunk,
                    "index": 0,
                    "logprobs": None,
                    "finish_reason": None,
                }],
            }
            yield f"data: {json.dumps(data)}\n\n"
        
        yield f"data: {json.dumps({'choices': [{'finish_reason': 'stop'}]})}\n\n"
        yield "data: [DONE]\n\n"
    except Exception as e:
        print(f"Error during streaming completion: {e}")
        yield f"data: {json.dumps({'choices': [{'finish_reason': 'stop'}]})}\n\n"
        yield "data: [DONE]\n\n"
async def generate_completion_response(
    prompt: str,
    model_name: str,
    max_tokens: Optional[int],
    temperature: float,
    top_p: float,
    stop: List[str],
    current_manager: ModelManager,
) -> Dict:
    """Generate non-streaming completion response."""
    completion_id = f"cmpl-{uuid.uuid4().hex}"
    created = int(time.time())
    
    try:
        generated_text = current_manager.generate(
            prompt=prompt,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            stop=stop,
        )
        
        # Calculate token counts if tokenizer available
        if current_manager.tokenizer:
            prompt_tokens = len(current_manager.tokenizer.encode(prompt))
            completion_tokens = len(current_manager.tokenizer.encode(generated_text))
        else:
            prompt_tokens = len(prompt.split())
            completion_tokens = len(generated_text.split())
        
        return {
            "id": completion_id,
            "object": "text_completion",
            "created": created,
            "model": model_name,
            "choices": [{
                "text": generated_text,
                "index": 0,
                "logprobs": None,
                "finish_reason": "stop",
            }],
            "usage": {
                "prompt_tokens": prompt_tokens,
                "completion_tokens": completion_tokens,
                "total_tokens": prompt_tokens + completion_tokens,
            },
        }
    except Exception as e:
        print(f"Error during completion: {e}")
        raise HTTPException(status_code=500, detail=f"Generation error: {str(e)}")
# =============================================================================
# Main Entry Point
# =============================================================================

def parse_args():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(
        description="OpenAI-compatible API server supporting NVIDIA (CUDA) and Vulkan backends"
    )
    parser.add_argument(
        "--model",
        type=str,
        action="append",
        default=None,
        help="Model name, path, or URL for text-to-text LLM. Can be specified multiple times for multiple models.",
    )
    parser.add_argument(
        "--model-alias",
        type=str,
        action="append",
        default=None,
        dest="model_aliases",
        nargs=2,
        metavar=("ALIAS", "MODEL"),
        help="Register an alias for a model. Format: --model-alias <alias_name> <actual_model>",
    )
    parser.add_argument(
        "--backend",
        type=str,
        choices=["auto", "nvidia", "vulkan", "opencl"],
        default="auto",
        help="Backend to use: auto (detect), nvidia (CUDA), vulkan (AMD), or opencl",
    )
    parser.add_argument(
        "--image-backend",
        type=str,
        choices=["auto", "nvidia", "vulkan", "opencl"],
        default="auto",
        help="Image generation backend: auto, nvidia (CUDA), vulkan (AMD), or opencl",
    )
    parser.add_argument(
        "--audio-backend",
        type=str,
        choices=["auto", "nvidia", "vulkan", "opencl"],
        default="auto",
        help="Audio transcription backend: auto, nvidia (CUDA), vulkan (AMD), or opencl",
    )
    parser.add_argument(
        "--tts-backend",
        type=str,
        choices=["auto", "nvidia", "vulkan", "opencl"],
        default="auto",
        help="TTS backend: auto, nvidia (CUDA), vulkan (AMD), or opencl",
    )
    parser.add_argument(
        "--host",
        type=str,
        default="0.0.0.0",
        help="Host to bind to (default: 0.0.0.0)",
    )
    parser.add_argument(
        "--port",
        type=int,
        default=8000,
        help="Port to bind to (default: 8000)",
    )
    parser.add_argument(
        "--url",
        type=str,
        default="auto",
        help="Base URL for media downloads: 'auto' (use request IP) or explicit URL (e.g., http://myserver:8000)",
    )
    parser.add_argument(
        "--https",
        action="store_true",
        help="Enable HTTPS with auto-generated certificate",
    )
    parser.add_argument(
        "--privkey",
        type=str,
        default=None,
        help="Path to HTTPS private key file",
    )
    parser.add_argument(
        "--pubkey",
        type=str,
        default=None,
        help="Path to HTTPS certificate file",
    )
    parser.add_argument(
        "--offload-dir",
        type=str,
        default="./offload",
        help="Directory for disk offload (NVIDIA backend only, default: ./offload)",
    )
    parser.add_argument(
        "--load-in-4bit",
        action="store_true",
        help="Load model in 4-bit precision (NVIDIA backend only, requires bitsandbytes)",
    )
    parser.add_argument(
        "--load-in-8bit",
        action="store_true",
        help="Load model in 8-bit precision (NVIDIA backend only, requires bitsandbytes)",
    )
    parser.add_argument(
        "--ram",
        type=float,
        default=None,
        help="Maximum CPU RAM to use for model offloading in GB (NVIDIA backend only). Auto-detected if not specified. Disk offloading only occurs after this limit is exceeded.",
    )
    parser.add_argument(
        "--flash-attn",
        action="store_true",
        help="Use Flash Attention 2 (NVIDIA backend only, requires flash-attn package)",
    )
    parser.add_argument(
        "--offload-strategy",
        type=str,
        choices=["auto", "conservative", "balanced", "aggressive", "sequential"],
        default="auto",
        help="Offload strategy for NVIDIA backend (default: auto)",
    )
    parser.add_argument(
        "--max-gpu-percent",
        type=float,
        default=None,
        help="Maximum GPU VRAM to use as percentage (0-100). Overrides offload-strategy. Lower values offload more to CPU/RAM (default: None = use offload-strategy)",
    )
    parser.add_argument(
        "--n-gpu-layers",
        type=int,
        default=-1,
        help="Number of layers to offload to GPU (Vulkan backend only, default: -1 = all layers)",
    )
    parser.add_argument(
        "--n-ctx",
        type=int,
        action="append",
        default=None,
        help="Context window size (Vulkan backend). Can be specified multiple times, one per --model.",
    )
    parser.add_argument(
        "--vulkan-device",
        type=int,
        default=0,
        help="Vulkan GPU device ID to use (Vulkan backend only, default: 0). Use --vulkan-list-devices to see available devices",
    )
    parser.add_argument(
        "--vulkan-single-gpu",
        action="store_true",
        help="Force Vulkan to use only the specified GPU device (prevents layer distribution across multiple GPUs)",
    )
    parser.add_argument(
        "--vulkan-list-devices",
        action="store_true",
        help="List available Vulkan GPU devices and exit",
    )
    parser.add_argument(
        "--hf-chat-template",
        action="append",
        default=[],
        help="Use HuggingFace apply_chat_template. Examples: --hf-chat-template auto (all models), --hf-chat-template text (all text), --hf-chat-template mymodel:llama3 (specific model with template). Can be repeated.",
    )
    parser.add_argument(
        "--system-prompt",
        nargs="?",
        const=True,
        default=None,
        help="Inject a system prompt at the beginning of conversations. Use without a value for a default prompt, or provide custom text.",
    )
    # Multi-model arguments
    parser.add_argument(
        "--tts-model",
        type=str,
        default=None,
        help="Model for text-to-speech (e.g., kokoro, or path/URL to Kokoro model). Can be specified multiple times.",
    )
    parser.add_argument(
        "--audio-model",
        type=str,
        action="append",
        default=None,
        help="Model for audio transcription (e.g., whisper-1, base, or path to faster-whisper model). Can be specified multiple times for multiple models.",
    )
    parser.add_argument(
        "--audio-1",
        action="store_true",
        help="Disable request queue for audio models - return 409 if model is busy",
    )
    parser.add_argument(
        "--image-model",
        type=str,
        action="append",
        default=None,
        help="Model for image generation (e.g., stable-diffusion-xl-base-1.0). Can be specified multiple times for multiple models.",
    )
    parser.add_argument(
        "--vision-model",
        type=str,
        action="append",
        default=None,
        help="Model for image/video-to-text (e.g., llava-1.5, LLaVA). Supports vulkan and cuda backends.",
    )
    parser.add_argument(
        "--image-1",
        action="store_true",
        help="Disable request queue for image models - return 409 if model is busy",
    )
    parser.add_argument(
        "--llm-path",
        type=str,
        default=None,
        help="Path to CLIP LLM model for image generation (stable-diffusion-cpp-python).",
    )
    parser.add_argument(
        "--vae-path",
        type=str,
        default=None,
        help="Path to VAE model for image generation (stable-diffusion-cpp-python).",
    )
    parser.add_argument(
        "--image-sample-method",
        type=str,
        default="res_multistep",
        help="Sample method for image generation (default: res_multistep for Z-Image Turbo).",
    )
    parser.add_argument(
        "--image-steps",
        type=int,
        default=4,
        help="Number of inference steps for image generation (default: 4 for Z-Image Turbo).",
    )
    parser.add_argument(
        "--image-width",
        type=int,
        default=512,
        help="Image width for generation (default: 512).",
    )
    parser.add_argument(
        "--image-height",
        type=int,
        default=512,
        help="Image height for generation (default: 512).",
    )
    parser.add_argument(
        "--image-cfg-scale",
        type=float,
        default=1.0,
        help="CFG scale for image generation (default: 1.0 for Z-Image Turbo).",
    )
    parser.add_argument(
        "--image-precision",
        type=str,
        default="f32",
        choices=["bf16", "f32", "f16", "f8"],
        help="Model precision for image generation (default: f32). bf16 recommended for modern GPUs.",
    )
    parser.add_argument(
        "--image-cpu-offload",
        action="store_true",
        help="Enable sequential CPU offload for image models (lower VRAM usage).",
    )
    parser.add_argument(
        "--image-seed",
        type=int,
        default=None,
        help="Default seed for image generation (default: random).",
    )
    parser.add_argument(
        "--vae-tiling",
        action="store_true",
        help="Enable VAE tiling for lower VRAM usage (sd.cpp only).",
    )
    parser.add_argument(
        "--clip-on-cpu",
        action="store_true",
        help="Run CLIP on CPU to save VRAM (sd.cpp only).",
    )
    parser.add_argument(
        "--loadall",
        action="store_true",
        help="Pre-load all models (main, audio, image) at startup instead of on-demand",
    )
    parser.add_argument(
        "--loadswap",
        action="store_true",
        help="Keep all models loaded, swapping active model between VRAM and RAM (only active model in VRAM)",
    )
    parser.add_argument(
        "--nopreload",
        action="store_true",
        help="Disable model preloading. Models will load on first request instead of at startup",
    )
    parser.add_argument(
        "--audio-ctx",
        type=int,
        action="append",
        default=None,
        help="Audio model context size in milliseconds. Can be specified multiple times, one per --audio-model.",
    )
    parser.add_argument(
        "--audio-offload",
        type=float,
        default=None,
        help="Audio model GPU offload percentage (0-100). If not set, uses CPU",
    )
    parser.add_argument(
        "--audio-vulkan-device",
        type=int,
        default=0,
        help="Vulkan GPU device ID to use for Whisper audio transcription (default: 0). Only used when using Vulkan backend.",
    )
    parser.add_argument(
        "--image-vulkan-device",
        type=int,
        default=None,
        help="Vulkan GPU device ID to use for image generation models (default: same as --vulkan-device). Use --vulkan-list-devices to see available devices",
    )

    parser.add_argument(
        "--whisper-cpp",
        type=str,
        default=None,
        help="Path to whisper.cpp CLI executable (e.g., ~/whisper.cpp/build/bin/whisper-cli). Uses Vulkan if available.",
    )
    parser.add_argument(
        "--whisper-server",
        type=str,
        default=None,
        help="Path to whisper.cpp server executable (e.g., ~/whisper.cpp/build/bin/whisper-server). Keeps model loaded in VRAM.",
    )
    parser.add_argument(
        "--whisper-server-port",
        type=int,
        default=8744,
        help="Port for whisper-server (default: 8744).",
    )
    parser.add_argument(
        "--image-ctx",
        type=int,
        action="append",
        default=None,
        help="Image model context size. Can be specified multiple times, one per --image-model.",
    )
    parser.add_argument(
        "--image-offload",
        type=float,
        default=None,
        help="Vision model GPU offload percentage (0-100). If not set, loads fully on GPU",
    )
    parser.add_argument(
        "--list-cached-models",
        action="store_true",
        help="List all cached models in the model cache directory",
    )
    parser.add_argument(
        "--remove-all-models",
        action="store_true",
        help="Remove all cached models from the model cache directory",
    )
    parser.add_argument(
        "--remove-model",
        type=str,
        default=None,
        help="Remove a specific cached model by name or hash (partial match)",
    )
    parser.add_argument(
        "--debug",
        action="store_true",
        help="Enable debug mode - dumps full request/response to stdout for troubleshooting",
    )
    parser.add_argument(
        "--dump",
        action="store_true",
        help="Dump model output: raw output, parsed output, and litellm debug info",
    )
    parser.add_argument(
        "--file-path",
        type=str,
        default=None,
        help="Path to store generated files (images, audio). If specified, files will be saved here and served over web.",
    )
    parser.add_argument(
        "--parser",
        type=str,
        default="auto",
        choices=["auto", "litellm"],
        help="Tool call parser to use: 'auto' for internal parser, 'litellm' for LiteLLM's parser. Default: auto",
    )
    # Custom type for comma-separated reasoning options
    def reasoning_choices(value):
        if not value:
            return []
        options = [v.strip().lower() for v in value.split(',')]
        valid = {'chat', 'stop', 'inject', 'prompt', 'all', 'twopass', 'mock', 'raw'}
        invalid = [o for o in options if o not in valid]
        if invalid:
            raise argparse.ArgumentTypeError(f"Invalid choices: {invalid}. Valid options: {valid}")
        # Expand 'all' to all options
        if 'all' in options:
            options = ['chat', 'inject', 'prompt', 'mock', 'raw', 'twopass']
        return options
    
    parser.add_argument(
        "--force-reasoning",
        type=reasoning_choices,
        default=None,
        help="Force reasoning. Options: 'chat' (API), 'stop' (tokens), 'inject' (sys prompt), 'prompt' (seeding), 'twopass' (2 calls), 'mock' (fake stats), 'raw' (raw completion), 'all' (all options). Combine: --force-reasoning chat,inject.",
    )
    return parser.parse_args()
def main():
    """Main entry point."""
    global global_system_prompt, model_manager, multi_model_manager, global_debug, global_dump, global_args, global_file_path
    
    # Suppress unraisable exceptions from LlamaModel.__del__
    import sys
    original_unraisablehook = sys.unraisablehook
    def suppress_llama_del_errors(unraisable):
        if isinstance(unraisable.exc_value, AttributeError) and 'LlamaModel' in repr(unraisable.object) and 'sampler' in str(unraisable.exc_value):
            return  # Ignore this specific error
        original_unraisablehook(unraisable)
    sys.unraisablehook = suppress_llama_del_errors
    
    # Optional: set process name if procname is available
    try:
        import procname
        procname.setprocname("coderai")
    except ImportError:
        pass
    args = parse_args()
    
    # Store args globally for access in endpoints
    global_args = args
    
    # Set global system prompt from --system-prompt flag
    global_system_prompt = args.system_prompt
    
    # Set global debug flag
    global_debug = args.debug
    # Set global dump flag (enables debug as well for litellm output)
    global_dump = args.dump
    if global_dump:
        global_debug = True
    # Set global file path for storing generated files
    global_file_path = args.file_path
    if global_debug:
        # Print the full command line that was used to invoke coderai
        import shlex
        cmd_line = ' '.join(shlex.quote(arg) for arg in sys.argv)
        print(f"\n{'='*80}")
        print(f"=== COMMAND LINE: {cmd_line}")
        print(f"{'='*80}\n")
        print("DEBUG MODE ENABLED - Full requests and replies will be dumped to stdout")
    
    # Handle --vulkan-list-devices
    if args.vulkan_list_devices:
        print("\nListing Vulkan devices...")
        try:
            import subprocess
            result = subprocess.run(['vulkaninfo', '--summary'], capture_output=True, text=True)
            if result.returncode == 0:
                print(result.stdout)
            else:
                print("Could not run vulkaninfo. Make sure vulkan-tools is installed.")
        except Exception as e:
            print(f"Error listing devices: {e}")
        sys.exit(0)
    
    # Handle --list-cached-models
    if args.list_cached_models:
        print("\n=== Listing Cached Models ===")
        
        caches = get_all_cache_dirs()
        if not caches:
            print("No model cache directories found.")
            sys.exit(0)
        
        all_files = []
        for cache_name, cache_dir in caches.items():
            print(f"\n--- {cache_name.upper()} Cache ({cache_dir}) ---")
            if not os.path.exists(cache_dir):
                print(f"  (directory does not exist)")
                continue
                
            files = os.listdir(cache_dir)
            if not files:
                print(f"  No cached files.")
                continue
            
            # For diffusers and huggingface, show directory structure
            if cache_name in ('diffusers', 'huggingface'):
                for root, dirs, files in os.walk(cache_dir):
                    for f in files:
                        filepath = os.path.join(root, f)
                        rel_path = os.path.relpath(filepath, cache_dir)
                        size = os.path.getsize(filepath)
                        all_files.append((cache_name, rel_path, size))
            else:
                for f in files:
                    filepath = os.path.join(cache_dir, f)
                    if os.path.isfile(filepath):
                        size = os.path.getsize(filepath)
                        all_files.append((cache_name, f, size))
        
        if not all_files:
            print("\nNo cached models found.")
            sys.exit(0)
        
        # Calculate totals
        total_size = sum(size for _, _, size in all_files)
        
        print(f"\n=== Summary ===")
        print(f"Total: {len(all_files)} files, {total_size / (1024*1024*1024):.2f} GB")
        print("\nCache locations:")
        for cache_name, cache_dir in caches.items():
            print(f"  {cache_name}: {cache_dir}")
        
        sys.exit(0)
    
    # Handle --remove-all-models
    if args.remove_all_models:
        print("\n=== Removing All Cached Models ===")
        
        import shutil
        caches = get_all_cache_dirs()
        
        if not caches:
            print("No cache directories found.")
            sys.exit(0)
        
        total_removed = 0
        for cache_name, cache_dir in caches.items():
            if not os.path.exists(cache_dir):
                continue
                
            files = os.listdir(cache_dir)
            if not files:
                continue
            
            print(f"\nRemoving from {cache_name} cache ({cache_dir})...")
            print(f"  Found {len(files)} file(s). Deleting...")
            
            # For diffusers, remove entire directory tree
            if cache_name == 'diffusers':
                for item in os.listdir(cache_dir):
                    item_path = os.path.join(cache_dir, item)
                    if os.path.isdir(item_path):
                        shutil.rmtree(item_path)
                    else:
                        os.remove(item_path)
                    print(f"  Deleted: {item}")
                    total_removed += 1
            else:
                for f in files:
                    filepath = os.path.join(cache_dir, f)
                    os.remove(filepath)
                    print(f"  Deleted: {f}")
                    total_removed += 1
        
        print(f"\n=== Removed {total_removed} item(s) from all caches ===")
        sys.exit(0)
    
    # Handle --remove-model
    if args.remove_model:
        print(f"\n=== Removing Cached Model Matching: {args.remove_model} ===")
        
        import shutil
        caches = get_all_cache_dirs()
        
        if not caches:
            print("No cache directories found.")
            sys.exit(0)
        
        all_matching = []
        for cache_name, cache_dir in caches.items():
            if not os.path.exists(cache_dir):
                continue
            
            # For diffusers and huggingface, search recursively
            if cache_name in ('diffusers', 'huggingface'):
                for root, dirs, files in os.walk(cache_dir):
                    for f in files:
                        if args.remove_model.lower() in f.lower():
                            filepath = os.path.join(root, f)
                            rel_path = os.path.relpath(filepath, cache_dir)
                            size = os.path.getsize(filepath)
                            all_matching.append((cache_name, rel_path, filepath, size))
            else:
                files = os.listdir(cache_dir)
                for f in files:
                    if args.remove_model.lower() in f.lower():
                        filepath = os.path.join(cache_dir, f)
                        if os.path.isfile(filepath):
                            size = os.path.getsize(filepath)
                            all_matching.append((cache_name, f, filepath, size))
        
        if not all_matching:
            print(f"No cached models found matching: {args.remove_model}")
            print(f"\nUse --list-cached-models to see available models.")
            sys.exit(0)
        
        print(f"\nFound {len(all_matching)} matching file(s):")
        for cache_name, filename, filepath, size in all_matching:
            print(f"  [{cache_name}] {filename} ({size / (1024*1024):.1f} MB)")
        
        # Confirm before deleting
        print(f"\nDeleting {len(all_matching)} file(s)...")
        for cache_name, filename, filepath, size in all_matching:
            try:
                os.remove(filepath)
                print(f"  Deleted: [{cache_name}] {filename}")
            except Exception as e:
                print(f"  Failed to delete {filename}: {e}")
        
        print(f"\nRemoved {len(all_matching)} cached model file(s).")
        sys.exit(0)
    
    # Get model names from args - support multiple models
    model_names = args.model if args.model else []
    
    # Helper function to get config value by index with fallback
    def get_ctx_by_index(ctx_list, index, default):
        """Get context value by model index, with fallback to default."""
        if ctx_list and index < len(ctx_list):
            return ctx_list[index]
        return default
    
    # Validate: must have at least one model specified
    audio_models = args.audio_model if args.audio_model else []
    image_models = args.image_model if args.image_model else []
    vision_models = args.vision_model if args.vision_model else []
    
    if not model_names and not audio_models and not image_models and not vision_models and args.tts_model is None:
        print("Error: At least one of --model, --audio-model, --image-model, --vision-model, or --tts-model must be specified.")
        print("")
        print("For NVIDIA backend (HuggingFace models):")
        print("  - microsoft/DialoGPT-medium")
        print("  - meta-llama/Llama-2-7b-chat-hf (requires auth)")
        print("  - TinyLlama/TinyLlama-1.1B-Chat-v1.0")
        print("  - Use multiple --model flags for multiple models")
        print("")
        print("For Vulkan backend (GGUF models):")
        print("  - Local path: ./phi-3-mini-4k-instruct-q4_k_m.gguf")
        print("  - HuggingFace: microsoft/Phi-3-mini-4k-instruct-gguf")
        print("  - URL: https://huggingface.co/.../model.gguf")
        print("")
        print("For audio transcription:")
        print("  - --audio-model base")
        print("")
        print("For text-to-speech:")
        print("  - --tts-model kokoro")
        print("")
        print("For image generation:")
        print("  - --image-model stabilityai/stable-diffusion-xl-base-1.0")
        sys.exit(1)
    
    # Print loaded models info
    if model_names:
        print(f"\nText model(s): {model_names}")
        if len(model_names) > 1:
            # Load mode will be determined below
            print(f"Multiple models configured - load mode will be set based on --loadall/--loadswap flags")
    
    # Detect available backends
    available = detect_available_backends()
    
    # If user explicitly requests nvidia/cuda backend with a GGUF model, 
    # remove vulkan from available since we'll use CUDA instead
    if model_names:
        first_model = model_names[0]
        is_gguf_model = first_model.endswith('.gguf') or 'gguf' in first_model.lower()
        if is_gguf_model and args.backend in ('nvidia', 'cuda'):
            # When using nvidia/cuda backend with GGUF, vulkan uses CUDA, so remove it
            if 'vulkan' in available:
                del available['vulkan']
    
    print("\nAvailable backends:")
    for name, available_flag in available.items():
        status = "✓" if available_flag else "✗"
        print(f"  [{status}] {name}")
    print("")
    
    # Load the main model (only if specified)
    if model_names:
        # Enable verbose mode when debug is set (for better troubleshooting output from llama-cpp)
        verbose = args.debug if hasattr(args, 'debug') else False
        
        load_kwargs = {
            'offload_dir': args.offload_dir,
            'load_in_4bit': args.load_in_4bit,
            'load_in_8bit': args.load_in_8bit,
            'manual_ram_gb': args.ram,
            'flash_attn': args.flash_attn,
            'offload_strategy': args.offload_strategy,
            'max_gpu_percent': args.max_gpu_percent,
            'n_gpu_layers': args.n_gpu_layers,
            'n_ctx': get_ctx_by_index(args.n_ctx, 0, 2048),
            'main_gpu': args.vulkan_device,
            'single_gpu': args.vulkan_single_gpu,
            'verbose': verbose,
        }
        
        # Load the first model
        first_model_name = model_names[0]
        try:
            model_manager.load_model(
                model_name=first_model_name,
                backend_type=args.backend,
                **load_kwargs
            )
            # Register with multi_model_manager
            multi_model_manager.set_default_model(first_model_name, load_kwargs, args.backend)
            multi_model_manager.add_model(first_model_name, model_manager)
            print(f"\nMain text model loaded: {first_model_name}")
        except Exception as e:
            print(f"\nError loading model: {e}")
            error_str = str(e).lower()
            print("\nTroubleshooting:")
            if args.backend == "vulkan":
                print("  - For Vulkan, ensure you have Vulkan drivers installed")
                print("  - Make sure you're using a GGUF format model")
                print("  - Run build.sh with 'vulkan' argument first")
            else:
                print("  - For NVIDIA, ensure PyTorch with CUDA is installed")
                print("  - Run build.sh with 'nvidia' argument first")
                if "tokenizer" in error_str or "sentencepiece" in error_str or "tiktoken" in error_str:
                    print("  - Tokenizer error: ensure sentencepiece and tiktoken are installed")
                    print("    pip install sentencepiece tiktoken tokenizers")
                # Check if trying to load GGUF model with NVIDIA backend
                if "gguf" in first_model_name.lower():
                    print(f"\n  *** IMPORTANT: '{first_model_name}' appears to be a GGUF model ***")
                    print("  GGUF models are NOT compatible with the NVIDIA backend.")
                    print("  Use --backend vulkan instead, or choose a HuggingFace Transformers model.")
                    print("\n  Example Vulkan command:")
                    print(f"    coderai --backend vulkan --model {first_model_name}")
            sys.exit(1)
    else:
        print("\nNo main text model specified (--model). Running with audio/image/TTS models only.")
    
    # Determine load mode BEFORE setting up other models
    load_mode = "ondemand"
    if args.loadall:
        load_mode = "loadall"
    elif args.loadswap:
        load_mode = "loadswap"
    
    # Set load mode in multi_model_manager
    multi_model_manager.set_load_mode(load_mode)
    
    # Load models based on mode and count
    if len(model_names) > 1:
        # Multiple models - handle based on load mode
        print(f"\n=== Multiple Models Mode: {load_mode} ===")
        
        if load_mode == "loadall":
            # Load all models into VRAM
            # Skip first model if it's already loaded (at lines 4274-4281)
            start_index = 1 if model_names[0] in multi_model_manager.models else 0
            for i in range(start_index, len(model_names)):
                model_name = model_names[i]
                print(f"\nLoading model {i+1}/{len(model_names)}: {model_name}")
                try:
                    manager = ModelManager()
                    manager.load_model(
                        model_name=model_name,
                        backend_type=args.backend,
                        **load_kwargs
                    )
                    multi_model_manager.add_model(model_name, manager)
                    print(f"Loaded: {model_name}")
                except Exception as e:
                    print(f"Error loading {model_name}: {e}")
        
        elif load_mode == "loadswap":
            # First model in VRAM, others in RAM
            # Skip first model if it's already loaded (at lines 4274-4281)
            start_index = 1 if model_names[0] in multi_model_manager.models else 0
            for i in range(start_index, len(model_names)):
                model_name = model_names[i]
                # In loadswap, all additional models go to RAM (CPU-only)
                print(f"\nLoading model {i+1}/{len(model_names)}: {model_name} (RAM)")
                try:
                    manager = ModelManager()
                    # Modify kwargs for CPU-only loading
                    swap_kwargs = load_kwargs.copy()
                    swap_kwargs['n_gpu_layers'] = 0  # Force CPU only for swap mode
                    manager.load_model(
                        model_name=model_name,
                        backend_type=args.backend,
                        **swap_kwargs
                    )
                    multi_model_manager.add_model(model_name, manager)
                    print(f"Loaded: {model_name} (RAM)")
                except Exception as e:
                    print(f"Error loading {model_name}: {e}")
        
        else:  # ondemand
            # First model already loaded at lines 4274-4281
            # Just register other models but don't load them
            print(f"\nFirst model already loaded: {model_names[0]}")
            
            # Register other models but don't load them
            for model_name in model_names[1:]:
                multi_model_manager.set_default_model(model_name, load_kwargs, args.backend)
            
            print(f"Other models will load on-demand: {model_names[1:]}")
    # Model is already loaded at lines 4274-4281
    
    # Determine load mode BEFORE setting up other models
    load_mode = "ondemand"
    if args.loadall:
        load_mode = "loadall"
    elif args.loadswap:
        load_mode = "loadswap"
    
    # Set load mode in multi_model_manager
    multi_model_manager.set_load_mode(load_mode)
    
    # Pre-load models based on mode
    print(f"DEBUG: load_mode at line 4710 = '{load_mode}', backend = {args.backend}")
    if load_mode in ("loadall", "loadswap"):
        # Load all models into VRAM (or RAM for CUDA loadswap)
        mode_name = "Load All" if load_mode == "loadall" else "Load Swap"
        print(f"\n=== {mode_name} Mode ===")
        
        # Load main text model first
        if model_names:
            print(f"Pre-loading main text model: {model_names[0]}")
        
        # Load image model (first one only in loadall mode currently)
        print(f"DEBUG: image_models check at line 4718: {image_models}, backend = {args.backend}")
        # Only preload image model if loadall or loadswap mode is set
        if image_models and not getattr(args, 'nopreload', False) and load_mode in ("loadall", "loadswap"):
            print(f"Pre-loading image model: {image_models[0]}")
            
            # Get the original model name
            original_model_name = image_models[0]
            
            # Check if it's a URL first (before any processing)
            is_url = original_model_name.startswith('http://') or original_model_name.startswith('https://')
            
            # Strip query parameters from URL if present
            model_name = original_model_name
            if '?' in model_name:
                model_name = model_name.split('?')[0]
            
            # Check if the image model is a GGUF model
            is_gguf = model_name.endswith('.gguf') or 'gguf' in model_name.lower()
            
            if is_gguf:
                # GGUF for image - use stable-diffusion-cpp-python
                print(f"Detected GGUF image model, loading with llama.cpp...")
                try:
                    from llama_cpp import Llama
                    
                    # Download GGUF model if needed (similar to VulkanBackend)
                    model_path = None
                    
                    # Check if it's a URL - download directly
                    if is_url:
                        print(f"Image model is a URL: {original_model_name}")
                        cached_path = get_cached_model_path(original_model_name)
                        if cached_path:
                            model_path = cached_path
                            print(f"Using cached GGUF model: {model_path}")
                        else:
                            print(f"Downloading GGUF model: {original_model_name}")
                            cache_dir = get_model_cache_dir()
                            model_path = download_model(original_model_name, cache_dir)
                    elif os.path.isfile(model_name):
                        # Local file
                        model_path = model_name
                        print(f"Loading local GGUF model: {model_path}")
                    else:
                        # Try to download from HuggingFace Hub
                        print(f"Trying to resolve as HuggingFace model: {model_name}")
                        try:
                            from huggingface_hub import hf_hub_download, list_repo_files
                            parts = model_name.split('/')
                            if len(parts) >= 2:
                                repo_id = f"{parts[0]}/{parts[1]}"
                                print(f"Looking for GGUF files in repo: {repo_id}")
                                files = list_repo_files(repo_id)
                                gguf_files = [f for f in files if f.endswith('.gguf')]
                                if not gguf_files:
                                    raise ValueError(f"No GGUF files found in {repo_id}")
                                filename = gguf_files[0]
                                model_path = hf_hub_download(repo_id=repo_id, filename=filename)
                                print(f"Downloaded GGUF model to: {model_path}")
                        except Exception as e:
                            print(f"Could not resolve GGUF model path: {e}")
                            print(f"Image model will load on first request")
                            model_path = None
                    
                    if model_path and os.path.isfile(model_path):
                        # Use the cached path for the model key
                        model_key = f"image:{model_path}"
                        
                        # Load with llama.cpp
                        n_gpu_layers = -1  # Load all layers to GPU
                        n_ctx = 2048
                        
                        print(f"Loading GGUF model from: {model_path}")
                        file_size = os.path.getsize(model_path)
                        print(f"GGUF model file size: {file_size / (1024*1024):.1f} MB")
                        
                        # Verify it's a valid GGUF file (check magic bytes)
                        with open(model_path, 'rb') as f:
                            magic = f.read(8)
                            print(f"File magic bytes: {magic}")
                            if not magic.startswith(b'GGUF'):
                                print(f"ERROR: File is NOT a valid GGUF! Expected 'GGUF', got: {magic}")
                                print(f"This means the download returned an HTML error page instead of the model.")
                                print(f"The URL must be a DIRECT download link (ends with .gguf, not a model page)")
                                print(f"Example: https://huggingface.co/owner/repo/resolve/main/model.gguf")
                                print(f"Image model will load on first request")
                            else:
                                # Valid GGUF, try to load
                                try:
                                    llama_model = Llama(
                                        model_path=model_path,
                                        n_gpu_layers=n_gpu_layers,
                                        n_ctx=n_ctx,
                                        verbose=True,
                                    )
                                    multi_model_manager.add_model(model_key, llama_model)
                                    print(f"GGUF image model loaded successfully: {original_model_name}")
                                except Exception as llama_error:
                                    print(f"llama.cpp load error: {llama_error}")
                                    print(f"Trying stable-diffusion-cpp-python fallback...")
                                    # Try stable-diffusion-cpp-python as fallback
                                    try:
                                        from stable_diffusion_cpp import StableDiffusion
                                        
                                        # Initialize model_key to None first so Python knows it exists
                                        model_key = None
                                        
                                        # Define model_key for this scope
                                        model_key = f"image:{model_path}"
                                        print(f"Loading with sd.cpp: {model_path}")
                                        # For models like Z-Image-Turbo/Flux, use diffusion_model_path
                                        # Look for additional model files in same directory
                                        model_dir = os.path.dirname(model_path)
                                        model_name = os.path.basename(model_path)
                                        
                                        # Try to find additional model files
                                        clip_l_path = None
                                        t5xxl_path = None
                                        vae_path = None
                                        
                                        # Use CLI arguments if provided, download and cache if URL
                                        if args.llm_path:
                                            # Check if it's a Hugging Face model ID, URL, or local path
                                            if is_huggingface_model_id(args.llm_path):
                                                # Download from Hugging Face
                                                print(f"Attempting to download LLM model from Hugging Face: {args.llm_path}")
                                                cache_dir = get_model_cache_dir()
                                                clip_l_path = download_huggingface_model(args.llm_path, cache_dir, '.gguf')
                                                if clip_l_path:
                                                    print(f"Downloaded LLM model to: {clip_l_path}")
                                                else:
                                                    print(f"Warning: Failed to download LLM model from Hugging Face, will try as local path")
                                            elif args.llm_path.startswith('http://') or args.llm_path.startswith('https://'):
                                                cached = get_cached_model_path(args.llm_path)
                                                if cached:
                                                    clip_l_path = cached
                                                    print(f"Using cached LLM model: {clip_l_path}")
                                                else:
                                                    cache_dir = get_model_cache_dir()
                                                    clip_l_path = download_model(args.llm_path, cache_dir)
                                                    print(f"Downloaded LLM model to: {clip_l_path}")
                                            else:
                                                clip_l_path = args.llm_path
                                        if args.vae_path:
                                            # Check if it's a URL and download if needed
                                            if args.vae_path.startswith('http://') or args.vae_path.startswith('https://'):
                                                cached = get_cached_model_path(args.vae_path)
                                                if cached:
                                                    vae_path = cached
                                                    print(f"Using cached VAE model: {vae_path}")
                                                else:
                                                    cache_dir = get_model_cache_dir()
                                                    vae_path = download_model(args.vae_path, cache_dir)
                                                    print(f"Downloaded VAE model to: {vae_path}")
                                            else:
                                                vae_path = args.vae_path
                                        
                                        # Look for common file patterns only if CLI args not provided
                                        if not args.llm_path or not args.vae_path:
                                            for f in os.listdir(model_dir) if os.path.exists(model_dir) else []:
                                                if not args.llm_path and 'clip_l' in f.lower() and f.endswith(('.safetensors', '.bin')):
                                                    clip_l_path = os.path.join(model_dir, f)
                                                elif 't5xxl' in f.lower() and f.endswith(('.safetensors', '.bin')):
                                                    t5xxl_path = os.path.join(model_dir, f)
                                                elif not args.vae_path and f.endswith('.safetensors') and 'ae' in f.lower():
                                                    vae_path = os.path.join(model_dir, f)
                                        
                                        # Build kwargs based on available files
                                        sd_kwargs = {'diffusion_model_path': model_path}
                                        
                                        if clip_l_path:
                                            sd_kwargs['llm_path'] = clip_l_path
                                            print(f"DEBUG: Adding llm_path to sd_kwargs: {clip_l_path}")
                                        else:
                                            print(f"DEBUG: clip_l_path is None or empty, not adding to sd_kwargs")
                                            print(f"DEBUG: args.llm_path = {args.llm_path}")
                                        if args.vae_path:
                                            sd_kwargs['vae_path'] = vae_path
                                        elif vae_path:
                                            sd_kwargs['vae_path'] = vae_path
                                        if t5xxl_path:
                                            sd_kwargs['t5xxl_path'] = t5xxl_path
                                        
                                        # Add sd.cpp-specific options from CLI args
                                        if getattr(global_args, 'vae_tiling', False):
                                            # VAE tiling is handled internally in newer builds
                                            print(f"DEBUG: VAE tiling is handled internally in stable-diffusion-cpp-python")
                                        if getattr(global_args, 'clip_on_cpu', False):
                                            sd_kwargs['keep_clip_on_cpu'] = True
                                            print(f"DEBUG: Running CLIP on CPU to save VRAM (keep_clip_on_cpu=True)")
                                        
                                        # Use all available CPU cores for processing
                                        import psutil
                                        sd_kwargs['n_threads'] = psutil.cpu_count()
                                        print(f"DEBUG: Using {psutil.cpu_count()} CPU cores for sd.cpp")

                                        # Add generation parameters from CLI args
                                        # sd_kwargs['sample_method'] = args.image_sample_method  # Not valid for __init__
                                        # sd_kwargs['steps'] = args.image_steps  # Not valid for __init__
                                        
                                        sd_model = StableDiffusion(**sd_kwargs)
                                        multi_model_manager.add_model(model_key, sd_model)
                                        # Add alias for "image" 
                                        multi_model_manager.add_model("image", sd_model)
                                        
                                        print(f"Image model loaded successfully via sd.cpp: {original_model_name}")
                                    except ImportError as sd_error:
                                        print(f"stable-diffusion-cpp-python not installed: {sd_error}")
                                        print(f"Image model will load on first request")
                                    except Exception as sd_error:
                                        print(f"sd.cpp load error: {sd_error}")
                                        print(f"Image model will load on first request")
                    else:
                        print(f"Could not load GGUF image model: no valid model path")
                        
                except ImportError as e:
                    print(f"Warning: llama_cpp not installed: {e}")
                    print(f"Image model will load on first request")
                except Exception as e:
                    print(f"Warning: Failed to pre-load GGUF image model: {e}")
                    print(f"Image model will load on first request")
            else:
                # Load diffusers image model (Stable Diffusion)
                try:
                    import torch
                    from diffusers import StableDiffusionXLPipeline, DiffusionPipeline
                    
                    # Use model name directly for diffusers (model_path is only set in GGUF branch)
                    model_key = f"image:{model_name}"
                    print(f"Loading diffusers pipeline: {model_name}")
                    
                    # Try to load as Stable Diffusion XL first
                    try:
                        pipeline = StableDiffusionXLPipeline.from_pretrained(
                            model_name,
                            torch_dtype=torch.float32,
                            use_safetensors=True,
                        )
                    except Exception as e:
                        print(f"SDXL failed, trying generic pipeline: {e}")
                        # Try generic diffusion pipeline
                        pipeline = DiffusionPipeline.from_pretrained(
                            model_name,
                            torch_dtype=torch.float32,
                            use_safetensors=True,
                        )
                    
                    # Move to GPU if available
                    if torch.cuda.is_available():
                        pipeline = pipeline.to("cuda")
                        pipeline.enable_attention_slicing()
                    else:
                        pipeline = pipeline.to("cpu")
                    
                    multi_model_manager.add_model(model_key, pipeline)
                    # Add alias for "image"
                    multi_model_manager.add_model("image", pipeline)
                    
                    print(f"Image model loaded successfully: {model_name}")
                    
                except ImportError as e:
                    print(f"Warning: diffusers not installed, image model will load on first request: {e}")
                except Exception as e:
                    print(f"Warning: Failed to pre-load image model: {e}")
                    print(f"  Image model will load on first request")
        
        # Load audio model
        print(f"DEBUG: audio_models check at line 4970: {audio_models}")
        if audio_models:
            print(f"Pre-loading audio model: {audio_models[0]}")
        
        # Load TTS model
        if args.tts_model:
            print(f"Pre-loading TTS model: {args.tts_model}")
            
    elif load_mode == "loadswap":
        # Load models in order: model > image > audio > TTS, keep active in VRAM
        # For Vulkan backend, load all models to VRAM like loadall (VRAM is not limited like CUDA)
        print("\n=== Load Swap Mode ===")
        
        # For Vulkan, use same preloading as loadall
        if args.backend == "vulkan":
            # Vulkan: Load all models to GPU like loadall
            if model_names:
                print(f"Pre-loading main text model: {model_names[0]}")
            # Only preload image model if loadall or loadswap mode is set
            if image_models and not getattr(args, 'nopreload', False) and load_mode in ("loadall", "loadswap"):
                print(f"Pre-loading image model: {image_models[0]}")
            if audio_models:
                print(f"Pre-loading audio model: {audio_models[0]}")
            if args.tts_model:
                print(f"Pre-loading TTS model: {args.tts_model}")
        else:
            # NVIDIA/CUDA: First model in VRAM, others in RAM
            if model_names:
                print(f"Main text model will be in VRAM: {model_names[0]}")
            # Only preload image model if loadall or loadswap mode is set
            if image_models and not getattr(args, 'nopreload', False) and load_mode in ("loadall", "loadswap"):
                print(f"Image model in RAM: {image_models[0]}")
            if audio_models:
                print(f"Audio model in RAM: {audio_models[0]}")
            if args.tts_model:
                print(f"TTS model in RAM: {args.tts_model}")
        
    else:
        # No flags: only one model gets loaded (the main text model if specified)
        print("\n=== On-Demand Mode ===")
        print("Models will load on first request")
    
    # Set up audio model if specified (with pre-loading if in loadall/loadswap mode)
    print(f"DEBUG: models in manager before audio setup: {list(multi_model_manager.models.keys())}")
    if audio_models:
        print(f"\nAudio transcription model(s): {audio_models}")
        
        # Set up Vulkan device for Whisper if using Vulkan backend
        if hasattr(args, 'audio_vulkan_device') and args.audio_vulkan_device is not None:
            print(f"  Using Vulkan device: {args.audio_vulkan_device}")
        
        # Register all audio models
        print(f"DEBUG: Registering audio models: {audio_models}")
        for idx, audio_m in enumerate(audio_models):
            multi_model_manager.set_audio_model(audio_m, {
                'ctx': get_ctx_by_index(args.audio_ctx, idx, 0),
                'offload': args.audio_offload,
            })
        print(f"DEBUG: After registration, audio_models in manager: {multi_model_manager.audio_models}")
        
        # Pre-load first audio model at startup if:
        # - Using loadall or loadswap mode, OR
        # - No main model is specified (only audio model configured)
        print(f"DEBUG: load_mode at line 5015 = '{load_mode}', model_names = {model_names}, audio_models = {audio_models}")
        should_preload = load_mode in ("loadall", "loadswap") or (not model_names and audio_models)
        print(f"DEBUG: should_preload = {should_preload}")
        
        # Initialize whisper-server if specified
        if args.whisper_server:
            print(f"\nWhisper server: {args.whisper_server}")
            print(f"  Port: {args.whisper_server_port}")
            # Check if whisper-server is already running
            if multi_model_manager.whisper_server is None:
                whisper_server_mgr = WhisperServerManager(
                    server_path=args.whisper_server,
                    port=args.whisper_server_port
                )
                multi_model_manager.whisper_server = whisper_server_mgr
            else:
                whisper_server_mgr = multi_model_manager.whisper_server
                print("Whisper server already running, using existing instance")
            
            # Start whisper-server if we should preload or if it's the only audio option
            print(f"DEBUG: whisper-server start check - audio_models={audio_models}, should_preload={should_preload}, whisper_cpp={args.whisper_cpp}")
            if audio_models and (should_preload or not args.whisper_cpp):
                model_to_use = audio_models[0] if audio_models else None
                gpu_device = getattr(args, 'audio_vulkan_device', 0) or 0
                print(f"DEBUG: Starting whisper-server with gpu_device={gpu_device}")
                actual_model_path = whisper_server_mgr.start(model_path=model_to_use, gpu_device=gpu_device)
                if actual_model_path:
                    # Update audio_models in multi_model_manager to store the actual path (not the URL)
                    if model_to_use != actual_model_path:
                        # Update the manager's audio_models list
                        if multi_model_manager.audio_models and multi_model_manager.audio_models[0] == model_to_use:
                            multi_model_manager.audio_models[0] = actual_model_path
                    print(f"Whisper server started with model: {actual_model_path}")
                else:
                    print("Warning: Failed to start whisper-server, falling back to other backends")
        elif should_preload:
            print(f"Pre-loading audio model... {audio_models[0]}")
            
            # Use first audio model for pre-loading
            model_to_use = audio_models[0]
            is_gguf_model = model_to_use.endswith('.gguf') or 'gguf' in model_to_use.lower()
            
            if is_gguf_model:
                # Skip faster-whisper for GGUF files - it doesn't support them
                # Go directly to whispercpp
                print("Detected GGUF model - using whispercpp backend")
                faster_whisper_failed = True
            else:
                # Try faster-whisper first
                faster_whisper_failed = False
            try:
                # Try faster-whisper first (requires torch)
                from faster_whisper import WhisperModel
                import torch
                
                model_to_use = audio_models[0]
                model_path = None
                
                # Check if model is a URL - handle caching
                if model_to_use.startswith('http://') or model_to_use.startswith('https://'):
                    cached_path = get_cached_model_path(model_to_use)
                    if cached_path:
                        model_path = cached_path
                        print(f"Using cached model: {model_path}")
                    else:
                        # Download with progress
                        cache_dir = get_model_cache_dir()
                        model_path = download_model(model_to_use, cache_dir)
                        model_to_use = model_path
                
                # Determine compute type - always use CPU on Vulkan backend
                # faster-whisper CUDA doesn't work with AMD/Vulkan GPUs
                compute_type = "int8"
                
                # Load the model - always use CPU (faster-whisper CUDA doesn't work with AMD/Vulkan)
                whisper_model = WhisperModel(
                    model_to_use,
                    device="cpu",
                    compute_type=compute_type
                )
                
                # Store in multi_model_manager
                model_key = f"audio:{audio_models[0]}"
                multi_model_manager.add_model(model_key, whisper_model)
                print(f"Audio model loaded successfully (faster-whisper)")
                
                # Warn if using CPU (no CUDA available)
                import torch
                if not torch.cuda.is_available():
                    print("Note: faster-whisper is running on CPU (no CUDA GPU detected)")
                    print("      For GPU acceleration, use NVIDIA GPU with CUDA or wait for Vulkan support.")
                
            except ImportError:
                # faster-whisper not available, will try whispercpp below
                faster_whisper_failed = True
            except Exception as e:
                # faster-whisper failed for some other reason (e.g., GGUF file not supported)
                print(f"Warning: faster-whisper failed to load model: {e}")
                faster_whisper_failed = True
            
            # If faster-whisper failed (not installed or couldn't load), try whispercpp
            if faster_whisper_failed:
                # Initialize model_path
                model_path = None
                
                # Check if model is a GGUF file - whispercpp can handle those
                model_is_gguf = model_to_use.endswith('.gguf') or (model_path and model_path.endswith('.gguf'))
                
                # Check if Vulkan is available for whispercpp
                whisper_vulkan_available = False
                whisper_vulkan_device = os.environ.get('VK_DEVICE_SELECT_DEVICE', '0')
                try:
                    import whispercpp
                    if os.environ.get('VK_DEVICE_SELECT_DEVICE'):
                        whisper_vulkan_available = True
                        print(f"Whisper Vulkan: Will use GPU device {whisper_vulkan_device}")
                    elif os.path.exists('/dev/dri'):
                        whisper_vulkan_available = True
                        print(f"Whisper Vulkan: Auto-detected GPU, will use device {whisper_vulkan_device}")
                except ImportError as e:
                    print(f"Debug: whispercpp import failed: {e}")
                
                try:
                    import whispercpp
                    
                    model_to_use = audio_models[0]
                    model_path = None
                    
                    # Check if model is a URL - handle caching
                    if model_to_use.startswith('http://') or model_to_use.startswith('https://'):
                        cached_path = get_cached_model_path(model_to_use)
                        if cached_path:
                            model_path = cached_path
                            print(f"Using cached model: {model_path}")
                        else:
                            # Download with progress
                            cache_dir = get_model_cache_dir()
                            model_path = download_model(model_to_use, cache_dir)
                            model_to_use = model_path
                    
                    # whispercpp needs a local file or a built-in model name
                    # whispercpp supports: tiny, base, small, medium, large-v1, large (built-in)
                    # or pre-converted GGUF files (NOT HuggingFace GGUF format)
                    if not model_path:
                        # Check if it's a local file
                        if os.path.isfile(model_to_use):
                            model_path = model_to_use
                        elif model_to_use in ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large']:
                            # It's a built-in model name - whispercpp will download automatically
                            model_path = model_to_use
                        else:
                            # Could be a model name without .gguf extension - try it
                            model_path = model_to_use
                    
                    if not model_path or (model_path != model_to_use and not os.path.isfile(model_path)):
                        # If model_path is not a valid built-in name, check if file exists
                        if model_path and model_path not in ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large']:
                            if not os.path.isfile(model_path):
                                print(f"Warning: whispercpp requires a local GGUF file or built-in model name, not: {model_to_use}")
                                print("For Vulkan audio transcription, use a built-in model name (tiny/base/small/medium/large-v1/large)")
                                print("or install faster-whisper with PyTorch for HuggingFace GGUF support.")
                                print("Audio model will load on-demand when transcription is requested.")
                    else:
                        # Load the whispercpp model
                        try:
                            whisper_model = whispercpp.Whisper.from_pretrained(model_path)
                            
                            # Store in multi_model_manager
                            model_key = f"audio:{audio_models[0]}"
                            multi_model_manager.add_model(model_key, whisper_model)
                            print(f"Audio model loaded successfully (whispercpp)")
                            if whisper_vulkan_available:
                                print(f"  -> Using Vulkan GPU acceleration (device {whisper_vulkan_device})")
                        except Exception as e:
                            error_msg = str(e).lower()
                            if 'not a valid preconverted model' in error_msg:
                                print(f"Warning: whispercpp does not support this model format")
                                print("whispercpp only supports built-in model names or pre-converted GGUF files.")
                                print("For Vulkan audio transcription, please either:")
                                print("  1. Install PyTorch + faster-whisper: pip install torch faster-whisper")
                                print("  2. Use a built-in whispercpp model: --audio-model base")
                                print("Audio model will load on-demand when transcription is requested.")
                            else:
                                print(f"Warning: Could not pre-load audio model with whispercpp: {e}")
                                print("Audio model will load on-demand when transcription is requested.")
                except ImportError as e:
                    # Neither faster-whisper nor whispercpp available
                    print(f"Warning: No audio transcription library available: {e}")
                    print("Options:")
                    print("  1. Install PyTorch + faster-whisper: pip install torch faster-whisper")
                    print("  2. Use a built-in whispercpp model: --audio-model base")
                    print("Audio model will load on-demand when transcription is requested.")
                except Exception as e:
                    print(f"Warning: Could not pre-load audio model with whispercpp: {e}")
                    print("Audio model will load on-demand when transcription is requested.")
    
    # Set up TTS model if specified
    if args.tts_model:
        print(f"\nText-to-speech model: {args.tts_model}")
        multi_model_manager.set_tts_model(args.tts_model, {})
        
        # Pre-load TTS model if it's the only model configured
        if not model_names and not audio_models and not image_models:
            print(f"Pre-loading TTS model...")
            # TTS models load on-demand, but we can pre-download if needed
    
    # Set up image model if specified
    if image_models:
        print(f"\nImage generation model(s): {image_models}")
        multi_model_manager.set_image_model(image_models[0], {
            'ctx': get_ctx_by_index(args.image_ctx, 0, 0),
            'offload': args.image_offload,
            'llm_path': args.llm_path,
            'vae_path': args.vae_path,
            'sample_method': args.image_sample_method,
            'steps': args.image_steps,
            'width': args.image_width,
            'height': args.image_height,
            'cfg_scale': args.image_cfg_scale,
        })
        # Register all image models
        for idx, img_m in enumerate(image_models[1:], start=1):
            multi_model_manager.set_image_model(img_m, {
                'ctx': get_ctx_by_index(args.image_ctx, idx, 0),
                'offload': args.image_offload,
            })
        
        # Pre-load image model if configured and in loadall/loadswap mode
        if image_models and not getattr(args, 'nopreload', False) and load_mode in ("loadall", "loadswap"):
            print(f"Pre-loading image model...")
            
            # Get the original model name
            original_model_name = image_models[0]
            
            # Check if it's a URL first (before any processing)
            is_url = original_model_name.startswith('http://') or original_model_name.startswith('https://')
            
            # Strip query parameters from URL if present
            model_name = original_model_name
            if '?' in model_name:
                model_name = model_name.split('?')[0]
            
            # Check if the image model is a GGUF model
            is_gguf = model_name.endswith('.gguf') or 'gguf' in model_name.lower()
            
            if is_gguf:
                # GGUF for image - use stable-diffusion-cpp-python
                print(f"Detected GGUF image model, loading with llama.cpp...")
                try:
                    from llama_cpp import Llama
                    from llama_cpp import Llama
                    
                    # Download GGUF model if needed
                    model_path = None
                    
                    # Check if it's a URL - download directly
                    if is_url:
                        print(f"Image model is a URL: {original_model_name}")
                        cached_path = get_cached_model_path(original_model_name)
                        if cached_path:
                            model_path = cached_path
                            print(f"Using cached GGUF model: {model_path}")
                        else:
                            print(f"Downloading GGUF model: {original_model_name}")
                            cache_dir = get_model_cache_dir()
                            model_path = download_model(original_model_name, cache_dir)
                    elif os.path.isfile(model_name):
                        model_path = model_name
                        print(f"Loading local GGUF model: {model_name}")
                    else:
                        # Try to download from HuggingFace Hub
                        print(f"Trying to resolve as HuggingFace model: {model_name}")
                        try:
                            from huggingface_hub import hf_hub_download, list_repo_files
                            parts = model_name.split('/')
                            if len(parts) >= 2:
                                repo_id = f"{parts[0]}/{parts[1]}"
                                print(f"Looking for GGUF files in repo: {repo_id}")
                                files = list_repo_files(repo_id)
                                gguf_files = [f for f in files if f.endswith('.gguf')]
                                if not gguf_files:
                                    raise ValueError(f"No GGUF files found in {repo_id}")
                                filename = gguf_files[0]
                                model_path = hf_hub_download(repo_id=repo_id, filename=filename)
                                print(f"Downloaded GGUF model to: {model_path}")
                        except Exception as e:
                            print(f"Could not resolve GGUF model path: {e}")
                            model_path = None
                    
                    if model_path and os.path.isfile(model_path):
                        n_gpu_layers = -1
                        n_ctx = 2048
                        
                        print(f"Loading GGUF model from: {model_path}")
                        file_size = os.path.getsize(model_path)
                        print(f"GGUF model file size: {file_size / (1024*1024):.1f} MB")
                        
                        # Verify it's a valid GGUF file (check magic bytes)
                        with open(model_path, 'rb') as f:
                            magic = f.read(8)
                            print(f"File magic bytes: {magic}")
                            if not magic.startswith(b'GGUF'):
                                print(f"ERROR: File is NOT a valid GGUF! Expected 'GGUF', got: {magic}")
                                print(f"The URL must be a DIRECT download link (ends with .gguf)")
                                print(f"Image model will load on first request")
                            else:
                                try:
                                    llama_model = Llama(
                                        model_path=model_path,
                                        n_gpu_layers=n_gpu_layers,
                                        n_ctx=n_ctx,
                                        verbose=True,
                                    )
                                    multi_model_manager.add_model(model_key, llama_model)
                                    print(f"GGUF image model loaded successfully: {original_model_name}")
                                except Exception as llama_error:
                                    print(f"llama.cpp load error: {llama_error}")
                                    print(f"Trying stable-diffusion-cpp-python fallback...")
                                    # Try stable-diffusion-cpp-python as fallback
                                    try:
                                        from stable_diffusion_cpp import StableDiffusion
                                        
                                        # Initialize model_key to avoid unbound variable error
                                        model_key = None
                                        
                                        print(f"Loading with sd.cpp: {model_path}")
                                        # For models like Z-Image-Turbo/Flux, use diffusion_model_path
                                        # Look for additional model files in same directory
                                        model_dir = os.path.dirname(model_path)
                                        model_name = os.path.basename(model_path)
                                        
                                        # Try to find additional model files
                                        clip_l_path = None
                                        t5xxl_path = None
                                        vae_path = None
                                        
                                        # Use CLI arguments if provided, download and cache if URL
                                        if args.llm_path:
                                            # Check if it's a Hugging Face model ID, URL, or local path
                                            if is_huggingface_model_id(args.llm_path):
                                                # Download from Hugging Face
                                                print(f"Attempting to download LLM model from Hugging Face: {args.llm_path}")
                                                cache_dir = get_model_cache_dir()
                                                clip_l_path = download_huggingface_model(args.llm_path, cache_dir, '.gguf')
                                                if clip_l_path:
                                                    print(f"Downloaded LLM model to: {clip_l_path}")
                                                else:
                                                    print(f"Warning: Failed to download LLM model from Hugging Face, will try as local path")
                                            elif args.llm_path.startswith('http://') or args.llm_path.startswith('https://'):
                                                cached = get_cached_model_path(args.llm_path)
                                                if cached:
                                                    clip_l_path = cached
                                                    print(f"Using cached LLM model: {clip_l_path}")
                                                else:
                                                    cache_dir = get_model_cache_dir()
                                                    clip_l_path = download_model(args.llm_path, cache_dir)
                                                    print(f"Downloaded LLM model to: {clip_l_path}")
                                            else:
                                                clip_l_path = args.llm_path
                                        if args.vae_path:
                                            # Check if it's a URL and download if needed
                                            if args.vae_path.startswith('http://') or args.vae_path.startswith('https://'):
                                                cached = get_cached_model_path(args.vae_path)
                                                if cached:
                                                    vae_path = cached
                                                    print(f"Using cached VAE model: {vae_path}")
                                                else:
                                                    cache_dir = get_model_cache_dir()
                                                    vae_path = download_model(args.vae_path, cache_dir)
                                                    print(f"Downloaded VAE model to: {vae_path}")
                                            else:
                                                vae_path = args.vae_path
                                        
                                        # Look for common file patterns only if CLI args not provided
                                        if not args.llm_path or not args.vae_path:
                                            for f in os.listdir(model_dir) if os.path.exists(model_dir) else []:
                                                if not args.llm_path and 'clip_l' in f.lower() and f.endswith(('.safetensors', '.bin')):
                                                    clip_l_path = os.path.join(model_dir, f)
                                                elif 't5xxl' in f.lower() and f.endswith(('.safetensors', '.bin')):
                                                    t5xxl_path = os.path.join(model_dir, f)
                                                elif not args.vae_path and f.endswith('.safetensors') and 'ae' in f.lower():
                                                    vae_path = os.path.join(model_dir, f)
                                        
                                        # Build kwargs based on available files
                                        sd_kwargs = {'diffusion_model_path': model_path}
                                        
                                        if clip_l_path:
                                            sd_kwargs['llm_path'] = clip_l_path
                                            print(f"DEBUG: Adding llm_path to sd_kwargs: {clip_l_path}")
                                        else:
                                            print(f"DEBUG: clip_l_path is None or empty, not adding to sd_kwargs")
                                            print(f"DEBUG: args.llm_path = {args.llm_path}")
                                        if args.vae_path:
                                            sd_kwargs['vae_path'] = vae_path
                                        elif vae_path:
                                            sd_kwargs['vae_path'] = vae_path
                                        if t5xxl_path:
                                            sd_kwargs['t5xxl_path'] = t5xxl_path
                                        
                                        # Add sd.cpp-specific options from CLI args
                                        if getattr(global_args, 'vae_tiling', False):
                                            # VAE tiling is handled internally in newer builds
                                            print(f"DEBUG: VAE tiling is handled internally in stable-diffusion-cpp-python")
                                        if getattr(global_args, 'clip_on_cpu', False):
                                            sd_kwargs['keep_clip_on_cpu'] = True
                                            print(f"DEBUG: Running CLIP on CPU to save VRAM (keep_clip_on_cpu=True)")
                                        
                                        # Use all available CPU cores for processing
                                        import psutil
                                        sd_kwargs['n_threads'] = psutil.cpu_count()
                                        print(f"DEBUG: Using {psutil.cpu_count()} CPU cores for sd.cpp")

                                        # Add generation parameters from CLI args
                                        # sd_kwargs['sample_method'] = args.image_sample_method  # Not valid for __init__
                                        # sd_kwargs['steps'] = args.image_steps  # Not valid for __init__
                                        
                                        # Define model_key for adding to manager
                                        model_key = f"image:{model_path}"
                                        sd_model = StableDiffusion(**sd_kwargs)
                                        multi_model_manager.add_model(model_key, sd_model)
                                        # Add alias for "image" 
                                        multi_model_manager.add_model("image", sd_model)
                                        
                                        print(f"Image model loaded successfully via sd.cpp: {original_model_name}")
                                    except ImportError as sd_error:
                                        print(f"stable-diffusion-cpp-python not installed: {sd_error}")
                                        print(f"Image model will load on first request")
                                    except Exception as sd_error:
                                        print(f"sd.cpp load error: {sd_error}")
                                        print(f"Image model will load on first request")
                    else:
                        print(f"Could not load GGUF image model: no valid model path")
                        
                except ImportError as e:
                    print(f"Warning: llama_cpp not installed: {e}")
                except Exception as e:
                    print(f"Warning: Failed to pre-load GGUF image model: {e}")
                try:
                    from llama_cpp import Llama
                    from llama_cpp import Llama
                    
                    # Download GGUF model if needed (similar to VulkanBackend)
                    model_path = None
                    if model_name.startswith('http://') or model_name.startswith('https://'):
                        cached_path = get_cached_model_path(model_name)
                        if cached_path:
                            model_path = cached_path
                            print(f"Using cached GGUF model: {model_path}")
                        else:
                            print(f"Downloading GGUF model: {model_name}")
                            cache_dir = get_model_cache_dir()
                            model_path = download_model(model_name, cache_dir)
                    elif os.path.isfile(model_name):
                        model_path = model_name
                        print(f"Loading local GGUF model: {model_path}")
                    else:
                        # Try to download from HuggingFace Hub
                        try:
                            from huggingface_hub import hf_hub_download, list_repo_files
                            parts = model_name.split('/')
                            if len(parts) >= 2:
                                repo_id = f"{parts[0]}/{parts[1]}"
                                files = list_repo_files(repo_id)
                                gguf_files = [f for f in files if f.endswith('.gguf')]
                                if not gguf_files:
                                    raise ValueError(f"No GGUF files found in {repo_id}")
                                filename = gguf_files[0]
                                model_path = hf_hub_download(repo_id=repo_id, filename=filename)
                                print(f"Downloaded GGUF model to: {model_path}")
                        except Exception as e:
                            print(f"Could not resolve GGUF model path: {e}")
                            model_path = None
                    
                    if model_path and os.path.isfile(model_path):
                        # Use the cached path for the model key
                        model_key = f"image:{model_path}"
                        
                        # Load with llama.cpp
                        n_gpu_layers = -1  # Load all layers to GPU
                        n_ctx = 2048
                        
                        llama_model = Llama(
                            model_path=model_path,
                            n_gpu_layers=n_gpu_layers,
                            n_ctx=n_ctx,
                            verbose=False,
                        )
                        multi_model_manager.add_model(model_key, llama_model)
                        print(f"GGUF image model loaded successfully: {model_name}")
                    else:
                        print(f"Could not load GGUF image model: no valid model path")
                        
                except ImportError as e:
                    print(f"Warning: llama_cpp not installed: {e}")
                except Exception as e:
                    print(f"Warning: Failed to pre-load GGUF image model: {e}")
            else:
                # Load diffusers image model (Stable Diffusion)
                try:
                    import torch
                    from diffusers import StableDiffusionXLPipeline, DiffusionPipeline
                    
                    # Use model name directly for diffusers (model_path is only set in GGUF branch)
                    model_key = f"image:{model_name}"
                    print(f"Loading diffusers pipeline: {model_name}")
                    
                    # Try to load as Stable Diffusion XL first
                    try:
                        pipeline = StableDiffusionXLPipeline.from_pretrained(
                            model_name,
                            torch_dtype=torch.float32,
                            use_safetensors=True,
                        )
                    except Exception as e:
                        print(f"SDXL failed, trying generic pipeline: {e}")
                        pipeline = DiffusionPipeline.from_pretrained(
                            model_name,
                            torch_dtype=torch.float32,
                            use_safetensors=True,
                        )
                    
                    if torch.cuda.is_available():
                        pipeline = pipeline.to("cuda")
                        pipeline.enable_attention_slicing()
                    else:
                        pipeline = pipeline.to("cpu")
                    
                    multi_model_manager.add_model(model_key, pipeline)
                    # Add alias for "image"
                    multi_model_manager.add_model("image", pipeline)
                    
                    print(f"Image model loaded successfully: {model_name}")
                    
                except ImportError as e:
                    print(f"Warning: diffusers not installed: {e}")
                except Exception as e:
                    print(f"Warning: Failed to pre-load image model: {e}")
    
    # Register model aliases if specified
    if args.model_aliases:
        print(f"\nRegistering model aliases:")
        for alias, model in args.model_aliases:
            multi_model_manager.set_model_alias(alias, model)
            print(f"  {alias} -> {model}")
    
    # Start the server
    import uvicorn
    print(f"\nStarting server on http://{args.host}:{args.port}")
    print(f"API documentation available at http://{args.host}:{args.port}/docs")
    if model_manager.backend is not None:
        # Show actual backend being used
        actual_backend = model_manager.backend_type
        if hasattr(model_manager.backend, 'force_cuda') and model_manager.backend.force_cuda:
            actual_backend = "cuda (via llama-cpp-python)"
        print(f"Using backend: {actual_backend}")
    
    # Print available models
    models = multi_model_manager.list_models()
    print(f"Available models: {[m.id for m in models]}")
    
    # Run server with or without HTTPS
    if args.https:
        import ssl
        
        # Determine SSL context
        ssl_keyfile = None
        ssl_certfile = None
        
        if args.privkey and args.pubkey:
            # Use provided certificates
            ssl_keyfile = args.privkey
            ssl_certfile = args.pubkey
            print(f"Using HTTPS with custom certificates: {args.pubkey}")
        else:
            # Auto-generate self-signed certificate
            print("Generating self-signed HTTPS certificate...")
            import subprocess
            try:
                # Generate self-signed cert
                cert_path = "./cert.pem"
                key_path = "./key.pem"
                subprocess.run([
                    "openssl", "req", "-x509", "-newkey", "rsa:4096",
                    "-keyout", key_path, "-out", cert_path,
                    "-days", "365", "-nodes",
                    "-subj", "/CN=localhost"
                ], check=True, capture_output=True)
                ssl_keyfile = key_path
                ssl_certfile = cert_path
                print(f"Generated self-signed certificate: {cert_path}")
            except Exception as e:
                print(f"Warning: Could not generate certificate: {e}")
                print("Falling back to HTTP...")
                uvicorn.run(app, host=args.host, port=args.port)
                return
        
        # Run with HTTPS
        ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
        ssl_context.load_cert_chain(ssl_certfile, ssl_keyfile)
        uvicorn.run(app, host=args.host, port=args.port, ssl=ssl_context)
    else:
        uvicorn.run(app, host=args.host, port=args.port)
if __name__ == "__main__":
    main()
