Centralize model resolution and VRAM management in MultiModelManager.request_model()

- Added request_model() method to MultiModelManager that handles: 1. Alias resolution (image, audio, tts, vision, default, custom aliases) 2. VRAM management (unloading previous models in ondemand mode) 3. Checking if model is already loaded - Simplified codai/api/images.py: - Uses request_model() for model resolution and VRAM management - Extracted helper functions: _is_gguf_model(), _load_diffusers_pipeline(), _generate_with_diffusers(), _generate_with_sdcpp(), _load_sdcpp_model() - Removed duplicated sd.cpp generation code - Fixed semaphore scope (all generation now inside semaphore block) - Simplified codai/api/tts.py: - Uses request_model() instead of duplicated VRAM management code - Removed duplicate get_cached_model_path() and get_model_cache_dir() wrappers - Simplified codai/api/transcriptions.py: - Uses request_model() instead of duplicated VRAM management code - Simplified codai/api/text.py: - Both /v1/chat/completions and /v1/completions use request_model() - Removed duplicated VRAM management blocks

Centralize model resolution and VRAM management in MultiModelManager.request_model()
- Added request_model() method to MultiModelManager that handles: 1. Alias resolution (image, audio, tts, vision, default, custom aliases) 2. VRAM management (unloading previous models in ondemand mode) 3. Checking if model is already loaded - Simplified codai/api/images.py: - Uses request_model() for model resolution and VRAM management - Extracted helper functions: _is_gguf_model(), _load_diffusers_pipeline(), _generate_with_diffusers(), _generate_with_sdcpp(), _load_sdcpp_model() - Removed duplicated sd.cpp generation code - Fixed semaphore scope (all generation now inside semaphore block) - Simplified codai/api/tts.py: - Uses request_model() instead of duplicated VRAM management code - Removed duplicate get_cached_model_path() and get_model_cache_dir() wrappers - Simplified codai/api/transcriptions.py: - Uses request_model() instead of duplicated VRAM management code - Simplified codai/api/text.py: - Both /v1/chat/completions and /v1/completions use request_model() - Removed duplicated VRAM management blocks
e004541a · Your Name · a5b64c4c · e004541a · e004541a · e004541a
Commit e004541a authored Mar 19, 2026 by Your Name
Showing with 558 additions and 696 deletions

images.py codai/api/images.py +379 -502

text.py codai/api/text.py +12 -72

transcriptions.py codai/api/transcriptions.py +21 -57

tts.py codai/api/tts.py +19 -65

manager.py codai/models/manager.py +127 -0

No files found.
--- a/codai/api/images.py
+++ b/codai/api/images.py
@@ -178,6 +178,265 @@ def set_queue_flags(flags):
    queue_flags = flags


+def _is_gguf_model(model_name: str) -> bool:
+    """Check if a model name/path indicates a GGUF model."""
+    if not model_name:
+        return False
+    return (model_name.endswith('.gguf') or 
+            'gguf' in model_name.lower() or
+            (model_name.startswith('http') and '.gguf' in model_name))
+
+
+def _load_diffusers_pipeline(model_name: str, global_args):
+    """
+    Try to load a model using the diffusers library.
+    
+    Returns the loaded pipeline or None if diffusers can't handle this model.
+    Raises Exception if loading fails for other reasons.
+    """
+    from diffusers import StableDiffusionPipeline, StableDiffusionXLPipeline, DiffusionPipeline
+    import torch
+    
+    # Determine precision from CLI argument (--image-precision)
+    precision = getattr(global_args, 'image_precision', 'f32') or 'f32'
+    precision_map = {
+        'bf16': torch.bfloat16,
+        'f32': torch.float32,
+        'f16': torch.float16,
+    }
+    if hasattr(torch, 'float8_e4m3fn'):
+        precision_map['f8'] = torch.float8_e4m3fn
+    dtype = precision_map.get(precision, torch.float32)
+    print(f"Using precision: {precision} ({dtype})")
+    
+    # Check if CPU offload is requested via CLI
+    use_sequential_offload = getattr(global_args, 'image_cpu_offload', False)
+    
+    # Track loading attempts for OOM handling
+    pipeline = None
+    load_attempt = 0
+    max_attempts = 3
+    
+    while pipeline is None and load_attempt < max_attempts:
+        try:
+            load_attempt += 1
+            print(f"Loading attempt {load_attempt}/{max_attempts}...")
+            
+            # Try to load as Stable Diffusion XL first, then generic DiffusionPipeline
+            try:
+                pipeline = StableDiffusionXLPipeline.from_pretrained(
+                    model_name,
+                    torch_dtype=dtype,
+                    use_safetensors=True,
+                )
+            except Exception:
+                # Try generic diffusion pipeline (supports custom pipelines like ZImagePipeline)
+                pipeline = DiffusionPipeline.from_pretrained(
+                    model_name,
+                    torch_dtype=dtype,
+                    use_safetensors=True,
+                )
+            
+            # Apply memory optimizations based on attempt
+            if torch.cuda.is_available():
+                if load_attempt >= 2:
+                    # Second attempt: enable attention slicing
+                    print("Enabling attention slicing for lower VRAM usage...")
+                    if hasattr(pipeline, 'enable_attention_slicing'):
+                        pipeline.enable_attention_slicing()
+                
+                if load_attempt >= 3 or use_sequential_offload:
+                    # Third attempt or offload requested: enable sequential CPU offload
+                    print("Enabling sequential CPU offload for lower VRAM usage...")
+                    if hasattr(pipeline, 'enable_sequential_cpu_offload'):
+                        pipeline.enable_sequential_cpu_offload()
+                else:
+                    # First attempt: try regular GPU
+                    pipeline = pipeline.to("cuda")
+            else:
+                pipeline = pipeline.to("cpu")
+            
+        except Exception as load_error:
+            error_msg = str(load_error).lower()
+            is_oom = any(x in error_msg for x in ['out of memory', 'oom', 'cuda error', 'cudamalloc'])
+            
+            if is_oom and load_attempt < max_attempts:
+                print(f"OOM during model loading: {load_error}")
+                print(f"Retrying with more aggressive memory optimization...")
+                pipeline = None  # Reset for retry
+            else:
+                print(f"Failed to load model (attempt {load_attempt}): {load_error}")
+                if load_attempt >= max_attempts:
+                    raise
+                pipeline = None
+    
+    return pipeline
+
+
+def _generate_with_diffusers(pipeline, request, global_args, http_request=None):
+    """Generate images using a diffusers pipeline."""
+    import torch
+    import numpy as np
+    import time as time_module
+    
+    # Determine size
+    width, height = 1024, 1024
+    if request.size:
+        parts = request.size.split("x")
+        if len(parts) == 2:
+            try:
+                width = int(parts[0])
+                height = int(parts[1])
+            except ValueError:
+                pass
+    
+    # Check for nan/inf in dimensions
+    if width != width or width == float('inf'):
+        width = 512
+    if height != height or height == float('inf'):
+        height = 512
+    
+    # Enable memory optimizations
+    try:
+        if hasattr(pipeline, 'enable_attention_slicing'):
+            pipeline.enable_attention_slicing(slice_size="auto")
+        if hasattr(pipeline, 'enable_vae_slicing'):
+            pipeline.enable_vae_slicing()
+    except Exception as e:
+        print(f"Warning: Could not enable memory optimizations: {e}")
+    
+    # Get timestamp BEFORE calling diffusers
+    timestamp = int(time_module.time())
+    
+    # Generate images
+    seed = request.seed if request.seed is not None else getattr(global_args, 'image_seed', None)
+    generator = None
+    if seed is not None:
+        generator = torch.Generator(device=pipeline.device).manual_seed(seed)
+    
+    # Quality: "standard" or "hd"
+    quality = request.quality or "standard"
+    
+    # Use request parameters if provided, otherwise fall back to quality-based defaults
+    num_steps = request.steps if request.steps else (30 if quality == "standard" else 50)
+    cfg_scale = request.guidance_scale if request.guidance_scale else (
+        getattr(global_args, 'image_cfg_scale', 7.5) if quality == "standard" else 9.0
+    )
+    
+    # Generate
+    result = pipeline(
+        prompt=request.prompt,
+        negative_prompt=None,
+        num_images_per_prompt=request.n,
+        height=height,
+        width=width,
+        generator=generator,
+        guidance_scale=cfg_scale,
+        num_inference_steps=num_steps,
+    )
+    
+    # Extract images
+    images = []
+    try:
+        result_images = result.images
+    except Exception as img_err:
+        print(f"Warning: Could not access result.images: {img_err}")
+        result_images = getattr(result, 'image', None) or getattr(result, 'output', None)
+        if result_images is None:
+            raise Exception(f"Could not extract images from diffusers result: {img_err}")
+    
+    for img in result_images:
+        # Debug: print image type and value range
+        print(f"DEBUG: Image type: {type(img)}")
+        if isinstance(img, np.ndarray):
+            print(f"DEBUG: Image shape: {img.shape}, dtype: {img.dtype}, min: {img.min()}, max: {img.max()}")
+            img = np.nan_to_num(img, nan=0.0, posinf=1.0, neginf=0.0)
+            img = np.clip(img, 0.0, 1.0)
+            print(f"DEBUG: After NaN handling - min: {img.min()}, max: {img.max()}")
+        
+        img_data = save_image_response(img, request.response_format, http_request)
+        images.append(img_data)
+    
+    return {
+        "created": timestamp,
+        "data": images
+    }
+
+
+async def _generate_with_sdcpp(sd_model, request, global_args, http_request=None):
+    """Generate images using stable-diffusion-cpp-python."""
+    import time
+    
+    # Parse size
+    width, height = 512, 512
+    if request.size:
+        parts = request.size.split("x")
+        if len(parts) == 2:
+            try:
+                width = int(parts[0])
+                height = int(parts[1])
+            except ValueError:
+                pass
+    
+    # Use default steps for fast generation
+    steps = 4
+    
+    # Use request seed if provided, otherwise use CLI default seed
+    seed = request.seed if request.seed is not None else getattr(global_args, 'image_seed', None)
+    
+    result = await asyncio.to_thread(
+        sd_model.generate_image,
+        prompt=request.prompt,
+        negative_prompt='',
+        width=width,
+        height=height,
+        cfg_scale=get_cfg_scale(),
+        sample_steps=steps,
+        seed=seed if seed is not None else 42,
+        batch_count=request.n if request.n else 1,
+    )
+    
+    # Small delay to let Vulkan driver settle after generation
+    time.sleep(0.1)
+    
+    # Convert results to response format
+    images = []
+    for img in result:
+        img_data = save_image_response(img, http_request=http_request)
+        images.append(img_data)
+    
+    return {
+        "created": int(time.time()),
+        "data": images
+    }
+
+
+def _load_sdcpp_model(model_path: str, global_args):
+    """
+    Try to load a model using stable-diffusion-cpp-python.
+    
+    Returns the loaded StableDiffusion model or None.
+    """
+    from stable_diffusion_cpp import StableDiffusion
+    
+    print(f"Loading sd.cpp model from: {model_path}")
+    
+    # Build sd.cpp constructor args from config
+    kwargs = {
+        'model_path': model_path,
+    }
+    
+    # Add optional paths from CLI args
+    if global_args:
+        if hasattr(global_args, 'vae_path') and global_args.vae_path:
+            kwargs['vae_path'] = global_args.vae_path
+        if hasattr(global_args, 'llm_path') and global_args.llm_path:
+            kwargs['lora_model_dir'] = global_args.llm_path
+    
+    sd_model = StableDiffusion(**kwargs)
+    return sd_model
+
+
 # =============================================================================
 # Router and Endpoints
 # =============================================================================
@@ -225,518 +484,136 @@ async def create_image_generation(request: ImageGenerationRequest, http_request:
            )
    
    async with semaphore:
-        image_model = multi_model_manager.image_model
-    
-    # If no image model configured, try to use main --model as fallback
-    if not image_model:
-        # Try to get the main model from args
-        main_model = getattr(global_args, 'model', None)
-        if main_model and isinstance(main_model, list) and len(main_model) > 0:
-            image_model = main_model[0]
-        elif main_model:
-            image_model = main_model
-        
-        # Check if main model is a GGUF file - can't use for image generation
-        if image_model and ('.gguf' in image_model.lower() or 'gguf' in image_model.lower()):
-            print(f"Note: Main model is a GGUF file (for text), not suitable for image generation")
-            image_model = None  # Can't use GGUF for images
-    
-    # If still no image model configured, return an error
-    if not image_model:
-        raise HTTPException(
-            status_code=400,
-            detail="Image generation not configured. Use --image-model to specify a model."
+        # =====================================================================
+        # Step 1: Ask the manager to resolve the model and manage VRAM
+        # =====================================================================
+        model_info = multi_model_manager.request_model(
+            requested_model=request.model,
+            model_type="image"
        )
-    
-    # Determine model to use
-    # Priority: 1) model specified in request, 2) default image model from --image-model
-    model_to_use = request.model
-    if not model_to_use or model_to_use == "image":
-        # No model specified in request, use default
-        model_to_use = image_model
-    elif model_to_use.startswith("image:"):
-        # Legacy format - strip prefix and use default
-        model_to_use = image_model
-    else:
-        # Check if model_to_use is a valid model (URL, file, or known model)
-        # If not, fallback to the configured image model to avoid HF resolution errors
-        if image_model:
-            is_url = model_to_use.startswith('http://') or model_to_use.startswith('https://')
-            is_file = os.path.isfile(model_to_use) if model_to_use else False
-            if not is_url and not is_file:
-                # Unknown model name - use default instead of trying to resolve as HF
-                print(f"Warning: Unknown model '{model_to_use}' in image generation request, using configured --image-model")
-                model_to_use = image_model
-    
-    # Check if model is loaded
-    model_key = f"image:{model_to_use}"
-    pipeline = multi_model_manager.get_model(model_key)
-    
-    # In ondemand mode, if ANY model is loaded in VRAM and it's different from what we need,
-    # fully unload it first to free VRAM
-    if mode == "ondemand":
-        from codai.models.manager import model_manager
-        has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
        
-        if has_any_model:
-            # Resolve both the requested image model and currently loaded model to their canonical names
-            requested_canonical = multi_model_manager.resolve_model_name(f"image:{model_to_use}")
-            loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
-            
-            # Also check legacy model_manager
-            if not loaded_canonical and model_manager.backend is not None:
-                loaded_canonical = "legacy_model_manager"
-            
-            # Compare: if they're different models (even if both are image models), unload first
-            already_loaded = (requested_canonical and loaded_canonical and 
-                            requested_canonical == loaded_canonical)
-            
-            if not already_loaded:
-                print(f"In ondemand mode - model switch detected:")
-                print(f"  Requested: 'image:{model_to_use}' (resolved to: '{requested_canonical}')")
-                print(f"  Loaded: '{loaded_canonical}'")
-                print(f"  -> Fully unloading current model(s) before loading new model...")
-                multi_model_manager.unload_all_models()
-                if model_manager.backend is not None:
-                    try:
-                        model_manager.cleanup()
-                    except:
-                        pass
+        model_name = model_info['model_name']
+        model_key = model_info['model_key']
+        pipeline = model_info['model_object']
        
-        # Try diffusers first
-        try:
-            from diffusers import StableDiffusionPipeline, StableDiffusionXLPipeline, DiffusionPipeline
-            import torch
-            
-            # Check if model is XL
-            is_xl = "xl" in model_to_use.lower() or "sdxl" in model_to_use.lower()
-            
-            # Check if it's a GGUF model - skip diffusers for those
-            is_gguf_model = (model_to_use.endswith('.gguf') or 'gguf' in model_to_use.lower() or
-                            (model_to_use.startswith('http') and '.gguf' in model_to_use))
-            
-            if is_gguf_model:
-                print(f"GGUF model detected ({model_to_use}), skipping diffusers, using stable-diffusion-cpp...")
-                raise Exception("GGUF model - use stable-diffusion-cpp instead")
-            
-            print(f"Loading diffusers model: {model_to_use}")
-            
-            # Determine precision from CLI argument (--image-precision)
-            precision = getattr(global_args, 'image_precision', 'f32') or 'f32'
-            precision_map = {
-                'bf16': torch.bfloat16,
-                'f32': torch.float32,
-                'f16': torch.float16,
-            }
-            if hasattr(torch, 'float8_e4m3fn'):
-                precision_map['f8'] = torch.float8_e4m3fn
-            dtype = precision_map.get(precision, torch.float32)
-            print(f"Using precision: {precision} ({dtype})")
-            
-            # Check if CPU offload is requested via CLI
-            use_sequential_offload = getattr(global_args, 'image_cpu_offload', False)
-            
-            # Track loading attempts for OOM handling
-            load_attempt = 0
-            max_attempts = 3
-            
-            while pipeline is None and load_attempt < max_attempts:
-                try:
-                    load_attempt += 1
-                    print(f"Loading attempt {load_attempt}/{max_attempts}...")
-                    
-                    # Try to load as Stable Diffusion XL first, then generic DiffusionPipeline
-                    try:
-                        pipeline = StableDiffusionXLPipeline.from_pretrained(
-                            model_to_use,
-                            torch_dtype=dtype,
-                            use_safetensors=True,
-                        )
-                    except Exception:
-                        # Try generic diffusion pipeline (supports custom pipelines like ZImagePipeline)
-                        pipeline = DiffusionPipeline.from_pretrained(
-                            model_to_use,
-                            torch_dtype=dtype,
-                            use_safetensors=True,
-                        )
-                    
-                    # Apply memory optimizations based on attempt
-                    if torch.cuda.is_available():
-                        if load_attempt >= 2:
-                            # Second attempt: enable attention slicing
-                            print("Enabling attention slicing for lower VRAM usage...")
-                            if hasattr(pipeline, 'enable_attention_slicing'):
-                                pipeline.enable_attention_slicing()
-                        
-                        if load_attempt >= 3 or use_sequential_offload:
-                            # Third attempt or offload requested: enable sequential CPU offload
-                            print("Enabling sequential CPU offload for lower VRAM usage...")
-                            if hasattr(pipeline, 'enable_sequential_cpu_offload'):
-                                pipeline.enable_sequential_cpu_offload()
-                        else:
-                            # First attempt: try regular GPU
-                            pipeline = pipeline.to("cuda")
-                    else:
-                        pipeline = pipeline.to("cpu")
-                    
-                except Exception as load_error:
-                    error_msg = str(load_error).lower()
-                    is_oom = any(x in error_msg for x in ['out of memory', 'oom', 'cuda error', 'cudamalloc'])
-                    
-                    if is_oom and load_attempt < max_attempts:
-                        print(f"OOM during model loading: {load_error}")
-                        print(f"Retrying with more aggressive memory optimization...")
-                        pipeline = None  # Reset for retry
-                    else:
-                        print(f"Failed to load model (attempt {load_attempt}): {load_error}")
-                        if load_attempt >= max_attempts:
-                            raise
-                        pipeline = None
-            
-            # Cache the model
-            if pipeline is not None:
-                multi_model_manager.add_model(model_key, pipeline)
-                print(f"Loaded diffusers model: {model_to_use}")
+        # If no image model configured, try to use main --model as fallback
+        if not model_name:
+            main_model = getattr(global_args, 'model', None)
+            if main_model and isinstance(main_model, list) and len(main_model) > 0:
+                model_name = main_model[0]
+            elif main_model:
+                model_name = main_model
            
-        except ImportError as e:
-            # diffusers not installed
-            diffusers_error = str(e)
-            print(f"diffusers not available: {diffusers_error}")
-        except Exception as e:
-            import traceback
-            diffusers_error = str(e)
-            print(f"diffusers error: {diffusers_error}")
-            print(f"Traceback: {traceback.format_exc()}")
-    
-    # Try diffusers if available
-    if pipeline is not None:
-        try:
-            # Determine size
-            width, height = 1024, 1024
-            if request.size:
-                parts = request.size.split("x")
-                if len(parts) == 2:
-                    try:
-                        width = int(parts[0])
-                        height = int(parts[1])
-                    except ValueError:
-                        pass
+            # Check if main model is a GGUF file - can't use for image generation
+            if model_name and _is_gguf_model(model_name):
+                print(f"Note: Main model is a GGUF file (for text), not suitable for image generation")
+                model_name = None
            
-            # Check for nan/inf in dimensions
-            if width != width or width == float('inf'):  # NaN or inf check
-                width = 512
-            if height != height or height == float('inf'):  # NaN or inf check
-                height = 512
-            
-            # Import torch for generation
-            import torch
-            
-            # Ensure model is on correct device
-            backend = getattr(global_args, 'backend', 'auto')
-            image_backend = getattr(global_args, 'image_backend', 'auto')
-            use_vulkan = (backend == 'vulkan') or (image_backend == 'vulkan') or (image_backend == 'auto' and backend == 'auto')
-            
-            if use_vulkan and not torch.cuda.is_available():
-                # CPU mode - try to reduce memory usage
-                try:
-                    if hasattr(pipeline, 'enable_attention_slicing'):
-                        pipeline.enable_attention_slicing(slice_size="auto")
-                    if hasattr(pipeline, 'enable_vae_slicing'):
-                        pipeline.enable_vae_slicing()
-                except Exception as e:
-                    print(f"Warning: Could not enable memory optimizations: {e}")
-            elif torch.cuda.is_available():
-                # Try to enable memory optimizations for CUDA
-                try:
-                    if hasattr(pipeline, 'enable_attention_slicing'):
-                        pipeline.enable_attention_slicing(slice_size="auto")
-                    if hasattr(pipeline, 'enable_vae_slicing'):
-                        pipeline.enable_vae_slicing()
-                except Exception as e:
-                    print(f"Warning: Could not enable CUDA memory optimizations: {e}")
-            
-            # Get timestamp BEFORE calling diffusers (to avoid scope issues)
-            import time as time_module
-            timestamp = int(time_module.time())
-            
-            # Generate images
-            # Use request seed if provided, otherwise use CLI default seed
-            seed = request.seed if request.seed is not None else getattr(global_args, 'image_seed', None)
-            generator = None
-            if seed is not None:
-                generator = torch.Generator(device=pipeline.device).manual_seed(seed)
-            
-            # Quality: "standard" or "hd"
-            quality = request.quality or "standard"
-            
-            # Use request parameters if provided, otherwise fall back to quality-based defaults
-            num_steps = request.steps if request.steps else (30 if quality == "standard" else 50)
-            cfg_scale = request.guidance_scale if request.guidance_scale else (
-                getattr(global_args, 'image_cfg_scale', 7.5) if quality == "standard" else 9.0
-            )
-            
-            # Generate
-            result = pipeline(
-                prompt=request.prompt,
-                negative_prompt=None,
-                num_images_per_prompt=request.n,
-                height=height,
-                width=width,
-                generator=generator,
-                guidance_scale=cfg_scale,
-                num_inference_steps=num_steps,
+            if model_name:
+                model_key = f"image:{model_name}"
+        
+        # If still no image model configured, return an error
+        if not model_name:
+            raise HTTPException(
+                status_code=400,
+                detail="Image generation not configured. Use --image-model to specify a model."
            )
-            
-            # Extract images
-            images = []
+        
+        # =====================================================================
+        # Step 2: Check if model is a sd.cpp StableDiffusion instance
+        # =====================================================================
+        is_sdcpp = False
+        if pipeline is not None:
            try:
-                result_images = result.images
-            except Exception as img_err:
-                print(f"Warning: Could not access result.images: {img_err}")
-                # Try alternative: result might have 'image' or 'output'
-                result_images = getattr(result, 'image', None) or getattr(result, 'output', None)
-                if result_images is None:
-                    raise Exception(f"Could not extract images from diffusers result: {img_err}")
-            
-            for img in result_images:
-                # Convert to base64
-                import numpy as np
-                
-                # Debug: print image type and value range
-                print(f"DEBUG: Image type: {type(img)}")
-                if isinstance(img, np.ndarray):
-                    print(f"DEBUG: Image shape: {img.shape}, dtype: {img.dtype}, min: {img.min()}, max: {img.max()}")
-                    # Handle NaN/Inf values in image data - convert to valid values
-                    # Replace NaN and Inf with valid values
-                    img = np.nan_to_num(img, nan=0.0, posinf=1.0, neginf=0.0)
-                    # Clip to valid range [0, 1]
-                    img = np.clip(img, 0.0, 1.0)
-                    print(f"DEBUG: After NaN handling - min: {img.min()}, max: {img.max()}")
+                from stable_diffusion_cpp import StableDiffusion
+                if isinstance(pipeline, StableDiffusion):
+                    is_sdcpp = True
+            except ImportError:
+                pass
+        
+        # =====================================================================
+        # Step 3: If already loaded, generate with appropriate backend
+        # =====================================================================
+        if pipeline is not None:
+            if is_sdcpp:
+                print(f"Using cached sd.cpp model for generation")
+                return await _generate_with_sdcpp(pipeline, request, global_args, http_request)
+            else:
+                # Assume it's a diffusers pipeline
+                print(f"Using cached diffusers pipeline for generation")
+                return _generate_with_diffusers(pipeline, request, global_args, http_request)
+        
+        # =====================================================================
+        # Step 4: Model not loaded - try to load it
+        # =====================================================================
+        is_gguf = _is_gguf_model(model_name)
+        diffusers_error = None
+        sdcpp_error = None
+        
+        # Try diffusers first (for non-GGUF models)
+        if not is_gguf:
+            try:
+                print(f"Loading diffusers model: {model_name}")
+                pipeline = _load_diffusers_pipeline(model_name, global_args)
                
-                # Use helper function to save and get response
-                img_data = save_image_response(img, request.response_format, http_request)
-                images.append(img_data)
-            
-            return {
-                "created": timestamp,
-                "data": images
-            }
-            
-        except ImportError as e:
-            # diffusers/torch not installed - record error and try sd.cpp
-            diffusers_error = str(e)
-            print(f"diffusers not available: {diffusers_error}, trying stable-diffusion-cpp-python...")
-        except Exception as e:
-            # Other error with diffusers - record and try sd.cpp
-            import traceback
-            diffusers_error = str(e)
-            print(f"diffusers error: {diffusers_error}")
-            print(f"Traceback: {traceback.format_exc()}")
-            print(f"Trying stable-diffusion-cpp-python...")
-    
-    # Try stable-diffusion-cpp-python (sd.cpp) as fallback when diffusers fails
-    # sd.cpp works with GGUF models, but some HF models may be GGUF even without "gguf" in name
-    # Let sd.cpp attempt loading and fail gracefully if it's not compatible
-
-    # Try stable-diffusion-cpp-python (sd.cpp) as fallback
-    # First, check all available image models to find one loaded via sd.cpp
-    # Always check for cached models - allows dynamically loaded models to be reused across requests
-    sd_model = None
-    for key in multi_model_manager.models:
-        if key.startswith("image:"):
-            potential_model = multi_model_manager.get_model(key)
-            if potential_model is not None:
-                # Check if it's a stable-diffusion-cpp model
-                try:
-                    from stable_diffusion_cpp import StableDiffusion
-                    if isinstance(potential_model, StableDiffusion):
-                        sd_model = potential_model
-                        print(f"Found cached stable-diffusion-cpp model with key: {key}")
-                        break
-                except ImportError:
-                    pass
-
-    # If no cached image model found, need to load one - first cleanup any existing models
-    if sd_model is None:
-        # In ondemand mode, check if we need to unload before loading sd.cpp model
-        from codai.models.manager import model_manager
-        has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
-
-        if mode == "ondemand" and has_any_model:
-            # Resolve both the requested image model and currently loaded model to their canonical names
-            requested_canonical = multi_model_manager.resolve_model_name(f"image:{model_to_use}")
-            loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
-
-            # Also check legacy model_manager
-            if not loaded_canonical and model_manager.backend is not None:
-                loaded_canonical = "legacy_model_manager"
-
-            # Compare: if they're different models, unload first
-            already_loaded = (requested_canonical and loaded_canonical and
-                            requested_canonical == loaded_canonical)
-
-            if not already_loaded:
-                print(f"In ondemand mode - model switch detected:")
-                print(f"  Requested: 'image:{model_to_use}' (resolved to: '{requested_canonical}')")
-                print(f"  Loaded: '{loaded_canonical}'")
-                print(f"  -> Fully unloading current model(s) before loading sd.cpp model...")
-                multi_model_manager.unload_all_models()
-                if model_manager.backend is not None:
-                    try:
-                        model_manager.cleanup()
-                    except:
-                        pass
-    
-    if sd_model is not None:
-        # Check if it's a stable-diffusion-cpp model (has generate method from sd.cpp)
+                if pipeline is not None:
+                    # Cache the loaded pipeline in the manager
+                    multi_model_manager.add_model(model_key, pipeline)
+                    multi_model_manager.current_model_key = model_key
+                    print(f"Loaded diffusers model: {model_name}")
+                    
+                    return _generate_with_diffusers(pipeline, request, global_args, http_request)
+                    
+            except ImportError as e:
+                diffusers_error = str(e)
+                print(f"diffusers not available: {diffusers_error}")
+            except Exception as e:
+                import traceback
+                diffusers_error = str(e)
+                print(f"diffusers error: {diffusers_error}")
+                print(f"Traceback: {traceback.format_exc()}")
+        
+        # Try stable-diffusion-cpp-python (for GGUF models or as fallback)
        try:
-            from stable_diffusion_cpp import StableDiffusion
-            if isinstance(sd_model, StableDiffusion):
-                print(f"Using stable-diffusion-cpp-python for image generation")
-                # Use sd.cpp for generation
-                # Parse size
-                width, height = 512, 512
-                if request.size:
-                    parts = request.size.split("x")
-                    if len(parts) == 2:
-                        try:
-                            width = int(parts[0])
-                            height = int(parts[1])
-                        except ValueError:
-                            pass
-                
-                # Use default steps for Z-Image Turbo (very fast)
-                steps = 4  # Default for fast generation
-                
-                # Generate images using sd.cpp (run in thread to not block event loop)
-                # Use request seed if provided, otherwise use CLI default seed
-                seed = request.seed if request.seed is not None else getattr(global_args, 'image_seed', None)
-                
-                result = await asyncio.to_thread(
-                    sd_model.generate_image,
-                    prompt=request.prompt,
-                    negative_prompt='',
-                    width=width,
-                    height=height,
-                    cfg_scale=get_cfg_scale(),
-                    sample_steps=steps,
-                    seed=seed if seed is not None else 42,
-                    batch_count=request.n if request.n else 1,
-                )
-                
-                # Small delay to let Vulkan driver settle after generation
-                import time
-                time.sleep(0.1)
-                
-                # Convert results to response format
-                images = []
-                
-                for img in result:
-                    # Use helper function to save and get response
-                    img_data = save_image_response(img, http_request=http_request)
-                    images.append(img_data)
+            # For GGUF models or URLs, resolve the model path through the cache
+            resolved_path = model_name
+            if is_gguf or model_name.startswith('http://') or model_name.startswith('https://'):
+                resolved_path = multi_model_manager.load_model(model_name)
+                if not resolved_path:
+                    raise Exception(f"Failed to resolve model path: {model_name}")
+            
+            # Only use sd.cpp if we have a local file path
+            if resolved_path and os.path.isfile(resolved_path):
+                sd_model = _load_sdcpp_model(resolved_path, global_args)
                
-                return {
-                    "created": int(time.time()),
-                    "data": images
-                }
-        except ImportError as e:
-            # stable-diffusion-cpp not available
-            sd_cpp_error = str(e)
-            print(f"stable-diffusion-cpp-python not available: {sd_cpp_error}")
-        except Exception as e:
-            print(f"sd.cpp generation error: {e}")
-            sd_cpp_error = str(e)
-    else:
-        # No sd.cpp model pre-loaded, try to load dynamically
-        print("No pre-loaded sd.cpp model found, trying to load...")
-        try:
-            from stable_diffusion_cpp import StableDiffusion
-
-            # Use model manager to resolve and load the model
-            model_path = multi_model_manager.load_model(model_to_use)
-
-            # For diffusers models, model_path will be the identifier string
-            # For GGUF models, it will be the file path
-            if model_path is not None and not os.path.isfile(model_path):
-                # This is a diffusers model identifier (not a file path)
-                # Skip sd.cpp and let diffusers handle it
-                print(f"Model '{model_path}' is handled by diffusers library, skipping sd.cpp")
-                model_path = None
-
-            if model_path is not None:
-                # Check if it's a stable-diffusion-cpp model (has generate method from sd.cpp)
-                try:
-                    from stable_diffusion_cpp import StableDiffusion
-                    if isinstance(sd_model, StableDiffusion):
-                        print(f"Using stable-diffusion-cpp-python for image generation")
-                        # Use sd.cpp for generation
-                        # Parse size
-                        width, height = 512, 512
-                        if request.size:
-                            parts = request.size.split("x")
-                            if len(parts) == 2:
-                                try:
-                                    width = int(parts[0])
-                                    height = int(parts[1])
-                                except ValueError:
-                                    pass
-
-                        # Use default steps for Z-Image Turbo (very fast)
-                        steps = 4  # Default for fast generation
-
-                        # Generate images using sd.cpp (run in thread to not block event loop)
-                        # Use request seed if provided, otherwise use CLI default seed
-                        seed = request.seed if request.seed is not None else getattr(global_args, 'image_seed', None)
-
-                        result = await asyncio.to_thread(
-                            sd_model.generate_image,
-                            prompt=request.prompt,
-                            negative_prompt='',
-                            width=width,
-                            height=height,
-                            cfg_scale=get_cfg_scale(),
-                            sample_steps=steps,
-                            seed=seed if seed is not None else 42,
-                            batch_count=request.n if request.n else 1,
-                        )
-
-                        # Small delay to let Vulkan driver settle after generation
-                        import time
-                        time.sleep(0.1)
-
-                        # Convert results to response format
-                        images = []
-
-                        for img in result:
-                            # Use helper function to save and get response
-                            img_data = save_image_response(img, http_request=http_request)
-                            images.append(img_data)
-
-                        return {
-                            "created": int(time.time()),
-                            "data": images
-                        }
-                except ImportError as e:
-                    # stable-diffusion-cpp not available
-                    sd_cpp_error = str(e)
-                    print(f"stable-diffusion-cpp-python not available: {sd_cpp_error}")
-                except Exception as e:
-                    print(f"sd.cpp generation error: {e}")
-                    sd_cpp_error = str(e)
+                if sd_model is not None:
+                    # Cache the loaded model in the manager
+                    multi_model_manager.add_model(model_key, sd_model)
+                    multi_model_manager.current_model_key = model_key
+                    print(f"Loaded sd.cpp model: {model_name}")
+                    
+                    return await _generate_with_sdcpp(sd_model, request, global_args, http_request)
            else:
-                # model_path is None - likely a diffusers model handled above
-                print("Model handled by diffusers library")
-                sd_cpp_error = "Model handled by diffusers"
+                sdcpp_error = f"Model '{model_name}' is not a local file, cannot use sd.cpp"
+                print(sdcpp_error)
+                
        except ImportError as e:
-            sd_cpp_error = str(e)
-            print(f"stable-diffusion-cpp-python not available: {sd_cpp_error}")
+            sdcpp_error = str(e)
+            print(f"stable-diffusion-cpp-python not available: {sdcpp_error}")
        except Exception as e:
-            sd_cpp_error = str(e)
-            print(f"sd.cpp error: {sd_cpp_error}")
-    
-    # Both backends failed - return error with installation instructions
-    raise HTTPException(
-        status_code=400,
-        detail=f"Model '{model_to_use}' does not support image generation"
-    )
+            sdcpp_error = str(e)
+            print(f"sd.cpp error: {sdcpp_error}")
+        
+        # =====================================================================
+        # Step 5: Both backends failed - return error
+        # =====================================================================
+        error_details = []
+        if diffusers_error:
+            error_details.append(f"diffusers: {diffusers_error}")
+        if sdcpp_error:
+            error_details.append(f"sd.cpp: {sdcpp_error}")
+        
+        raise HTTPException(
+            status_code=400,
+            detail=f"Failed to load image model '{model_name}'. Errors: {'; '.join(error_details) if error_details else 'No compatible backend found'}"
+        )
--- a/codai/api/text.py
+++ b/codai/api/text.py
@@ -295,46 +295,13 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
    # Get the model for this request
    requested_model = request.model
    
-    # Get load mode to determine if we need to unload other models first
-    from codai.api.state import get_load_mode
-    load_mode = get_load_mode()
+    # Use the manager to resolve the model and manage VRAM (handles ondemand unloading)
+    model_info = multi_model_manager.request_model(
+        requested_model=requested_model,
+        model_type="text"
+    )
    
-    # In ondemand mode (no --load-all or --loadswap), if ANY model is loaded in VRAM
-    # and it's different from what we need, fully unload it first to free VRAM
-    if load_mode == "ondemand":
-        has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
-        
-        if has_any_model:
-            # Resolve both the requested model and currently loaded model to their canonical names
-            requested_canonical = multi_model_manager.resolve_model_name(requested_model)
-            loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
-            
-            # Also check legacy model_manager
-            if not loaded_canonical and model_manager.backend is not None:
-                loaded_canonical = "legacy_model_manager"
-            
-            # Compare: if they're different models (even if same type), unload first
-            already_loaded = (requested_canonical and loaded_canonical and 
-                            requested_canonical == loaded_canonical)
-            
-            if not already_loaded:
-                print(f"In ondemand mode - model switch detected:")
-                print(f"  Requested: '{requested_model}' (resolved to: '{requested_canonical}')")
-                print(f"  Loaded: '{loaded_canonical}'")
-                print(f"  -> Fully unloading current model(s) before loading new model...")
-                
-                # Use centralized unload method
-                multi_model_manager.unload_all_models()
-                
-                # Also cleanup legacy model_manager
-                if model_manager.backend is not None:
-                    print("Unloading legacy model_manager from VRAM...")
-                    try:
-                        model_manager.cleanup()
-                    except Exception as e:
-                        print(f"Warning during legacy model cleanup: {e}")
-    
-    # Try to get the appropriate model
+    # Try to get the appropriate model (request_model handles VRAM cleanup)
    mm = multi_model_manager.get_model_for_request(requested_model)
    
    if mm is None:
@@ -1727,40 +1694,13 @@ async def completions(request: CompletionRequest):
    # Get the model for this request
    requested_model = request.model
    
-    # Get load mode to determine if we need to unload other models first
-    from codai.api.state import get_load_mode
-    load_mode = get_load_mode()
-    
-    # In ondemand mode, if ANY model is loaded and it's different from what we need, unload first
-    if load_mode == "ondemand":
-        has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
-        
-        if has_any_model:
-            # Resolve both the requested model and currently loaded model to their canonical names
-            requested_canonical = multi_model_manager.resolve_model_name(requested_model)
-            loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
-            
-            # Also check legacy model_manager
-            if not loaded_canonical and model_manager.backend is not None:
-                loaded_canonical = "legacy_model_manager"
-            
-            # Compare: if they're different models (even if same type), unload first
-            already_loaded = (requested_canonical and loaded_canonical and 
-                            requested_canonical == loaded_canonical)
-            
-            if not already_loaded:
-                print(f"In ondemand mode - model switch detected:")
-                print(f"  Requested: '{requested_model}' (resolved to: '{requested_canonical}')")
-                print(f"  Loaded: '{loaded_canonical}'")
-                print(f"  -> Fully unloading current model(s) before loading new model...")
-                multi_model_manager.unload_all_models()
-                if model_manager.backend is not None:
-                    try:
-                        model_manager.cleanup()
-                    except:
-                        pass
+    # Use the manager to resolve the model and manage VRAM (handles ondemand unloading)
+    model_info = multi_model_manager.request_model(
+        requested_model=requested_model,
+        model_type="text"
+    )
    
-    # Try to get the appropriate model
+    # Try to get the appropriate model (request_model handles VRAM cleanup)
    mm = multi_model_manager.get_model_for_request(requested_model)
    
    if mm is None:

--- a/codai/api/transcriptions.py
+++ b/codai/api/transcriptions.py
@@ -54,52 +54,22 @@ async def create_transcription(
            raise HTTPException(status_code=500, detail=result["error"])
        return {"text": result.get("text", "")}
    
-    audio_model = multi_model_manager.audio_models[0] if multi_model_manager.audio_models else None
-    if not audio_model:
+    # Use the manager to resolve the model and manage VRAM
+    model_info = multi_model_manager.request_model(
+        requested_model=model,
+        model_type="audio"
+    )
+    
+    model_name = model_info['model_name']
+    model_key = model_info['model_key']
+    whisper_model = model_info['model_object']
+    
+    if not model_name:
        raise HTTPException(
            status_code=400,
            detail="Audio transcription not configured. Use --audio-model or --whisper-server."
        )
    
-    # Get load mode to determine if we need to unload other models first
-    from codai.api.state import get_load_mode
-    from codai.models.manager import model_manager
-    load_mode = get_load_mode()
-    
-    # In ondemand mode, if ANY model is loaded and it's different from what we need, unload first
-    if load_mode == "ondemand":
-        has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
-        
-        if has_any_model:
-            # Resolve both the requested audio model and currently loaded model to their canonical names
-            requested_canonical = multi_model_manager.resolve_model_name(f"audio:{audio_model}")
-            loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
-            
-            # Also check legacy model_manager
-            if not loaded_canonical and model_manager.backend is not None:
-                loaded_canonical = "legacy_model_manager"
-            
-            # Compare: if they're different models, unload first
-            already_loaded = (requested_canonical and loaded_canonical and 
-                            requested_canonical == loaded_canonical)
-            
-            if not already_loaded:
-                print(f"In ondemand mode - model switch detected:")
-                print(f"  Requested: 'audio:{audio_model}' (resolved to: '{requested_canonical}')")
-                print(f"  Loaded: '{loaded_canonical}'")
-                print(f"  -> Fully unloading current model(s) before loading audio model...")
-                multi_model_manager.unload_all_models()
-                if model_manager.backend is not None:
-                    try:
-                        model_manager.cleanup()
-                    except:
-                        pass
-    
-    # Determine model to use
-    model_to_use = model
-    if model_to_use.startswith("whisper:") or model_to_use.startswith("audio:"):
-        model_to_use = audio_model
-    
    # Read the uploaded file
    file_content = await file.read()
    
@@ -113,26 +83,23 @@ async def create_transcription(
        try:
            from faster_whisper import WhisperModel
            
-            # Determine model key
-            model_key = f"audio:{model_to_use}"
-            whisper_model = multi_model_manager.get_model(model_key)
-            
            if whisper_model is None:
-                print(f"Loading faster-whisper model: {model_to_use}")
+                print(f"Loading faster-whisper model: {model_name}")
                
                # Determine compute type - always use int8 for CPU
                compute_type = "int8"
                
                # Load the model
                whisper_model = WhisperModel(
-                    model_to_use,
+                    model_name,
                    device="cpu",  # Always use CPU - faster-whisper CUDA doesn't work with AMD
                    compute_type=compute_type,
                )
                
                # Cache the model
                multi_model_manager.add_model(model_key, whisper_model)
-                print(f"Loaded faster-whisper model: {model_to_use}")
+                multi_model_manager.current_model_key = model_key
+                print(f"Loaded faster-whisper model: {model_name}")
            
            # Run transcription
            segments, info = whisper_model.transcribe(
@@ -160,24 +127,21 @@ async def create_transcription(
        try:
            import whispercpp
            
-            # Determine model key
-            model_key = f"audio:{model_to_use}"
-            whisper_model = multi_model_manager.get_model(model_key)
-            
            if whisper_model is None:
-                print(f"Loading whispercpp model: {model_to_use}")
+                print(f"Loading whispercpp model: {model_name}")
                
                # Check if it's a built-in model name
-                if model_to_use in ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large']:
+                if model_name in ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large']:
                    # It's a built-in model name
-                    whisper_model = whispercpp.Whisper.from_pretrained(model_to_use)
+                    whisper_model = whispercpp.Whisper.from_pretrained(model_name)
                else:
                    # It's a path to a GGUF file
-                    whisper_model = whispercpp.Whisper.from_pretrained(model_to_use)
+                    whisper_model = whispercpp.Whisper.from_pretrained(model_name)
                
                # Cache the model
                multi_model_manager.add_model(model_key, whisper_model)
-                print(f"Loaded whispercpp model: {model_to_use}")
+                multi_model_manager.current_model_key = model_key
+                print(f"Loaded whispercpp model: {model_name}")
            
            # Run transcription
            result = whisper_model.transcribe(tmp_path)

--- a/codai/api/tts.py
+++ b/codai/api/tts.py
@@ -16,18 +16,6 @@ from codai.models.manager import multi_model_manager
 global_args = None


-def get_cached_model_path(url: str) -> str:
-    """Get cached model path if available."""
-    from codai.models.cache import get_cached_model_path as cache_get_cached_model_path
-    return cache_get_cached_model_path(url)
-
-
-def get_model_cache_dir() -> str:
-    """Get model cache directory."""
-    from codai.models.cache import get_model_cache_dir
-    return get_model_cache_dir()
-
-
 def set_global_args(args):
    """Set global args from coderai."""
    global global_args
@@ -65,80 +53,46 @@ async def create_speech(request: TTSRequest):
    Supports:
    - Kokoro TTS models (when --tts-model is specified)
    """
-    tts_model = multi_model_manager.tts_model
+    # Use the manager to resolve the model and manage VRAM
+    model_info = multi_model_manager.request_model(
+        requested_model=request.model,
+        model_type="tts"
+    )
+    
+    model_name = model_info['model_name']
+    model_key = model_info['model_key']
+    kokoro_model = model_info['model_object']
    
    # If no TTS model configured, return an error
-    if not tts_model:
+    if not model_name:
        raise HTTPException(
            status_code=400,
            detail="TTS not configured. Use --tts-model to specify a model."
        )
    
-    # Get load mode to determine if we need to unload other models first
-    from codai.api.state import get_load_mode
-    from codai.models.manager import model_manager
-    load_mode = get_load_mode()
-    
-    # In ondemand mode, if ANY model is loaded and it's different from what we need, unload first
-    if load_mode == "ondemand":
-        has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
-        
-        if has_any_model:
-            # Resolve both the requested TTS model and currently loaded model to their canonical names
-            requested_canonical = multi_model_manager.resolve_model_name(f"tts:{tts_model}")
-            loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
-            
-            # Also check legacy model_manager
-            if not loaded_canonical and model_manager.backend is not None:
-                loaded_canonical = "legacy_model_manager"
-            
-            # Compare: if they're different models, unload first
-            already_loaded = (requested_canonical and loaded_canonical and 
-                            requested_canonical == loaded_canonical)
-            
-            if not already_loaded:
-                print(f"In ondemand mode - model switch detected:")
-                print(f"  Requested: 'tts:{tts_model}' (resolved to: '{requested_canonical}')")
-                print(f"  Loaded: '{loaded_canonical}'")
-                print(f"  -> Fully unloading current model(s) before loading TTS model...")
-                multi_model_manager.unload_all_models()
-                if model_manager.backend is not None:
-                    try:
-                        model_manager.cleanup()
-                    except:
-                        pass
-    
-    # Determine model to use
-    model_to_use = request.model
-    if model_to_use.startswith("tts:"):
-        model_to_use = tts_model
-    
    # Try to use kokoro if available
    try:
        from kokoro import Kokoro
        
-        # Determine model key
-        model_key = f"tts:{model_to_use}"
-        kokoro_model = multi_model_manager.get_model(model_key)
-        
        if kokoro_model is None:
-            print(f"Loading Kokoro TTS model: {model_to_use}")
+            print(f"Loading Kokoro TTS model: {model_name}")
            
-            # Check if model_to_use is a URL - download it (with caching)
+            # Check if model_name is a URL - download it (with caching)
            model_path = None
-            if model_to_use.startswith('http://') or model_to_use.startswith('https://'):
-                print(f"Loading model from URL: {model_to_use}")
+            if model_name.startswith('http://') or model_name.startswith('https://'):
+                print(f"Loading model from URL: {model_name}")
                from codai.models.cache import load_model
-                model_path = load_model(model_to_use)
+                model_path = load_model(model_name)
                if not model_path:
-                    raise Exception(f"Failed to load model from {model_to_use}")
+                    raise Exception(f"Failed to load model from {model_name}")
            else:
                # Use local path or model name
-                model_path = model_to_use
+                model_path = model_name
            
            # Load the Kokoro model
-            kokoro_model = Kokoro(model_path if model_path else model_to_use)
+            kokoro_model = Kokoro(model_path if model_path else model_name)
            multi_model_manager.add_model(model_key, kokoro_model)
+            multi_model_manager.current_model_key = model_key
        
        # Generate speech
        voice = request.voice or "af_sarah"

--- a/codai/models/manager.py
+++ b/codai/models/manager.py
@@ -883,6 +883,133 @@ class MultiModelManager:
        return load_model(model_path, cache_dir, file_pattern)

    
+    def request_model(self, requested_model: str, model_type: str = None) -> Dict[str, Any]:
+        """
+        Central method for API modules to request a model.
+        
+        Handles:
+        1. Alias resolution (e.g., "image" -> "Tongyi-MAI/Z-Image-Turbo")
+        2. VRAM management (unloading previous models in ondemand mode)
+        3. Checking if model is already loaded
+        
+        Args:
+            requested_model: The model name/alias from the API request
+            model_type: The type of model being requested ("image", "text", "audio", "tts", "vision")
+                       Used to resolve empty/None model names to the appropriate default.
+        
+        Returns:
+            Dict with:
+                - 'model_key': The key used to store/retrieve the model in self.models
+                - 'model_name': The resolved model name/path/HF ID
+                - 'model_object': The loaded model object if already loaded, None otherwise
+                - 'config': The stored configuration for this model
+                - 'already_loaded': True if the model is already loaded in VRAM
+        """
+        from codai.api.state import get_load_mode
+        mode = get_load_mode()
+        
+        # Step 1: Resolve the model name from aliases
+        resolved_name = None
+        model_key = None
+        
+        # If no model specified, use the default for the given type
+        if not requested_model or requested_model == model_type:
+            if model_type == "image":
+                resolved_name = self.image_models[0] if self.image_models else None
+            elif model_type == "audio":
+                resolved_name = self.audio_models[0] if self.audio_models else None
+            elif model_type == "tts":
+                resolved_name = self.tts_model
+            elif model_type == "vision":
+                resolved_name = self.vision_models[0] if self.vision_models else None
+            else:
+                resolved_name = self.default_model
+        else:
+            # Resolve custom aliases
+            if requested_model in self.model_aliases:
+                requested_model = self.model_aliases[requested_model]
+            
+            # Handle "default" alias
+            if requested_model == "default":
+                resolved_name = self.default_model
+            # Handle type-specific aliases
+            elif requested_model == "image":
+                resolved_name = self.image_models[0] if self.image_models else None
+            elif requested_model == "audio":
+                resolved_name = self.audio_models[0] if self.audio_models else None
+            elif requested_model == "tts":
+                resolved_name = self.tts_model
+            elif requested_model == "vision":
+                resolved_name = self.vision_models[0] if self.vision_models else None
+            # Handle prefixed models (e.g., "image:model_name")
+            elif requested_model.startswith("image:"):
+                resolved_name = requested_model[6:]
+            elif requested_model.startswith("audio:"):
+                resolved_name = requested_model[6:]
+            elif requested_model.startswith("tts:"):
+                resolved_name = requested_model[4:]
+            elif requested_model.startswith("vision:"):
+                resolved_name = requested_model[7:]
+            else:
+                resolved_name = requested_model
+        
+        if not resolved_name:
+            return {
+                'model_key': None,
+                'model_name': None,
+                'model_object': None,
+                'config': {},
+                'already_loaded': False,
+            }
+        
+        # Step 2: Build the model key (prefixed with type)
+        if model_type and model_type != "text":
+            model_key = f"{model_type}:{resolved_name}"
+        else:
+            model_key = resolved_name
+        
+        # Step 3: Check if already loaded
+        existing_model = self.models.get(model_key)
+        if existing_model is not None:
+            self.current_model_key = model_key
+            return {
+                'model_key': model_key,
+                'model_name': resolved_name,
+                'model_object': existing_model,
+                'config': self.config.get(model_key, {}),
+                'already_loaded': True,
+            }
+        
+        # Step 4: In ondemand mode, unload any currently loaded model
+        if mode == "ondemand":
+            has_any_model = len(self.models) > 0 or model_manager.backend is not None
+            
+            if has_any_model:
+                loaded_canonical = self.get_currently_loaded_model_name()
+                if not loaded_canonical and model_manager.backend is not None:
+                    loaded_canonical = "legacy_model_manager"
+                
+                if loaded_canonical and loaded_canonical != model_key:
+                    print(f"Ondemand mode - model switch detected:")
+                    print(f"  Requested: '{model_key}' (resolved: '{resolved_name}')")
+                    print(f"  Currently loaded: '{loaded_canonical}'")
+                    print(f"  -> Unloading current model(s) before loading new model...")
+                    self.unload_all_models()
+                    if model_manager.backend is not None:
+                        try:
+                            model_manager.cleanup()
+                        except:
+                            pass
+        
+        # Step 5: Return info for the caller to load the model
+        return {
+            'model_key': model_key,
+            'model_name': resolved_name,
+            'model_object': None,
+            'config': self.config.get(model_key, {}),
+            'already_loaded': False,
+        }
+    
    def unload_all_models(self):
        """
        Fully unload ALL models from VRAM. Used in ondemand mode when switching