Fix: Proper model resolution for ondemand mode - unload when switching between ANY different models

- Added resolve_model_name() to MultiModelManager to properly resolve model aliases - Added get_currently_loaded_model_name() to track what's actually in VRAM - Updated /v1/chat/completions, /v1/completions, and /v1/images/generations - Now correctly compares resolved canonical names before deciding to unload - Handles all aliases (default, image, audio, tts) and custom aliases - Works across ALL model types: text->text2, image->image2, text->image, etc.

Fix: Proper model resolution for ondemand mode - unload when switching between ANY different models
- Added resolve_model_name() to MultiModelManager to properly resolve model aliases - Added get_currently_loaded_model_name() to track what's actually in VRAM - Updated /v1/chat/completions, /v1/completions, and /v1/images/generations - Now correctly compares resolved canonical names before deciding to unload - Handles all aliases (default, image, audio, tts) and custom aliases - Works across ALL model types: text->text2, image->image2, text->image, etc.
a37085b4 · Your Name · 00775972 · a37085b4 · a37085b4 · a37085b4
Commit a37085b4 authored Mar 19, 2026 by Your Name
Show whitespace changes
Inline Side-by-side

Showing with 163 additions and 38 deletions

images.py codai/api/images.py +48 -17

text.py codai/api/text.py +30 -21

manager.py codai/models/manager.py +85 -0

No files found.
--- a/codai/api/images.py
+++ b/codai/api/images.py
@@ -278,9 +278,24 @@ async def create_image_generation(request: ImageGenerationRequest, http_request:
        from codai.models.manager import model_manager
        has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
        
-        if has_any_model and pipeline is None:
-            # The image model we need is NOT loaded - unload everything
-            print(f"In ondemand mode - fully unloading current model(s) before loading image model '{model_to_use}'...")
+        if has_any_model:
+            # Resolve both the requested image model and currently loaded model to their canonical names
+            requested_canonical = multi_model_manager.resolve_model_name(f"image:{model_to_use}")
+            loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
+            
+            # Also check legacy model_manager
+            if not loaded_canonical and model_manager.backend is not None:
+                loaded_canonical = "legacy_model_manager"
+            
+            # Compare: if they're different models (even if both are image models), unload first
+            already_loaded = (requested_canonical and loaded_canonical and 
+                            requested_canonical == loaded_canonical)
+            
+            if not already_loaded:
+                print(f"In ondemand mode - model switch detected:")
+                print(f"  Requested: 'image:{model_to_use}' (resolved to: '{requested_canonical}')")
+                print(f"  Loaded: '{loaded_canonical}'")
+                print(f"  -> Fully unloading current model(s) before loading new model...")
                multi_model_manager.unload_all_models()
                if model_manager.backend is not None:
                    try:
@@ -539,12 +554,28 @@ async def create_image_generation(request: ImageGenerationRequest, http_request:
    
    # If no cached image model found, need to load one - first cleanup any existing models
    if sd_model is None:
-        # In ondemand mode, fully unload everything before loading sd.cpp model
+        # In ondemand mode, check if we need to unload before loading sd.cpp model
        from codai.models.manager import model_manager
        has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
        
        if mode == "ondemand" and has_any_model:
-            print(f"In ondemand mode - fully unloading current model(s) before loading sd.cpp image model...")
+            # Resolve both the requested image model and currently loaded model to their canonical names
+            requested_canonical = multi_model_manager.resolve_model_name(f"image:{model_to_use}")
+            loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
+            
+            # Also check legacy model_manager
+            if not loaded_canonical and model_manager.backend is not None:
+                loaded_canonical = "legacy_model_manager"
+            
+            # Compare: if they're different models, unload first
+            already_loaded = (requested_canonical and loaded_canonical and 
+                            requested_canonical == loaded_canonical)
+            
+            if not already_loaded:
+                print(f"In ondemand mode - model switch detected:")
+                print(f"  Requested: 'image:{model_to_use}' (resolved to: '{requested_canonical}')")
+                print(f"  Loaded: '{loaded_canonical}'")
+                print(f"  -> Fully unloading current model(s) before loading sd.cpp model...")
                multi_model_manager.unload_all_models()
                if model_manager.backend is not None:
                    try:

--- a/codai/api/text.py
+++ b/codai/api/text.py
@@ -305,19 +305,23 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
        has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
        
        if has_any_model:
-            # Check if the requested model is already loaded (no need to unload)
-            already_loaded = False
-            if requested_model and requested_model in multi_model_manager.models:
-                already_loaded = True
-            elif multi_model_manager.default_model and (
-                not requested_model or requested_model == "default" or 
-                requested_model == multi_model_manager.default_model
-            ):
-                if multi_model_manager.default_model in multi_model_manager.models:
-                    already_loaded = True
+            # Resolve both the requested model and currently loaded model to their canonical names
+            requested_canonical = multi_model_manager.resolve_model_name(requested_model)
+            loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
+            
+            # Also check legacy model_manager
+            if not loaded_canonical and model_manager.backend is not None:
+                loaded_canonical = "legacy_model_manager"
+            
+            # Compare: if they're different models (even if same type), unload first
+            already_loaded = (requested_canonical and loaded_canonical and 
+                            requested_canonical == loaded_canonical)
            
            if not already_loaded:
-                print(f"In ondemand mode - fully unloading current model(s) before loading text model '{requested_model}'...")
+                print(f"In ondemand mode - model switch detected:")
+                print(f"  Requested: '{requested_model}' (resolved to: '{requested_canonical}')")
+                print(f"  Loaded: '{loaded_canonical}'")
+                print(f"  -> Fully unloading current model(s) before loading new model...")
                
                # Use centralized unload method
                multi_model_manager.unload_all_models()
@@ -1732,18 +1736,23 @@ async def completions(request: CompletionRequest):
        has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
        
        if has_any_model:
-            already_loaded = False
-            if requested_model and requested_model in multi_model_manager.models:
-                already_loaded = True
-            elif multi_model_manager.default_model and (
-                not requested_model or requested_model == "default" or 
-                requested_model == multi_model_manager.default_model
-            ):
-                if multi_model_manager.default_model in multi_model_manager.models:
-                    already_loaded = True
+            # Resolve both the requested model and currently loaded model to their canonical names
+            requested_canonical = multi_model_manager.resolve_model_name(requested_model)
+            loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
+            
+            # Also check legacy model_manager
+            if not loaded_canonical and model_manager.backend is not None:
+                loaded_canonical = "legacy_model_manager"
+            
+            # Compare: if they're different models (even if same type), unload first
+            already_loaded = (requested_canonical and loaded_canonical and 
+                            requested_canonical == loaded_canonical)
            
            if not already_loaded:
-                print(f"In ondemand mode - fully unloading current model(s) before loading text model '{requested_model}'...")
+                print(f"In ondemand mode - model switch detected:")
+                print(f"  Requested: '{requested_model}' (resolved to: '{requested_canonical}')")
+                print(f"  Loaded: '{loaded_canonical}'")
+                print(f"  -> Fully unloading current model(s) before loading new model...")
                multi_model_manager.unload_all_models()
                if model_manager.backend is not None:
                    try:

--- a/codai/models/manager.py
+++ b/codai/models/manager.py
@@ -711,6 +711,91 @@ class MultiModelManager:
        # Model not found - try to load it as a new model
        return self._load_model_by_name(requested_model)
    
+    def resolve_model_name(self, requested_model: str) -> Optional[str]:
+        """
+        Resolve a model name to its canonical form.
+        
+        Handles:
+        - Aliases ("default", "image", "audio", "tts")
+        - Custom aliases from model_aliases dict
+        - Prefixed models ("image:", "audio:", "tts:", "vision:")
+        - Default model resolution
+        
+        Returns the canonical model name/path, or None if not resolvable.
+        """
+        # Handle None or empty
+        if not requested_model:
+            return self.default_model
+        
+        # Resolve custom aliases first
+        if requested_model in self.model_aliases:
+            requested_model = self.model_aliases[requested_model]
+        
+        # Handle "default" alias
+        if requested_model == "default":
+            return self.default_model
+        
+        # Handle "audio" alias
+        if requested_model == "audio":
+            return f"audio:{self.audio_models[0]}" if self.audio_models else None
+        
+        # Handle "image" alias
+        if requested_model == "image":
+            return f"image:{self.image_models[0]}" if self.image_models else None
+        
+        # Handle "tts" alias
+        if requested_model == "tts":
+            return f"tts:{self.tts_model}" if self.tts_model else None
+        
+        # Handle "vision" alias
+        if requested_model == "vision":
+            return f"image:{self.vision_models[0]}" if self.vision_models else None
+        
+        # Handle prefixed models - normalize them
+        if requested_model.startswith("audio:"):
+            return requested_model
+        if requested_model.startswith("tts:"):
+            return requested_model
+        if requested_model.startswith("image:") or requested_model.startswith("vision:"):
+            # Normalize vision: to image:
+            if requested_model.startswith("vision:"):
+                return f"image:{requested_model[7:]}"
+            return requested_model
+        
+        # Check if it matches the default model (with or without path)
+        if self.default_model:
+            if requested_model == self.default_model:
+                return self.default_model
+            # Check if it's a short name match
+            if requested_model.endswith(self.default_model.split("/")[-1]) or \
+               self.default_model.endswith(requested_model.split("/")[-1]):
+                return self.default_model
+        
+        # Check if it matches any loaded model key
+        for key in self.models.keys():
+            if requested_model in key or key.endswith(requested_model.split("/")[-1]):
+                return key
+        
+        # Return as-is if no resolution
+        return requested_model
+    
+    def get_currently_loaded_model_name(self) -> Optional[str]:
+        """
+        Get the canonical name of the model currently loaded in VRAM.
+        
+        Returns the model key from self.models if any model is loaded,
+        or None if no models are loaded.
+        """
+        if not self.models:
+            return None
+        
+        # If we have a tracked current model, return it
+        if self.current_model_key and self.current_model_key in self.models:
+            return self.current_model_key
+        
+        # Otherwise return the first loaded model (there should only be one in ondemand mode)
+        return list(self.models.keys())[0] if self.models else None
+    
    def unload_all_models(self):
        """
        Fully unload ALL models from VRAM. Used in ondemand mode when switching