Commit a37085b4 authored by Your Name's avatar Your Name

Fix: Proper model resolution for ondemand mode - unload when switching between ANY different models

- Added resolve_model_name() to MultiModelManager to properly resolve model aliases
- Added get_currently_loaded_model_name() to track what's actually in VRAM
- Updated /v1/chat/completions, /v1/completions, and /v1/images/generations
- Now correctly compares resolved canonical names before deciding to unload
- Handles all aliases (default, image, audio, tts) and custom aliases
- Works across ALL model types: text->text2, image->image2, text->image, etc.
parent 00775972
...@@ -278,15 +278,30 @@ async def create_image_generation(request: ImageGenerationRequest, http_request: ...@@ -278,15 +278,30 @@ async def create_image_generation(request: ImageGenerationRequest, http_request:
from codai.models.manager import model_manager from codai.models.manager import model_manager
has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
if has_any_model and pipeline is None: if has_any_model:
# The image model we need is NOT loaded - unload everything # Resolve both the requested image model and currently loaded model to their canonical names
print(f"In ondemand mode - fully unloading current model(s) before loading image model '{model_to_use}'...") requested_canonical = multi_model_manager.resolve_model_name(f"image:{model_to_use}")
multi_model_manager.unload_all_models() loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
if model_manager.backend is not None:
try: # Also check legacy model_manager
model_manager.cleanup() if not loaded_canonical and model_manager.backend is not None:
except: loaded_canonical = "legacy_model_manager"
pass
# Compare: if they're different models (even if both are image models), unload first
already_loaded = (requested_canonical and loaded_canonical and
requested_canonical == loaded_canonical)
if not already_loaded:
print(f"In ondemand mode - model switch detected:")
print(f" Requested: 'image:{model_to_use}' (resolved to: '{requested_canonical}')")
print(f" Loaded: '{loaded_canonical}'")
print(f" -> Fully unloading current model(s) before loading new model...")
multi_model_manager.unload_all_models()
if model_manager.backend is not None:
try:
model_manager.cleanup()
except:
pass
# Try diffusers first # Try diffusers first
try: try:
...@@ -539,18 +554,34 @@ async def create_image_generation(request: ImageGenerationRequest, http_request: ...@@ -539,18 +554,34 @@ async def create_image_generation(request: ImageGenerationRequest, http_request:
# If no cached image model found, need to load one - first cleanup any existing models # If no cached image model found, need to load one - first cleanup any existing models
if sd_model is None: if sd_model is None:
# In ondemand mode, fully unload everything before loading sd.cpp model # In ondemand mode, check if we need to unload before loading sd.cpp model
from codai.models.manager import model_manager from codai.models.manager import model_manager
has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
if mode == "ondemand" and has_any_model: if mode == "ondemand" and has_any_model:
print(f"In ondemand mode - fully unloading current model(s) before loading sd.cpp image model...") # Resolve both the requested image model and currently loaded model to their canonical names
multi_model_manager.unload_all_models() requested_canonical = multi_model_manager.resolve_model_name(f"image:{model_to_use}")
if model_manager.backend is not None: loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
try:
model_manager.cleanup() # Also check legacy model_manager
except: if not loaded_canonical and model_manager.backend is not None:
pass loaded_canonical = "legacy_model_manager"
# Compare: if they're different models, unload first
already_loaded = (requested_canonical and loaded_canonical and
requested_canonical == loaded_canonical)
if not already_loaded:
print(f"In ondemand mode - model switch detected:")
print(f" Requested: 'image:{model_to_use}' (resolved to: '{requested_canonical}')")
print(f" Loaded: '{loaded_canonical}'")
print(f" -> Fully unloading current model(s) before loading sd.cpp model...")
multi_model_manager.unload_all_models()
if model_manager.backend is not None:
try:
model_manager.cleanup()
except:
pass
if sd_model is not None: if sd_model is not None:
# Check if it's a stable-diffusion-cpp model (has generate method from sd.cpp) # Check if it's a stable-diffusion-cpp model (has generate method from sd.cpp)
......
...@@ -305,19 +305,23 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request ...@@ -305,19 +305,23 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
if has_any_model: if has_any_model:
# Check if the requested model is already loaded (no need to unload) # Resolve both the requested model and currently loaded model to their canonical names
already_loaded = False requested_canonical = multi_model_manager.resolve_model_name(requested_model)
if requested_model and requested_model in multi_model_manager.models: loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
already_loaded = True
elif multi_model_manager.default_model and ( # Also check legacy model_manager
not requested_model or requested_model == "default" or if not loaded_canonical and model_manager.backend is not None:
requested_model == multi_model_manager.default_model loaded_canonical = "legacy_model_manager"
):
if multi_model_manager.default_model in multi_model_manager.models: # Compare: if they're different models (even if same type), unload first
already_loaded = True already_loaded = (requested_canonical and loaded_canonical and
requested_canonical == loaded_canonical)
if not already_loaded: if not already_loaded:
print(f"In ondemand mode - fully unloading current model(s) before loading text model '{requested_model}'...") print(f"In ondemand mode - model switch detected:")
print(f" Requested: '{requested_model}' (resolved to: '{requested_canonical}')")
print(f" Loaded: '{loaded_canonical}'")
print(f" -> Fully unloading current model(s) before loading new model...")
# Use centralized unload method # Use centralized unload method
multi_model_manager.unload_all_models() multi_model_manager.unload_all_models()
...@@ -1732,18 +1736,23 @@ async def completions(request: CompletionRequest): ...@@ -1732,18 +1736,23 @@ async def completions(request: CompletionRequest):
has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
if has_any_model: if has_any_model:
already_loaded = False # Resolve both the requested model and currently loaded model to their canonical names
if requested_model and requested_model in multi_model_manager.models: requested_canonical = multi_model_manager.resolve_model_name(requested_model)
already_loaded = True loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
elif multi_model_manager.default_model and (
not requested_model or requested_model == "default" or # Also check legacy model_manager
requested_model == multi_model_manager.default_model if not loaded_canonical and model_manager.backend is not None:
): loaded_canonical = "legacy_model_manager"
if multi_model_manager.default_model in multi_model_manager.models:
already_loaded = True # Compare: if they're different models (even if same type), unload first
already_loaded = (requested_canonical and loaded_canonical and
requested_canonical == loaded_canonical)
if not already_loaded: if not already_loaded:
print(f"In ondemand mode - fully unloading current model(s) before loading text model '{requested_model}'...") print(f"In ondemand mode - model switch detected:")
print(f" Requested: '{requested_model}' (resolved to: '{requested_canonical}')")
print(f" Loaded: '{loaded_canonical}'")
print(f" -> Fully unloading current model(s) before loading new model...")
multi_model_manager.unload_all_models() multi_model_manager.unload_all_models()
if model_manager.backend is not None: if model_manager.backend is not None:
try: try:
......
...@@ -711,6 +711,91 @@ class MultiModelManager: ...@@ -711,6 +711,91 @@ class MultiModelManager:
# Model not found - try to load it as a new model # Model not found - try to load it as a new model
return self._load_model_by_name(requested_model) return self._load_model_by_name(requested_model)
def resolve_model_name(self, requested_model: str) -> Optional[str]:
"""
Resolve a model name to its canonical form.
Handles:
- Aliases ("default", "image", "audio", "tts")
- Custom aliases from model_aliases dict
- Prefixed models ("image:", "audio:", "tts:", "vision:")
- Default model resolution
Returns the canonical model name/path, or None if not resolvable.
"""
# Handle None or empty
if not requested_model:
return self.default_model
# Resolve custom aliases first
if requested_model in self.model_aliases:
requested_model = self.model_aliases[requested_model]
# Handle "default" alias
if requested_model == "default":
return self.default_model
# Handle "audio" alias
if requested_model == "audio":
return f"audio:{self.audio_models[0]}" if self.audio_models else None
# Handle "image" alias
if requested_model == "image":
return f"image:{self.image_models[0]}" if self.image_models else None
# Handle "tts" alias
if requested_model == "tts":
return f"tts:{self.tts_model}" if self.tts_model else None
# Handle "vision" alias
if requested_model == "vision":
return f"image:{self.vision_models[0]}" if self.vision_models else None
# Handle prefixed models - normalize them
if requested_model.startswith("audio:"):
return requested_model
if requested_model.startswith("tts:"):
return requested_model
if requested_model.startswith("image:") or requested_model.startswith("vision:"):
# Normalize vision: to image:
if requested_model.startswith("vision:"):
return f"image:{requested_model[7:]}"
return requested_model
# Check if it matches the default model (with or without path)
if self.default_model:
if requested_model == self.default_model:
return self.default_model
# Check if it's a short name match
if requested_model.endswith(self.default_model.split("/")[-1]) or \
self.default_model.endswith(requested_model.split("/")[-1]):
return self.default_model
# Check if it matches any loaded model key
for key in self.models.keys():
if requested_model in key or key.endswith(requested_model.split("/")[-1]):
return key
# Return as-is if no resolution
return requested_model
def get_currently_loaded_model_name(self) -> Optional[str]:
"""
Get the canonical name of the model currently loaded in VRAM.
Returns the model key from self.models if any model is loaded,
or None if no models are loaded.
"""
if not self.models:
return None
# If we have a tracked current model, return it
if self.current_model_key and self.current_model_key in self.models:
return self.current_model_key
# Otherwise return the first loaded model (there should only be one in ondemand mode)
return list(self.models.keys())[0] if self.models else None
def unload_all_models(self): def unload_all_models(self):
""" """
Fully unload ALL models from VRAM. Used in ondemand mode when switching Fully unload ALL models from VRAM. Used in ondemand mode when switching
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment