Fix: In ondemand mode, fully unload current model before loading new one

- In ondemand mode (no --load-all or --loadswap specified), when a new model is requested, the current model in VRAM is now fully unloaded before loading the new one. This ensures clean model switching. - Added cleanup logic to both /v1/chat/completions and /v1/completions endpoints - Added same logic to image generation endpoints (diffusers and sd.cpp paths) - Cleanup includes: model cleanup, gc.collect(), torch.cuda.empty_cache()

Fix: In ondemand mode, fully unload current model before loading new one
- In ondemand mode (no --load-all or --loadswap specified), when a new model is requested, the current model in VRAM is now fully unloaded before loading the new one. This ensures clean model switching. - Added cleanup logic to both /v1/chat/completions and /v1/completions endpoints - Added same logic to image generation endpoints (diffusers and sd.cpp paths) - Cleanup includes: model cleanup, gc.collect(), torch.cuda.empty_cache()
7d838962 · Your Name · 9b3126d7 · 7d838962 · 7d838962
Commit 7d838962 authored Mar 19, 2026 by Your Name
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 293 additions and 54 deletions

images.py codai/api/images.py +179 -54

text.py codai/api/text.py +114 -0

No files found.
--- a/codai/api/images.py
+++ b/codai/api/images.py
--- a/codai/api/text.py
+++ b/codai/api/text.py
@@ -295,6 +295,68 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
    # Get the model for this request
    requested_model = request.model
    
+    # Get load mode to determine if we need to unload other models first
+    from codai.api.state import get_load_mode
+    load_mode = get_load_mode()
+    
+    # Check if there's an image model already loaded in VRAM
+    current_image_model = None
+    for key in multi_model_manager.models.keys():
+        if key.startswith("image:"):
+            current_image_model = key
+            break
+    
+    # Check if legacy model_manager has a model loaded
+    has_legacy_model = model_manager.backend is not None
+    
+    # In ondemand mode, if any model (text, image, etc.) is already loaded and we're requesting a different model,
+    # we should unload the current model first to free VRAM
+    needs_full_unload = (load_mode == "ondemand" and (current_image_model is not None or has_legacy_model))
+    
+    # If we're requesting a text model and there's an image model loaded, unload it first
+    if needs_full_unload:
+        print(f"In ondemand mode - fully unloading current model before loading text model...")
+        
+        # Full cleanup: remove all models from VRAM
+        for key in list(multi_model_manager.models.keys()):
+            model_to_cleanup = multi_model_manager.models.get(key)
+            if model_to_cleanup is not None:
+                print(f"Unloading '{key}' from VRAM...")
+                try:
+                    if hasattr(model_to_cleanup, 'cleanup') and callable(getattr(model_to_cleanup, 'cleanup')):
+                        model_to_cleanup.cleanup()
+                except Exception as e:
+                    print(f"Warning during cleanup of '{key}': {e}")
+                del multi_model_manager.models[key]
+        
+        # Also cleanup legacy model_manager
+        if model_manager.backend is not None:
+            print("Unloading legacy model_manager from VRAM...")
+            try:
+                if hasattr(model_manager.backend, 'unload'):
+                    model_manager.backend.unload()
+                elif hasattr(model_manager.backend, 'cleanup'):
+                    model_manager.backend.cleanup()
+            except Exception as e:
+                print(f"Warning during legacy model cleanup: {e}")
+            model_manager.backend = None
+        
+        # Force garbage collection and clear CUDA cache
+        import gc
+        gc.collect()
+        try:
+            import torch
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+                torch.cuda.empty_cache()
+                print("CUDA cache cleared")
+        except:
+            pass
+        
+        # Add delay to let VRAM settle
+        import time
+        time.sleep(1)
+    
    # Try to get the appropriate model
    mm = multi_model_manager.get_model_for_request(requested_model)
    
@@ -1688,6 +1750,58 @@ async def completions(request: CompletionRequest):
    # Get the model for this request
    requested_model = request.model
    
+    # Get load mode to determine if we need to unload other models first
+    from codai.api.state import get_load_mode
+    load_mode = get_load_mode()
+    
+    # Check if there's an image model already loaded in VRAM
+    current_image_model = None
+    for key in multi_model_manager.models.keys():
+        if key.startswith("image:"):
+            current_image_model = key
+            break
+    
+    # In ondemand mode, if any model is already loaded, unload it first
+    needs_full_unload = (load_mode == "ondemand" and current_image_model is not None)
+    
+    if needs_full_unload:
+        print(f"In ondemand mode - fully unloading current model before loading text model...")
+        
+        # Full cleanup
+        for key in list(multi_model_manager.models.keys()):
+            model_to_cleanup = multi_model_manager.models.get(key)
+            if model_to_cleanup is not None:
+                print(f"Unloading '{key}' from VRAM...")
+                try:
+                    if hasattr(model_to_cleanup, 'cleanup') and callable(getattr(model_to_cleanup, 'cleanup')):
+                        model_to_cleanup.cleanup()
+                except Exception as e:
+                    print(f"Warning during cleanup of '{key}': {e}")
+                del multi_model_manager.models[key]
+        
+        # Also cleanup legacy model_manager
+        if model_manager.backend is not None:
+            print("Unloading legacy model_manager from VRAM...")
+            try:
+                if hasattr(model_manager.backend, 'unload'):
+                    model_manager.backend.unload()
+                elif hasattr(model_manager.backend, 'cleanup'):
+                    model_manager.backend.cleanup()
+            except:
+                pass
+            model_manager.backend = None
+        
+        # Force garbage collection and clear CUDA cache
+        import gc
+        gc.collect()
+        try:
+            import torch
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+                torch.cuda.empty_cache()
+        except:
+            pass
+    
    # Try to get the appropriate model
    mm = multi_model_manager.get_model_for_request(requested_model)