Commit 7d838962 authored by Your Name's avatar Your Name

Fix: In ondemand mode, fully unload current model before loading new one

- In ondemand mode (no --load-all or --loadswap specified), when a new model
  is requested, the current model in VRAM is now fully unloaded before loading
  the new one. This ensures clean model switching.
- Added cleanup logic to both /v1/chat/completions and /v1/completions endpoints
- Added same logic to image generation endpoints (diffusers and sd.cpp paths)
- Cleanup includes: model cleanup, gc.collect(), torch.cuda.empty_cache()
parent 9b3126d7
This diff is collapsed.
......@@ -295,6 +295,68 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
# Get the model for this request
requested_model = request.model
# Get load mode to determine if we need to unload other models first
from codai.api.state import get_load_mode
load_mode = get_load_mode()
# Check if there's an image model already loaded in VRAM
current_image_model = None
for key in multi_model_manager.models.keys():
if key.startswith("image:"):
current_image_model = key
break
# Check if legacy model_manager has a model loaded
has_legacy_model = model_manager.backend is not None
# In ondemand mode, if any model (text, image, etc.) is already loaded and we're requesting a different model,
# we should unload the current model first to free VRAM
needs_full_unload = (load_mode == "ondemand" and (current_image_model is not None or has_legacy_model))
# If we're requesting a text model and there's an image model loaded, unload it first
if needs_full_unload:
print(f"In ondemand mode - fully unloading current model before loading text model...")
# Full cleanup: remove all models from VRAM
for key in list(multi_model_manager.models.keys()):
model_to_cleanup = multi_model_manager.models.get(key)
if model_to_cleanup is not None:
print(f"Unloading '{key}' from VRAM...")
try:
if hasattr(model_to_cleanup, 'cleanup') and callable(getattr(model_to_cleanup, 'cleanup')):
model_to_cleanup.cleanup()
except Exception as e:
print(f"Warning during cleanup of '{key}': {e}")
del multi_model_manager.models[key]
# Also cleanup legacy model_manager
if model_manager.backend is not None:
print("Unloading legacy model_manager from VRAM...")
try:
if hasattr(model_manager.backend, 'unload'):
model_manager.backend.unload()
elif hasattr(model_manager.backend, 'cleanup'):
model_manager.backend.cleanup()
except Exception as e:
print(f"Warning during legacy model cleanup: {e}")
model_manager.backend = None
# Force garbage collection and clear CUDA cache
import gc
gc.collect()
try:
import torch
if torch.cuda.is_available():
torch.cuda.synchronize()
torch.cuda.empty_cache()
print("CUDA cache cleared")
except:
pass
# Add delay to let VRAM settle
import time
time.sleep(1)
# Try to get the appropriate model
mm = multi_model_manager.get_model_for_request(requested_model)
......@@ -1688,6 +1750,58 @@ async def completions(request: CompletionRequest):
# Get the model for this request
requested_model = request.model
# Get load mode to determine if we need to unload other models first
from codai.api.state import get_load_mode
load_mode = get_load_mode()
# Check if there's an image model already loaded in VRAM
current_image_model = None
for key in multi_model_manager.models.keys():
if key.startswith("image:"):
current_image_model = key
break
# In ondemand mode, if any model is already loaded, unload it first
needs_full_unload = (load_mode == "ondemand" and current_image_model is not None)
if needs_full_unload:
print(f"In ondemand mode - fully unloading current model before loading text model...")
# Full cleanup
for key in list(multi_model_manager.models.keys()):
model_to_cleanup = multi_model_manager.models.get(key)
if model_to_cleanup is not None:
print(f"Unloading '{key}' from VRAM...")
try:
if hasattr(model_to_cleanup, 'cleanup') and callable(getattr(model_to_cleanup, 'cleanup')):
model_to_cleanup.cleanup()
except Exception as e:
print(f"Warning during cleanup of '{key}': {e}")
del multi_model_manager.models[key]
# Also cleanup legacy model_manager
if model_manager.backend is not None:
print("Unloading legacy model_manager from VRAM...")
try:
if hasattr(model_manager.backend, 'unload'):
model_manager.backend.unload()
elif hasattr(model_manager.backend, 'cleanup'):
model_manager.backend.cleanup()
except:
pass
model_manager.backend = None
# Force garbage collection and clear CUDA cache
import gc
gc.collect()
try:
import torch
if torch.cuda.is_available():
torch.cuda.synchronize()
torch.cuda.empty_cache()
except:
pass
# Try to get the appropriate model
mm = multi_model_manager.get_model_for_request(requested_model)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment