Commit c08a5b4f authored by Your Name's avatar Your Name

Implement proper loadswap/loadall/ondemand model management modes

- Default mode changed to ondemand (pre-load first model, unload/load on switch)
- loadswap: load first model in VRAM, others in CPU RAM, swap on switch
- loadall: try to load all models in VRAM, offload to CPU RAM if OOM
- --nopreload: skip pre-loading in any mode, load on first request
- request_model() now properly handles all three modes
- Added _move_model_to_cpu() and _move_model_to_vram() for loadswap
- Fixed NameError: model_manager reference in request_model() (was using global singleton instead of self)
- Updated CLI help text for --loadall, --loadswap, --nopreload
parent e004541a
...@@ -280,17 +280,17 @@ def parse_args(): ...@@ -280,17 +280,17 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--loadall", "--loadall",
action="store_true", action="store_true",
help="Pre-load all models (main, audio, image) at startup instead of on-demand", help="Load all models at startup. Tries VRAM first, offloads to CPU RAM if VRAM is full.",
) )
parser.add_argument( parser.add_argument(
"--loadswap", "--loadswap",
action="store_true", action="store_true",
help="Keep all models loaded, swapping active model between VRAM and RAM (only active model in VRAM)", help="Load first model in VRAM, others in CPU RAM. Swap active model between VRAM and CPU RAM on switch.",
) )
parser.add_argument( parser.add_argument(
"--nopreload", "--nopreload",
action="store_true", action="store_true",
help="Disable model preloading. Models will load on first request instead of at startup", help="Skip model pre-loading at startup. Models will load on first request using the active mode strategy (ondemand/loadswap/loadall).",
) )
parser.add_argument( parser.add_argument(
"--audio-ctx", "--audio-ctx",
......
...@@ -233,18 +233,29 @@ def main(): ...@@ -233,18 +233,29 @@ def main():
sys.exit(1) sys.exit(1)
# Determine load mode # Determine load mode
# Default is to preload (loadall) unless --nopreload is specified # Default is ondemand: pre-load only the first model, unload/load on switch
load_mode = "loadall" # Default: preload models # --loadswap: load first in VRAM, others in CPU RAM, swap on switch
# --loadall: try to load all models in VRAM, offload to CPU RAM if fails
# --nopreload: skip pre-loading in any mode, load on first request
load_mode = "ondemand" # Default: on-demand loading
if args.loadall: if args.loadall:
load_mode = "loadall" load_mode = "loadall"
elif args.loadswap: elif args.loadswap:
load_mode = "loadswap" load_mode = "loadswap"
elif args.nopreload:
load_mode = "ondemand" nopreload = args.nopreload
set_load_mode(load_mode) set_load_mode(load_mode)
multi_model_manager.set_load_mode(load_mode)
if load_mode == "ondemand": if load_mode == "ondemand":
print("Load mode: ondemand (load model on first request)") print("Load mode: ondemand (pre-load first model, unload/load on switch)")
elif load_mode == "loadswap":
print("Load mode: loadswap (first model in VRAM, others in CPU RAM, swap on switch)")
elif load_mode == "loadall":
print("Load mode: loadall (load all models, offload to CPU RAM if VRAM full)")
if nopreload:
print(" --nopreload: models will load on first request instead of at startup")
# Initialize model manager # Initialize model manager
print("\n=== Initializing Model Manager ===") print("\n=== Initializing Model Manager ===")
...@@ -278,28 +289,87 @@ def main(): ...@@ -278,28 +289,87 @@ def main():
# Load main text model(s) # Load main text model(s)
if model_names: if model_names:
print(f"\nLoading main text model(s): {model_names}") print(f"\nMain text model(s): {model_names}")
# Register models with multi_model_manager # Register models with multi_model_manager (set_default_model also resolves/caches)
for idx, model_name in enumerate(model_names): for idx, model_name in enumerate(model_names):
multi_model_manager.set_default_model(model_name, { multi_model_manager.set_default_model(model_name, {
'ctx': get_ctx_by_index(args.n_ctx, idx, 0), 'ctx': get_ctx_by_index(args.n_ctx, idx, 0),
}) })
# Load first model (unless nopreload mode) # Pre-load models at startup (unless --nopreload)
if load_mode == "loadall": if nopreload:
print(f" --nopreload: text model(s) will load on first request")
elif load_mode == "ondemand":
# Ondemand: pre-load only the first model into VRAM
try: try:
print(f"Loading model: {model_names[0]}...") print(f"Preloading first model into VRAM: {model_names[0]}...")
mm = multi_model_manager._load_default_model() mm = multi_model_manager._load_default_model()
if mm is not None and mm.backend is not None: if mm is not None and mm.backend is not None:
multi_model_manager.active_in_vram = multi_model_manager.default_model
print(f"Model loaded successfully: {model_names[0]}") print(f"Model loaded successfully: {model_names[0]}")
else: else:
print(f"Warning: Model {model_names[0]} failed to load") print(f"Warning: Model {model_names[0]} failed to load")
except Exception as e: except Exception as e:
print(f"Warning: Failed to load model: {e}") print(f"Warning: Failed to preload model: {e}")
print(f"Model will load on first request") print(f"Model will load on first request")
else: elif load_mode == "loadswap":
print(f"Load mode: ondemand (model will load on first request)") # Loadswap: load first model into VRAM, others into CPU RAM
try:
print(f"Preloading first model into VRAM: {model_names[0]}...")
mm = multi_model_manager._load_default_model()
if mm is not None and mm.backend is not None:
multi_model_manager.active_in_vram = multi_model_manager.default_model
print(f"Model loaded successfully (VRAM): {model_names[0]}")
else:
print(f"Warning: Model {model_names[0]} failed to load")
except Exception as e:
print(f"Warning: Failed to preload model: {e}")
# Load remaining text models into CPU RAM
for idx, model_name in enumerate(model_names[1:], 1):
try:
print(f"Preloading model into CPU RAM: {model_name}...")
mm2 = multi_model_manager._load_model_by_name(model_name)
if mm2 is not None:
# Move to CPU immediately (it was loaded into VRAM by default)
multi_model_manager._move_model_to_cpu(model_name)
print(f"Model loaded successfully (CPU RAM): {model_name}")
else:
print(f"Warning: Model {model_name} failed to load")
except Exception as e:
print(f"Warning: Failed to preload model {model_name}: {e}")
elif load_mode == "loadall":
# Loadall: try to load all models into VRAM, offload to CPU RAM if fails
for idx, model_name in enumerate(model_names):
try:
if idx == 0:
print(f"Preloading model into VRAM: {model_name}...")
mm = multi_model_manager._load_default_model()
else:
print(f"Preloading model into VRAM: {model_name}...")
mm = multi_model_manager._load_model_by_name(model_name)
if mm is not None and (not hasattr(mm, 'backend') or mm.backend is not None):
if idx == 0:
multi_model_manager.active_in_vram = multi_model_manager.default_model
print(f"Model loaded successfully (VRAM): {model_name}")
else:
print(f"Warning: Model {model_name} failed to load")
except Exception as e:
error_msg = str(e).lower()
is_oom = any(x in error_msg for x in ['out of memory', 'oom', 'cuda error'])
if is_oom:
print(f"VRAM full for {model_name}, offloading to CPU RAM...")
try:
mm = multi_model_manager._load_model_by_name(model_name)
if mm is not None:
multi_model_manager._move_model_to_cpu(model_name)
print(f"Model loaded successfully (CPU RAM): {model_name}")
except Exception as e2:
print(f"Warning: Failed to load model {model_name} even to CPU: {e2}")
else:
print(f"Warning: Failed to preload model {model_name}: {e}")
# Set up audio model if specified # Set up audio model if specified
if audio_models: if audio_models:
......
...@@ -374,7 +374,7 @@ class MultiModelManager: ...@@ -374,7 +374,7 @@ class MultiModelManager:
""" """
def __init__(self): def __init__(self):
self.models: Dict[str, ModelManager] = {} self.models: Dict[str, Any] = {} # Can hold ModelManager, diffusers pipelines, sd.cpp models, etc.
self.default_model: Optional[str] = None self.default_model: Optional[str] = None
self.audio_models: List[str] = [] self.audio_models: List[str] = []
self.tts_model: Optional[str] = None self.tts_model: Optional[str] = None
...@@ -883,14 +883,127 @@ class MultiModelManager: ...@@ -883,14 +883,127 @@ class MultiModelManager:
return load_model(model_path, cache_dir, file_pattern) return load_model(model_path, cache_dir, file_pattern)
def _move_model_to_cpu(self, model_key: str):
"""
Move a model from VRAM to CPU RAM (for loadswap mode).
The model stays in self.models but is moved to CPU so it doesn't
consume VRAM. It can be moved back to VRAM later.
"""
model_obj = self.models.get(model_key)
if model_obj is None:
return
print(f"Moving model '{model_key}' from VRAM to CPU RAM...")
try:
import torch
# Case 1: ModelManager with a backend
if isinstance(model_obj, ModelManager) and model_obj.backend is not None:
backend = model_obj.backend
if hasattr(backend, 'model') and backend.model is not None:
if hasattr(backend.model, 'to'):
try:
backend.model.to('cpu')
print(f" Moved backend model to CPU")
except Exception as e:
print(f" Warning: Could not move backend model to CPU: {e}")
# For llama-cpp-python models, we can't move to CPU easily
# They stay in memory but we track them
# Case 2: Diffusers pipeline (has 'to' method)
elif hasattr(model_obj, 'to') and callable(getattr(model_obj, 'to')):
try:
model_obj.to('cpu')
print(f" Moved diffusers pipeline to CPU")
except Exception as e:
print(f" Warning: Could not move pipeline to CPU: {e}")
# Case 3: Object with a model attribute
elif hasattr(model_obj, 'model') and model_obj.model is not None:
if hasattr(model_obj.model, 'to'):
try:
model_obj.model.to('cpu')
print(f" Moved inner model to CPU")
except Exception as e:
print(f" Warning: Could not move inner model to CPU: {e}")
# Clear CUDA cache after moving to CPU
if torch.cuda.is_available():
torch.cuda.synchronize()
torch.cuda.empty_cache()
gc.collect()
except ImportError:
print(f" Warning: torch not available, cannot move model to CPU")
except Exception as e:
print(f" Warning during CPU offload of '{model_key}': {e}")
def _move_model_to_vram(self, model_key: str):
"""
Move a model from CPU RAM back to VRAM (for loadswap mode).
"""
model_obj = self.models.get(model_key)
if model_obj is None:
return
print(f"Moving model '{model_key}' from CPU RAM to VRAM...")
try:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
# Case 1: ModelManager with a backend
if isinstance(model_obj, ModelManager) and model_obj.backend is not None:
backend = model_obj.backend
if hasattr(backend, 'model') and backend.model is not None:
if hasattr(backend.model, 'to'):
try:
backend.model.to(device)
print(f" Moved backend model to {device}")
except Exception as e:
print(f" Warning: Could not move backend model to {device}: {e}")
# Case 2: Diffusers pipeline
elif hasattr(model_obj, 'to') and callable(getattr(model_obj, 'to')):
try:
model_obj.to(device)
print(f" Moved diffusers pipeline to {device}")
except Exception as e:
print(f" Warning: Could not move pipeline to {device}: {e}")
# Case 3: Object with a model attribute
elif hasattr(model_obj, 'model') and model_obj.model is not None:
if hasattr(model_obj.model, 'to'):
try:
model_obj.model.to(device)
print(f" Moved inner model to {device}")
except Exception as e:
print(f" Warning: Could not move inner model to {device}: {e}")
except ImportError:
print(f" Warning: torch not available, cannot move model to VRAM")
except Exception as e:
print(f" Warning during VRAM load of '{model_key}': {e}")
def request_model(self, requested_model: str, model_type: str = None) -> Dict[str, Any]: def request_model(self, requested_model: str, model_type: str = None) -> Dict[str, Any]:
""" """
Central method for API modules to request a model. Central method for API modules to request a model.
Handles: Handles three load modes:
1. Alias resolution (e.g., "image" -> "Tongyi-MAI/Z-Image-Turbo")
2. VRAM management (unloading previous models in ondemand mode) **loadall**: All models are pre-loaded at startup. Just return the
3. Checking if model is already loaded already-loaded model. No VRAM management needed.
**loadswap**: All models stay loaded (in CPU RAM or VRAM). When a
different model is requested, the current VRAM model is moved to CPU
RAM and the requested model is moved from CPU RAM to VRAM.
**ondemand** (default when no flag specified): Only one model in memory
at a time. When a different model is requested, the current model is
fully unloaded (deleted) and the new one is loaded from scratch.
Args: Args:
requested_model: The model name/alias from the API request requested_model: The model name/alias from the API request
...@@ -903,7 +1016,7 @@ class MultiModelManager: ...@@ -903,7 +1016,7 @@ class MultiModelManager:
- 'model_name': The resolved model name/path/HF ID - 'model_name': The resolved model name/path/HF ID
- 'model_object': The loaded model object if already loaded, None otherwise - 'model_object': The loaded model object if already loaded, None otherwise
- 'config': The stored configuration for this model - 'config': The stored configuration for this model
- 'already_loaded': True if the model is already loaded in VRAM - 'already_loaded': True if the model is already loaded and ready in VRAM
""" """
from codai.api.state import get_load_mode from codai.api.state import get_load_mode
mode = get_load_mode() mode = get_load_mode()
...@@ -968,10 +1081,107 @@ class MultiModelManager: ...@@ -968,10 +1081,107 @@ class MultiModelManager:
else: else:
model_key = resolved_name model_key = resolved_name
# Step 3: Check if already loaded # Step 3: Check if already loaded in self.models
existing_model = self.models.get(model_key) existing_model = self.models.get(model_key)
# =====================================================================
# LOADALL MODE: All models should be pre-loaded. Just return it.
# =====================================================================
if mode == "loadall":
if existing_model is not None:
self.current_model_key = model_key
self.active_in_vram = model_key
return {
'model_key': model_key,
'model_name': resolved_name,
'model_object': existing_model,
'config': self.config.get(model_key, {}),
'already_loaded': True,
}
# Model not loaded yet in loadall mode - caller needs to load it
# (this happens for models not pre-loaded at startup, e.g., image models)
print(f"Loadall mode: Model '{model_key}' not pre-loaded, will load now")
return {
'model_key': model_key,
'model_name': resolved_name,
'model_object': None,
'config': self.config.get(model_key, {}),
'already_loaded': False,
}
# =====================================================================
# LOADSWAP MODE: Keep all models in memory. Swap active model between
# VRAM and CPU RAM. Only the active model should be in VRAM.
# =====================================================================
if mode == "loadswap":
if existing_model is not None:
# Model is loaded (either in VRAM or CPU RAM)
if self.active_in_vram == model_key:
# Already the active model in VRAM
self.current_model_key = model_key
return {
'model_key': model_key,
'model_name': resolved_name,
'model_object': existing_model,
'config': self.config.get(model_key, {}),
'already_loaded': True,
}
else:
# Model is in CPU RAM - need to swap
# First, move current VRAM model to CPU
if self.active_in_vram and self.active_in_vram in self.models:
print(f"Loadswap: Moving '{self.active_in_vram}' from VRAM to CPU RAM")
self._move_model_to_cpu(self.active_in_vram)
# Also check the legacy model_manager singleton
from codai.models.manager import model_manager as _legacy_mm
if _legacy_mm.backend is not None and self.active_in_vram is None:
print(f"Loadswap: Moving legacy model_manager model to CPU")
self._move_model_to_cpu_legacy(_legacy_mm)
# Now move the requested model to VRAM
print(f"Loadswap: Moving '{model_key}' from CPU RAM to VRAM")
self._move_model_to_vram(model_key)
self.active_in_vram = model_key
self.current_model_key = model_key
return {
'model_key': model_key,
'model_name': resolved_name,
'model_object': existing_model,
'config': self.config.get(model_key, {}),
'already_loaded': True,
}
else:
# Model not loaded at all - move current VRAM model to CPU first
if self.active_in_vram and self.active_in_vram in self.models:
print(f"Loadswap: Moving '{self.active_in_vram}' from VRAM to CPU RAM")
self._move_model_to_cpu(self.active_in_vram)
# Also check the legacy model_manager singleton
from codai.models.manager import model_manager as _legacy_mm
if _legacy_mm.backend is not None and self.active_in_vram is None:
print(f"Loadswap: Moving legacy model_manager model to CPU")
self._move_model_to_cpu_legacy(_legacy_mm)
# Caller needs to load the model fresh (into VRAM)
self.active_in_vram = model_key # Will be set once loaded
return {
'model_key': model_key,
'model_name': resolved_name,
'model_object': None,
'config': self.config.get(model_key, {}),
'already_loaded': False,
}
# =====================================================================
# ONDEMAND MODE (default): Only one model in memory at a time.
# Fully unload the current model before loading the new one.
# =====================================================================
if existing_model is not None: if existing_model is not None:
# Already loaded and it's the only model - return it
self.current_model_key = model_key self.current_model_key = model_key
self.active_in_vram = model_key
return { return {
'model_key': model_key, 'model_key': model_key,
'model_name': resolved_name, 'model_name': resolved_name,
...@@ -980,28 +1190,35 @@ class MultiModelManager: ...@@ -980,28 +1190,35 @@ class MultiModelManager:
'already_loaded': True, 'already_loaded': True,
} }
# Step 4: In ondemand mode, unload any currently loaded model # Model not loaded - need to unload whatever is currently loaded
if mode == "ondemand": # Check if there's anything to unload
has_any_model = len(self.models) > 0 or model_manager.backend is not None has_models_in_multi = len(self.models) > 0
# Also check the legacy model_manager singleton
from codai.models.manager import model_manager as _legacy_mm
has_legacy_model = _legacy_mm.backend is not None
if has_models_in_multi or has_legacy_model:
loaded_canonical = self.get_currently_loaded_model_name()
if not loaded_canonical and has_legacy_model:
loaded_canonical = "legacy_model_manager"
if has_any_model: if loaded_canonical and loaded_canonical != model_key:
loaded_canonical = self.get_currently_loaded_model_name() print(f"Ondemand mode - model switch detected:")
if not loaded_canonical and model_manager.backend is not None: print(f" Requested: '{model_key}' (resolved: '{resolved_name}')")
loaded_canonical = "legacy_model_manager" print(f" Currently loaded: '{loaded_canonical}'")
print(f" -> Unloading current model(s) before loading new model...")
self.unload_all_models()
if loaded_canonical and loaded_canonical != model_key: # Also cleanup the legacy singleton if it has a model
print(f"Ondemand mode - model switch detected:") if has_legacy_model:
print(f" Requested: '{model_key}' (resolved: '{resolved_name}')") try:
print(f" Currently loaded: '{loaded_canonical}'") print(f" -> Cleaning up legacy model_manager...")
print(f" -> Unloading current model(s) before loading new model...") _legacy_mm.cleanup()
self.unload_all_models() except Exception as e:
if model_manager.backend is not None: print(f" Warning: Error cleaning up legacy model_manager: {e}")
try:
model_manager.cleanup()
except:
pass
# Step 5: Return info for the caller to load the model # Return info for the caller to load the model
return { return {
'model_key': model_key, 'model_key': model_key,
'model_name': resolved_name, 'model_name': resolved_name,
...@@ -1010,6 +1227,26 @@ class MultiModelManager: ...@@ -1010,6 +1227,26 @@ class MultiModelManager:
'already_loaded': False, 'already_loaded': False,
} }
def _move_model_to_cpu_legacy(self, legacy_mm):
"""Move the legacy model_manager's model to CPU (for loadswap mode)."""
try:
import torch
if legacy_mm.backend is not None:
if hasattr(legacy_mm.backend, 'model') and legacy_mm.backend.model is not None:
if hasattr(legacy_mm.backend.model, 'to'):
try:
legacy_mm.backend.model.to('cpu')
print(f" Moved legacy model_manager model to CPU")
except Exception as e:
print(f" Warning: Could not move legacy model to CPU: {e}")
if torch.cuda.is_available():
torch.cuda.synchronize()
torch.cuda.empty_cache()
gc.collect()
except Exception as e:
print(f" Warning during legacy model CPU offload: {e}")
def unload_all_models(self): def unload_all_models(self):
""" """
Fully unload ALL models from VRAM. Used in ondemand mode when switching Fully unload ALL models from VRAM. Used in ondemand mode when switching
...@@ -1082,9 +1319,10 @@ class MultiModelManager: ...@@ -1082,9 +1319,10 @@ class MultiModelManager:
time.sleep(1) time.sleep(1)
print("=== FULL VRAM CLEANUP: Complete ===") print("=== FULL VRAM CLEANUP: Complete ===")
def add_model(self, key: str, manager: ModelManager): def add_model(self, key: str, manager):
"""Add a model manager for a specific key.""" """Add a model (ModelManager, diffusers pipeline, sd.cpp model, etc.) for a specific key."""
self.models[key] = manager self.models[key] = manager
self.active_in_vram = key
def get_model(self, key: str) -> Optional[ModelManager]: def get_model(self, key: str) -> Optional[ModelManager]:
"""Get a model manager by key.""" """Get a model manager by key."""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment