Implement proper loadswap/loadall/ondemand model management modes

- Default mode changed to ondemand (pre-load first model, unload/load on switch) - loadswap: load first model in VRAM, others in CPU RAM, swap on switch - loadall: try to load all models in VRAM, offload to CPU RAM if OOM - --nopreload: skip pre-loading in any mode, load on first request - request_model() now properly handles all three modes - Added _move_model_to_cpu() and _move_model_to_vram() for loadswap - Fixed NameError: model_manager reference in request_model() (was using global singleton instead of self) - Updated CLI help text for --loadall, --loadswap, --nopreload

Implement proper loadswap/loadall/ondemand model management modes
- Default mode changed to ondemand (pre-load first model, unload/load on switch) - loadswap: load first model in VRAM, others in CPU RAM, swap on switch - loadall: try to load all models in VRAM, offload to CPU RAM if OOM - --nopreload: skip pre-loading in any mode, load on first request - request_model() now properly handles all three modes - Added _move_model_to_cpu() and _move_model_to_vram() for loadswap - Fixed NameError: model_manager reference in request_model() (was using global singleton instead of self) - Updated CLI help text for --loadall, --loadswap, --nopreload
c08a5b4f · Your Name · e004541a · c08a5b4f · c08a5b4f · c08a5b4f
Commit c08a5b4f authored Mar 19, 2026 by Your Name
Hide whitespace changes
Inline Side-by-side

Showing with 352 additions and 44 deletions

cli.py codai/cli.py +3 -3

main.py codai/main.py +83 -13

manager.py codai/models/manager.py +266 -28

No files found.
--- a/codai/cli.py
+++ b/codai/cli.py
@@ -280,17 +280,17 @@ def parse_args():
    parser.add_argument(
        "--loadall",
        action="store_true",
-        help="Pre-load all models (main, audio, image) at startup instead of on-demand",
+        help="Load all models at startup. Tries VRAM first, offloads to CPU RAM if VRAM is full.",
    )
    parser.add_argument(
        "--loadswap",
        action="store_true",
-        help="Keep all models loaded, swapping active model between VRAM and RAM (only active model in VRAM)",
+        help="Load first model in VRAM, others in CPU RAM. Swap active model between VRAM and CPU RAM on switch.",
    )
    parser.add_argument(
        "--nopreload",
        action="store_true",
-        help="Disable model preloading. Models will load on first request instead of at startup",
+        help="Skip model pre-loading at startup. Models will load on first request using the active mode strategy (ondemand/loadswap/loadall).",
    )
    parser.add_argument(
        "--audio-ctx",

--- a/codai/main.py
+++ b/codai/main.py
@@ -233,18 +233,29 @@ def main():
        sys.exit(1)
    # Determine load mode
-    # Default is to preload (loadall) unless --nopreload is specified
+    # Default is ondemand: pre-load only the first model, unload/load on switch
-    load_mode = "loadall"  # Default: preload models
+    # --loadswap: load first in VRAM, others in CPU RAM, swap on switch
+    # --loadall: try to load all models in VRAM, offload to CPU RAM if fails
+    # --nopreload: skip pre-loading in any mode, load on first request
+    load_mode = "ondemand"  # Default: on-demand loading
    if args.loadall:
        load_mode = "loadall"
    elif args.loadswap:
        load_mode = "loadswap"
-    elif args.nopreload:
-        load_mode = "ondemand"
+    nopreload = args.nopreload
    set_load_mode(load_mode)
+    multi_model_manager.set_load_mode(load_mode)
    if load_mode == "ondemand":
-        print("Load mode: ondemand (load model on first request)")
+        print("Load mode: ondemand (pre-load first model, unload/load on switch)")
+    elif load_mode == "loadswap":
+        print("Load mode: loadswap (first model in VRAM, others in CPU RAM, swap on switch)")
+    elif load_mode == "loadall":
+        print("Load mode: loadall (load all models, offload to CPU RAM if VRAM full)")
+    if nopreload:
+        print("  --nopreload: models will load on first request instead of at startup")
    # Initialize model manager
    print("\n=== Initializing Model Manager ===")
@@ -278,28 +289,87 @@ def main():
    # Load main text model(s)
    if model_names:
-        print(f"\nLoading main text model(s): {model_names}")
+        print(f"\nMain text model(s): {model_names}")
-        # Register models with multi_model_manager
+        # Register models with multi_model_manager (set_default_model also resolves/caches)
        for idx, model_name in enumerate(model_names):
            multi_model_manager.set_default_model(model_name, {
                'ctx': get_ctx_by_index(args.n_ctx, idx, 0),
            })
-        # Load first model (unless nopreload mode)
+        # Pre-load models at startup (unless --nopreload)
-        if load_mode == "loadall":
+        if nopreload:
+            print(f"  --nopreload: text model(s) will load on first request")
+        elif load_mode == "ondemand":
+            # Ondemand: pre-load only the first model into VRAM
            try:
-                print(f"Loading model: {model_names[0]}...")
+                print(f"Preloading first model into VRAM: {model_names[0]}...")
                mm = multi_model_manager._load_default_model()
                if mm is not None and mm.backend is not None:
+                    multi_model_manager.active_in_vram = multi_model_manager.default_model
                    print(f"Model loaded successfully: {model_names[0]}")
                else:
                    print(f"Warning: Model {model_names[0]} failed to load")
            except Exception as e:
-                print(f"Warning: Failed to load model: {e}")
+                print(f"Warning: Failed to preload model: {e}")
                print(f"Model will load on first request")
-        else:
+        elif load_mode == "loadswap":
-            print(f"Load mode: ondemand (model will load on first request)")
+            # Loadswap: load first model into VRAM, others into CPU RAM
+            try:
+                print(f"Preloading first model into VRAM: {model_names[0]}...")
+                mm = multi_model_manager._load_default_model()
+                if mm is not None and mm.backend is not None:
+                    multi_model_manager.active_in_vram = multi_model_manager.default_model
+                    print(f"Model loaded successfully (VRAM): {model_names[0]}")
+                else:
+                    print(f"Warning: Model {model_names[0]} failed to load")
+            except Exception as e:
+                print(f"Warning: Failed to preload model: {e}")
+            # Load remaining text models into CPU RAM
+            for idx, model_name in enumerate(model_names[1:], 1):
+                try:
+                    print(f"Preloading model into CPU RAM: {model_name}...")
+                    mm2 = multi_model_manager._load_model_by_name(model_name)
+                    if mm2 is not None:
+                        # Move to CPU immediately (it was loaded into VRAM by default)
+                        multi_model_manager._move_model_to_cpu(model_name)
+                        print(f"Model loaded successfully (CPU RAM): {model_name}")
+                    else:
+                        print(f"Warning: Model {model_name} failed to load")
+                except Exception as e:
+                    print(f"Warning: Failed to preload model {model_name}: {e}")
+        elif load_mode == "loadall":
+            # Loadall: try to load all models into VRAM, offload to CPU RAM if fails
+            for idx, model_name in enumerate(model_names):
+                try:
+                    if idx == 0:
+                        print(f"Preloading model into VRAM: {model_name}...")
+                        mm = multi_model_manager._load_default_model()
+                    else:
+                        print(f"Preloading model into VRAM: {model_name}...")
+                        mm = multi_model_manager._load_model_by_name(model_name)
+                    if mm is not None and (not hasattr(mm, 'backend') or mm.backend is not None):
+                        if idx == 0:
+                            multi_model_manager.active_in_vram = multi_model_manager.default_model
+                        print(f"Model loaded successfully (VRAM): {model_name}")
+                    else:
+                        print(f"Warning: Model {model_name} failed to load")
+                except Exception as e:
+                    error_msg = str(e).lower()
+                    is_oom = any(x in error_msg for x in ['out of memory', 'oom', 'cuda error'])
+                    if is_oom:
+                        print(f"VRAM full for {model_name}, offloading to CPU RAM...")
+                        try:
+                            mm = multi_model_manager._load_model_by_name(model_name)
+                            if mm is not None:
+                                multi_model_manager._move_model_to_cpu(model_name)
+                                print(f"Model loaded successfully (CPU RAM): {model_name}")
+                        except Exception as e2:
+                            print(f"Warning: Failed to load model {model_name} even to CPU: {e2}")
+                    else:
+                        print(f"Warning: Failed to preload model {model_name}: {e}")
    # Set up audio model if specified
    if audio_models:

--- a/codai/models/manager.py
+++ b/codai/models/manager.py
@@ -374,7 +374,7 @@ class MultiModelManager:
    """
    def __init__(self):
-        self.models: Dict[str, ModelManager] = {}
+        self.models: Dict[str, Any] = {}  # Can hold ModelManager, diffusers pipelines, sd.cpp models, etc.
        self.default_model: Optional[str] = None
        self.audio_models: List[str] = []
        self.tts_model: Optional[str] = None
@@ -883,14 +883,127 @@ class MultiModelManager:
        return load_model(model_path, cache_dir, file_pattern)
+    def _move_model_to_cpu(self, model_key: str):
+        """
+        Move a model from VRAM to CPU RAM (for loadswap mode).
+        The model stays in self.models but is moved to CPU so it doesn't
+        consume VRAM. It can be moved back to VRAM later.
+        """
+        model_obj = self.models.get(model_key)
+        if model_obj is None:
+            return
+        print(f"Moving model '{model_key}' from VRAM to CPU RAM...")
+        try:
+            import torch
+            # Case 1: ModelManager with a backend
+            if isinstance(model_obj, ModelManager) and model_obj.backend is not None:
+                backend = model_obj.backend
+                if hasattr(backend, 'model') and backend.model is not None:
+                    if hasattr(backend.model, 'to'):
+                        try:
+                            backend.model.to('cpu')
+                            print(f"  Moved backend model to CPU")
+                        except Exception as e:
+                            print(f"  Warning: Could not move backend model to CPU: {e}")
+                    # For llama-cpp-python models, we can't move to CPU easily
+                    # They stay in memory but we track them
+            # Case 2: Diffusers pipeline (has 'to' method)
+            elif hasattr(model_obj, 'to') and callable(getattr(model_obj, 'to')):
+                try:
+                    model_obj.to('cpu')
+                    print(f"  Moved diffusers pipeline to CPU")
+                except Exception as e:
+                    print(f"  Warning: Could not move pipeline to CPU: {e}")
+            # Case 3: Object with a model attribute
+            elif hasattr(model_obj, 'model') and model_obj.model is not None:
+                if hasattr(model_obj.model, 'to'):
+                    try:
+                        model_obj.model.to('cpu')
+                        print(f"  Moved inner model to CPU")
+                    except Exception as e:
+                        print(f"  Warning: Could not move inner model to CPU: {e}")
+            # Clear CUDA cache after moving to CPU
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+                torch.cuda.empty_cache()
+            gc.collect()
+        except ImportError:
+            print(f"  Warning: torch not available, cannot move model to CPU")
+        except Exception as e:
+            print(f"  Warning during CPU offload of '{model_key}': {e}")
+    def _move_model_to_vram(self, model_key: str):
+        """
+        Move a model from CPU RAM back to VRAM (for loadswap mode).
+        """
+        model_obj = self.models.get(model_key)
+        if model_obj is None:
+            return
+        print(f"Moving model '{model_key}' from CPU RAM to VRAM...")
+        try:
+            import torch
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            # Case 1: ModelManager with a backend
+            if isinstance(model_obj, ModelManager) and model_obj.backend is not None:
+                backend = model_obj.backend
+                if hasattr(backend, 'model') and backend.model is not None:
+                    if hasattr(backend.model, 'to'):
+                        try:
+                            backend.model.to(device)
+                            print(f"  Moved backend model to {device}")
+                        except Exception as e:
+                            print(f"  Warning: Could not move backend model to {device}: {e}")
+            # Case 2: Diffusers pipeline
+            elif hasattr(model_obj, 'to') and callable(getattr(model_obj, 'to')):
+                try:
+                    model_obj.to(device)
+                    print(f"  Moved diffusers pipeline to {device}")
+                except Exception as e:
+                    print(f"  Warning: Could not move pipeline to {device}: {e}")
+            # Case 3: Object with a model attribute
+            elif hasattr(model_obj, 'model') and model_obj.model is not None:
+                if hasattr(model_obj.model, 'to'):
+                    try:
+                        model_obj.model.to(device)
+                        print(f"  Moved inner model to {device}")
+                    except Exception as e:
+                        print(f"  Warning: Could not move inner model to {device}: {e}")
+        except ImportError:
+            print(f"  Warning: torch not available, cannot move model to VRAM")
+        except Exception as e:
+            print(f"  Warning during VRAM load of '{model_key}': {e}")
    def request_model(self, requested_model: str, model_type: str = None) -> Dict[str, Any]:
        """
        Central method for API modules to request a model.
-        Handles:
+        Handles three load modes:
-        1. Alias resolution (e.g., "image" -> "Tongyi-MAI/Z-Image-Turbo")
-        2. VRAM management (unloading previous models in ondemand mode)
+        **loadall**: All models are pre-loaded at startup. Just return the
-        3. Checking if model is already loaded
+        already-loaded model. No VRAM management needed.
+        **loadswap**: All models stay loaded (in CPU RAM or VRAM). When a
+        different model is requested, the current VRAM model is moved to CPU
+        RAM and the requested model is moved from CPU RAM to VRAM.
+        **ondemand** (default when no flag specified): Only one model in memory
+        at a time. When a different model is requested, the current model is
+        fully unloaded (deleted) and the new one is loaded from scratch.
        Args:
            requested_model: The model name/alias from the API request
@@ -903,7 +1016,7 @@ class MultiModelManager:
                - 'model_name': The resolved model name/path/HF ID
                - 'model_object': The loaded model object if already loaded, None otherwise
                - 'config': The stored configuration for this model
-                - 'already_loaded': True if the model is already loaded in VRAM
+                - 'already_loaded': True if the model is already loaded and ready in VRAM
        """
        from codai.api.state import get_load_mode
        mode = get_load_mode()
@@ -968,10 +1081,107 @@ class MultiModelManager:
        else:
            model_key = resolved_name
-        # Step 3: Check if already loaded
+        # Step 3: Check if already loaded in self.models
        existing_model = self.models.get(model_key)
+        # =====================================================================
+        # LOADALL MODE: All models should be pre-loaded. Just return it.
+        # =====================================================================
+        if mode == "loadall":
+            if existing_model is not None:
+                self.current_model_key = model_key
+                self.active_in_vram = model_key
+                return {
+                    'model_key': model_key,
+                    'model_name': resolved_name,
+                    'model_object': existing_model,
+                    'config': self.config.get(model_key, {}),
+                    'already_loaded': True,
+                }
+            # Model not loaded yet in loadall mode - caller needs to load it
+            # (this happens for models not pre-loaded at startup, e.g., image models)
+            print(f"Loadall mode: Model '{model_key}' not pre-loaded, will load now")
+            return {
+                'model_key': model_key,
+                'model_name': resolved_name,
+                'model_object': None,
+                'config': self.config.get(model_key, {}),
+                'already_loaded': False,
+            }
+        # =====================================================================
+        # LOADSWAP MODE: Keep all models in memory. Swap active model between
+        # VRAM and CPU RAM. Only the active model should be in VRAM.
+        # =====================================================================
+        if mode == "loadswap":
+            if existing_model is not None:
+                # Model is loaded (either in VRAM or CPU RAM)
+                if self.active_in_vram == model_key:
+                    # Already the active model in VRAM
+                    self.current_model_key = model_key
+                    return {
+                        'model_key': model_key,
+                        'model_name': resolved_name,
+                        'model_object': existing_model,
+                        'config': self.config.get(model_key, {}),
+                        'already_loaded': True,
+                    }
+                else:
+                    # Model is in CPU RAM - need to swap
+                    # First, move current VRAM model to CPU
+                    if self.active_in_vram and self.active_in_vram in self.models:
+                        print(f"Loadswap: Moving '{self.active_in_vram}' from VRAM to CPU RAM")
+                        self._move_model_to_cpu(self.active_in_vram)
+                    # Also check the legacy model_manager singleton
+                    from codai.models.manager import model_manager as _legacy_mm
+                    if _legacy_mm.backend is not None and self.active_in_vram is None:
+                        print(f"Loadswap: Moving legacy model_manager model to CPU")
+                        self._move_model_to_cpu_legacy(_legacy_mm)
+                    # Now move the requested model to VRAM
+                    print(f"Loadswap: Moving '{model_key}' from CPU RAM to VRAM")
+                    self._move_model_to_vram(model_key)
+                    self.active_in_vram = model_key
+                    self.current_model_key = model_key
+                    return {
+                        'model_key': model_key,
+                        'model_name': resolved_name,
+                        'model_object': existing_model,
+                        'config': self.config.get(model_key, {}),
+                        'already_loaded': True,
+                    }
+            else:
+                # Model not loaded at all - move current VRAM model to CPU first
+                if self.active_in_vram and self.active_in_vram in self.models:
+                    print(f"Loadswap: Moving '{self.active_in_vram}' from VRAM to CPU RAM")
+                    self._move_model_to_cpu(self.active_in_vram)
+                # Also check the legacy model_manager singleton
+                from codai.models.manager import model_manager as _legacy_mm
+                if _legacy_mm.backend is not None and self.active_in_vram is None:
+                    print(f"Loadswap: Moving legacy model_manager model to CPU")
+                    self._move_model_to_cpu_legacy(_legacy_mm)
+                # Caller needs to load the model fresh (into VRAM)
+                self.active_in_vram = model_key  # Will be set once loaded
+                return {
+                    'model_key': model_key,
+                    'model_name': resolved_name,
+                    'model_object': None,
+                    'config': self.config.get(model_key, {}),
+                    'already_loaded': False,
+                }
+        # =====================================================================
+        # ONDEMAND MODE (default): Only one model in memory at a time.
+        # Fully unload the current model before loading the new one.
+        # =====================================================================
        if existing_model is not None:
+            # Already loaded and it's the only model - return it
            self.current_model_key = model_key
+            self.active_in_vram = model_key
            return {
                'model_key': model_key,
                'model_name': resolved_name,
@@ -980,28 +1190,35 @@ class MultiModelManager:
                'already_loaded': True,
            }
-        # Step 4: In ondemand mode, unload any currently loaded model
+        # Model not loaded - need to unload whatever is currently loaded
-        if mode == "ondemand":
+        # Check if there's anything to unload
-            has_any_model = len(self.models) > 0 or model_manager.backend is not None
+        has_models_in_multi = len(self.models) > 0
+        # Also check the legacy model_manager singleton
+        from codai.models.manager import model_manager as _legacy_mm
+        has_legacy_model = _legacy_mm.backend is not None
+        if has_models_in_multi or has_legacy_model:
+            loaded_canonical = self.get_currently_loaded_model_name()
+            if not loaded_canonical and has_legacy_model:
+                loaded_canonical = "legacy_model_manager"
-            if has_any_model:
+            if loaded_canonical and loaded_canonical != model_key:
-                loaded_canonical = self.get_currently_loaded_model_name()
+                print(f"Ondemand mode - model switch detected:")
-                if not loaded_canonical and model_manager.backend is not None:
+                print(f"  Requested: '{model_key}' (resolved: '{resolved_name}')")
-                    loaded_canonical = "legacy_model_manager"
+                print(f"  Currently loaded: '{loaded_canonical}'")
+                print(f"  -> Unloading current model(s) before loading new model...")
+                self.unload_all_models()
-                if loaded_canonical and loaded_canonical != model_key:
+                # Also cleanup the legacy singleton if it has a model
-                    print(f"Ondemand mode - model switch detected:")
+                if has_legacy_model:
-                    print(f"  Requested: '{model_key}' (resolved: '{resolved_name}')")
+                    try:
-                    print(f"  Currently loaded: '{loaded_canonical}'")
+                        print(f"  -> Cleaning up legacy model_manager...")
-                    print(f"  -> Unloading current model(s) before loading new model...")
+                        _legacy_mm.cleanup()
-                    self.unload_all_models()
+                    except Exception as e:
-                    if model_manager.backend is not None:
+                        print(f"  Warning: Error cleaning up legacy model_manager: {e}")
-                        try:
-                            model_manager.cleanup()
-                        except:
-                            pass
-        # Step 5: Return info for the caller to load the model
+        # Return info for the caller to load the model
        return {
            'model_key': model_key,
            'model_name': resolved_name,
@@ -1010,6 +1227,26 @@ class MultiModelManager:
            'already_loaded': False,
        }
+    def _move_model_to_cpu_legacy(self, legacy_mm):
+        """Move the legacy model_manager's model to CPU (for loadswap mode)."""
+        try:
+            import torch
+            if legacy_mm.backend is not None:
+                if hasattr(legacy_mm.backend, 'model') and legacy_mm.backend.model is not None:
+                    if hasattr(legacy_mm.backend.model, 'to'):
+                        try:
+                            legacy_mm.backend.model.to('cpu')
+                            print(f"  Moved legacy model_manager model to CPU")
+                        except Exception as e:
+                            print(f"  Warning: Could not move legacy model to CPU: {e}")
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+                torch.cuda.empty_cache()
+            gc.collect()
+        except Exception as e:
+            print(f"  Warning during legacy model CPU offload: {e}")
    def unload_all_models(self):
        """
        Fully unload ALL models from VRAM. Used in ondemand mode when switching
@@ -1082,9 +1319,10 @@ class MultiModelManager:
        time.sleep(1)
        print("=== FULL VRAM CLEANUP: Complete ===")
-    def add_model(self, key: str, manager: ModelManager):
+    def add_model(self, key: str, manager):
-        """Add a model manager for a specific key."""
+        """Add a model (ModelManager, diffusers pipeline, sd.cpp model, etc.) for a specific key."""
        self.models[key] = manager
+        self.active_in_vram = key
    def get_model(self, key: str) -> Optional[ModelManager]:
        """Get a model manager by key."""