Add --offload-strategy none to disable CPU offloading and VRAM auto-detection

- Add 'none' to --offload-strategy choices in cli.py - In cuda.py backend: - _get_vram_percentages_for_strategy() returns None for 'none' strategy - _get_vram_percentages_for_gpu() skips VRAM detection for 'none' - load_model() loads directly on GPU without max_memory constraints - Add startup status message in main.py for --offload-strategy none

Add --offload-strategy none to disable CPU offloading and VRAM auto-detection
- Add 'none' to --offload-strategy choices in cli.py - In cuda.py backend: - _get_vram_percentages_for_strategy() returns None for 'none' strategy - _get_vram_percentages_for_gpu() skips VRAM detection for 'none' - load_model() loads directly on GPU without max_memory constraints - Add startup status message in main.py for --offload-strategy none
beded066 · Your Name · b782a092 · beded066 · beded066 · beded066
Commit beded066 authored Mar 20, 2026 by Your Name
Expand all Show whitespace changes
Inline Side-by-side

Showing with 714 additions and 35 deletions

cuda.py codai/backends/cuda.py +61 -33

cli.py codai/cli.py +2 -2

main.py codai/main.py +5 -0

main.py~ codai/main.py~ +646 -0

No files found.
--- a/codai/backends/cuda.py
+++ b/codai/backends/cuda.py
@@ -185,6 +185,9 @@ class NvidiaBackend(ModelBackend):
    
    def _get_vram_percentages_for_strategy(self, strategy: str, is_moe: bool, total_vram_gb: float) -> list:
        """Get VRAM percentage steps based on offload strategy."""
+        if strategy == "none":
+            print(f"  Offload strategy 'none': disabling CPU offload and VRAM auto-detection")
+            return None  # Signal to skip offloading entirely
        if strategy == "conservative":
            print(f"  Using conservative offload strategy")
            if is_moe:
@@ -221,9 +224,15 @@ class NvidiaBackend(ModelBackend):
                    return [0.93, 0.85, 0.75, 0.65, 0.50, 0.35, 0.20, 0.0]
    
    def _get_vram_percentages_for_gpu(self, model_name: str = "", strategy: str = "auto", max_gpu_percent: float = None) -> list:
-        """Get VRAM percentage steps based on GPU memory size."""
+        """Get VRAM percentage steps based on GPU memory size.
+        
+        Returns None when strategy is 'none' (no offloading).
+        """
        import torch
        
+        if strategy == "none":
+            return None  # Signal to skip offloading entirely
+        
        if not torch.cuda.is_available():
            return [0.0]
        
@@ -397,6 +406,25 @@ class NvidiaBackend(ModelBackend):
        
        model = None
        vram_percentages = self._get_vram_percentages_for_gpu(model_name, offload_strategy, max_gpu_percent)
+        
+        # --offload-strategy none: load directly on GPU without offloading or VRAM limits
+        if vram_percentages is None:
+            cuda_device = self._derive_cuda_device()
+            print(f"\nOffload strategy 'none': loading model directly on {cuda_device} (no CPU offload, no VRAM limits)")
+            load_kwargs['device_map'] = cuda_device
+            load_kwargs['low_cpu_mem_usage'] = True
+            load_kwargs['torch_dtype'] = "auto"
+            # Remove dtype set earlier since torch_dtype=auto takes precedence
+            load_kwargs.pop('dtype', None)
+            
+            try:
+                model = AutoModelForCausalLM.from_pretrained(model_name, **load_kwargs)
+            except Exception as e:
+                raise RuntimeError(
+                    f"--offload-strategy none: Failed to load model entirely on GPU ({cuda_device}). "
+                    f"The model may be too large for available VRAM. Error: {e}"
+                )
+        else:
            first_vram_pct = vram_percentages[0] if vram_percentages else 0.93
            
            for vram_pct in vram_percentages:

--- a/codai/cli.py
+++ b/codai/cli.py
@@ -117,9 +117,9 @@ def parse_args():
    parser.add_argument(
        "--offload-strategy",
        type=str,
-        choices=["auto", "conservative", "balanced", "aggressive", "sequential"],
+        choices=["auto", "conservative", "balanced", "aggressive", "sequential", "none"],
        default="auto",
-        help="Offload strategy for NVIDIA backend (default: auto)",
+        help="Offload strategy for NVIDIA backend (default: auto). Use 'none' to disable CPU offloading and VRAM auto-detection entirely.",
    )
    parser.add_argument(
        "--max-gpu-percent",

--- a/codai/main.py
+++ b/codai/main.py
@@ -187,6 +187,11 @@ def main():
    if grammar_guided_gen:
        print("Grammar-guided generation enabled (--grammar-guided-gen)")
    
+    # Print --offload-strategy none status
+    if args.offload_strategy == "none":
+        print("Offload strategy 'none': CPU offloading and VRAM auto-detection disabled")
+        print("  Model will be loaded directly on GPU without memory limits")
+    
    # Print --no-ram mode status
    if args.no_ram:
        print("No-RAM mode enabled (--no-ram): maximizing VRAM usage, no CPU RAM spilling")

--- a/codai/main.py~
+++ b/codai/main.py~