Add --no-ram option to maximize VRAM usage

- Add --no-ram CLI option to force model loading without CPU RAM spilling - Implement --no-ram behavior for: - llama-cpp-python: n_gpu_layers=-1, use_mmap=False, ignore --n-ctx - HuggingFace transformers: device_map='cuda:0', low_cpu_mem_usage=True - Diffusers: force full GPU loading - sd.cpp: maximize GPU usage - Propagate flag through model manager - Add startup banner message

Add --no-ram option to maximize VRAM usage
- Add --no-ram CLI option to force model loading without CPU RAM spilling - Implement --no-ram behavior for: - llama-cpp-python: n_gpu_layers=-1, use_mmap=False, ignore --n-ctx - HuggingFace transformers: device_map='cuda:0', low_cpu_mem_usage=True - Diffusers: force full GPU loading - sd.cpp: maximize GPU usage - Propagate flag through model manager - Add startup banner message
b782a092 · Your Name · ef949827 · b782a092 · b782a092 · b782a092
Commit b782a092 authored Mar 20, 2026 by Your Name
6 changed files
--- a/codai/api/images.py
+++ b/codai/api/images.py
@@ -187,6 +187,24 @@ def _is_gguf_model(model_name: str) -> bool:
            (model_name.startswith('http') and '.gguf' in model_name))


+def _derive_diffusers_device(global_args) -> str:
+    """Derive the CUDA device string for diffusers from global args.
+    
+    Checks --image-vulkan-device then --vulkan-device to determine
+    which CUDA device to target. Defaults to 'cuda:0'.
+    """
+    if global_args:
+        # Check image-specific device first
+        image_device = getattr(global_args, 'image_vulkan_device', None)
+        if image_device is not None:
+            return f"cuda:{image_device}"
+        # Fall back to general device
+        device_id = getattr(global_args, 'vulkan_device', 0)
+        if device_id is not None and device_id != 0:
+            return f"cuda:{device_id}"
+    return "cuda:0"
+
+
 def _load_diffusers_pipeline(model_name: str, global_args):
    """
    Try to load a model using the diffusers library.
@@ -197,6 +215,9 @@ def _load_diffusers_pipeline(model_name: str, global_args):
    from diffusers import StableDiffusionPipeline, StableDiffusionXLPipeline, DiffusionPipeline
    import torch
    
+    # Check for --no-ram mode
+    no_ram = getattr(global_args, 'no_ram', False) if global_args else False
+    
    # Determine precision from CLI argument (--image-precision)
    precision = getattr(global_args, 'image_precision', 'f32') or 'f32'
    precision_map = {
@@ -207,11 +228,55 @@ def _load_diffusers_pipeline(model_name: str, global_args):
    if hasattr(torch, 'float8_e4m3fn'):
        precision_map['f8'] = torch.float8_e4m3fn
    dtype = precision_map.get(precision, torch.float32)
+    
+    # --no-ram mode: override dtype to auto-select best for GPU
+    if no_ram:
+        dtype = torch.float16  # Use fp16 to maximize VRAM efficiency
+        print(f"--no-ram mode: Using precision fp16 for maximum VRAM efficiency")
+    else:
        print(f"Using precision: {precision} ({dtype})")
    
    # Check if CPU offload is requested via CLI
    use_sequential_offload = getattr(global_args, 'image_cpu_offload', False)
    
+    # --no-ram mode: never use CPU offload
+    if no_ram and use_sequential_offload:
+        print("--no-ram mode: ignoring --image-cpu-offload, forcing full GPU loading")
+        use_sequential_offload = False
+    
+    # =====================================================================
+    # --no-ram mode: load directly on GPU, no CPU RAM fallback
+    # =====================================================================
+    if no_ram and torch.cuda.is_available():
+        cuda_device = _derive_diffusers_device(global_args)
+        print(f"--no-ram mode: loading diffusers model directly on {cuda_device}")
+        
+        try:
+            try:
+                pipeline = StableDiffusionXLPipeline.from_pretrained(
+                    model_name,
+                    torch_dtype=dtype,
+                    use_safetensors=True,
+                )
+            except Exception:
+                pipeline = DiffusionPipeline.from_pretrained(
+                    model_name,
+                    torch_dtype=dtype,
+                    use_safetensors=True,
+                )
+            
+            pipeline = pipeline.to(cuda_device)
+            print(f"--no-ram: Diffusers model loaded on {cuda_device}")
+            return pipeline
+        except Exception as e:
+            raise RuntimeError(
+                f"--no-ram: Failed to load diffusers model entirely on GPU ({cuda_device}). "
+                f"The model may be too large for available VRAM. Error: {e}"
+            )
+    
+    # =====================================================================
+    # Standard loading path (with OOM fallback)
+    # =====================================================================
    # Track loading attempts for OOM handling
    pipeline = None
    load_attempt = 0
@@ -419,6 +484,9 @@ def _load_sdcpp_model(model_path: str, global_args):
    """
    from stable_diffusion_cpp import StableDiffusion
    
+    # Check for --no-ram mode
+    no_ram = getattr(global_args, 'no_ram', False) if global_args else False
+    
    print(f"Loading sd.cpp model from: {model_path}")
    
    # Build sd.cpp constructor args from config
@@ -433,6 +501,15 @@ def _load_sdcpp_model(model_path: str, global_args):
        if hasattr(global_args, 'llm_path') and global_args.llm_path:
            kwargs['lora_model_dir'] = global_args.llm_path
    
+    # --no-ram mode: maximize GPU offloading for sd.cpp
+    if no_ram:
+        # stable-diffusion-cpp-python supports n_threads and gpu-related params
+        # Force full GPU offload by keeping all operations on GPU
+        kwargs['keep_clip_on_cpu'] = False  # Don't offload CLIP to CPU
+        kwargs['keep_control_net_cpu'] = False  # Don't offload ControlNet to CPU
+        kwargs['keep_vae_on_cpu'] = False  # Don't offload VAE to CPU
+        print("--no-ram mode: sd.cpp maximizing GPU usage (no CPU offload for CLIP/VAE/ControlNet)")
+    
    sd_model = StableDiffusion(**kwargs)
    return sd_model


--- a/codai/backends/cuda.py
+++ b/codai/backends/cuda.py
@@ -254,6 +254,24 @@ class NvidiaBackend(ModelBackend):
        
        return self._get_vram_percentages_for_strategy(strategy, is_moe, total_vram_gb)
    
+    def _derive_cuda_device(self) -> str:
+        """Derive the CUDA device string from global args.
+        
+        Checks --vulkan-device (reused as generic GPU device ID) to determine
+        which CUDA device to target. Defaults to 'cuda:0'.
+        """
+        try:
+            from codai.api.state import get_global_args
+            _global_args = get_global_args()
+            if _global_args:
+                # Use vulkan-device as a generic GPU device selector
+                device_id = getattr(_global_args, 'vulkan_device', 0)
+                if device_id is not None and device_id != 0:
+                    return f"cuda:{device_id}"
+        except Exception:
+            pass
+        return "cuda:0"
+    
    def load_model(self, model_name: str, **kwargs) -> None:
        """Load the model using HuggingFace Transformers with automatic OOM handling."""
        import torch
@@ -267,6 +285,17 @@ class NvidiaBackend(ModelBackend):
        offload_strategy = kwargs.get('offload_strategy', 'auto')
        max_gpu_percent = kwargs.get('max_gpu_percent', None)
        
+        # Check for --no-ram mode
+        no_ram = kwargs.get('no_ram', False)
+        if not no_ram:
+            try:
+                from codai.api.state import get_global_args
+                _global_args = get_global_args()
+                if _global_args and getattr(_global_args, 'no_ram', False):
+                    no_ram = True
+            except Exception:
+                pass
+        
        self._pending_ram_gb = manual_ram_gb
        
        print(f"Loading HuggingFace model: {model_name}")
@@ -285,6 +314,60 @@ class NvidiaBackend(ModelBackend):
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
+        # =====================================================================
+        # --no-ram mode: maximize VRAM, no CPU RAM spilling
+        # =====================================================================
+        if no_ram and self.device == "cuda":
+            cuda_device = self._derive_cuda_device()
+            print(f"--no-ram mode: loading model directly on {cuda_device}")
+            print(f"  device_map={cuda_device}, low_cpu_mem_usage=True, torch_dtype=auto")
+            
+            load_kwargs = {
+                'trust_remote_code': True,
+                'device_map': cuda_device,
+                'low_cpu_mem_usage': True,
+                'torch_dtype': "auto",
+            }
+            
+            if self.use_flash_attn and self.flash_attn_available:
+                load_kwargs['attn_implementation'] = "flash_attention_2"
+                print("  Using Flash Attention 2")
+            
+            # Still allow quantization in no-ram mode (reduces VRAM usage)
+            if load_in_4bit or load_in_8bit:
+                if 'qwen3.5' in model_name.lower() and ('a3b' in model_name.lower() or 'moe' in model_name.lower()):
+                    print(f"  Warning: {model_name} does not support bitsandbytes quantization")
+                else:
+                    try:
+                        import bitsandbytes as bnb
+                        print(f"  Using {4 if load_in_4bit else 8}-bit quantization")
+                        load_kwargs['load_in_4bit'] = load_in_4bit
+                        load_kwargs['load_in_8bit'] = load_in_8bit
+                    except ImportError:
+                        print("  Warning: bitsandbytes not installed. Quantization disabled.")
+            
+            try:
+                model = AutoModelForCausalLM.from_pretrained(model_name, **load_kwargs)
+                self.model = model
+                self.model.eval()
+                self.model_name = model_name
+                
+                print(f"\n--no-ram: Model loaded successfully on {cuda_device}")
+                print(f"Model device: {next(self.model.parameters()).device}")
+                
+                caps = detect_model_capabilities(model_name)
+                print(f"Model capabilities: {caps}")
+                return
+            except Exception as e:
+                print(f"--no-ram: Failed to load model on {cuda_device}: {e}")
+                raise RuntimeError(
+                    f"--no-ram: Failed to load model entirely on GPU ({cuda_device}). "
+                    f"The model may be too large for available VRAM. Error: {e}"
+                )
+        
+        # =====================================================================
+        # Standard loading path (with OOM fallback)
+        # =====================================================================
        load_kwargs = {'trust_remote_code': True}
        
        if load_in_4bit or load_in_8bit:

--- a/codai/backends/vulkan.py
+++ b/codai/backends/vulkan.py
@@ -476,12 +476,31 @@ class VulkanBackend(ModelBackend):
        # Determine model type
        is_image = model_type == "image" or model_path.startswith("image:")
        
+        # Check for --no-ram mode from global args
+        no_ram = kwargs.get('no_ram', False)
+        if not no_ram:
+            try:
+                from codai.api.state import get_global_args
+                _global_args = get_global_args()
+                if _global_args and getattr(_global_args, 'no_ram', False):
+                    no_ram = True
+            except Exception:
+                pass
+        
        # Configure GPU layers
        n_gpu_layers = kwargs.get('n_gpu_layers', -1)
-        if n_gpu_layers != -1:
+        if no_ram:
+            # --no-ram: force all layers on GPU
+            self.n_gpu_layers = -1
+        elif n_gpu_layers != -1:
            self.n_gpu_layers = n_gpu_layers
        
        # Configure context size
+        if no_ram:
+            # --no-ram: ignore --n-ctx, let the model use its own default
+            self.n_ctx = 0  # 0 means use model's built-in default in llama.cpp
+            print("DEBUG: --no-ram mode: ignoring --n-ctx, using model default context size")
+        else:
            n_ctx = kwargs.get('n_ctx', 2048)
            self.n_ctx = n_ctx
        
@@ -500,6 +519,11 @@ class VulkanBackend(ModelBackend):
            'main_gpu': self.main_gpu,
        }
        
+        # --no-ram: disable mmap to prevent CPU RAM usage for memory-mapped files
+        if no_ram:
+            llama_kwargs['use_mmap'] = False
+            print("DEBUG: --no-ram mode: use_mmap=False, n_gpu_layers=-1")
+        
        # Add optional parameters
        if 'n_threads' in kwargs:
            llama_kwargs['n_threads'] = kwargs['n_threads']
@@ -523,7 +547,7 @@ class VulkanBackend(ModelBackend):
            self._finalize_chat_template_detection()
            
            print(f"DEBUG: VulkanBackend loaded model: {model_path}")
-            print(f"DEBUG: n_gpu_layers={self.n_gpu_layers}, n_ctx={self.n_ctx}")
+            print(f"DEBUG: n_gpu_layers={self.n_gpu_layers}, n_ctx={self.n_ctx}, no_ram={no_ram}")
            print(f"DEBUG: chat_template={self.chat_template}")
        except Exception as e:
            print(f"Error loading GGUF model: {e}")

--- a/codai/cli.py
+++ b/codai/cli.py
@@ -433,4 +433,14 @@ def parse_args():
        default=False,
        help="Enable prompt distillation: place tool definitions right before the user's latest request instead of in the system prompt. This can improve tool call accuracy.",
    )
+    parser.add_argument(
+        "--no-ram",
+        action="store_true",
+        default=False,
+        help="Force model loading to maximize VRAM usage without CPU RAM spilling. "
+             "For llama-cpp-python: sets n_gpu_layers=-1, use_mmap=False, ignores --n-ctx. "
+             "For HuggingFace transformers: sets device_map='cuda:0', low_cpu_mem_usage=True, torch_dtype='auto'. "
+             "For diffusers: forces full GPU loading without CPU offload. "
+             "For sd.cpp: maximizes GPU layer offloading.",
+    )
    return parser.parse_args()
--- a/codai/main.py
+++ b/codai/main.py
@@ -187,6 +187,14 @@ def main():
    if grammar_guided_gen:
        print("Grammar-guided generation enabled (--grammar-guided-gen)")
    
+    # Print --no-ram mode status
+    if args.no_ram:
+        print("No-RAM mode enabled (--no-ram): maximizing VRAM usage, no CPU RAM spilling")
+        print("  llama-cpp-python: n_gpu_layers=-1, use_mmap=False, --n-ctx ignored")
+        print("  HuggingFace: device_map=cuda, low_cpu_mem_usage=True, torch_dtype=auto")
+        print("  Diffusers: forced full GPU loading")
+        print("  sd.cpp: maximizing GPU offload")
+    
    # Set global system prompt from --system-prompt flag
    global_system_prompt = args.system_prompt
    set_global_system_prompt(global_system_prompt)

--- a/codai/models/manager.py
+++ b/codai/models/manager.py
@@ -530,6 +530,8 @@ class MultiModelManager:
                    kwargs['ram'] = global_args.ram
                if hasattr(global_args, 'flash_attn'):
                    kwargs['flash_attn'] = global_args.flash_attn
+                if hasattr(global_args, 'no_ram'):
+                    kwargs['no_ram'] = global_args.no_ram
            
            print(f"Loading default model on demand: {self.default_model}")
            model_manager.load_model(self.default_model, backend_type=backend_type, **kwargs)
@@ -579,6 +581,8 @@ class MultiModelManager:
                    kwargs['ram'] = global_args.ram
                if hasattr(global_args, 'flash_attn'):
                    kwargs['flash_attn'] = global_args.flash_attn
+                if hasattr(global_args, 'no_ram'):
+                    kwargs['no_ram'] = global_args.no_ram
            
            print(f"Loading model on demand: {model_name}")
            model_manager.load_model(model_name, backend_type=backend_type, **kwargs)