Commit b782a092 authored by Your Name's avatar Your Name

Add --no-ram option to maximize VRAM usage

- Add --no-ram CLI option to force model loading without CPU RAM spilling
- Implement --no-ram behavior for:
  - llama-cpp-python: n_gpu_layers=-1, use_mmap=False, ignore --n-ctx
  - HuggingFace transformers: device_map='cuda:0', low_cpu_mem_usage=True
  - Diffusers: force full GPU loading
  - sd.cpp: maximize GPU usage
- Propagate flag through model manager
- Add startup banner message
parent ef949827
......@@ -187,6 +187,24 @@ def _is_gguf_model(model_name: str) -> bool:
(model_name.startswith('http') and '.gguf' in model_name))
def _derive_diffusers_device(global_args) -> str:
"""Derive the CUDA device string for diffusers from global args.
Checks --image-vulkan-device then --vulkan-device to determine
which CUDA device to target. Defaults to 'cuda:0'.
"""
if global_args:
# Check image-specific device first
image_device = getattr(global_args, 'image_vulkan_device', None)
if image_device is not None:
return f"cuda:{image_device}"
# Fall back to general device
device_id = getattr(global_args, 'vulkan_device', 0)
if device_id is not None and device_id != 0:
return f"cuda:{device_id}"
return "cuda:0"
def _load_diffusers_pipeline(model_name: str, global_args):
"""
Try to load a model using the diffusers library.
......@@ -197,6 +215,9 @@ def _load_diffusers_pipeline(model_name: str, global_args):
from diffusers import StableDiffusionPipeline, StableDiffusionXLPipeline, DiffusionPipeline
import torch
# Check for --no-ram mode
no_ram = getattr(global_args, 'no_ram', False) if global_args else False
# Determine precision from CLI argument (--image-precision)
precision = getattr(global_args, 'image_precision', 'f32') or 'f32'
precision_map = {
......@@ -207,11 +228,55 @@ def _load_diffusers_pipeline(model_name: str, global_args):
if hasattr(torch, 'float8_e4m3fn'):
precision_map['f8'] = torch.float8_e4m3fn
dtype = precision_map.get(precision, torch.float32)
# --no-ram mode: override dtype to auto-select best for GPU
if no_ram:
dtype = torch.float16 # Use fp16 to maximize VRAM efficiency
print(f"--no-ram mode: Using precision fp16 for maximum VRAM efficiency")
else:
print(f"Using precision: {precision} ({dtype})")
# Check if CPU offload is requested via CLI
use_sequential_offload = getattr(global_args, 'image_cpu_offload', False)
# --no-ram mode: never use CPU offload
if no_ram and use_sequential_offload:
print("--no-ram mode: ignoring --image-cpu-offload, forcing full GPU loading")
use_sequential_offload = False
# =====================================================================
# --no-ram mode: load directly on GPU, no CPU RAM fallback
# =====================================================================
if no_ram and torch.cuda.is_available():
cuda_device = _derive_diffusers_device(global_args)
print(f"--no-ram mode: loading diffusers model directly on {cuda_device}")
try:
try:
pipeline = StableDiffusionXLPipeline.from_pretrained(
model_name,
torch_dtype=dtype,
use_safetensors=True,
)
except Exception:
pipeline = DiffusionPipeline.from_pretrained(
model_name,
torch_dtype=dtype,
use_safetensors=True,
)
pipeline = pipeline.to(cuda_device)
print(f"--no-ram: Diffusers model loaded on {cuda_device}")
return pipeline
except Exception as e:
raise RuntimeError(
f"--no-ram: Failed to load diffusers model entirely on GPU ({cuda_device}). "
f"The model may be too large for available VRAM. Error: {e}"
)
# =====================================================================
# Standard loading path (with OOM fallback)
# =====================================================================
# Track loading attempts for OOM handling
pipeline = None
load_attempt = 0
......@@ -419,6 +484,9 @@ def _load_sdcpp_model(model_path: str, global_args):
"""
from stable_diffusion_cpp import StableDiffusion
# Check for --no-ram mode
no_ram = getattr(global_args, 'no_ram', False) if global_args else False
print(f"Loading sd.cpp model from: {model_path}")
# Build sd.cpp constructor args from config
......@@ -433,6 +501,15 @@ def _load_sdcpp_model(model_path: str, global_args):
if hasattr(global_args, 'llm_path') and global_args.llm_path:
kwargs['lora_model_dir'] = global_args.llm_path
# --no-ram mode: maximize GPU offloading for sd.cpp
if no_ram:
# stable-diffusion-cpp-python supports n_threads and gpu-related params
# Force full GPU offload by keeping all operations on GPU
kwargs['keep_clip_on_cpu'] = False # Don't offload CLIP to CPU
kwargs['keep_control_net_cpu'] = False # Don't offload ControlNet to CPU
kwargs['keep_vae_on_cpu'] = False # Don't offload VAE to CPU
print("--no-ram mode: sd.cpp maximizing GPU usage (no CPU offload for CLIP/VAE/ControlNet)")
sd_model = StableDiffusion(**kwargs)
return sd_model
......
......@@ -254,6 +254,24 @@ class NvidiaBackend(ModelBackend):
return self._get_vram_percentages_for_strategy(strategy, is_moe, total_vram_gb)
def _derive_cuda_device(self) -> str:
"""Derive the CUDA device string from global args.
Checks --vulkan-device (reused as generic GPU device ID) to determine
which CUDA device to target. Defaults to 'cuda:0'.
"""
try:
from codai.api.state import get_global_args
_global_args = get_global_args()
if _global_args:
# Use vulkan-device as a generic GPU device selector
device_id = getattr(_global_args, 'vulkan_device', 0)
if device_id is not None and device_id != 0:
return f"cuda:{device_id}"
except Exception:
pass
return "cuda:0"
def load_model(self, model_name: str, **kwargs) -> None:
"""Load the model using HuggingFace Transformers with automatic OOM handling."""
import torch
......@@ -267,6 +285,17 @@ class NvidiaBackend(ModelBackend):
offload_strategy = kwargs.get('offload_strategy', 'auto')
max_gpu_percent = kwargs.get('max_gpu_percent', None)
# Check for --no-ram mode
no_ram = kwargs.get('no_ram', False)
if not no_ram:
try:
from codai.api.state import get_global_args
_global_args = get_global_args()
if _global_args and getattr(_global_args, 'no_ram', False):
no_ram = True
except Exception:
pass
self._pending_ram_gb = manual_ram_gb
print(f"Loading HuggingFace model: {model_name}")
......@@ -285,6 +314,60 @@ class NvidiaBackend(ModelBackend):
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
# =====================================================================
# --no-ram mode: maximize VRAM, no CPU RAM spilling
# =====================================================================
if no_ram and self.device == "cuda":
cuda_device = self._derive_cuda_device()
print(f"--no-ram mode: loading model directly on {cuda_device}")
print(f" device_map={cuda_device}, low_cpu_mem_usage=True, torch_dtype=auto")
load_kwargs = {
'trust_remote_code': True,
'device_map': cuda_device,
'low_cpu_mem_usage': True,
'torch_dtype': "auto",
}
if self.use_flash_attn and self.flash_attn_available:
load_kwargs['attn_implementation'] = "flash_attention_2"
print(" Using Flash Attention 2")
# Still allow quantization in no-ram mode (reduces VRAM usage)
if load_in_4bit or load_in_8bit:
if 'qwen3.5' in model_name.lower() and ('a3b' in model_name.lower() or 'moe' in model_name.lower()):
print(f" Warning: {model_name} does not support bitsandbytes quantization")
else:
try:
import bitsandbytes as bnb
print(f" Using {4 if load_in_4bit else 8}-bit quantization")
load_kwargs['load_in_4bit'] = load_in_4bit
load_kwargs['load_in_8bit'] = load_in_8bit
except ImportError:
print(" Warning: bitsandbytes not installed. Quantization disabled.")
try:
model = AutoModelForCausalLM.from_pretrained(model_name, **load_kwargs)
self.model = model
self.model.eval()
self.model_name = model_name
print(f"\n--no-ram: Model loaded successfully on {cuda_device}")
print(f"Model device: {next(self.model.parameters()).device}")
caps = detect_model_capabilities(model_name)
print(f"Model capabilities: {caps}")
return
except Exception as e:
print(f"--no-ram: Failed to load model on {cuda_device}: {e}")
raise RuntimeError(
f"--no-ram: Failed to load model entirely on GPU ({cuda_device}). "
f"The model may be too large for available VRAM. Error: {e}"
)
# =====================================================================
# Standard loading path (with OOM fallback)
# =====================================================================
load_kwargs = {'trust_remote_code': True}
if load_in_4bit or load_in_8bit:
......
......@@ -476,12 +476,31 @@ class VulkanBackend(ModelBackend):
# Determine model type
is_image = model_type == "image" or model_path.startswith("image:")
# Check for --no-ram mode from global args
no_ram = kwargs.get('no_ram', False)
if not no_ram:
try:
from codai.api.state import get_global_args
_global_args = get_global_args()
if _global_args and getattr(_global_args, 'no_ram', False):
no_ram = True
except Exception:
pass
# Configure GPU layers
n_gpu_layers = kwargs.get('n_gpu_layers', -1)
if n_gpu_layers != -1:
if no_ram:
# --no-ram: force all layers on GPU
self.n_gpu_layers = -1
elif n_gpu_layers != -1:
self.n_gpu_layers = n_gpu_layers
# Configure context size
if no_ram:
# --no-ram: ignore --n-ctx, let the model use its own default
self.n_ctx = 0 # 0 means use model's built-in default in llama.cpp
print("DEBUG: --no-ram mode: ignoring --n-ctx, using model default context size")
else:
n_ctx = kwargs.get('n_ctx', 2048)
self.n_ctx = n_ctx
......@@ -500,6 +519,11 @@ class VulkanBackend(ModelBackend):
'main_gpu': self.main_gpu,
}
# --no-ram: disable mmap to prevent CPU RAM usage for memory-mapped files
if no_ram:
llama_kwargs['use_mmap'] = False
print("DEBUG: --no-ram mode: use_mmap=False, n_gpu_layers=-1")
# Add optional parameters
if 'n_threads' in kwargs:
llama_kwargs['n_threads'] = kwargs['n_threads']
......@@ -523,7 +547,7 @@ class VulkanBackend(ModelBackend):
self._finalize_chat_template_detection()
print(f"DEBUG: VulkanBackend loaded model: {model_path}")
print(f"DEBUG: n_gpu_layers={self.n_gpu_layers}, n_ctx={self.n_ctx}")
print(f"DEBUG: n_gpu_layers={self.n_gpu_layers}, n_ctx={self.n_ctx}, no_ram={no_ram}")
print(f"DEBUG: chat_template={self.chat_template}")
except Exception as e:
print(f"Error loading GGUF model: {e}")
......
......@@ -433,4 +433,14 @@ def parse_args():
default=False,
help="Enable prompt distillation: place tool definitions right before the user's latest request instead of in the system prompt. This can improve tool call accuracy.",
)
parser.add_argument(
"--no-ram",
action="store_true",
default=False,
help="Force model loading to maximize VRAM usage without CPU RAM spilling. "
"For llama-cpp-python: sets n_gpu_layers=-1, use_mmap=False, ignores --n-ctx. "
"For HuggingFace transformers: sets device_map='cuda:0', low_cpu_mem_usage=True, torch_dtype='auto'. "
"For diffusers: forces full GPU loading without CPU offload. "
"For sd.cpp: maximizes GPU layer offloading.",
)
return parser.parse_args()
......@@ -187,6 +187,14 @@ def main():
if grammar_guided_gen:
print("Grammar-guided generation enabled (--grammar-guided-gen)")
# Print --no-ram mode status
if args.no_ram:
print("No-RAM mode enabled (--no-ram): maximizing VRAM usage, no CPU RAM spilling")
print(" llama-cpp-python: n_gpu_layers=-1, use_mmap=False, --n-ctx ignored")
print(" HuggingFace: device_map=cuda, low_cpu_mem_usage=True, torch_dtype=auto")
print(" Diffusers: forced full GPU loading")
print(" sd.cpp: maximizing GPU offload")
# Set global system prompt from --system-prompt flag
global_system_prompt = args.system_prompt
set_global_system_prompt(global_system_prompt)
......
......@@ -530,6 +530,8 @@ class MultiModelManager:
kwargs['ram'] = global_args.ram
if hasattr(global_args, 'flash_attn'):
kwargs['flash_attn'] = global_args.flash_attn
if hasattr(global_args, 'no_ram'):
kwargs['no_ram'] = global_args.no_ram
print(f"Loading default model on demand: {self.default_model}")
model_manager.load_model(self.default_model, backend_type=backend_type, **kwargs)
......@@ -579,6 +581,8 @@ class MultiModelManager:
kwargs['ram'] = global_args.ram
if hasattr(global_args, 'flash_attn'):
kwargs['flash_attn'] = global_args.flash_attn
if hasattr(global_args, 'no_ram'):
kwargs['no_ram'] = global_args.no_ram
print(f"Loading model on demand: {model_name}")
model_manager.load_model(model_name, backend_type=backend_type, **kwargs)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment