Commit beded066 authored by Your Name's avatar Your Name

Add --offload-strategy none to disable CPU offloading and VRAM auto-detection

- Add 'none' to --offload-strategy choices in cli.py
- In cuda.py backend:
  - _get_vram_percentages_for_strategy() returns None for 'none' strategy
  - _get_vram_percentages_for_gpu() skips VRAM detection for 'none'
  - load_model() loads directly on GPU without max_memory constraints
- Add startup status message in main.py for --offload-strategy none
parent b782a092
......@@ -185,6 +185,9 @@ class NvidiaBackend(ModelBackend):
def _get_vram_percentages_for_strategy(self, strategy: str, is_moe: bool, total_vram_gb: float) -> list:
"""Get VRAM percentage steps based on offload strategy."""
if strategy == "none":
print(f" Offload strategy 'none': disabling CPU offload and VRAM auto-detection")
return None # Signal to skip offloading entirely
if strategy == "conservative":
print(f" Using conservative offload strategy")
if is_moe:
......@@ -221,9 +224,15 @@ class NvidiaBackend(ModelBackend):
return [0.93, 0.85, 0.75, 0.65, 0.50, 0.35, 0.20, 0.0]
def _get_vram_percentages_for_gpu(self, model_name: str = "", strategy: str = "auto", max_gpu_percent: float = None) -> list:
"""Get VRAM percentage steps based on GPU memory size."""
"""Get VRAM percentage steps based on GPU memory size.
Returns None when strategy is 'none' (no offloading).
"""
import torch
if strategy == "none":
return None # Signal to skip offloading entirely
if not torch.cuda.is_available():
return [0.0]
......@@ -397,6 +406,25 @@ class NvidiaBackend(ModelBackend):
model = None
vram_percentages = self._get_vram_percentages_for_gpu(model_name, offload_strategy, max_gpu_percent)
# --offload-strategy none: load directly on GPU without offloading or VRAM limits
if vram_percentages is None:
cuda_device = self._derive_cuda_device()
print(f"\nOffload strategy 'none': loading model directly on {cuda_device} (no CPU offload, no VRAM limits)")
load_kwargs['device_map'] = cuda_device
load_kwargs['low_cpu_mem_usage'] = True
load_kwargs['torch_dtype'] = "auto"
# Remove dtype set earlier since torch_dtype=auto takes precedence
load_kwargs.pop('dtype', None)
try:
model = AutoModelForCausalLM.from_pretrained(model_name, **load_kwargs)
except Exception as e:
raise RuntimeError(
f"--offload-strategy none: Failed to load model entirely on GPU ({cuda_device}). "
f"The model may be too large for available VRAM. Error: {e}"
)
else:
first_vram_pct = vram_percentages[0] if vram_percentages else 0.93
for vram_pct in vram_percentages:
......
......@@ -117,9 +117,9 @@ def parse_args():
parser.add_argument(
"--offload-strategy",
type=str,
choices=["auto", "conservative", "balanced", "aggressive", "sequential"],
choices=["auto", "conservative", "balanced", "aggressive", "sequential", "none"],
default="auto",
help="Offload strategy for NVIDIA backend (default: auto)",
help="Offload strategy for NVIDIA backend (default: auto). Use 'none' to disable CPU offloading and VRAM auto-detection entirely.",
)
parser.add_argument(
"--max-gpu-percent",
......
......@@ -187,6 +187,11 @@ def main():
if grammar_guided_gen:
print("Grammar-guided generation enabled (--grammar-guided-gen)")
# Print --offload-strategy none status
if args.offload_strategy == "none":
print("Offload strategy 'none': CPU offloading and VRAM auto-detection disabled")
print(" Model will be loaded directly on GPU without memory limits")
# Print --no-ram mode status
if args.no_ram:
print("No-RAM mode enabled (--no-ram): maximizing VRAM usage, no CPU RAM spilling")
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment