Add --max-gpu-percent parameter for fine-grained GPU memory control

This new parameter allows users to specify the exact percentage of GPU VRAM
to use, overriding the offload-strategy. When specified, the model will:
1. Use up to max-gpu-percent of VRAM
2. Offload remaining weights to CPU RAM (--ram)
3. Overflow to disk (--offload-dir) if RAM exhausted
4. Automatically fallback in 5% steps if OOM occurs

Example usage for RTX 3090 with Qwen3.5-35B-A3B:
  coderai --model Qwen/Qwen3.5-35B-A3B --max-gpu-percent 50 --ram 64

This ensures MoE models with high VRAM requirements during generation
can run without OOM by using CPU RAM as the primary offload target.
parent e23c3f7f
...@@ -631,13 +631,33 @@ class NvidiaBackend(ModelBackend): ...@@ -631,13 +631,33 @@ class NvidiaBackend(ModelBackend):
print(f" Detected large GPU ({total_vram_gb:.1f}GB), using conservative VRAM usage (93% start)") print(f" Detected large GPU ({total_vram_gb:.1f}GB), using conservative VRAM usage (93% start)")
return [0.93, 0.85, 0.75, 0.65, 0.50, 0.35, 0.20, 0.0] return [0.93, 0.85, 0.75, 0.65, 0.50, 0.35, 0.20, 0.0]
def _get_vram_percentages_for_gpu(self, model_name: str = "", strategy: str = "auto") -> list: def _get_vram_percentages_for_gpu(self, model_name: str = "", strategy: str = "auto", max_gpu_percent: float = None) -> list:
"""Get VRAM percentage steps based on GPU memory size, model type, and offload strategy.""" """Get VRAM percentage steps based on GPU memory size, model type, and offload strategy."""
import torch import torch
if not torch.cuda.is_available(): if not torch.cuda.is_available():
return [0.0] # CPU only return [0.0] # CPU only
# If max_gpu_percent is specified, use it to create custom percentage steps
if max_gpu_percent is not None:
# Clamp to valid range (5-100%)
max_pct = max(0.05, min(1.0, max_gpu_percent / 100.0))
print(f" Using custom max GPU percent: {max_pct*100:.0f}%")
# Create a descending series from max_pct down to 0
steps = []
current = max_pct
while current > 0.05:
steps.append(current)
# Reduce by 5% each step, or smaller steps near the end
if current > 0.3:
current -= 0.05
elif current > 0.15:
current -= 0.03
else:
current -= 0.02
steps.append(0.0)
return steps
# Get total VRAM of the first GPU # Get total VRAM of the first GPU
total_vram_gb = 0 total_vram_gb = 0
for i in range(torch.cuda.device_count()): for i in range(torch.cuda.device_count()):
...@@ -662,6 +682,7 @@ class NvidiaBackend(ModelBackend): ...@@ -662,6 +682,7 @@ class NvidiaBackend(ModelBackend):
manual_ram_gb = kwargs.get('manual_ram_gb') manual_ram_gb = kwargs.get('manual_ram_gb')
flash_attn = kwargs.get('flash_attn', False) flash_attn = kwargs.get('flash_attn', False)
offload_strategy = kwargs.get('offload_strategy', 'auto') offload_strategy = kwargs.get('offload_strategy', 'auto')
max_gpu_percent = kwargs.get('max_gpu_percent', None)
# Store RAM limit for use in _get_gpu_memory_map # Store RAM limit for use in _get_gpu_memory_map
self._pending_ram_gb = manual_ram_gb self._pending_ram_gb = manual_ram_gb
...@@ -720,7 +741,7 @@ class NvidiaBackend(ModelBackend): ...@@ -720,7 +741,7 @@ class NvidiaBackend(ModelBackend):
# Try loading with automatic fallback on OOM # Try loading with automatic fallback on OOM
model = None model = None
vram_percentages = self._get_vram_percentages_for_gpu(model_name, offload_strategy) vram_percentages = self._get_vram_percentages_for_gpu(model_name, offload_strategy, max_gpu_percent)
first_vram_pct = vram_percentages[0] if vram_percentages else 0.93 first_vram_pct = vram_percentages[0] if vram_percentages else 0.93
for vram_pct in vram_percentages: for vram_pct in vram_percentages:
...@@ -2046,6 +2067,12 @@ def parse_args(): ...@@ -2046,6 +2067,12 @@ def parse_args():
default="auto", default="auto",
help="Offload strategy for NVIDIA backend (default: auto)", help="Offload strategy for NVIDIA backend (default: auto)",
) )
parser.add_argument(
"--max-gpu-percent",
type=float,
default=None,
help="Maximum GPU VRAM to use as percentage (0-100). Overrides offload-strategy. Lower values offload more to CPU/RAM (default: None = use offload-strategy)",
)
parser.add_argument( parser.add_argument(
"--n-gpu-layers", "--n-gpu-layers",
type=int, type=int,
...@@ -2149,6 +2176,7 @@ def main(): ...@@ -2149,6 +2176,7 @@ def main():
'manual_ram_gb': args.ram, 'manual_ram_gb': args.ram,
'flash_attn': args.flash_attn, 'flash_attn': args.flash_attn,
'offload_strategy': args.offload_strategy, 'offload_strategy': args.offload_strategy,
'max_gpu_percent': args.max_gpu_percent,
'n_gpu_layers': args.n_gpu_layers, 'n_gpu_layers': args.n_gpu_layers,
'n_ctx': args.n_ctx, 'n_ctx': args.n_ctx,
'main_gpu': args.vulkan_device, 'main_gpu': args.vulkan_device,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment