Change NVIDIA backend VRAM limit from 99.9% to 93% to leave more headroom for CUDA overhead

parent 2ca7368f
...@@ -541,17 +541,17 @@ class NvidiaBackend(ModelBackend): ...@@ -541,17 +541,17 @@ class NvidiaBackend(ModelBackend):
return None return None
def _get_gpu_memory_map(self) -> Dict: def _get_gpu_memory_map(self) -> Dict:
"""Get max_memory dict for Accelerate with 99.9% GPU limit, then CPU, then disk.""" """Get max_memory dict for Accelerate with 93% GPU limit, then CPU, then disk."""
import torch import torch
max_memory = {} max_memory = {}
# GPU memory: 99.9% of available VRAM per GPU # GPU memory: 93% of available VRAM per GPU
if torch.cuda.is_available(): if torch.cuda.is_available():
for i in range(torch.cuda.device_count()): for i in range(torch.cuda.device_count()):
props = torch.cuda.get_device_properties(i) props = torch.cuda.get_device_properties(i)
total_vram = props.total_memory total_vram = props.total_memory
# Leave 0.1% headroom for CUDA overhead # Leave 7% headroom for CUDA overhead (changed from 0.1% to 7%)
usable_vram = int(total_vram * 0.999) usable_vram = int(total_vram * 0.93)
max_memory[i] = usable_vram max_memory[i] = usable_vram
print(f" GPU {i}: {total_vram / 1e9:.1f}GB total, {usable_vram / 1e9:.1f}GB usable") print(f" GPU {i}: {total_vram / 1e9:.1f}GB total, {usable_vram / 1e9:.1f}GB usable")
...@@ -606,12 +606,12 @@ class NvidiaBackend(ModelBackend): ...@@ -606,12 +606,12 @@ class NvidiaBackend(ModelBackend):
# Prepare model loading arguments # Prepare model loading arguments
load_kwargs = {'trust_remote_code': True} load_kwargs = {'trust_remote_code': True}
# Setup memory management: GPU (95%) → CPU (limit) → Disk # Setup memory management: GPU (93%) → CPU (limit) → Disk
if self.device == "cuda": if self.device == "cuda":
max_memory = self._get_gpu_memory_map() max_memory = self._get_gpu_memory_map()
load_kwargs['max_memory'] = max_memory load_kwargs['max_memory'] = max_memory
load_kwargs['device_map'] = 'auto' load_kwargs['device_map'] = 'auto'
print(f" Memory strategy: GPU (99.9% VRAM) → CPU → Disk") print(f" Memory strategy: GPU (93% VRAM) → CPU → Disk")
else: else:
# CPU-only mode # CPU-only mode
load_kwargs['device_map'] = None load_kwargs['device_map'] = None
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment