Fix balanced offload strategy VRAM estimation

- Account for LoRA overhead (~4GB) in VRAM calculations
- Add 30% inference overhead for activation memory
- Use more conservative 70% threshold (was 85%)
- Add OOM fallback to model CPU offload if GPU loading fails
- Switch fallback from sequential to model offload for better performance
parent 5b724e07
...@@ -10235,18 +10235,39 @@ def main(args): ...@@ -10235,18 +10235,39 @@ def main(args):
vram_available = vram_total - vram_allocated vram_available = vram_total - vram_allocated
# Estimate model size from VRAM requirements # Estimate model size from VRAM requirements
# Add overhead for LoRA, inference, and model components not in estimate
model_vram_est = parse_vram_estimate(m_info.get("vram", "~10 GB")) model_vram_est = parse_vram_estimate(m_info.get("vram", "~10 GB"))
# If model fits comfortably in available VRAM (with 15% buffer), load fully # Account for various overheads:
if model_vram_est < vram_available * 0.85: # - LoRA weights add ~2-4GB
print(f" 📦 Balanced mode: Model (~{model_vram_est:.1f}GB) fits in VRAM ({vram_available:.1f}GB available)") # - Inference activation memory needs ~20-30% extra
# - Text encoder, VAE, scheduler not always in estimate
is_lora = m_info.get("is_lora", False)
lora_overhead = 4.0 if is_lora else 0.0 # LoRA adds significant overhead
inference_overhead = model_vram_est * 0.3 # 30% for activations during inference
total_vram_needed = model_vram_est + lora_overhead + inference_overhead
# Use conservative 70% threshold (30% safety buffer) for "balanced"
# This ensures we don't OOM during inference
vram_threshold = vram_available * 0.70
if total_vram_needed < vram_threshold:
print(f" 📦 Balanced mode: Model (~{total_vram_needed:.1f}GB needed) fits in VRAM ({vram_available:.1f}GB available)")
print(f" Loading fully to GPU (no offloading)") print(f" Loading fully to GPU (no offloading)")
pipe = pipe.to("cuda") try:
pipe = pipe.to("cuda")
except torch.cuda.OutOfMemoryError:
# Fallback if moving to GPU fails
print(f" ⚠️ OOM when loading to GPU, falling back to model CPU offload")
torch.cuda.empty_cache()
gc.collect()
pipe.enable_model_cpu_offload()
else: else:
# Model too large, use sequential offloading but only for necessary layers # Model too large, use model CPU offload (better than sequential for most cases)
print(f" 📦 Balanced mode: Model (~{model_vram_est:.1f}GB) exceeds VRAM ({vram_available:.1f}GB available)") print(f" 📦 Balanced mode: Model (~{total_vram_needed:.1f}GB needed) exceeds safe VRAM ({vram_available:.1f}GB available)")
print(f" Using selective offloading to maximize VRAM usage") print(f" Using model CPU offload to prevent OOM")
pipe.enable_sequential_cpu_offload() pipe.enable_model_cpu_offload()
else: else:
pipe.to("cuda" if torch.cuda.is_available() else "cpu") pipe.to("cuda" if torch.cuda.is_available() else "cpu")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment