Fix balanced offload strategy VRAM estimation

- Account for LoRA overhead (~4GB) in VRAM calculations - Add 30% inference overhead for activation memory - Use more conservative 70% threshold (was 85%) - Add OOM fallback to model CPU offload if GPU loading fails - Switch fallback from sequential to model offload for better performance

Fix balanced offload strategy VRAM estimation
- Account for LoRA overhead (~4GB) in VRAM calculations - Add 30% inference overhead for activation memory - Use more conservative 70% threshold (was 85%) - Add OOM fallback to model CPU offload if GPU loading fails - Switch fallback from sequential to model offload for better performance
15e14a57 · Stefy Lanza (nextime / spora ) · 5b724e07 · 15e14a57
Commit 15e14a57 authored Feb 28, 2026 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 29 additions and 8 deletions

videogen.py videogen.py +29 -8

No files found.
--- a/videogen.py
+++ b/videogen.py
@@ -10235,18 +10235,39 @@ def main(args):
                vram_available = vram_total - vram_allocated
                # Estimate model size from VRAM requirements
+                # Add overhead for LoRA, inference, and model components not in estimate
                model_vram_est = parse_vram_estimate(m_info.get("vram", "~10 GB"))
-                # If model fits comfortably in available VRAM (with 15% buffer), load fully
+                # Account for various overheads:
-                if model_vram_est < vram_available * 0.85:
+                # - LoRA weights add ~2-4GB
-                    print(f"  📦 Balanced mode: Model (~{model_vram_est:.1f}GB) fits in VRAM ({vram_available:.1f}GB available)")
+                # - Inference activation memory needs ~20-30% extra
+                # - Text encoder, VAE, scheduler not always in estimate
+                is_lora = m_info.get("is_lora", False)
+                lora_overhead = 4.0 if is_lora else 0.0  # LoRA adds significant overhead
+                inference_overhead = model_vram_est * 0.3  # 30% for activations during inference
+                total_vram_needed = model_vram_est + lora_overhead + inference_overhead
+                # Use conservative 70% threshold (30% safety buffer) for "balanced"
+                # This ensures we don't OOM during inference
+                vram_threshold = vram_available * 0.70
+                if total_vram_needed < vram_threshold:
+                    print(f"  📦 Balanced mode: Model (~{total_vram_needed:.1f}GB needed) fits in VRAM ({vram_available:.1f}GB available)")
                    print(f"     Loading fully to GPU (no offloading)")
-                    pipe = pipe.to("cuda")
+                    try:
+                        pipe = pipe.to("cuda")
+                    except torch.cuda.OutOfMemoryError:
+                        # Fallback if moving to GPU fails
+                        print(f"     ⚠️  OOM when loading to GPU, falling back to model CPU offload")
+                        torch.cuda.empty_cache()
+                        gc.collect()
+                        pipe.enable_model_cpu_offload()
                else:
-                    # Model too large, use sequential offloading but only for necessary layers
+                    # Model too large, use model CPU offload (better than sequential for most cases)
-                    print(f"  📦 Balanced mode: Model (~{model_vram_est:.1f}GB) exceeds VRAM ({vram_available:.1f}GB available)")
+                    print(f"  📦 Balanced mode: Model (~{total_vram_needed:.1f}GB needed) exceeds safe VRAM ({vram_available:.1f}GB available)")
-                    print(f"     Using selective offloading to maximize VRAM usage")
+                    print(f"     Using model CPU offload to prevent OOM")
-                    pipe.enable_sequential_cpu_offload()
+                    pipe.enable_model_cpu_offload()
            else:
                pipe.to("cuda" if torch.cuda.is_available() else "cpu")