Merge branch 'experimental'

698b8f7d · Stefy Lanza (nextime / spora ) · 1326bbbf · e5c12b7f · 698b8f7d
Commit 698b8f7d authored Feb 28, 2026 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 50 additions and 1 deletion

videogen.py videogen.py +50 -1

No files found.
--- a/videogen.py
+++ b/videogen.py
@@ -9226,6 +9226,31 @@ def main(args):
                pipe.enable_model_cpu_offload()
        elif off == "model":
            pipe.enable_model_cpu_offload()
+        elif off == "balanced":
+            # Smart offloading: use VRAM fully, only offload if needed
+            import gc
+            torch.cuda.empty_cache()
+            gc.collect()
+            # Get available VRAM
+            vram_total = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+            vram_allocated = torch.cuda.memory_allocated() / (1024**3)
+            vram_reserved = torch.cuda.memory_reserved() / (1024**3)
+            vram_available = vram_total - vram_allocated
+            # Estimate model size from VRAM requirements
+            model_vram_est = parse_vram_estimate(m_info.get("vram", "~10 GB"))
+            # If model fits comfortably in available VRAM (with 15% buffer), load fully
+            if model_vram_est < vram_available * 0.85:
+                print(f"  📦 Balanced mode: Model (~{model_vram_est:.1f}GB) fits in VRAM ({vram_available:.1f}GB available)")
+                print(f"     Loading fully to GPU (no offloading)")
+                pipe = pipe.to("cuda")
+            else:
+                # Model too large, use sequential offloading but only for necessary layers
+                print(f"  📦 Balanced mode: Model (~{model_vram_est:.1f}GB) exceeds VRAM ({vram_available:.1f}GB available)")
+                print(f"     Using selective offloading to maximize VRAM usage")
+                pipe.enable_sequential_cpu_offload()
        else:
            pipe.to("cuda" if torch.cuda.is_available() else "cpu")
@@ -10043,6 +10068,30 @@ def main(args):
                    pipe.enable_model_cpu_offload()
            elif off == "model":
                pipe.enable_model_cpu_offload()
+            elif off == "balanced":
+                # Smart offloading: use VRAM fully, only offload if needed
+                import gc
+                torch.cuda.empty_cache()
+                gc.collect()
+                # Get available VRAM
+                vram_total = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+                vram_allocated = torch.cuda.memory_allocated() / (1024**3)
+                vram_available = vram_total - vram_allocated
+                # Estimate model size from VRAM requirements
+                model_vram_est = parse_vram_estimate(m_info.get("vram", "~10 GB"))
+                # If model fits comfortably in available VRAM (with 15% buffer), load fully
+                if model_vram_est < vram_available * 0.85:
+                    print(f"  📦 Balanced mode: Model (~{model_vram_est:.1f}GB) fits in VRAM ({vram_available:.1f}GB available)")
+                    print(f"     Loading fully to GPU (no offloading)")
+                    pipe = pipe.to("cuda")
+                else:
+                    # Model too large, use sequential offloading but only for necessary layers
+                    print(f"  📦 Balanced mode: Model (~{model_vram_est:.1f}GB) exceeds VRAM ({vram_available:.1f}GB available)")
+                    print(f"     Using selective offloading to maximize VRAM usage")
+                    pipe.enable_sequential_cpu_offload()
            else:
                pipe.to("cuda" if torch.cuda.is_available() else "cpu")
@@ -10415,7 +10464,7 @@ List TTS voices:
    parser.add_argument("--distribute", action="store_true")
    parser.add_argument("--interface", type=str, default="eth0")
-    parser.add_argument("--offload_strategy", choices=["none", "model", "sequential", "group", "auto_map"], default="model")
+    parser.add_argument("--offload_strategy", choices=["none", "model", "sequential", "group", "auto_map", "balanced"], default="model")
    parser.add_argument("--offload_group_size", type=int, default=8)
    parser.add_argument("--low_ram_mode", action="store_true")
    parser.add_argument("--vram_limit", type=int, default=22)