Commit 698b8f7d authored by Stefy Lanza (nextime / spora )'s avatar Stefy Lanza (nextime / spora )

Merge branch 'experimental'

parents 1326bbbf e5c12b7f
......@@ -9226,6 +9226,31 @@ def main(args):
pipe.enable_model_cpu_offload()
elif off == "model":
pipe.enable_model_cpu_offload()
elif off == "balanced":
# Smart offloading: use VRAM fully, only offload if needed
import gc
torch.cuda.empty_cache()
gc.collect()
# Get available VRAM
vram_total = torch.cuda.get_device_properties(0).total_memory / (1024**3)
vram_allocated = torch.cuda.memory_allocated() / (1024**3)
vram_reserved = torch.cuda.memory_reserved() / (1024**3)
vram_available = vram_total - vram_allocated
# Estimate model size from VRAM requirements
model_vram_est = parse_vram_estimate(m_info.get("vram", "~10 GB"))
# If model fits comfortably in available VRAM (with 15% buffer), load fully
if model_vram_est < vram_available * 0.85:
print(f" 📦 Balanced mode: Model (~{model_vram_est:.1f}GB) fits in VRAM ({vram_available:.1f}GB available)")
print(f" Loading fully to GPU (no offloading)")
pipe = pipe.to("cuda")
else:
# Model too large, use sequential offloading but only for necessary layers
print(f" 📦 Balanced mode: Model (~{model_vram_est:.1f}GB) exceeds VRAM ({vram_available:.1f}GB available)")
print(f" Using selective offloading to maximize VRAM usage")
pipe.enable_sequential_cpu_offload()
else:
pipe.to("cuda" if torch.cuda.is_available() else "cpu")
......@@ -10043,6 +10068,30 @@ def main(args):
pipe.enable_model_cpu_offload()
elif off == "model":
pipe.enable_model_cpu_offload()
elif off == "balanced":
# Smart offloading: use VRAM fully, only offload if needed
import gc
torch.cuda.empty_cache()
gc.collect()
# Get available VRAM
vram_total = torch.cuda.get_device_properties(0).total_memory / (1024**3)
vram_allocated = torch.cuda.memory_allocated() / (1024**3)
vram_available = vram_total - vram_allocated
# Estimate model size from VRAM requirements
model_vram_est = parse_vram_estimate(m_info.get("vram", "~10 GB"))
# If model fits comfortably in available VRAM (with 15% buffer), load fully
if model_vram_est < vram_available * 0.85:
print(f" 📦 Balanced mode: Model (~{model_vram_est:.1f}GB) fits in VRAM ({vram_available:.1f}GB available)")
print(f" Loading fully to GPU (no offloading)")
pipe = pipe.to("cuda")
else:
# Model too large, use sequential offloading but only for necessary layers
print(f" 📦 Balanced mode: Model (~{model_vram_est:.1f}GB) exceeds VRAM ({vram_available:.1f}GB available)")
print(f" Using selective offloading to maximize VRAM usage")
pipe.enable_sequential_cpu_offload()
else:
pipe.to("cuda" if torch.cuda.is_available() else "cpu")
......@@ -10415,7 +10464,7 @@ List TTS voices:
parser.add_argument("--distribute", action="store_true")
parser.add_argument("--interface", type=str, default="eth0")
parser.add_argument("--offload_strategy", choices=["none", "model", "sequential", "group", "auto_map"], default="model")
parser.add_argument("--offload_strategy", choices=["none", "model", "sequential", "group", "auto_map", "balanced"], default="model")
parser.add_argument("--offload_group_size", type=int, default=8)
parser.add_argument("--low_ram_mode", action="store_true")
parser.add_argument("--vram_limit", type=int, default=22)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment