Fix time estimation to be more realistic

- Increased base time per frame for all models (2-4x more realistic)
- Added LTXVideoPipeline specific estimate (4.0s/frame)
- Increased model loading times (90s-10min based on model size)
- Added realistic image model loading times for I2V mode
- Added image generation time based on model type (Flux, SDXL, SD3)
- Added 30% overhead for I/O and memory operations
- Added 20% extra time for I2V processing
- Increased resolution scaling factor to 1.3 (quadratic relationship)
- Increased download time estimate to 15s/GB with 2min cap

The previous estimates were too optimistic and didn't account for:
- Full diffusion process (multiple denoising steps)
- Model loading from disk/download
- Memory management overhead
- I2V-specific processing time
- Image model loading for I2V mode
parent 5291deb2
...@@ -1977,45 +1977,51 @@ class TimingTracker: ...@@ -1977,45 +1977,51 @@ class TimingTracker:
# Slight speedup for memory-intensive operations # Slight speedup for memory-intensive operations
perf_multiplier *= 0.9 # 10% faster due to better memory distribution perf_multiplier *= 0.9 # 10% faster due to better memory distribution
# Base time per frame (empirical estimates for RTX 4090) # Base time per frame (REALISTIC estimates for RTX 4090)
# These are OPTIMISTIC estimates for top-tier hardware # These account for the FULL diffusion process, not just one step
# Video generation involves multiple denoising steps per frame
model_class = m_info.get("class", "") model_class = m_info.get("class", "")
model_id = m_info.get("id", "").lower()
# Time per frame estimates (seconds) for RTX 4090 # Time per frame estimates (seconds) for RTX 4090
# These are REALISTIC estimates including diffusion steps # These are REALISTIC estimates for the ENTIRE generation process
# including all diffusion steps, VAE decoding, etc.
if "WanPipeline" in model_class: if "WanPipeline" in model_class:
base_time_per_frame = 1.5 # Wan 14B is compute heavy base_time_per_frame = 3.0 # Wan 14B is compute heavy, ~25 steps
elif "MochiPipeline" in model_class: elif "MochiPipeline" in model_class:
base_time_per_frame = 3.0 # Mochi is very slow base_time_per_frame = 5.0 # Mochi is very slow
elif "StableVideoDiffusionPipeline" in model_class: elif "StableVideoDiffusionPipeline" in model_class:
base_time_per_frame = 0.8 # SVD is relatively fast base_time_per_frame = 1.5 # SVD is relatively fast but still ~25 steps
elif "CogVideoXPipeline" in model_class: elif "CogVideoXPipeline" in model_class:
base_time_per_frame = 2.0 # CogVideoX 5B base_time_per_frame = 4.0 # CogVideoX 5B is slow
elif "LTXVideoPipeline" in model_class: elif "LTXVideoPipeline" in model_class or "ltx" in model_id:
base_time_per_frame = 2.5 # LTX is moderate base_time_per_frame = 4.0 # LTX is moderate-slow, ~25 steps
elif "FluxPipeline" in model_class: elif "FluxPipeline" in model_class:
base_time_per_frame = 4.0 # Flux is slow for images base_time_per_frame = 8.0 # Flux is slow for images (~20-30 steps)
elif "StableDiffusionXLPipeline" in model_class: elif "StableDiffusionXLPipeline" in model_class:
base_time_per_frame = 0.5 # SDXL is fast for images base_time_per_frame = 1.0 # SDXL is fast for images
elif "StableDiffusion3Pipeline" in model_class: elif "StableDiffusion3Pipeline" in model_class:
base_time_per_frame = 1.0 # SD3 is moderate base_time_per_frame = 2.0 # SD3 is moderate
elif "AllegroPipeline" in model_class: elif "AllegroPipeline" in model_class:
base_time_per_frame = 5.0 # Allegro is very slow base_time_per_frame = 8.0 # Allegro is very slow
elif "HunyuanDiTPipeline" in model_class: elif "HunyuanDiTPipeline" in model_class:
base_time_per_frame = 6.0 # Hunyuan is very slow base_time_per_frame = 10.0 # Hunyuan is very slow
elif "OpenSoraPipeline" in model_class: elif "OpenSoraPipeline" in model_class:
base_time_per_frame = 4.0 # OpenSora is slow base_time_per_frame = 6.0 # OpenSora is slow
elif "MochiPipeline" in model_class: elif "I2VGenXLPipeline" in model_class:
base_time_per_frame = 3.0 # Mochi is slow base_time_per_frame = 3.0 # I2VGenXL
elif "AnimateDiffPipeline" in model_class:
base_time_per_frame = 2.0 # AnimateDiff
else: else:
base_time_per_frame = 2.0 # Default estimate # Default - be conservative for unknown models
base_time_per_frame = 4.0
# Apply GPU performance multiplier # Apply GPU performance multiplier
time_per_frame = base_time_per_frame * perf_multiplier time_per_frame = base_time_per_frame * perf_multiplier
# Adjust for resolution (higher res = more time, quadratic relationship) # Adjust for resolution (higher res = more time, quadratic relationship)
resolution_factor = (args.width * args.height) / (832 * 480) resolution_factor = (args.width * args.height) / (832 * 480)
time_per_frame *= (resolution_factor ** 1.2) # Slightly more than linear time_per_frame *= (resolution_factor ** 1.3) # More than linear - memory bandwidth
# VRAM constraint adjustment # VRAM constraint adjustment
# If model VRAM requirement > available VRAM, will need offloading # If model VRAM requirement > available VRAM, will need offloading
...@@ -2030,34 +2036,61 @@ class TimingTracker: ...@@ -2030,34 +2036,61 @@ class TimingTracker:
# Model loading time estimate (REALISTIC) # Model loading time estimate (REALISTIC)
# Large models take MINUTES to load, not seconds # Large models take MINUTES to load, not seconds
# This includes: download, weight loading, CUDA initialization, warmup
if model_vram_req > 50: if model_vram_req > 50:
load_time = 300 # 5 minutes for huge models load_time = 600 # 10 minutes for huge models (100GB+)
elif model_vram_req > 30: elif model_vram_req > 30:
load_time = 180 # 3 minutes for large models load_time = 300 # 5 minutes for large models
elif model_vram_req > 16: elif model_vram_req > 16:
load_time = 90 # 1.5 minutes for medium models load_time = 180 # 3 minutes for medium models
else: else:
load_time = 45 # 45 seconds for small models load_time = 90 # 1.5 minutes for small models
# Add network download time estimate (if model not cached) # Add network download time estimate (if model not cached)
# This is a rough estimate - actual time depends on connection # This is a rough estimate - actual time depends on connection
model_size_gb = model_vram_req * 1.5 # Models are usually larger than VRAM requirement model_size_gb = model_vram_req * 1.5 # Models are usually larger than VRAM requirement
download_time = model_size_gb * 10 # ~10 seconds per GB on average connection download_time = model_size_gb * 15 # ~15 seconds per GB on average connection
# Only add if model might not be cached (first run) # Only add if model might not be cached (first run)
# We'll be conservative and include partial download time # We'll be conservative and include partial download time
load_time += min(download_time, 60) # Cap at 60s extra for potential download load_time += min(download_time, 120) # Cap at 2min extra for potential download
estimates["model_loading"] = load_time estimates["model_loading"] = load_time
# Image generation for I2V (REALISTIC estimates) # Image generation for I2V (REALISTIC estimates)
if has_i2v and not args.image: if has_i2v and not args.image:
# Image generation is typically 10-30 seconds for quality models # Image generation for Flux/SDXL takes significant time
img_time = 15 + (args.width * args.height) / (1024 * 1024) * 5 # Flux: ~20-40 steps, SDXL: ~20-30 steps
img_model_class = ""
if hasattr(args, 'image_model') and args.image_model:
img_model_info = MODELS.get(args.image_model, {})
img_model_class = img_model_info.get("class", "")
# Base image generation time (seconds)
if "FluxPipeline" in img_model_class:
img_time = 45 # Flux is slow
elif "StableDiffusion3Pipeline" in img_model_class:
img_time = 20 # SD3 is moderate
elif "StableDiffusionXLPipeline" in img_model_class:
img_time = 15 # SDXL is faster
else:
img_time = 30 # Default for unknown models
# Scale by resolution
img_time *= (args.width * args.height) / (1024 * 1024)
img_time *= perf_multiplier # Apply GPU performance img_time *= perf_multiplier # Apply GPU performance
estimates["image_generation"] = img_time estimates["image_generation"] = img_time
# Add image model loading time # Add image model loading time (REALISTIC)
estimates["image_model_loading"] = 30 # Image models also need to be loaded from disk/downloaded
img_model_vram = parse_vram_estimate(MODELS.get(args.image_model, {}).get("vram", "~10 GB"))
if img_model_vram > 20:
img_load_time = 180 # 3 minutes for large image models
elif img_model_vram > 10:
img_load_time = 90 # 1.5 minutes for medium
else:
img_load_time = 45 # 45 seconds for small
estimates["image_model_loading"] = img_load_time
# Audio generation # Audio generation
if has_audio: if has_audio:
...@@ -2069,14 +2102,28 @@ class TimingTracker: ...@@ -2069,14 +2102,28 @@ class TimingTracker:
# Video generation (REALISTIC) # Video generation (REALISTIC)
num_frames = int(args.length * args.fps) num_frames = int(args.length * args.fps)
inference_steps = 50 if "wan" in args.model.lower() else 28
# Total video time = frames * time_per_frame * step_factor # Get actual inference steps for the model
step_factor = inference_steps / 50 # Normalize to 50 steps # Most video models use 20-50 steps
video_time = num_frames * time_per_frame * step_factor if "wan" in args.model.lower():
inference_steps = 50
elif "svd" in args.model.lower() or "stable-video" in args.model.lower():
inference_steps = 25
elif "ltx" in args.model.lower():
inference_steps = 25
else:
inference_steps = 30 # Default
# Total video time = frames * time_per_frame
# time_per_frame already accounts for diffusion steps
video_time = num_frames * time_per_frame
# Add overhead for memory management, saving, etc. # Add overhead for memory management, saving, etc.
video_time *= 1.2 # 20% overhead video_time *= 1.3 # 30% overhead for I/O, memory ops
# For I2V, add extra time for image encoding and conditioning
if has_i2v:
video_time *= 1.2 # 20% extra for I2V processing
estimates["video_generation"] = video_time estimates["video_generation"] = video_time
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment