Fix time estimation for T2I models and suppress log output for JSON

- Move T2I detection before time estimation
- Add has_t2i parameter to estimate_total_time
- T2I models now show image_generation time instead of video_generation
- Add Lumina pipelines to T2I model detection
- Suppress 'Loaded models' message when --json flag is used
parent 6bd2bbd4
...@@ -2140,14 +2140,19 @@ def print_search_results(results, args): ...@@ -2140,14 +2140,19 @@ def print_search_results(results, args):
# Initialize MODELS from external config only # Initialize MODELS from external config only
MODELS = {} MODELS = {}
# Check if JSON output is requested (for suppressing log messages)
_json_output = "--json" in sys.argv
# Load external models config # Load external models config
_external_models = load_models_config() _external_models = load_models_config()
if _external_models: if _external_models:
MODELS = _external_models MODELS = _external_models
print(f"📁 Loaded {len(_external_models)} models from {MODELS_CONFIG_FILE}") if not _json_output:
print(f"📁 Loaded {len(_external_models)} models from {MODELS_CONFIG_FILE}")
else: else:
print(f"⚠️ No models configured. Run: videogen --update-models") if not _json_output:
print(f" Or add a model: videogen --add-model <model_id> --name <name>") print(f"⚠️ No models configured. Run: videogen --update-models")
print(f" Or add a model: videogen --add-model <model_id> --name <name>")
# ────────────────────────────────────────────────────────────────────────────── # ──────────────────────────────────────────────────────────────────────────────
# TTS VOICE REGISTRY # TTS VOICE REGISTRY
...@@ -2379,7 +2384,7 @@ class TimingTracker: ...@@ -2379,7 +2384,7 @@ class TimingTracker:
return load_factor return load_factor
def estimate_total_time(self, args, m_info, has_i2v=False, has_audio=False, has_lipsync=False, has_upscale=False): def estimate_total_time(self, args, m_info, has_i2v=False, has_audio=False, has_lipsync=False, has_upscale=False, has_t2i=False):
"""Estimate total generation time based on parameters and hardware """Estimate total generation time based on parameters and hardware
This provides MORE REALISTIC estimates that account for: This provides MORE REALISTIC estimates that account for:
...@@ -2551,36 +2556,58 @@ class TimingTracker: ...@@ -2551,36 +2556,58 @@ class TimingTracker:
audio_time = args.length * 2 + 10 # MusicGen takes time audio_time = args.length * 2 + 10 # MusicGen takes time
estimates["audio_generation"] = audio_time estimates["audio_generation"] = audio_time
# Video generation (REALISTIC) # Video generation (REALISTIC) - only for video models, not T2I
num_frames = int(args.length * args.fps) if has_t2i:
# For T2I models, estimate image generation time instead of video
# Get actual inference steps for the model # Get base time per image from model class
# Most video models use 20-50 steps model_class = m_info.get("class", "")
if "wan" in args.model.lower(): if "FluxPipeline" in model_class:
inference_steps = 50 img_time = 45 # Flux is slow (~20-30 steps)
elif "svd" in args.model.lower() or "stable-video" in args.model.lower(): elif "StableDiffusion3Pipeline" in model_class:
inference_steps = 25 img_time = 20 # SD3 is moderate
elif "ltx" in args.model.lower(): elif "StableDiffusionXLPipeline" in model_class:
inference_steps = 25 img_time = 15 # SDXL is faster
elif "LuminaText2ImgPipeline" in model_class or "Lumina2Text2ImgPipeline" in model_class:
img_time = 25 # Lumina is moderate
else:
img_time = 30 # Default for unknown models
# Scale by resolution
img_time *= (args.width * args.height) / (1024 * 1024)
img_time *= perf_multiplier # Apply GPU performance
estimates["image_generation"] = img_time
else: else:
inference_steps = 30 # Default # Video generation
num_frames = int(args.length * args.fps)
# Total video time = frames * time_per_frame
# time_per_frame already accounts for diffusion steps # Get actual inference steps for the model
video_time = num_frames * time_per_frame # Most video models use 20-50 steps
if "wan" in args.model.lower():
# Add overhead for memory management, saving, etc. inference_steps = 50
video_time *= 1.5 # 50% overhead for I/O, memory ops, unexpected delays elif "svd" in args.model.lower() or "stable-video" in args.model.lower():
inference_steps = 25
# For I2V, add extra time for image encoding and conditioning elif "ltx" in args.model.lower():
if has_i2v: inference_steps = 25
video_time *= 1.3 # 30% extra for I2V processing else:
inference_steps = 30 # Default
# Add safety margin for unpredictable factors
# This accounts for: thermal throttling, other processes, disk I/O, etc. # Total video time = frames * time_per_frame
video_time *= 1.2 # 20% safety margin # time_per_frame already accounts for diffusion steps
video_time = num_frames * time_per_frame
estimates["video_generation"] = video_time
# Add overhead for memory management, saving, etc.
video_time *= 1.5 # 50% overhead for I/O, memory ops, unexpected delays
# For I2V, add extra time for image encoding and conditioning
if has_i2v:
video_time *= 1.3 # 30% extra for I2V processing
# Add safety margin for unpredictable factors
# This accounts for: thermal throttling, other processes, disk I/O, etc.
video_time *= 1.2 # 20% safety margin
estimates["video_generation"] = video_time
# Upscaling (REALISTIC - can be slow for high-res) # Upscaling (REALISTIC - can be slow for high-res)
if has_upscale: if has_upscale:
...@@ -8001,29 +8028,33 @@ def main(args): ...@@ -8001,29 +8028,33 @@ def main(args):
main_prompt = ", ".join(args.prompt) main_prompt = ", ".join(args.prompt)
init_image = None init_image = None
# Detect if we should generate a static image (T2I mode)
# Conditions: T2I model, OR output ends with image extension, OR only prompt_image specified
is_t2i_model = m_info.get("class") in ["StableDiffusionXLPipeline", "FluxPipeline",
"StableDiffusion3Pipeline", "LuminaText2ImgPipeline",
"Lumina2Text2ImgPipeline"]
output_ext = os.path.splitext(args.output)[1].lower()
is_image_output = output_ext in [".png", ".jpg", ".jpeg", ".gif", ".webp"]
only_prompt_image = args.prompt_image and not args.prompt
generate_static_image = is_t2i_model or is_image_output or only_prompt_image
# Calculate and print time estimate # Calculate and print time estimate
has_i2v = args.image_to_video or args.image has_i2v = args.image_to_video or args.image
has_audio = args.generate_audio or args.audio_file has_audio = args.generate_audio or args.audio_file
has_lipsync = args.lip_sync has_lipsync = args.lip_sync
has_upscale = args.upscale has_upscale = args.upscale
has_t2i = generate_static_image and not has_i2v # T2I mode, not I2V
estimates = timing.estimate_total_time( estimates = timing.estimate_total_time(
args, m_info, args, m_info,
has_i2v=has_i2v, has_i2v=has_i2v,
has_audio=has_audio, has_audio=has_audio,
has_lipsync=has_lipsync, has_lipsync=has_lipsync,
has_upscale=has_upscale has_upscale=has_upscale,
has_t2i=has_t2i
) )
timing.print_estimate(estimates) timing.print_estimate(estimates)
# Detect if we should generate a static image (T2I mode)
# Conditions: T2I model, OR output ends with image extension, OR only prompt_image specified
is_t2i_model = m_info.get("class") in ["StableDiffusionXLPipeline", "FluxPipeline"]
output_ext = os.path.splitext(args.output)[1].lower()
is_image_output = output_ext in [".png", ".jpg", ".jpeg", ".gif", ".webp"]
only_prompt_image = args.prompt_image and not args.prompt
generate_static_image = is_t2i_model or is_image_output or only_prompt_image
# ─── T+I2I (Text + Image-to-Image) Mode ───────────────────────────────────── # ─── T+I2I (Text + Image-to-Image) Mode ─────────────────────────────────────
# Use existing image with T2I model to create modified image # Use existing image with T2I model to create modified image
if args.image_to_image and args.image: if args.image_to_image and args.image:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment