Fix time estimation for T2I models and suppress log output for JSON

- Move T2I detection before time estimation - Add has_t2i parameter to estimate_total_time - T2I models now show image_generation time instead of video_generation - Add Lumina pipelines to T2I model detection - Suppress 'Loaded models' message when --json flag is used

Fix time estimation for T2I models and suppress log output for JSON
- Move T2I detection before time estimation - Add has_t2i parameter to estimate_total_time - T2I models now show image_generation time instead of video_generation - Add Lumina pipelines to T2I model detection - Suppress 'Loaded models' message when --json flag is used
0a6b9dd2 · Stefy Lanza (nextime / spora ) · 6bd2bbd4 · 0a6b9dd2
Commit 0a6b9dd2 authored Feb 25, 2026 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 73 additions and 42 deletions

videogen videogen +73 -42

No files found.
--- a/videogen
+++ b/videogen
@@ -2140,14 +2140,19 @@ def print_search_results(results, args):
 # Initialize MODELS from external config only
 MODELS = {}
+# Check if JSON output is requested (for suppressing log messages)
+_json_output = "--json" in sys.argv
 # Load external models config
 _external_models = load_models_config()
 if _external_models:
    MODELS = _external_models
-    print(f"📁 Loaded {len(_external_models)} models from {MODELS_CONFIG_FILE}")
+    if not _json_output:
+        print(f"📁 Loaded {len(_external_models)} models from {MODELS_CONFIG_FILE}")
 else:
-    print(f"⚠️  No models configured. Run: videogen --update-models")
+    if not _json_output:
-    print(f"   Or add a model: videogen --add-model <model_id> --name <name>")
+        print(f"⚠️  No models configured. Run: videogen --update-models")
+        print(f"   Or add a model: videogen --add-model <model_id> --name <name>")
 # ──────────────────────────────────────────────────────────────────────────────
 #                                 TTS VOICE REGISTRY
@@ -2379,7 +2384,7 @@ class TimingTracker:
        return load_factor
-    def estimate_total_time(self, args, m_info, has_i2v=False, has_audio=False, has_lipsync=False, has_upscale=False):
+    def estimate_total_time(self, args, m_info, has_i2v=False, has_audio=False, has_lipsync=False, has_upscale=False, has_t2i=False):
        """Estimate total generation time based on parameters and hardware
        This provides MORE REALISTIC estimates that account for:
@@ -2551,36 +2556,58 @@ class TimingTracker:
                audio_time = args.length * 2 + 10  # MusicGen takes time
            estimates["audio_generation"] = audio_time
-        # Video generation (REALISTIC)
+        # Video generation (REALISTIC) - only for video models, not T2I
-        num_frames = int(args.length * args.fps)
+        if has_t2i:
+            # For T2I models, estimate image generation time instead of video
-        # Get actual inference steps for the model
+            # Get base time per image from model class
-        # Most video models use 20-50 steps
+            model_class = m_info.get("class", "")
-        if "wan" in args.model.lower():
+            if "FluxPipeline" in model_class:
-            inference_steps = 50
+                img_time = 45  # Flux is slow (~20-30 steps)
-        elif "svd" in args.model.lower() or "stable-video" in args.model.lower():
+            elif "StableDiffusion3Pipeline" in model_class:
-            inference_steps = 25
+                img_time = 20  # SD3 is moderate
-        elif "ltx" in args.model.lower():
+            elif "StableDiffusionXLPipeline" in model_class:
-            inference_steps = 25
+                img_time = 15  # SDXL is faster
+            elif "LuminaText2ImgPipeline" in model_class or "Lumina2Text2ImgPipeline" in model_class:
+                img_time = 25  # Lumina is moderate
+            else:
+                img_time = 30  # Default for unknown models
+            # Scale by resolution
+            img_time *= (args.width * args.height) / (1024 * 1024)
+            img_time *= perf_multiplier  # Apply GPU performance
+            estimates["image_generation"] = img_time
        else:
-            inference_steps = 30  # Default
+            # Video generation
+            num_frames = int(args.length * args.fps)
-        # Total video time = frames * time_per_frame
-        # time_per_frame already accounts for diffusion steps
+            # Get actual inference steps for the model
-        video_time = num_frames * time_per_frame
+            # Most video models use 20-50 steps
+            if "wan" in args.model.lower():
-        # Add overhead for memory management, saving, etc.
+                inference_steps = 50
-        video_time *= 1.5  # 50% overhead for I/O, memory ops, unexpected delays
+            elif "svd" in args.model.lower() or "stable-video" in args.model.lower():
+                inference_steps = 25
-        # For I2V, add extra time for image encoding and conditioning
+            elif "ltx" in args.model.lower():
-        if has_i2v:
+                inference_steps = 25
-            video_time *= 1.3  # 30% extra for I2V processing
+            else:
+                inference_steps = 30  # Default
-        # Add safety margin for unpredictable factors
-        # This accounts for: thermal throttling, other processes, disk I/O, etc.
+            # Total video time = frames * time_per_frame
-        video_time *= 1.2  # 20% safety margin
+            # time_per_frame already accounts for diffusion steps
+            video_time = num_frames * time_per_frame
-        estimates["video_generation"] = video_time
+            # Add overhead for memory management, saving, etc.
+            video_time *= 1.5  # 50% overhead for I/O, memory ops, unexpected delays
+            # For I2V, add extra time for image encoding and conditioning
+            if has_i2v:
+                video_time *= 1.3  # 30% extra for I2V processing
+            # Add safety margin for unpredictable factors
+            # This accounts for: thermal throttling, other processes, disk I/O, etc.
+            video_time *= 1.2  # 20% safety margin
+            estimates["video_generation"] = video_time
        # Upscaling (REALISTIC - can be slow for high-res)
        if has_upscale:
@@ -8001,29 +8028,33 @@ def main(args):
    main_prompt = ", ".join(args.prompt)
    init_image = None
+    # Detect if we should generate a static image (T2I mode)
+    # Conditions: T2I model, OR output ends with image extension, OR only prompt_image specified
+    is_t2i_model = m_info.get("class") in ["StableDiffusionXLPipeline", "FluxPipeline",
+                                            "StableDiffusion3Pipeline", "LuminaText2ImgPipeline",
+                                            "Lumina2Text2ImgPipeline"]
+    output_ext = os.path.splitext(args.output)[1].lower()
+    is_image_output = output_ext in [".png", ".jpg", ".jpeg", ".gif", ".webp"]
+    only_prompt_image = args.prompt_image and not args.prompt
+    generate_static_image = is_t2i_model or is_image_output or only_prompt_image
    # Calculate and print time estimate
    has_i2v = args.image_to_video or args.image
    has_audio = args.generate_audio or args.audio_file
    has_lipsync = args.lip_sync
    has_upscale = args.upscale
+    has_t2i = generate_static_image and not has_i2v  # T2I mode, not I2V
    estimates = timing.estimate_total_time(
        args, m_info,
        has_i2v=has_i2v,
        has_audio=has_audio,
        has_lipsync=has_lipsync,
-        has_upscale=has_upscale
+        has_upscale=has_upscale,
+        has_t2i=has_t2i
    )
    timing.print_estimate(estimates)
-    # Detect if we should generate a static image (T2I mode)
-    # Conditions: T2I model, OR output ends with image extension, OR only prompt_image specified
-    is_t2i_model = m_info.get("class") in ["StableDiffusionXLPipeline", "FluxPipeline"]
-    output_ext = os.path.splitext(args.output)[1].lower()
-    is_image_output = output_ext in [".png", ".jpg", ".jpeg", ".gif", ".webp"]
-    only_prompt_image = args.prompt_image and not args.prompt
-    generate_static_image = is_t2i_model or is_image_output or only_prompt_image
    # ─── T+I2I (Text + Image-to-Image) Mode ─────────────────────────────────────
    # Use existing image with T2I model to create modified image
    if args.image_to_image and args.image: