#!/usr/bin/env python3
"""
VideoGen - Universal Video Generation Toolkit (2026 Edition)
============================================================

Copyleft © 2026 Stefy <stefy@nexlab.net>
Licensed under GNU General Public License v3.0 or later

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

Supports T2V, I2V chaining, post-upscale, multi-GPU distribute, many offload strategies
PLUS: Audio generation, audio sync, lip sync, and audio prompting

INSTALLATION:
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 --break-system-packages
pip install git+https://github.com/huggingface/diffusers.git --break-system-packages
pip install git+https://github.com/huggingface/transformers.git --break-system-packages
pip install --upgrade accelerate xformers spandrel psutil ffmpeg-python ftfy --break-system-packages

AUDIO FEATURES (optional):
pip install scipy soundfile librosa --break-system-packages
pip install git+https://github.com/suno-ai/bark.git --break-system-packages
pip install edge-tts --break-system-packages  # Lightweight TTS alternative
pip install audiocraft  # For MusicGen

LIP SYNC (optional):
pip install opencv-python face-recognition dlib --break-system-packages
# Wav2Lip needs to be cloned: git clone https://github.com/Rudrabha/Wav2Lip.git
"""

import warnings
warnings.filterwarnings("ignore", message="The `local_dir_use_symlinks` argument is deprecated")

import torch
import argparse
import os
import math
import random
import sys
import psutil
import re
import subprocess
import tempfile
import json
import urllib.request
import urllib.error
import time
from datetime import datetime, timedelta
from pathlib import Path
from PIL import Image

try:
    from diffusers.utils import export_to_video, load_image
    from diffusers import (
        AutoencoderKLWan,
        UniPCMultistepScheduler,
        StableDiffusionUpscalePipeline,
    )
except ImportError as e:
    print(f"Critical import error: {e}")
    sys.exit(1)

# ──────────────────────────────────────────────────────────────────────────────
#                                 AUDIO IMPORTS
# ──────────────────────────────────────────────────────────────────────────────

AUDIO_AVAILABLE = False
BARK_AVAILABLE = False
EDGE_TTS_AVAILABLE = False
MUSICGEN_AVAILABLE = False
LIBROSA_AVAILABLE = False
SCIPY_AVAILABLE = False

try:
    import scipy
    import soundfile as sf
    SCIPY_AVAILABLE = True
except ImportError:
    pass

try:
    import librosa
    LIBROSA_AVAILABLE = True
except ImportError:
    pass

try:
    from bark import SAMPLE_RATE as BARK_SAMPLE_RATE
    from bark.generation import generate_audio_semantic, preload_models
    from bark.api import semantic_to_waveform
    from bark.api import generate_audio as bark_generate_audio
    BARK_AVAILABLE = True
    AUDIO_AVAILABLE = True
except ImportError:
    pass

try:
    import edge_tts
    EDGE_TTS_AVAILABLE = True
    AUDIO_AVAILABLE = True
except ImportError:
    pass

try:
    from audiocraft.models import MusicGen
    from audiocraft.data.audio import audio_write
    MUSICGEN_AVAILABLE = True
    AUDIO_AVAILABLE = True
except ImportError:
    pass

# NSFW text classification
TRANSFORMERS_AVAILABLE = False
NSFW_CLASSIFIER = None

try:
    from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    pass

# ──────────────────────────────────────────────────────────────────────────────
#                                 CONFIG & MODEL MANAGEMENT
# ──────────────────────────────────────────────────────────────────────────────

CONFIG_DIR = Path.home() / ".config" / "videogen"
MODELS_CONFIG_FILE = CONFIG_DIR / "models.json"
CACHE_FILE = CONFIG_DIR / "hf_cache.json"

# Pipeline class to model type mapping
PIPELINE_CLASS_MAP = {
    "StableVideoDiffusionPipeline": {"type": "i2v", "default_vram": "~14-18 GB"},
    "WanPipeline": {"type": "video", "default_vram": "~10-24 GB"},
    "I2VGenXLPipeline": {"type": "i2v", "default_vram": "~18-24 GB"},
    "LTXVideoPipeline": {"type": "i2v", "default_vram": "~12-16 GB"},
    "AnimateDiffPipeline": {"type": "video", "default_vram": "~10-14 GB"},
    "TextToVideoSDPipeline": {"type": "t2v", "default_vram": "~7-9 GB"},
    "TextToVideoZeroPipeline": {"type": "t2v", "default_vram": "~6-8 GB"},
    "MochiPipeline": {"type": "t2v", "default_vram": "~18-22 GB"},
    "StableDiffusionXLPipeline": {"type": "image", "default_vram": "~10-16 GB"},
    "FluxPipeline": {"type": "image", "default_vram": "~20-25 GB"},
    "AllegroPipeline": {"type": "t2v", "default_vram": "~35-45 GB"},
    "HunyuanDiTPipeline": {"type": "t2v", "default_vram": "~40-55 GB"},
    "OpenSoraPipeline": {"type": "video", "default_vram": "~45-65 GB"},
    "LuminaVideoPipeline": {"type": "t2v", "default_vram": "~100 GB"},
    "StepVideoPipeline": {"type": "t2v", "default_vram": "~90-140 GB"},
    "CogVideoXPipeline": {"type": "t2v", "default_vram": "~20-30 GB"},
    "HotshotXLPipeline": {"type": "video", "default_vram": "~8-12 GB"},
    # Generic pipeline - auto-detects model type from loaded model
    "DiffusionPipeline": {"type": "auto", "default_vram": "~10-30 GB"},
}


def ensure_config_dir():
    """Ensure config directory exists"""
    CONFIG_DIR.mkdir(parents=True, exist_ok=True)


def load_models_config():
    """Load models from external config file"""
    ensure_config_dir()
    
    if MODELS_CONFIG_FILE.exists():
        try:
            with open(MODELS_CONFIG_FILE, 'r') as f:
                config = json.load(f)
                return config.get("models", {})
        except Exception as e:
            print(f"⚠️  Could not load models config: {e}")
    
    return None


def save_models_config(models):
    """Save models to external config file"""
    ensure_config_dir()
    
    try:
        with open(MODELS_CONFIG_FILE, 'w') as f:
            json.dump({"models": models, "version": "1.0"}, f, indent=2)
        print(f"✅ Saved models config to {MODELS_CONFIG_FILE}")
    except Exception as e:
        print(f"❌ Could not save models config: {e}")


def validate_hf_model(model_id, hf_token=None, debug=False):
    """Validate if a HuggingFace model exists and get its info"""
    headers = {}
    if hf_token:
        headers["Authorization"] = f"Bearer {hf_token}"
    
    if debug:
        print(f"\n🔍 [DEBUG] Validating model: {model_id}")
        print(f"   [DEBUG] HF Token: {'***' + hf_token[-4:] if hf_token else 'Not set'}")
    
    try:
        url = f"https://huggingface.co/api/models/{model_id}"
        if debug:
            print(f"   [DEBUG] API URL: {url}")
        
        req = urllib.request.Request(url, headers=headers)
        
        if debug:
            print(f"   [DEBUG] Sending request...")
        
        with urllib.request.urlopen(req, timeout=10) as response:
            if debug:
                print(f"   [DEBUG] Response status: {response.status}")
            data = json.loads(response.read().decode())
            if debug:
                print(f"   [DEBUG] Model found! Tags: {data.get('tags', [])[:5]}")
            return data
    except urllib.error.HTTPError as e:
        if debug:
            print(f"   [DEBUG] HTTP Error: {e.code} - {e.reason}")
            print(f"   [DEBUG] Response headers: {dict(e.headers)}")
            try:
                error_body = e.read().decode()
                print(f"   [DEBUG] Response body: {error_body[:500]}")
            except:
                pass
        
        if e.code == 401:
            print(f"❌ Model {model_id} requires authentication. Set HF_TOKEN environment variable.")
        elif e.code == 404:
            print(f"❌ Model {model_id} not found on HuggingFace.")
            if debug:
                print(f"   [DEBUG] The model ID may be incorrect or the model may have been removed.")
                print(f"   [DEBUG] Check the URL: https://huggingface.co/{model_id}")
        else:
            print(f"❌ HTTP error {e.code} for {model_id}")
        return None
    except urllib.error.URLError as e:
        if debug:
            print(f"   [DEBUG] URL Error: {e.reason}")
        print(f"❌ Network error validating {model_id}: {e.reason}")
        return None
    except Exception as e:
        if debug:
            print(f"   [DEBUG] Unexpected error: {type(e).__name__}: {e}")
        print(f"❌ Error validating {model_id}: {e}")
        return None


def detect_pipeline_class(model_info):
    """Try to detect the pipeline class from model info"""
    tags = model_info.get("tags", [])
    library_name = model_info.get("library_name", "")
    model_id = model_info.get("id", "").lower()
    
    # Check model ID patterns
    if "svd" in model_id or "stable-video-diffusion" in model_id:
        return "StableVideoDiffusionPipeline"
    if "wan" in model_id and "i2v" in model_id:
        return "WanPipeline"
    if "wan" in model_id and "t2v" in model_id:
        return "WanPipeline"
    if "wan2" in model_id:
        return "WanPipeline"
    if "i2vgen" in model_id:
        return "I2VGenXLPipeline"
    if "ltx-video" in model_id or "ltxvideo" in model_id:
        return "LTXVideoPipeline"
    if "animatediff" in model_id:
        return "AnimateDiffPipeline"
    if "mochi" in model_id:
        return "MochiPipeline"
    if "flux" in model_id:
        return "FluxPipeline"
    if "pony" in model_id or "animagine" in model_id:
        return "StableDiffusionXLPipeline"
    if "sdxl" in model_id or "xl" in model_id:
        return "StableDiffusionXLPipeline"
    if "allegro" in model_id:
        return "AllegroPipeline"
    if "hunyuan" in model_id:
        return "HunyuanDiTPipeline"
    if "open-sora" in model_id or "opensora" in model_id:
        return "OpenSoraPipeline"
    if "cogvideox" in model_id:
        return "CogVideoXPipeline"
    if "hotshot" in model_id:
        return "HotshotXLPipeline"
    if "zeroscope" in model_id:
        return "TextToVideoZeroPipeline"
    if "modelscope" in model_id or "text-to-video-ms" in model_id:
        return "TextToVideoSDPipeline"
    # Check for qwen or other diffusers models that use generic DiffusionPipeline
    if "qwen" in model_id or "diffusers" in model_id:
        return "DiffusionPipeline"
    
    # Check tags
    if "video" in tags:
        if "image-to-video" in tags:
            return "StableVideoDiffusionPipeline"
        return "WanPipeline"
    if "text-to-image" in tags:
        return "StableDiffusionXLPipeline"
    
    # Check library
    if library_name == "diffusers":
        # Use generic DiffusionPipeline for diffusers models
        # This allows loading any diffusers-compatible model
        return "DiffusionPipeline"
    
    return None


def parse_hf_url_or_id(input_str):
    """Parse either a HuggingFace URL or model ID and return the model ID
    
    Accepts:
    - lopi999/Wan2.2-I2V_General-NSFW-LoRA
    - https://huggingface.co/lopi999/Wan2.2-I2V_General-NSFW-LoRA
    - https://huggingface.co/lopi999/Wan2.2-I2V_General-NSFW-LoRA/tree/main
    - https://huggingface.co/lopi999/Wan2.2-I2V_General-NSFW-LoRA?some=params
    """
    input_str = input_str.strip()
    
    # If it's already a model ID format (org/model-name)
    if "/" in input_str and not input_str.startswith("http"):
        return input_str
    
    # Parse URL
    if input_str.startswith("http"):
        # Remove protocol and domain
        if "huggingface.co/" in input_str:
            # Extract everything after huggingface.co/
            parts = input_str.split("huggingface.co/")
            if len(parts) > 1:
                path = parts[1].split("?")[0].split("#")[0]  # Remove query params and fragments
                # Remove tree/main, blob/main, etc.
                path_parts = path.split("/")
                if len(path_parts) >= 2:
                    # Model ID is org/model-name
                    model_id = "/".join(path_parts[:2])
                    return model_id
    
    # Return as-is if we can't parse it
    return input_str


def add_model_from_hf(model_id_or_url, name=None, hf_token=None, debug=False):
    """Add a model from HuggingFace to the config
    
    Accepts both model IDs (org/model-name) and HuggingFace URLs
    """
    # Parse URL or model ID
    model_id = parse_hf_url_or_id(model_id_or_url)
    
    print(f"🔍 Validating model: {model_id}")
    if model_id != model_id_or_url:
        print(f"   (parsed from URL: {model_id_or_url})")
    
    model_info = validate_hf_model(model_id, hf_token, debug=debug)
    if not model_info:
        return None
    
    # Detect pipeline class
    pipeline_class = detect_pipeline_class(model_info)
    if not pipeline_class:
        print(f"⚠️  Could not auto-detect pipeline class for {model_id}")
        print(f"   Available classes: {', '.join(PIPELINE_CLASS_MAP.keys())}")
        pipeline_class = "WanPipeline"  # Default fallback
    
    # Get model name
    if not name:
        name = model_id.split("/")[-1].lower().replace("-", "_").replace(".", "_")
    
    # Determine if I2V
    tags = model_info.get("tags", [])
    is_i2v = any(t in tags for t in ["image-to-video", "i2v"]) or "i2v" in model_id.lower()
    
    # Get VRAM estimate
    vram_est = PIPELINE_CLASS_MAP.get(pipeline_class, {}).get("default_vram", "~10-20 GB")
    
    # Check for NSFW indicators
    nsfw_keywords = ["nsfw", "adult", "uncensored", "porn", "explicit", "nude", "erotic"]
    is_nsfw = any(kw in model_id.lower() or kw in model_info.get("description", "").lower() for kw in nsfw_keywords)
    
    # Detect if this is a LoRA
    is_lora = "lora" in model_id.lower() or any(t in tags for t in ["lora", "LoRA"])
    base_model = None
    
    if is_lora:
        # Try to detect base model for LoRA
        if "wan" in model_id.lower():
            base_model = "Wan-AI/Wan2.1-I2V-14B-Diffusers" if is_i2v else "Wan-AI/Wan2.1-T2V-14B-Diffusers"
        elif "svd" in model_id.lower() or "stable-video" in model_id.lower():
            base_model = "stabilityai/stable-video-diffusion-img2vid-xt-1-1"
        elif "sdxl" in model_id.lower() or "xl" in model_id.lower():
            base_model = "stabilityai/stable-diffusion-xl-base-1.0"
    
    # Build model entry
    model_entry = {
        "id": model_id,
        "vram": vram_est,
        "class": pipeline_class,
        "desc": model_info.get("description", f"Model from {model_id}")[:100],
        "supports_i2v": is_i2v,
        "tags": tags[:10] if tags else [],
        "validated": True,
        "added_date": str(Path.cwd()),
        "is_lora": is_lora,
    }
    
    if base_model:
        model_entry["base_model"] = base_model
    
    # Add extra config for Wan models
    if "WanPipeline" in pipeline_class and not is_lora:
        model_entry["extra"] = {"use_custom_vae": True}
    
    print(f"✅ Model validated: {name}")
    print(f"   Pipeline: {pipeline_class}")
    print(f"   VRAM: {vram_est}")
    print(f"   I2V: {is_i2v}")
    print(f"   NSFW-friendly: {is_nsfw}")
    
    return name, model_entry


def search_hf_models(query, limit=20, hf_token=None):
    """Search HuggingFace for models"""
    print(f"🔍 Searching HuggingFace for: {query}")
    
    headers = {}
    if hf_token:
        headers["Authorization"] = f"Bearer {hf_token}"
    
    try:
        # Use HuggingFace API search with URL encoding
        import urllib.parse
        encoded_query = urllib.parse.quote(query)
        search_url = f"https://huggingface.co/api/models?search={encoded_query}&limit={limit}&filter=diffusers"
        req = urllib.request.Request(search_url, headers=headers)
        
        with urllib.request.urlopen(req, timeout=30) as response:
            models = json.loads(response.read().decode())
            
        results = []
        for m in models:
            model_id = m.get("id", "")
            tags = m.get("tags", [])
            
            # Determine type
            is_i2v = any(t in tags for t in ["image-to-video", "i2v"])
            is_video = "video" in tags or "text-to-video" in tags
            is_image = "text-to-image" in tags
            
            # Check for NSFW
            nsfw_keywords = ["nsfw", "adult", "uncensored", "porn", "explicit"]
            is_nsfw = any(kw in model_id.lower() for kw in nsfw_keywords)
            
            results.append({
                "id": model_id,
                "downloads": m.get("downloads", 0),
                "likes": m.get("likes", 0),
                "tags": tags,
                "is_i2v": is_i2v,
                "is_video": is_video,
                "is_image": is_image,
                "is_nsfw": is_nsfw,
                "pipeline_class": detect_pipeline_class(m) or "Unknown",
            })
        
        return results
    except Exception as e:
        print(f"❌ Search failed: {e}")
        return []


def search_hf_safetensors(query, limit=20, hf_token=None):
    """Search HuggingFace for safetensors files (community models, fine-tunes, etc.)"""
    print(f"🔍 Searching HuggingFace safetensors for: {query}")
    
    headers = {}
    if hf_token:
        headers["Authorization"] = f"Bearer {hf_token}"
    
    try:
        # Search for models with safetensors (URL encoded)
        import urllib.parse
        encoded_query = urllib.parse.quote(query)
        search_url = f"https://huggingface.co/api/models?search={encoded_query}&limit={limit}"
        req = urllib.request.Request(search_url, headers=headers)
        
        with urllib.request.urlopen(req, timeout=30) as response:
            models = json.loads(response.read().decode())
        
        results = []
        for m in models:
            model_id = m.get("id", "")
            siblings = m.get("siblings", [])
            
            # Check for safetensors files
            safetensor_files = [s.get("rfilename", "") for s in siblings
                               if s.get("rfilename", "").endswith(".safetensors")]
            
            if not safetensor_files:
                continue
            
            tags = m.get("tags", [])
            
            # Determine type from model name/tags
            model_name_lower = model_id.lower()
            is_i2v = any(t in tags for t in ["image-to-video", "i2v"]) or "i2v" in model_name_lower
            is_video = "video" in tags or "text-to-video" in tags or any(x in model_name_lower for x in ["wan", "svd", "video", "mochi", "cogvideo"])
            is_nsfw = any(kw in model_name_lower for kw in ["nsfw", "adult", "uncensored", "porn", "explicit", "xxx"])
            
            # Detect pipeline from model name
            pipeline_class = "Unknown"
            if "wan" in model_name_lower:
                pipeline_class = "WanPipeline"
            elif "svd" in model_name_lower or "stable-video" in model_name_lower:
                pipeline_class = "StableVideoDiffusionPipeline"
            elif "mochi" in model_name_lower:
                pipeline_class = "MochiPipeline"
            elif "flux" in model_name_lower:
                pipeline_class = "FluxPipeline"
            elif "sdxl" in model_name_lower:
                pipeline_class = "StableDiffusionXLPipeline"
            
            results.append({
                "id": model_id,
                "safetensor_files": safetensor_files,
                "downloads": m.get("downloads", 0),
                "likes": m.get("likes", 0),
                "tags": tags,
                "is_i2v": is_i2v,
                "is_video": is_video,
                "is_nsfw": is_nsfw,
                "pipeline_class": pipeline_class,
                "is_safetensors": True,
            })
        
        return results
    except Exception as e:
        print(f"❌ Safetensors search failed: {e}")
        return []


def update_all_models(hf_token=None):
    """Search and update model list with I2V, T2V, and NSFW models from HuggingFace
    
    Preserves existing local/cached models even if not found online.
    Includes both diffusers models and safetensors files.
    """
    print("🔄 Updating model database from HuggingFace...")
    print("=" * 60)
    
    # Load existing models to preserve them
    existing_models = load_models_config() or {}
    print(f"📁 Preserving {len(existing_models)} existing local models")
    
    # Search queries for different model types
    search_queries = [
        # ═══════════════════════════════════════════════════════════════
        # I2V (Image-to-Video) Models
        # ═══════════════════════════════════════════════════════════════
        ("image-to-video", 50),
        ("i2v", 50),
        ("i2v video", 30),
        ("stable video diffusion", 30),
        ("svd", 30),
        ("svd xt", 20),
        ("svd 1.1", 20),
        ("wan i2v", 30),
        ("wan2 i2v", 30),
        ("wan2.1 i2v", 30),
        ("wan2.2 i2v", 30),
        ("ltx video", 30),
        ("ltxvideo", 30),
        ("i2vgen", 30),
        ("i2vgen xl", 20),
        ("animate diff i2v", 20),
        ("animatediff i2v", 20),
        ("img2vid", 30),
        ("image to video", 30),
        
        # ═══════════════════════════════════════════════════════════════
        # T2V (Text-to-Video) Models - Small/Medium
        # ═══════════════════════════════════════════════════════════════
        ("text-to-video", 50),
        ("t2v", 50),
        ("video generation", 40),
        ("video diffusion", 40),
        ("wan t2v", 30),
        ("wan2 t2v", 30),
        ("wan2.1 t2v", 30),
        ("wan2.2 t2v", 30),
        ("zeroscope", 30),
        ("modelscope video", 30),
        ("cogvideo", 30),
        ("cogvideox", 30),
        ("hotshot xl", 20),
        ("hotshot video", 20),
        ("animatediff", 40),
        ("animate diff", 30),
        ("modelscope", 30),
        
        # ═══════════════════════════════════════════════════════════════
        # T2V (Text-to-Video) Models - Large/Huge (40GB+)
        # ═══════════════════════════════════════════════════════════════
        ("mochi", 30),
        ("mochi 1", 20),
        ("mochi video", 20),
        ("hunyuan video", 30),
        ("hunyuanvideo", 30),
        ("open sora", 30),
        ("opensora", 30),
        ("open-sora", 30),
        ("sora", 20),
        ("allegro video", 20),
        ("allegro", 20),
        ("step video", 20),
        ("stepvideo", 20),
        ("lumina video", 20),
        ("luminavideo", 20),
        ("cogvideox 5b", 20),
        ("cogvideox 2b", 20),
        ("latte video", 20),
        
        # ═══════════════════════════════════════════════════════════════
        # T2I (Text-to-Image) Models - SD/SDXL
        # ═══════════════════════════════════════════════════════════════
        ("stable diffusion xl", 40),
        ("sdxl", 50),
        ("sdxl base", 30),
        ("stable diffusion 1.5", 30),
        ("sd 1.5", 30),
        ("sd2.1", 30),
        ("stable diffusion 2", 30),
        ("dreamshaper", 30),
        ("deliberate", 30),
        ("realistic vision", 30),
        ("juggernaut xl", 30),
        ("cyberrealistic", 30),
        ("epic realism", 30),
        ("majicmix", 30),
        ("realcartoon", 30),
        ("anything v5", 20),
        ("anything v4", 20),
        ("counterfeit", 20),
        
        # ═══════════════════════════════════════════════════════════════
        # T2I (Text-to-Image) Models - Flux
        # ═══════════════════════════════════════════════════════════════
        ("flux", 50),
        ("flux.1", 40),
        ("flux dev", 30),
        ("flux schnell", 30),
        ("flux fill", 20),
        ("flux realism", 30),
        
        # ═══════════════════════════════════════════════════════════════
        # T2I (Text-to-Image) Models - Pony/Anime
        # ═══════════════════════════════════════════════════════════════
        ("pony diffusion", 40),
        ("pony xl", 40),
        ("pony v6", 30),
        ("pony v5", 20),
        ("pony realism", 30),
        ("animagine", 30),
        ("animagine xl", 30),
        ("novelai", 20),
        ("nai diffusion", 20),
        
        # ═══════════════════════════════════════════════════════════════
        # NSFW Models - General
        # ═══════════════════════════════════════════════════════════════
        ("nsfw", 50),
        ("nsfw diffusers", 40),
        ("uncensored", 50),
        ("uncensored model", 40),
        ("adult", 50),
        ("adult diffusion", 40),
        ("porn", 50),
        ("porn diffusion", 40),
        ("xxx", 40),
        ("explicit", 40),
        ("nude", 40),
        ("erotic", 40),
        ("hentai", 40),
        ("hentai diffusion", 30),
        
        # ═══════════════════════════════════════════════════════════════
        # NSFW Models - Flux
        # ═══════════════════════════════════════════════════════════════
        ("flux nsfw", 30),
        ("flux uncensored", 30),
        ("flux adult", 30),
        ("flux porn", 20),
        ("flux realistic nsfw", 20),
        
        # ═══════════════════════════════════════════════════════════════
        # NSFW Models - SDXL
        # ═══════════════════════════════════════════════════════════════
        ("sdxl nsfw", 30),
        ("sdxl uncensored", 30),
        ("sdxl adult", 30),
        ("sdxl porn", 20),
        
        # ═══════════════════════════════════════════════════════════════
        # NSFW Models - Pony
        # ═══════════════════════════════════════════════════════════════
        ("pony nsfw", 30),
        ("pony uncensored", 30),
        ("pony adult", 30),
        ("pony porn", 30),
        ("pony xxx", 20),
        
        # ═══════════════════════════════════════════════════════════════
        # NSFW Models - Video
        # ═══════════════════════════════════════════════════════════════
        ("video nsfw", 30),
        ("i2v nsfw", 30),
        ("t2v nsfw", 30),
        ("svd nsfw", 30),
        ("wan nsfw", 30),
        ("mochi nsfw", 20),
        ("animatediff nsfw", 20),
        
        # ═══════════════════════════════════════════════════════════════
        # Audio Models - TTS
        # ═══════════════════════════════════════════════════════════════
        ("tts", 40),
        ("text to speech", 40),
        ("speech synthesis", 30),
        ("bark", 30),
        ("vits", 30),
        ("tortoise tts", 20),
        ("coqui tts", 20),
        ("styletts", 20),
        ("f5 tts", 20),
        ("cosyvoice", 20),
        ("chat tts", 20),
        
        # ═══════════════════════════════════════════════════════════════
        # Audio Models - Music/Sound
        # ═══════════════════════════════════════════════════════════════
        ("music generation", 40),
        ("musicgen", 40),
        ("audio generation", 40),
        ("audio diffusion", 30),
        ("audioldm", 30),
        ("riffusion", 20),
        ("stable audio", 30),
        ("audio lcm", 20),
        ("sound generation", 30),
        
        # ═══════════════════════════════════════════════════════════════
        # Audio Models - Voice/Speech
        # ═══════════════════════════════════════════════════════════════
        ("voice cloning", 30),
        ("voice conversion", 30),
        ("rvc", 30),
        ("so-vits", 30),
        ("sovits", 30),
        ("open voice", 20),
        ("xtts", 20),
        ("whisper", 30),
        ("speech to text", 30),
        
        # ═══════════════════════════════════════════════════════════════
        # LoRA Adapters
        # ═══════════════════════════════════════════════════════════════
        ("lora", 50),
        ("lora video", 30),
        ("lora i2v", 30),
        ("lora t2v", 30),
        ("lora nsfw", 30),
        ("wan lora", 30),
        ("svd lora", 30),
        ("flux lora", 30),
        ("sdxl lora", 30),
        
        # ═══════════════════════════════════════════════════════════════
        # Community Safetensors
        # ═══════════════════════════════════════════════════════════════
        ("wan2.2", 30),
        ("wan rapid", 20),
        ("wan aio", 20),
        ("wan finetune", 20),
        ("svd finetune", 20),
        ("video finetune", 20),
        
        # ═══════════════════════════════════════════════════════════════
        # Small/Lightweight Models (<10GB)
        # ═══════════════════════════════════════════════════════════════
        ("tiny model", 30),
        ("small model", 30),
        ("lightweight", 30),
        ("mobile diffusion", 20),
        ("sd turbo", 20),
        ("sdxl turbo", 20),
        ("latent consistency", 20),
        ("lcm", 30),
        ("lcm video", 20),
        
        # ═══════════════════════════════════════════════════════════════
        # Upscale/Enhancement
        # ═══════════════════════════════════════════════════════════════
        ("upscale", 30),
        ("upscaler", 30),
        ("super resolution", 30),
        ("video upscale", 20),
        ("esrgan", 20),
        ("real esrgan", 20),
        ("swinir", 20),
    ]
    
    # Known large/huge models to explicitly include (without requiring validation)
    # These are added regardless of HuggingFace search results
    known_large_models = [
        # ═══════════════════════════════════════════════════════════════
        # 100GB+ models - Ultra High VRAM
        # ═══════════════════════════════════════════════════════════════
        ("Alpha-VLLM/Lumina-Next-SFT", "LuminaVideoPipeline", "~100 GB", "Lumina Next SFT - Ultra high quality T2I/T2V"),
        ("Alpha-VLLM/Lumina-T2X", "LuminaVideoPipeline", "~100 GB", "Lumina T2X - Text to any"),
        
        # ═══════════════════════════════════════════════════════════════
        # 90-140GB models - Extreme VRAM
        # ═══════════════════════════════════════════════════════════════
        ("stepvideo/Step-Video-T2V", "StepVideoPipeline", "~90-140 GB", "Step Video T2V - Extreme quality"),
        
        # ═══════════════════════════════════════════════════════════════
        # 45-65GB models - Very High VRAM
        # ═══════════════════════════════════════════════════════════════
        ("hpcai-tech/Open-Sora", "OpenSoraPipeline", "~45-65 GB", "Open Sora - Open source Sora alternative"),
        ("hpcai-tech/OpenSora-STDiT-v2", "OpenSoraPipeline", "~45-65 GB", "OpenSora STDiT v2"),
        ("hpcai-tech/Open-Sora-1.2", "OpenSoraPipeline", "~45-65 GB", "OpenSora 1.2"),
        ("hpcai-tech/Open-Sora-plan-v1.2.0", "OpenSoraPipeline", "~45-65 GB", "OpenSora Plan v1.2.0"),
        
        # ═══════════════════════════════════════════════════════════════
        # 40-55GB models - High VRAM
        # ═══════════════════════════════════════════════════════════════
        ("tencent/HunyuanVideo", "HunyuanDiTPipeline", "~40-55 GB", "Tencent HunyuanVideo"),
        ("Tencent-Hunyuan/HunyuanVideo", "HunyuanDiTPipeline", "~40-55 GB", "Tencent HunyuanVideo (alt)"),
        
        # ═══════════════════════════════════════════════════════════════
        # 35-45GB models
        # ═══════════════════════════════════════════════════════════════
        ("rhymes-ai/Allegro", "AllegroPipeline", "~35-45 GB", "Allegro - High quality video gen"),
        ("rhymes-ai/Allegro-Medium", "AllegroPipeline", "~35-45 GB", "Allegro Medium"),
        
        # ═══════════════════════════════════════════════════════════════
        # 20-30GB models
        # ═══════════════════════════════════════════════════════════════
        ("THUDM/CogVideoX-5b", "CogVideoXPipeline", "~20-30 GB", "CogVideoX 5B parameter model"),
        ("THUDM/CogVideoX-2b", "CogVideoXPipeline", "~20-30 GB", "CogVideoX 2B parameter model"),
        ("THUDM/CogVideoX-5b-I2V", "CogVideoXPipeline", "~20-30 GB", "CogVideoX 5B I2V"),
        
        # ═══════════════════════════════════════════════════════════════
        # 18-22GB models
        # ═══════════════════════════════════════════════════════════════
        ("genmo/mochi-1-preview", "MochiPipeline", "~18-22 GB", "Mochi 1 Preview - High quality T2V"),
        ("genmo/mochi", "MochiPipeline", "~18-22 GB", "Mochi - Latest version"),
    ]
    
    all_models = {}
    seen_ids = set()
    
    # First, add known large models explicitly (without requiring validation)
    print("\n📦 Adding known large/high-VRAM models...")
    print("   (These models may require significant VRAM - 40GB to 140GB)")
    print()
    
    for model_id, pipeline_class, vram_est, description in known_large_models:
        if model_id in seen_ids:
            continue
        seen_ids.add(model_id)
        
        # Generate name
        name = model_id.split("/")[-1].lower()
        name = name.replace("-", "_").replace(".", "_")
        name = re.sub(r'[^a-z0-9_]', '', name)
        
        # Ensure unique name
        base_name = name
        counter = 1
        while name in all_models:
            name = f"{base_name}_{counter}"
            counter += 1
        
        # Try to validate (but don't skip if it fails)
        model_info = validate_hf_model(model_id, hf_token=hf_token)
        if model_info:
            tags = model_info.get("tags", [])
            downloads = model_info.get("downloads", 0)
            likes = model_info.get("likes", 0)
            is_i2v = any(t in tags for t in ["image-to-video", "i2v"]) or "i2v" in model_id.lower()
        else:
            # Add anyway with defaults
            tags = ["video", "text-to-video", "large-model"]
            downloads = 0
            likes = 0
            is_i2v = "i2v" in model_id.lower()
            print(f"  ⚠️  Could not validate {model_id} - adding with defaults")
        
        # Build entry
        model_entry = {
            "id": model_id,
            "vram": vram_est,
            "class": pipeline_class,
            "desc": description,
            "supports_i2v": is_i2v,
            "tags": tags[:10] if isinstance(tags, list) else list(tags)[:10],
            "downloads": downloads,
            "likes": likes,
            "auto_added": True,
            "is_large": True,
        }
        
        all_models[name] = model_entry
        print(f"  ✅ {name}: {model_id} ({vram_est})")
    
    for query, limit in search_queries:
        print(f"\n🔍 Searching: '{query}' (limit: {limit})")
        results = search_hf_models(query, limit=limit, hf_token=hf_token)
        
        for m in results:
            model_id = m["id"]
            
            # Skip duplicates
            if model_id in seen_ids:
                continue
            seen_ids.add(model_id)
            
            # Filter: include video models, NSFW models, OR models with known video pipeline classes
            is_video_model = m["is_i2v"] or m["is_video"]
            is_nsfw_model = m["is_nsfw"]
            is_known_pipeline = m["pipeline_class"] in ["WanPipeline", "MochiPipeline", "CogVideoXPipeline",
                                                        "StableVideoDiffusionPipeline", "I2VGenXLPipeline",
                                                        "LTXVideoPipeline", "AnimateDiffPipeline",
                                                        "TextToVideoSDPipeline", "TextToVideoZeroPipeline",
                                                        "HotshotXLPipeline", "AllegroPipeline",
                                                        "HunyuanDiTPipeline", "OpenSoraPipeline",
                                                        "LuminaVideoPipeline", "StepVideoPipeline"]
            
            if not (is_video_model or is_nsfw_model or is_known_pipeline):
                continue
            
            # Generate model name
            name = model_id.split("/")[-1].lower()
            name = name.replace("-", "_").replace(".", "_")
            name = re.sub(r'[^a-z0-9_]', '', name)
            
            # Ensure unique name
            base_name = name
            counter = 1
            while name in all_models:
                name = f"{base_name}_{counter}"
                counter += 1
            
            # Determine pipeline class
            pipeline_class = m["pipeline_class"]
            if pipeline_class == "Unknown":
                pipeline_class = "WanPipeline" if m["is_video"] else "StableDiffusionXLPipeline"
            
            # Determine VRAM estimate
            vram_est = PIPELINE_CLASS_MAP.get(pipeline_class, {}).get("default_vram", "~10-20 GB")
            
            # Detect if LoRA
            is_lora = "lora" in model_id.lower() or any(t in m.get("tags", []) for t in ["lora", "LoRA"])
            base_model = None
            
            if is_lora:
                if "wan" in model_id.lower():
                    base_model = "Wan-AI/Wan2.1-I2V-14B-Diffusers" if m["is_i2v"] else "Wan-AI/Wan2.1-T2V-14B-Diffusers"
                elif "svd" in model_id.lower() or "stable-video" in model_id.lower():
                    base_model = "stabilityai/stable-video-diffusion-img2vid-xt-1-1"
            
            # Build model entry
            model_entry = {
                "id": model_id,
                "vram": vram_est,
                "class": pipeline_class,
                "desc": f"{'[LoRA] ' if is_lora else ''}{model_id}",
                "supports_i2v": m["is_i2v"],
                "tags": m.get("tags", [])[:10],
                "downloads": m.get("downloads", 0),
                "likes": m.get("likes", 0),
                "is_lora": is_lora,
                "auto_added": True,
            }
            
            if base_model:
                model_entry["base_model"] = base_model
            
            all_models[name] = model_entry
            print(f"  ✅ {name}: {model_id}")
    
    # Also search for safetensors files (community models)
    print(f"\n" + "-" * 60)
    print("🔍 Searching for community safetensors models...")
    
    safetensors_queries = [
        ("wan nsfw", 20),
        ("wan2.2", 20),
        ("wan i2v rapid", 15),
        ("svd nsfw", 15),
        ("video nsfw", 15),
        ("mochi nsfw", 10),
        ("pony safetensors", 20),
        ("flux safetensors nsfw", 15),
        ("sdxl safetensors nsfw", 15),
        ("realistic vision safetensors", 10),
    ]
    
    for query, limit in safetensors_queries:
        results = search_hf_safetensors(query, limit=limit, hf_token=hf_token)
        
        for m in results:
            model_id = m["id"]
            
            # Skip duplicates
            if model_id in seen_ids:
                continue
            seen_ids.add(model_id)
            
            # Filter: only include video models or NSFW models
            is_video_model = m["is_i2v"] or m["is_video"]
            is_nsfw_model = m["is_nsfw"]
            
            if not (is_video_model or is_nsfw_model):
                continue
            
            # Generate model name
            name = model_id.split("/")[-1].lower()
            name = name.replace("-", "_").replace(".", "_")
            name = re.sub(r'[^a-z0-9_]', '', name)
            
            # Ensure unique name
            base_name = name
            counter = 1
            while name in all_models:
                name = f"{base_name}_{counter}"
                counter += 1
            
            # Determine pipeline class
            pipeline_class = m["pipeline_class"]
            if pipeline_class == "Unknown":
                pipeline_class = "WanPipeline" if m["is_video"] else "StableDiffusionXLPipeline"
            
            # Determine VRAM estimate
            vram_est = PIPELINE_CLASS_MAP.get(pipeline_class, {}).get("default_vram", "~10-20 GB")
            
            # Get safetensors files
            safetensor_files = m.get("safetensor_files", [])
            primary_file = safetensor_files[0] if safetensor_files else None
            
            # Build model entry for safetensors
            model_entry = {
                "id": model_id,
                "vram": vram_est,
                "class": pipeline_class,
                "desc": f"[Safetensors] {model_id}",
                "supports_i2v": m["is_i2v"],
                "tags": m.get("tags", [])[:10],
                "downloads": m.get("downloads", 0),
                "likes": m.get("likes", 0),
                "is_safetensors": True,
                "safetensor_files": safetensor_files,
                "primary_safetensor": primary_file,
                "auto_added": True,
            }
            
            # For safetensors, we need to use from_single_file
            if primary_file:
                model_entry["load_method"] = "from_single_file"
                model_entry["file_url"] = f"https://huggingface.co/{model_id}/blob/main/{primary_file}"
            
            all_models[name] = model_entry
            print(f"  ✅ [safetensors] {name}: {model_id} ({len(safetensor_files)} files)")
    
    print(f"\n" + "=" * 60)
    print(f"📊 Found {len(all_models)} new models from HuggingFace")
    
    # Merge with existing models (existing take precedence to preserve local configs)
    final_models = existing_models.copy()
    new_count = 0
    for name, entry in all_models.items():
        if name not in final_models:
            final_models[name] = entry
            new_count += 1
    
    # Save to config
    save_models_config(final_models)
    
    print(f"✅ Model database updated!")
    print(f"   Preserved: {len(existing_models)} existing models")
    print(f"   Added: {new_count} new models")
    print(f"   Total models: {len(final_models)}")
    print(f"   Config saved to: {MODELS_CONFIG_FILE}")
    
    return final_models


def print_search_results(results, args):
    """Print search results in a formatted table"""
    if not results:
        print("No models found.")
        return
    
    # Filter results
    if args.i2v_only:
        results = [r for r in results if r.get("is_i2v")]
    if args.nsfw_friendly:
        results = [r for r in results if r.get("is_nsfw")]
    
    print(f"\nFound {len(results)} models:\n")
    print(f"{'Model ID':<48} {'I2V':<4} {'T2V':<4} {'T2I':<4} {'I2I':<4} {'NSFW':<5} {'Pipeline':<25}")
    print("-" * 110)
    
    for r in results:
        # Determine capabilities
        is_i2v = "Yes" if r.get("is_i2v") else "-"
        is_t2v = "Yes" if r.get("is_video") and not r.get("is_i2v") else "-"
        is_t2i = "Yes" if r.get("is_image") else "-"
        # T2I models can do I2I
        is_i2i = "Yes" if r.get("is_image") or r.get("is_i2v") else "-"
        nsfw = "Yes" if r.get("is_nsfw") else "-"
        pipeline = r.get("pipeline_class", "Unknown")[:23]
        
        model_id = r['id'][:46] + ".." if len(r['id']) > 48 else r['id']
        print(f"{model_id:<48} {is_i2v:<4} {is_t2v:<4} {is_t2i:<4} {is_i2i:<4} {nsfw:<5} {pipeline:<25}")
    
    print(f"\nTo add a model: videogen --add-model <model_id> --name <short_name>")
    print(f"Example: videogen --add-model stabilityai/stable-video-diffusion-img2vid-xt-1-1 --name svd_xt")


# ──────────────────────────────────────────────────────────────────────────────
#                                 MODEL REGISTRY
# ──────────────────────────────────────────────────────────────────────────────

# Initialize MODELS from external config only
MODELS = {}

# Load external models config
_external_models = load_models_config()
if _external_models:
    MODELS = _external_models
    print(f"📁 Loaded {len(_external_models)} models from {MODELS_CONFIG_FILE}")
else:
    print(f"⚠️  No models configured. Run: videogen --update-models")
    print(f"   Or add a model: videogen --add-model <model_id> --name <name>")

# ──────────────────────────────────────────────────────────────────────────────
#                                 TTS VOICE REGISTRY
# ──────────────────────────────────────────────────────────────────────────────

TTS_VOICES = {
    # Bark voices (Suno AI)
    "bark_male": {"engine": "bark", "voice": "v2/en_speaker_6"},
    "bark_female": {"engine": "bark", "voice": "v2/en_speaker_9"},
    "bark_narrator": {"engine": "bark", "voice": "v2/en_speaker_3"},
    "bark_custom": {"engine": "bark", "voice": None},  # User provides via --tts_voice
    
    # Edge-TTS voices (Microsoft Azure - high quality, lightweight)
    "edge_male_us": {"engine": "edge", "voice": "en-US-GuyNeural"},
    "edge_female_us": {"engine": "edge", "voice": "en-US-JennyNeural"},
    "edge_male_uk": {"engine": "edge", "voice": "en-GB-RyanNeural"},
    "edge_female_uk": {"engine": "edge", "voice": "en-GB-SoniaNeural"},
    "edge_male_au": {"engine": "edge", "voice": "en-AU-WilliamNeural"},
    "edge_female_au": {"engine": "edge", "voice": "en-AU-NatashaNeural"},
}

# ──────────────────────────────────────────────────────────────────────────────
#                                 UTILITY FUNCTIONS
# ──────────────────────────────────────────────────────────────────────────────

def get_pipeline_class(class_name):
    import diffusers
    
    # Try the exact class name first
    try:
        return getattr(diffusers, class_name)
    except AttributeError:
        pass
    
    # Try alternative names for known pipelines
    alternatives = {
        "LTXVideoPipeline": ["LTXLatentVideoToVideoPipeline", "LTXImageToVideoPipeline"],
        "StableVideoDiffusionPipeline": ["StableVideoDiffusionImg2VidPipeline"],
        "CogVideoXPipeline": ["CogVideoXImageToVideoPipeline", "CogVideoXVideoToVideoPipeline"],
        "MochiPipeline": ["Mochi1Pipeline", "MochiVideoPipeline"],
        "DiffusionPipeline": [],  # No alternatives needed - it's the generic class
    }
    
    if class_name in alternatives:
        for alt_name in alternatives[class_name]:
            try:
                cls = getattr(diffusers, alt_name)
                print(f"  ℹ️  Using alternative pipeline class: {alt_name}")
                return cls
            except AttributeError:
                continue
    
    # Fallback to DiffusionPipeline for unknown classes
    # This allows loading any diffusers-compatible model
    if class_name not in ["Unknown", None]:
        try:
            print(f"  ℹ️  Trying generic DiffusionPipeline for: {class_name}")
            return diffusers.DiffusionPipeline
        except AttributeError:
            pass
    
    return None


def log_memory():
    ram = psutil.virtual_memory().percent
    vram = torch.cuda.memory_reserved() / 1e9 if torch.cuda.is_available() else 0
    print(f"📊 RAM: {ram:5.1f}%   VRAM: {vram:5.1f} GB")


# ──────────────────────────────────────────────────────────────────────────────
#                                 TIMING UTILITIES
# ──────────────────────────────────────────────────────────────────────────────

class TimingTracker:
    """Track timing for each generation step and provide estimates"""
    
    def __init__(self):
        self.steps = {}
        self.start_time = None
        self.current_step = None
        self.current_step_start = None
    
    def start(self):
        """Start overall timing"""
        self.start_time = time.time()
    
    def begin_step(self, step_name):
        """Begin timing a specific step"""
        if self.current_step:
            self.end_step()
        self.current_step = step_name
        self.current_step_start = time.time()
        print(f"⏱️  Starting: {step_name}...")
    
    def end_step(self):
        """End current step timing"""
        if self.current_step and self.current_step_start:
            elapsed = time.time() - self.current_step_start
            self.steps[self.current_step] = elapsed
            print(f"✅ Completed: {self.current_step} ({self._format_time(elapsed)})")
        self.current_step = None
        self.current_step_start = None
    
    def get_elapsed(self):
        """Get total elapsed time"""
        if self.start_time:
            return time.time() - self.start_time
        return 0
    
    def _format_time(self, seconds):
        """Format seconds into human readable string"""
        if seconds < 60:
            return f"{seconds:.1f}s"
        elif seconds < 3600:
            mins = int(seconds // 60)
            secs = int(seconds % 60)
            return f"{mins}m {secs}s"
        else:
            hours = int(seconds // 3600)
            mins = int((seconds % 3600) // 60)
            return f"{hours}h {mins}m"
    
    def estimate_total_time(self, args, m_info, has_i2v=False, has_audio=False, has_lipsync=False, has_upscale=False):
        """Estimate total generation time based on parameters"""
        estimates = {}
        
        # Base time per frame (empirical estimates for RTX 3090/4090)
        # These are rough estimates and will vary by model
        model_class = m_info.get("class", "")
        
        # Time per frame estimates (seconds)
        if "WanPipeline" in model_class:
            time_per_frame = 0.8  # Wan is relatively fast
        elif "MochiPipeline" in model_class:
            time_per_frame = 1.5  # Mochi is slower
        elif "StableVideoDiffusionPipeline" in model_class:
            time_per_frame = 0.5  # SVD is fast
        elif "CogVideoXPipeline" in model_class:
            time_per_frame = 1.2
        elif "FluxPipeline" in model_class:
            time_per_frame = 2.0  # Flux is slower for images
        elif "StableDiffusionXLPipeline" in model_class:
            time_per_frame = 0.3  # SDXL is fast for images
        else:
            time_per_frame = 1.0  # Default estimate
        
        # Adjust for resolution (higher res = more time)
        resolution_factor = (args.width * args.height) / (832 * 480)  # Normalized to default
        time_per_frame *= resolution_factor
        
        # Model loading time estimate
        vram_est = parse_vram_estimate(m_info.get("vram", "~10 GB"))
        if vram_est > 50:
            load_time = 120  # Large models take longer to load
        elif vram_est > 30:
            load_time = 60
        elif vram_est > 16:
            load_time = 30
        else:
            load_time = 15
        
        estimates["model_loading"] = load_time
        
        # Image generation for I2V
        if has_i2v and not args.image:
            img_model_class = "StableDiffusionXLPipeline"  # Default image model
            img_time = 5 + (args.width * args.height) / (1024 * 1024) * 2
            estimates["image_generation"] = img_time
        
        # Audio generation
        if has_audio:
            if args.audio_type == "tts":
                audio_time = 10 + len(args.audio_text or "") / 50  # Rough estimate
            else:  # music
                audio_time = args.length * 0.5 + 5  # MusicGen is relatively fast
            estimates["audio_generation"] = audio_time
        
        # Video generation
        num_frames = int(args.length * args.fps)
        inference_steps = 50 if "wan" in args.model.lower() else 28
        video_time = num_frames * time_per_frame * (inference_steps / 50)  # Normalized to 50 steps
        estimates["video_generation"] = video_time
        
        # Upscaling
        if has_upscale:
            upscale_time = num_frames * 0.3  # Upscaling is relatively fast per frame
            estimates["upscaling"] = upscale_time
        
        # Audio sync
        if has_audio and args.sync_audio:
            estimates["audio_sync"] = 5
        
        # Lip sync
        if has_lipsync:
            lipsync_time = num_frames * 0.2  # Wav2Lip processes frames
            estimates["lip_sync"] = lipsync_time
        
        return estimates
    
    def print_estimate(self, estimates):
        """Print time estimate breakdown"""
        total = sum(estimates.values())
        
        print(f"\n⏱️  ESTIMATED GENERATION TIME")
        print("=" * 50)
        
        for step, seconds in estimates.items():
            pct = (seconds / total) * 100 if total > 0 else 0
            print(f"  {step.replace('_', ' ').title():<25} {self._format_time(seconds):>10}  ({pct:>5.1f}%)")
        
        print("-" * 50)
        print(f"  {'TOTAL ESTIMATED':<25} {self._format_time(total):>10}")
        print("=" * 50)
        print()
    
    def print_summary(self):
        """Print timing summary after generation"""
        if not self.steps:
            return
        
        total = sum(self.steps.values())
        
        print(f"\n⏱️  GENERATION TIME BREAKDOWN")
        print("=" * 50)
        
        # Sort by time (longest first)
        sorted_steps = sorted(self.steps.items(), key=lambda x: x[1], reverse=True)
        
        for step, seconds in sorted_steps:
            pct = (seconds / total) * 100 if total > 0 else 0
            print(f"  {step.replace('_', ' ').title():<25} {self._format_time(seconds):>10}  ({pct:>5.1f}%)")
        
        print("-" * 50)
        print(f"  {'TOTAL TIME':<25} {self._format_time(total):>10}")
        print("=" * 50)
        
        # Calculate efficiency
        if total > 0:
            avg_step = total / len(self.steps)
            print(f"\n  Average step time: {self._format_time(avg_step)}")
            print(f"  Steps completed: {len(self.steps)}")


def detect_vram_gb():
    if torch.cuda.is_available():
        try:
            return torch.cuda.get_device_properties(0).total_memory / 1e9
        except:
            return 0
    return 0


def parse_vram_estimate(vram_str):
    numbers = re.findall(r'\d+\.?\d*', vram_str)
    if not numbers:
        return 0.0
    return float(max(numbers))


def should_use_low_mem(args, m_info, effective_vram_gb):
    model_vram_est = parse_vram_estimate(m_info["vram"])
    low_vram = model_vram_est > 0.90 * effective_vram_gb
    is_wan = "wan" in args.model.lower()
    user_forced = args.low_ram_mode

    if user_forced:
        return True, "User forced with --low_ram_mode"

    if is_wan:
        return False, "Known Wan fp32 conflict – forcing False"

    if low_vram:
        return True, f"Model est {model_vram_est:.1f} GB > 90% of effective VRAM {effective_vram_gb:.1f} GB"

    return False, f"Safe default (model {model_vram_est:.1f} GB < 90%)"


def detect_model_type(info):
    """Detect model capabilities from info dict"""
    model_id = info.get("id", "").lower()
    desc = info.get("desc", "").lower()
    tags = info.get("tags", [])
    pipeline_class = info.get("class", "")
    
    # I2V detection (image-to-video)
    i2v = info.get("supports_i2v", False) or "i2v" in model_id or "image-to-video" in tags
    
    # T2V detection (video models that aren't I2V)
    is_video = "video" in tags or "text-to-video" in tags
    is_video_pipeline = pipeline_class in ["WanPipeline", "MochiPipeline", "TextToVideoSDPipeline",
                                            "TextToVideoZeroPipeline", "CogVideoXPipeline",
                                            "HotshotXLPipeline", "AnimateDiffPipeline"]
    t2v = (is_video or is_video_pipeline) and not i2v
    
    # T2I detection (text-to-image)
    is_image = "text-to-image" in tags or pipeline_class in ["StableDiffusionXLPipeline", "FluxPipeline"]
    t2i = is_image and not (i2v or t2v)
    
    # I2I detection (image-to-image) - T2I models can do img2img
    # Also check for specific img2img pipelines
    is_img2img_pipeline = "Img2Img" in pipeline_class or "img2img" in model_id
    i2i = t2i or is_img2img_pipeline or any(x in tags for x in ["image-to-image", "img2img"])
    
    # Audio detection
    audio = any(x in model_id or x in desc for x in ["tts", "bark", "musicgen", "audio", "voice", "speech"])
    audio = audio or any(x in tags for x in ["tts", "audio", "speech", "music"])
    
    # NSFW detection
    nsfw_keywords = ["nsfw", "adult", "uncensored", "porn", "explicit", "xxx", "erotic", "nude"]
    nsfw = any(kw in model_id or kw in desc for kw in nsfw_keywords)
    nsfw = nsfw or any(kw in str(tags).lower() for kw in nsfw_keywords)
    
    # LoRA detection
    lora = info.get("is_lora", False) or "lora" in model_id or "lora" in str(tags).lower()
    
    return {
        "i2v": i2v,
        "t2v": t2v,
        "t2i": t2i,
        "i2i": i2i,
        "audio": audio,
        "nsfw": nsfw,
        "lora": lora
    }


def show_model_details(model_id_or_name, args):
    """Show full details for a specific model by numeric ID or name"""
    model = None
    model_name = None
    
    # Try to parse as numeric ID
    try:
        model_idx = int(model_id_or_name)
        # Build sorted list matching print_model_list order
        sorted_models = sorted(MODELS.items())
        
        # Apply same filters
        filtered = []
        for name, info in sorted_models:
            if args.i2v_only and not info.get("supports_i2v", False):
                continue
            if args.t2v_only and info.get("supports_i2v", False):
                continue
            if args.nsfw_friendly and not any(word in name.lower() or word in info.get("desc", "").lower()
                                              for word in ["uncensored", "nsfw", "adult", "realism", "erotic", "explicit"]):
                continue
            if args.low_vram:
                est = parse_vram_estimate(info["vram"])
                if est == 0 or est > 16:
                    continue
            if args.high_vram:
                est = parse_vram_estimate(info["vram"])
                if est == 0 or est <= 30:
                    continue
            if args.huge_vram:
                est = parse_vram_estimate(info["vram"])
                if est == 0 or est <= 55:
                    continue
            filtered.append((name, info))
        
        if 1 <= model_idx <= len(filtered):
            model_name, model = filtered[model_idx - 1]
        else:
            print(f"❌ Model ID {model_idx} out of range (1-{len(filtered)})")
            sys.exit(1)
    except ValueError:
        # Not a number, search by name
        for name, info in MODELS.items():
            if name == model_id_or_name or info.get("id") == model_id_or_name:
                model = info
                model_name = name
                break
        
        if not model:
            print(f"❌ Model not found: {model_id_or_name}")
            sys.exit(1)
    
    # Display full details
    print(f"\n{'='*60}")
    print(f"MODEL DETAILS: {model_name}")
    print(f"{'='*60}\n")
    
    print(f"  Full ID:       {model.get('id', 'N/A')}")
    print(f"  Short Name:    {model_name}")
    print(f"  Pipeline:      {model.get('class', 'Unknown')}")
    print(f"  VRAM:          {model.get('vram', 'Unknown')}")
    
    # Capabilities
    caps = detect_model_type(model)
    print(f"\n  Capabilities:")
    print(f"    I2V (Image-to-Video):  {'✅ Yes' if caps['i2v'] else '❌ No'}")
    print(f"    T2V (Text-to-Video):   {'✅ Yes' if caps['t2v'] else '❌ No'}")
    print(f"    T2I (Text-to-Image):   {'✅ Yes' if caps['t2i'] else '❌ No'}")
    print(f"    I2I (Image-to-Image):  {'✅ Yes' if caps['i2i'] else '❌ No'}")
    print(f"    Audio:                 {'✅ Yes' if caps['audio'] else '❌ No'}")
    print(f"    NSFW-friendly:         {'✅ Yes' if caps['nsfw'] else '❌ No'}")
    print(f"    LoRA Adapter:          {'✅ Yes' if caps['lora'] else '❌ No'}")
    
    # Additional info
    if model.get("is_lora") and model.get("base_model"):
        print(f"\n  Base Model:    {model['base_model']}")
    
    if model.get("is_safetensors"):
        print(f"\n  Format:        Safetensors")
        if model.get("safetensor_files"):
            print(f"  Files:")
            for f in model["safetensor_files"][:5]:
                print(f"    - {f}")
            if len(model.get("safetensor_files", [])) > 5:
                print(f"    ... and {len(model['safetensor_files']) - 5} more")
    
    if model.get("tags"):
        print(f"\n  Tags:          {', '.join(model['tags'][:10])}")
    
    if model.get("downloads"):
        print(f"  Downloads:     {model['downloads']:,}")
    if model.get("likes"):
        print(f"  Likes:         {model['likes']:,}")
    
    print(f"\n  Description:")
    desc = model.get("desc", "No description available")
    # Word wrap description
    import textwrap
    for line in textwrap.wrap(desc, width=60):
        print(f"    {line}")
    
    print(f"\n{'='*60}")
    print(f"Usage: --model {model_name}")
    print(f"{'='*60}\n")
    sys.exit(0)


def print_model_list(args):
    print("\nAvailable models (filtered):\n")

    shown = 0
    results = []
    
    for name, info in sorted(MODELS.items()):
        if args.i2v_only and not info.get("supports_i2v", False):
            continue
        if args.t2v_only and info.get("supports_i2v", False):
            continue
        if args.nsfw_friendly and not any(word in name.lower() or word in info.get("desc", "").lower()
                                          for word in ["uncensored", "nsfw", "adult", "realism", "erotic", "explicit"]):
            continue
        if args.low_vram:
            est = parse_vram_estimate(info["vram"])
            if est == 0 or est > 16:
                continue
        if args.high_vram:
            est = parse_vram_estimate(info["vram"])
            if est == 0 or est <= 30:
                continue
        if args.huge_vram:
            est = parse_vram_estimate(info["vram"])
            if est == 0 or est <= 55:
                continue

        shown += 1
        caps = detect_model_type(info)
        results.append((name, info, caps))

    if shown == 0:
        print("No models match the selected filters.")
    else:
        # Print table header
        print(f"{'ID':>4}  {'Name':<28} {'VRAM':<11} {'I2V':<4} {'T2V':<4} {'T2I':<4} {'I2I':<4} {'NSFW':<5} {'LoRA':<5}")
        print("-" * 95)
        
        for idx, (name, info, caps) in enumerate(results, 1):
            # Truncate name if too long
            display_name = name[:26] + ".." if len(name) > 28 else name
            vram = info["vram"][:9] if len(info["vram"]) > 9 else info["vram"]
            
            i2v = "Yes" if caps["i2v"] else "-"
            t2v = "Yes" if caps["t2v"] else "-"
            t2i = "Yes" if caps["t2i"] else "-"
            i2i = "Yes" if caps["i2i"] else "-"
            nsfw = "Yes" if caps["nsfw"] else "-"
            lora = "Yes" if caps["lora"] else "-"
            
            print(f"{idx:>4}  {display_name:<28} {vram:<11} {i2v:<4} {t2v:<4} {t2i:<4} {i2i:<4} {nsfw:<5} {lora:<5}")
        
        print("-" * 95)
        print(f"Total shown: {shown} / {len(MODELS)} available")

    print("\nUse --model <name> to select a model.")
    print("Use --show-model <ID|name> to see full model details.")
    sys.exit(0)


def print_tts_voices():
    """Print available TTS voices"""
    print("\nAvailable TTS Voices:\n")
    print(f"{'Voice name':<20} {'Engine':<8} {'Voice ID':<30}")
    print("-" * 60)
    
    for name, info in sorted(TTS_VOICES.items()):
        engine = info["engine"]
        voice_id = info["voice"] or "(custom via --tts_voice)"
        print(f"{name:<20} {engine:<8} {voice_id:<30}")
    
    print("\nUsage: --tts_voice <name> or --tts_voice bark_custom --tts_voice_id v2/en_speaker_1")
    print("\nFor Edge-TTS, you can also use any Azure voice name directly with --tts_voice_id")
    sys.exit(0)


# ──────────────────────────────────────────────────────────────────────────────
#                                 AUDIO GENERATION FUNCTIONS
# ──────────────────────────────────────────────────────────────────────────────

def check_audio_dependencies():
    """Check and report audio dependency status"""
    print("\n📦 Audio Dependency Status:")
    print(f"  scipy/soundfile: {'✅' if SCIPY_AVAILABLE else '❌'}")
    print(f"  librosa: {'✅' if LIBROSA_AVAILABLE else '❌'}")
    print(f"  Bark TTS: {'✅' if BARK_AVAILABLE else '❌'}")
    print(f"  Edge-TTS: {'✅' if EDGE_TTS_AVAILABLE else '❌'}")
    print(f"  MusicGen: {'✅' if MUSICGEN_AVAILABLE else '❌'}")
    
    if not AUDIO_AVAILABLE:
        print("\n⚠️  No audio engines available. Install audio dependencies:")
        print("    pip install scipy soundfile librosa edge-tts")
        print("    pip install git+https://github.com/suno-ai/bark.git")
        print("    pip install audiocraft")


# ──────────────────────────────────────────────────────────────────────────────
#                                 AUTO MODE FUNCTIONS
# ──────────────────────────────────────────────────────────────────────────────

# Global NSFW classifier cache
_nsfw_classifier = None

def get_nsfw_classifier():
    """Get or load the NSFW text classifier"""
    global _nsfw_classifier
    
    if _nsfw_classifier is not None:
        return _nsfw_classifier
    
    if not TRANSFORMERS_AVAILABLE:
        print("⚠️  transformers not available for NSFW detection")
        print("   Install with: pip install transformers")
        return None
    
    try:
        # Use a small, fast text classification model for NSFW detection
        print("🔄 Loading NSFW text classifier...")
        
        # Try to use a toxicity classifier
        try:
            _nsfw_classifier = pipeline(
                "text-classification",
                model="unitary/toxic-bert",
                device=-1  # CPU for fast inference
            )
            print("  ✅ Loaded toxic-bert classifier")
            return _nsfw_classifier
        except Exception:
            pass
        
        # Fallback: use keyword-based detection
        print("  ℹ️  Using keyword-based NSFW detection")
        _nsfw_classifier = "keyword"
        return _nsfw_classifier
        
    except Exception as e:
        print(f"⚠️  Could not load NSFW classifier: {e}")
        return None


def detect_nsfw_text(text, classifier=None):
    """Detect if text contains NSFW content
    
    Returns: (is_nsfw, confidence, reason)
    """
    if classifier is None:
        classifier = get_nsfw_classifier()
    
    if classifier is None:
        # Fallback to keyword detection
        return detect_nsfw_keywords(text)
    
    if classifier == "keyword":
        return detect_nsfw_keywords(text)
    
    try:
        # Use the classifier
        result = classifier(text[:512])  # Truncate for model limits
        label = result[0]['label']
        score = result[0]['score']
        
        # toxic-bert labels: 'toxic' or 'non-toxic'
        is_nsfw = label.lower() in ['toxic', 'nsfw', 'positive']
        return is_nsfw, score, f"Model classification: {label}"
        
    except Exception as e:
        print(f"⚠️  Classifier error: {e}")
        return detect_nsfw_keywords(text)


def detect_nsfw_keywords(text):
    """Keyword-based NSFW detection as fallback"""
    text_lower = text.lower()
    
    # NSFW keywords (comprehensive list)
    nsfw_keywords = [
        # Explicit sexual content
        "nsfw", "porn", "xxx", "sex", "nude", "naked", "nudity",
        "erotic", "explicit", "adult", "uncensored", "18+",
        "penis", "vagina", "breasts", "boobs", "tits", "ass", "butt",
        "fuck", "fucking", "fucked", "hardcore", "softcore",
        "blowjob", "oral", "anal", "cumshot", "cum", "sperm",
        "masturbat", "orgasm", "climax", "moan", "groan",
        "dildo", "vibrator", "toy", "fetish", "kink", "bdsm",
        "dominatrix", "submissive", "bondage", "spank",
        "hentai", "anime porn", "cartoon porn",
        "threesome", "orgy", "gangbang", "group sex",
        "interracial", "lesbian", "gay", "bisexual",
        "strip", "stripper", "lap dance", "pole dance",
        "lingerie", "underwear", "panties", "bra", "thong",
        "seduce", "seductive", "sensual", "provocative",
        "aroused", "horny", "wet", "hard", "erection",
        "deepthroat", "riding", "cowgirl", "doggy", "missionary",
        "creampie", "facial", "swallow", "bukkake",
        
        # Violence/gore (also NSFW)
        "gore", "blood", "violent", "brutal", "torture",
        "mutilat", "dismember", "decapitat", "kill", "murder",
        "massacre", "slaughter", "carnage",
    ]
    
    # Check for keywords
    found_keywords = []
    for kw in nsfw_keywords:
        if kw in text_lower:
            found_keywords.append(kw)
    
    if found_keywords:
        confidence = min(0.9, 0.5 + len(found_keywords) * 0.1)
        return True, confidence, f"Keywords found: {', '.join(found_keywords[:5])}"
    
    return False, 0.8, "No NSFW keywords detected"


def detect_generation_type(prompt, prompt_image=None, prompt_animation=None, args=None):
    """Detect what type of generation is needed from prompts
    
    Returns: dict with generation parameters
    """
    full_prompt = " ".join(prompt) if prompt else ""
    image_prompt = " ".join(prompt_image) if prompt_image else ""
    animation_prompt = " ".join(prompt_animation) if prompt_animation else ""
    
    all_text = f"{full_prompt} {image_prompt} {animation_prompt}".lower()
    
    result = {
        "type": "t2v",  # Default: text-to-video
        "needs_image": False,
        "needs_video": True,
        "needs_audio": False,
        "audio_type": None,
        "is_nsfw": False,
        "nsfw_confidence": 0.0,
        "nsfw_reason": "",
        "motion_type": "standard",
        "subject_type": "general",
    }
    
    # Detect NSFW
    is_nsfw, confidence, reason = detect_nsfw_text(all_text)
    result["is_nsfw"] = is_nsfw
    result["nsfw_confidence"] = confidence
    result["nsfw_reason"] = reason
    
    # Detect if image generation is needed
    image_keywords = ["portrait", "photo", "picture", "image", "still", "static",
                      "painting", "artwork", "illustration", "drawing", "render"]
    video_keywords = ["video", "animation", "motion", "moving", "walking", "running",
                      "dancing", "flying", "flowing", "cinematic", "scene", "clip"]
    
    has_image_intent = any(kw in all_text for kw in image_keywords)
    has_video_intent = any(kw in all_text for kw in video_keywords)
    
    # Check output extension if provided
    if args and hasattr(args, 'output'):
        output_ext = os.path.splitext(args.output)[1].lower() if args.output else ""
        if output_ext in [".png", ".jpg", ".jpeg", ".gif", ".webp"]:
            result["type"] = "t2i"
            result["needs_video"] = False
            result["needs_image"] = True
            return result
    
    # Detect I2V (image-to-video) intent
    i2v_keywords = ["animate", "bring to life", "make it move", "add motion",
                    "from image", "starting from", "beginning with"]
    if any(kw in all_text for kw in i2v_keywords) or (prompt_image and prompt_animation):
        result["type"] = "i2v"
        result["needs_image"] = True
        result["needs_video"] = True
    
    # Detect T2I (static image) intent
    elif has_image_intent and not has_video_intent:
        result["type"] = "t2i"
        result["needs_video"] = False
        result["needs_image"] = True
    
    # Detect I2I (image-to-image) intent
    i2i_keywords = ["transform", "modify", "change", "alter", "convert",
                    "style transfer", "make it look like", "turn into"]
    if any(kw in all_text for kw in i2i_keywords):
        result["type"] = "i2i"
        result["needs_image"] = True
        result["needs_video"] = False
    
    # Detect audio needs
    audio_keywords = ["narration", "voiceover", "speech", "talking", "speaking",
                      "saying", "dialogue", "monologue", "story"]
    music_keywords = ["music", "soundtrack", "background music", "score",
                      "orchestral", "ambient sound", "audio"]
    
    if any(kw in all_text for kw in audio_keywords):
        result["needs_audio"] = True
        result["audio_type"] = "tts"
    elif any(kw in all_text for kw in music_keywords):
        result["needs_audio"] = True
        result["audio_type"] = "music"
    
    # Detect motion type
    if "slow" in all_text or "gentle" in all_text:
        result["motion_type"] = "slow"
    elif "fast" in all_text or "dynamic" in all_text or "action" in all_text:
        result["motion_type"] = "fast"
    elif "subtle" in all_text or "minimal" in all_text:
        result["motion_type"] = "subtle"
    
    # Detect subject type
    if any(kw in all_text for kw in ["woman", "girl", "female", "lady", "she"]):
        result["subject_type"] = "female"
    elif any(kw in all_text for kw in ["man", "boy", "male", "guy", "he"]):
        result["subject_type"] = "male"
    elif any(kw in all_text for kw in ["landscape", "scenery", "nature", "environment"]):
        result["subject_type"] = "landscape"
    elif any(kw in all_text for kw in ["animal", "cat", "dog", "bird", "wildlife"]):
        result["subject_type"] = "animal"
    
    return result


def select_best_model(gen_type, models, vram_gb=24, prefer_quality=True, return_all=False):
    """Select the best model based on generation type and constraints
    
    Args:
        gen_type: Dict from detect_generation_type()
        models: Available models dict
        vram_gb: Available VRAM in GB
        prefer_quality: Prefer quality over speed
        return_all: If True, return all candidates sorted by score
    
    Returns: (model_name, model_info, reason) or [(model_name, model_info, reason), ...] if return_all=True
    """
    candidates = []
    is_nsfw = gen_type.get("is_nsfw", False)
    gen_type_str = gen_type.get("type", "t2v")
    
    for name, info in models.items():
        # Skip LoRA adapters (need base model)
        if info.get("is_lora"):
            continue
        
        # Check VRAM compatibility
        vram_est = parse_vram_estimate(info.get("vram", "~10 GB"))
        if vram_est > vram_gb * 1.1:  # Allow 10% margin
            continue
        
        # Check model capabilities
        caps = detect_model_type(info)
        
        # Score the model
        score = 0
        reasons = []
        
        # Type matching
        if gen_type_str == "t2v" and caps["t2v"]:
            score += 100
            reasons.append("T2V capable")
        elif gen_type_str == "i2v" and caps["i2v"]:
            score += 100
            reasons.append("I2V capable")
        elif gen_type_str == "t2i" and caps["t2i"]:
            score += 100
            reasons.append("T2I capable")
        elif gen_type_str == "i2i" and caps["i2i"]:
            score += 100
            reasons.append("I2I capable")
        
        # NSFW matching
        if is_nsfw and caps["nsfw"]:
            score += 50
            reasons.append("NSFW-friendly")
        elif is_nsfw and not caps["nsfw"]:
            score -= 30
            reasons.append("May filter NSFW")
        
        # Quality vs speed
        if prefer_quality:
            # Prefer larger models for quality
            score += min(vram_est, 30)
        else:
            # Prefer smaller models for speed
            score += max(0, 20 - vram_est)
        
        # Popular/reliable models get bonus
        downloads = info.get("downloads", 0)
        if downloads > 10000:
            score += 20
            reasons.append(f"Popular ({downloads:,} downloads)")
        
        if score > 0:
            candidates.append((name, info, score, reasons))
    
    if not candidates:
        # Fallback: return first available model
        for name, info in models.items():
            if not info.get("is_lora"):
                if return_all:
                    return [(name, info, "Fallback (no ideal match)")]
                return name, info, "Fallback (no ideal match)"
        if return_all:
            return []
        return None, None, "No models available"
    
    # Sort by score (highest first)
    candidates.sort(key=lambda x: x[2], reverse=True)
    
    if return_all:
        # Return all candidates with their reasons
        return [(name, info, f"Score: {score} - {', '.join(reasons)}")
                for name, info, score, reasons in candidates]
    
    best_name, best_info, best_score, best_reasons = candidates[0]
    return best_name, best_info, f"Score: {best_score} - {', '.join(best_reasons)}"


def split_prompt_for_i2v(full_prompt):
    """Split a prompt into image and animation prompts for I2V mode
    
    Analyzes the prompt to identify:
    - Static/scene description (for image generation)
    - Motion/action description (for animation)
    
    Returns: (image_prompt, animation_prompt)
    """
    # Keywords that typically indicate motion/action
    motion_keywords = [
        "moving", "walking", "running", "dancing", "flying", "jumping",
        "swimming", "crawling", "climbing", "falling", "rising",
        "turning", "spinning", "rotating", "swinging", "swaying",
        "flowing", "streaming", "blowing", "floating", "drifting",
        "animating", "breathing", "blinking", "talking", "speaking",
        "looking", "glancing", "nodding", "shaking", "trembling",
        "slowly", "quickly", "gently", "rapidly", "smoothly",
        "motion", "movement", "action", "animate", "dynamic"
    ]
    
    # Keywords that typically indicate static scene
    static_keywords = [
        "wearing", "dressed", "clothed", "standing", "sitting", "lying",
        "portrait", "photo", "picture", "scene", "setting", "background",
        "environment", "landscape", "indoor", "outdoor", "room", "street",
        "beautiful", "handsome", "detailed", "realistic", "cinematic",
        "lighting", "illuminated", "lit", "shadow", "atmosphere", "mood"
    ]
    
    # Split by common separators
    separators = [", ", " and ", " while ", " as ", " with "]
    parts = [full_prompt]
    for sep in separators:
        new_parts = []
        for part in parts:
            new_parts.extend(part.split(sep))
        parts = new_parts
    
    image_parts = []
    animation_parts = []
    
    for part in parts:
        part = part.strip()
        if not part:
            continue
        
        # Check if this part is more motion or static oriented
        motion_score = sum(1 for kw in motion_keywords if kw in part.lower())
        static_score = sum(1 for kw in static_keywords if kw in part.lower())
        
        if motion_score > static_score:
            animation_parts.append(part)
        else:
            image_parts.append(part)
    
    # If no animation parts found, use the full prompt for image
    # and generate a generic animation prompt
    if not animation_parts:
        image_prompt = full_prompt
        # Generate animation prompt based on content
        if any(kw in full_prompt.lower() for kw in ["woman", "girl", "female", "lady"]):
            animation_prompt = "subtle natural movement, gentle breathing, soft motion"
        elif any(kw in full_prompt.lower() for kw in ["man", "boy", "male", "guy"]):
            animation_prompt = "subtle natural movement, gentle breathing, soft motion"
        elif any(kw in full_prompt.lower() for kw in ["water", "ocean", "river", "stream"]):
            animation_prompt = "flowing water, gentle waves, natural movement"
        elif any(kw in full_prompt.lower() for kw in ["fire", "flame", "burn"]):
            animation_prompt = "flickering flames, dancing fire, dynamic movement"
        elif any(kw in full_prompt.lower() for kw in ["tree", "leaf", "forest", "grass"]):
            animation_prompt = "swaying in the wind, gentle movement, natural motion"
        elif any(kw in full_prompt.lower() for kw in ["cloud", "sky", "sunset", "sunrise"]):
            animation_prompt = "slow moving clouds, gradual change, atmospheric motion"
        else:
            animation_prompt = "subtle natural motion, gentle movement"
    else:
        image_prompt = ", ".join(image_parts) if image_parts else full_prompt
        animation_prompt = ", ".join(animation_parts)
    
    return image_prompt, animation_prompt


def generate_command_line(args):
    """Generate a command line string that reproduces the current configuration
    
    This creates a command that can be run without --auto to get the same result.
    """
    cmd_parts = ["python3", "videogen"]
    
    # Model
    if args.model:
        cmd_parts.extend(["--model", args.model])
    
    # Mode flags
    if getattr(args, 'image_to_video', False):
        cmd_parts.append("--image_to_video")
    if getattr(args, 'image_to_image', False):
        cmd_parts.append("--image-to-image")
    
    # Image model for I2V
    if getattr(args, 'image_model', None) and getattr(args, 'image_to_video', False):
        cmd_parts.extend(["--image_model", args.image_model])
    
    # Image file if provided
    if getattr(args, 'image', None):
        cmd_parts.extend(["--image", args.image])
    
    # Prompts
    if args.prompt:
        # Quote the prompt if it contains spaces
        prompt_str = " ".join(args.prompt) if isinstance(args.prompt, list) else args.prompt
        cmd_parts.extend(["--prompt", f'"{prompt_str}"'])
    
    if getattr(args, 'prompt_image', None):
        prompt_image_str = " ".join(args.prompt_image) if isinstance(args.prompt_image, list) else args.prompt_image
        cmd_parts.extend(["--prompt_image", f'"{prompt_image_str}"'])
    
    if getattr(args, 'prompt_animation', None):
        prompt_anim_str = " ".join(args.prompt_animation) if isinstance(args.prompt_animation, list) else args.prompt_animation
        cmd_parts.extend(["--prompt_animation", f'"{prompt_anim_str}"'])
    
    # Resolution
    if args.width != 832:
        cmd_parts.extend(["--width", str(args.width)])
    if args.height != 480:
        cmd_parts.extend(["--height", str(args.height)])
    
    # Duration and FPS
    if args.length != 5.0:
        cmd_parts.extend(["--length", str(args.length)])
    if args.fps != 15:
        cmd_parts.extend(["--fps", str(args.fps)])
    
    # Output
    if args.output != "output":
        cmd_parts.extend(["--output", args.output])
    
    # Seed
    if args.seed != -1:
        cmd_parts.extend(["--seed", str(args.seed)])
    
    # Filter
    if args.no_filter:
        cmd_parts.append("--no_filter")
    
    # Upscale
    if args.upscale:
        cmd_parts.append("--upscale")
        if args.upscale_factor != 2.0:
            cmd_parts.extend(["--upscale_factor", str(args.upscale_factor)])
    
    # Audio
    if getattr(args, 'generate_audio', False):
        cmd_parts.append("--generate_audio")
        cmd_parts.extend(["--audio_type", args.audio_type])
        if getattr(args, 'audio_text', None):
            cmd_parts.extend(["--audio_text", f'"{args.audio_text}"'])
        if getattr(args, 'tts_voice', None) and args.tts_voice != "edge_female_us":
            cmd_parts.extend(["--tts_voice", args.tts_voice])
        if getattr(args, 'music_model', None) and args.music_model != "medium":
            cmd_parts.extend(["--music_model", args.music_model])
    
    if getattr(args, 'sync_audio', False):
        cmd_parts.append("--sync_audio")
        if args.sync_mode != "stretch":
            cmd_parts.extend(["--sync_mode", args.sync_mode])
    
    if getattr(args, 'lip_sync', False):
        cmd_parts.append("--lip_sync")
        if args.lip_sync_method != "auto":
            cmd_parts.extend(["--lip_sync_method", args.lip_sync_method])
    
    # Offloading
    if args.offload_strategy != "model":
        cmd_parts.extend(["--offload_strategy", args.offload_strategy])
    
    if args.vram_limit != 22:
        cmd_parts.extend(["--vram_limit", str(args.vram_limit)])
    
    if args.low_ram_mode:
        cmd_parts.append("--low_ram_mode")
    
    return " \\\n  ".join(cmd_parts) if len(cmd_parts) > 3 else " ".join(cmd_parts)


def run_auto_mode(args, models):
    """Run automatic mode: detect and generate
    
    This function:
    1. Analyzes prompts to detect generation type
    2. Detects NSFW content
    3. Selects appropriate models
    4. Splits prompts for I2V mode if needed
    5. Configures and runs generation
    
    IMPORTANT: User-specified settings are ALWAYS preserved.
    Auto mode only sets values that weren't explicitly provided.
    """
    print("\n" + "=" * 60)
    print("🤖 AUTO MODE - Analyzing prompts and selecting models")
    print("=" * 60)
    
    # Track which settings were explicitly provided by user
    # These are settings that have non-default values
    user_provided = {
        'model': args.model is not None and args.model != (list(MODELS.keys())[0] if MODELS else None),
        'image_model': args.image_model is not None and args.image_model != (list(MODELS.keys())[0] if MODELS else None),
        'width': args.width != 832,
        'height': args.height != 480,
        'fps': args.fps != 15,
        'length': args.length != 5.0,
        'upscale': args.upscale,
        'upscale_factor': args.upscale_factor != 2.0,
        'output': args.output != "output",
        'no_filter': args.no_filter,
        'generate_audio': args.generate_audio,
        'audio_type': args.audio_type != "tts",
        'sync_audio': args.sync_audio,
        'lip_sync': args.lip_sync,
        'seed': args.seed != -1,
        'offload_strategy': args.offload_strategy != "model",
        'vram_limit': args.vram_limit != 22,
        'prompt_image': getattr(args, 'prompt_image', None) is not None,
        'prompt_animation': getattr(args, 'prompt_animation', None) is not None,
        'image': getattr(args, 'image', None) is not None,
    }
    
    # Store alternative models for retry in auto mode
    args._auto_alternative_models = []
    args._auto_alternative_image_models = []
    
    # Detect generation type
    print("\n📊 Analyzing prompts...")
    gen_type = detect_generation_type(
        args.prompt,
        getattr(args, 'prompt_image', None),
        getattr(args, 'prompt_animation', None),
        args
    )
    
    # Print detection results
    print(f"\n🔍 Detection Results:")
    print(f"  Generation Type: {gen_type['type'].upper()}")
    print(f"  NSFW Content: {'⚠️ YES' if gen_type['is_nsfw'] else '✅ NO'} ({gen_type['nsfw_confidence']:.0%} confidence)")
    if gen_type['is_nsfw']:
        print(f"    Reason: {gen_type['nsfw_reason']}")
    print(f"  Motion Type: {gen_type['motion_type']}")
    print(f"  Subject Type: {gen_type['subject_type']}")
    print(f"  Needs Audio: {'Yes (' + gen_type['audio_type'] + ')' if gen_type['needs_audio'] else 'No'}")
    
    # Detect VRAM
    vram_gb = detect_vram_gb()
    if vram_gb == 0:
        vram_gb = args.vram_limit
    print(f"\n💻 Detected VRAM: {vram_gb:.1f} GB")
    
    # Select main model (only if user didn't specify one)
    print(f"\n🎯 Selecting model for {gen_type['type'].upper()}...")
    prefer_quality = not getattr(args, 'prefer_speed', False)
    
    if not user_provided['model']:
        # Get all candidate models for retry support
        all_candidates = select_best_model(gen_type, models, vram_gb, prefer_quality, return_all=True)
        
        if not all_candidates:
            print("❌ Could not find a suitable model!")
            print("   Try running --update-models to update the model database")
            return None
        
        # Use the best candidate
        model_name, model_info, reason = all_candidates[0]
        print(f"  ✅ Selected: {model_name}")
        print(f"     {model_info.get('id', 'Unknown')}")
        print(f"     {reason}")
        args.model = model_name
        
        # Store alternatives for retry (excluding the selected one)
        args._auto_alternative_models = all_candidates[1:]
        if args._auto_alternative_models:
            print(f"  📋 {len(args._auto_alternative_models)} alternative models available for retry")
    else:
        # User specified a model - use it
        model_name = args.model
        model_info = models.get(model_name)
        print(f"  ✅ Using user-specified model: {model_name}")
        if model_info:
            print(f"     {model_info.get('id', 'Unknown')}")
    
    # Select image model for I2V (only if user didn't specify one)
    image_model_name = None
    if gen_type['type'] == 'i2v' and not getattr(args, 'image', None):
        if not user_provided['image_model']:
            print(f"\n🎯 Selecting image model for I2V...")
            img_gen_type = gen_type.copy()
            img_gen_type['type'] = 't2i'
            
            # Get all image model candidates
            all_img_candidates = select_best_model(
                img_gen_type, models, vram_gb, prefer_quality=True, return_all=True
            )
            
            if all_img_candidates:
                image_model_name, image_model_info, img_reason = all_img_candidates[0]
                print(f"  ✅ Selected: {image_model_name}")
                print(f"     {image_model_info.get('id', 'Unknown')}")
                args.image_model = image_model_name
                
                # Store alternatives for retry
                args._auto_alternative_image_models = all_img_candidates[1:]
                if args._auto_alternative_image_models:
                    print(f"  📋 {len(args._auto_alternative_image_models)} alternative image models available")
        else:
            print(f"\n🎯 Using user-specified image model: {args.image_model}")
    
    # Configure args for generation
    print(f"\n⚙️  Configuring generation...")
    
    # Set I2V mode if needed (only if not already set by user)
    if gen_type['type'] == 'i2v' and not args.image_to_video:
        args.image_to_video = True
        print("  📹 I2V mode enabled")
    
    # Set I2I mode if needed (only if not already set by user)
    if gen_type['type'] == 'i2i' and not args.image_to_image:
        args.image_to_image = True
        print("  🎨 I2I mode enabled")
    
    # Split prompts for I2V mode if user didn't provide separate prompts
    if gen_type['type'] == 'i2v' and not user_provided['prompt_image'] and not user_provided['prompt_animation']:
        full_prompt = " ".join(args.prompt) if args.prompt else ""
        image_prompt, animation_prompt = split_prompt_for_i2v(full_prompt)
        args.prompt_image = [image_prompt]
        args.prompt_animation = [animation_prompt]
        print(f"  ✂️  Split prompt for I2V:")
        print(f"     Image: {image_prompt[:60]}{'...' if len(image_prompt) > 60 else ''}")
        print(f"     Animation: {animation_prompt[:60]}{'...' if len(animation_prompt) > 60 else ''}")
    
    # Configure NSFW mode (only if user didn't explicitly set --no_filter or didn't set it)
    if gen_type['is_nsfw'] and not user_provided['no_filter']:
        args.no_filter = True
        print("  🔓 NSFW mode enabled (--no-filter)")
    
    # Configure audio if needed (only if user didn't explicitly set it)
    if gen_type['needs_audio'] and not user_provided['generate_audio']:
        args.generate_audio = True
        args.audio_type = gen_type['audio_type']
        if not user_provided['sync_audio']:
            args.sync_audio = True
        print(f"  🎵 Audio enabled: {gen_type['audio_type']}")
    
    # Adjust FPS based on motion type (only if user didn't specify FPS)
    if not user_provided['fps']:
        if gen_type['motion_type'] == 'slow':
            args.fps = max(12, args.fps - 3)
            print(f"  🎬 FPS adjusted for slow motion: {args.fps}")
        elif gen_type['motion_type'] == 'fast':
            args.fps = min(30, args.fps + 3)
            print(f"  🎬 FPS adjusted for fast motion: {args.fps}")
    
    # Print final configuration
    print(f"\n📋 Final Configuration:")
    print(f"  Model: {args.model}")
    if gen_type['type'] == 'i2v':
        print(f"  Image Model: {args.image_model}")
        print(f"  Mode: Image-to-Video (I2V)")
    elif gen_type['type'] == 't2i':
        print(f"  Mode: Text-to-Image (T2I)")
    elif gen_type['type'] == 'i2i':
        print(f"  Mode: Image-to-Image (I2I)")
    else:
        print(f"  Mode: Text-to-Video (T2V)")
    print(f"  Resolution: {args.width}x{args.height}")
    print(f"  Duration: {args.length}s @ {args.fps} fps")
    print(f"  Output: {args.output}")
    if args.no_filter:
        print(f"  NSFW Filter: Disabled")
    if args.upscale:
        print(f"  Upscale: {args.upscale_factor}x")
    if args.generate_audio:
        print(f"  Audio: {args.audio_type}")
    if args.seed != -1:
        print(f"  Seed: {args.seed}")
    
    # Show which settings were preserved from user
    preserved = [k for k, v in user_provided.items() if v]
    if preserved:
        print(f"\n  ✅ Preserved user settings: {', '.join(preserved)}")
    
    # Generate and print command line for reproduction
    print("\n" + "=" * 60)
    print("📝 COMMAND LINE (to reproduce without --auto):")
    print("=" * 60)
    cmd_line = generate_command_line(args)
    print(f"\n{cmd_line}\n")
    print("=" * 60)
    
    print("\n🚀 Starting generation...")
    print("=" * 60 + "\n")
    
    return args


def generate_tts_bark(text, output_path, voice="v2/en_speaker_6", args=None):
    """Generate TTS audio using Bark (Suno AI)"""
    if not BARK_AVAILABLE:
        print("❌ Bark not available. Install with: pip install git+https://github.com/suno-ai/bark.git")
        return None
    
    print(f"🎤 Generating TTS with Bark (voice: {voice})...")
    
    # Preload models to GPU if available
    if torch.cuda.is_available():
        preload_models()
    
    try:
        # Generate audio
        audio_array = bark_generate_audio(text, history_prompt=voice)
        
        # Save to file
        if SCIPY_AVAILABLE:
            scipy.io.wavfile.write(output_path, BARK_SAMPLE_RATE, audio_array)
        else:
            import numpy as np
            sf.write(output_path, audio_array, BARK_SAMPLE_RATE)
        
        print(f"  ✅ Saved TTS audio: {output_path}")
        return output_path
    except Exception as e:
        print(f"❌ Bark TTS failed: {e}")
        return None


async def generate_tts_edge(text, output_path, voice="en-US-JennyNeural"):
    """Generate TTS audio using Edge-TTS (Microsoft Azure)"""
    if not EDGE_TTS_AVAILABLE:
        print("❌ Edge-TTS not available. Install with: pip install edge-tts")
        return None
    
    print(f"🎤 Generating TTS with Edge-TTS (voice: {voice})...")
    
    try:
        communicate = edge_tts.Communicate(text, voice)
        await communicate.save(output_path)
        print(f"  ✅ Saved TTS audio: {output_path}")
        return output_path
    except Exception as e:
        print(f"❌ Edge-TTS failed: {e}")
        return None


def generate_tts(text, output_path, voice_name="edge_female_us", custom_voice_id=None, args=None):
    """Generate TTS audio using the specified voice/engine"""
    if not AUDIO_AVAILABLE:
        print("❌ No TTS engines available")
        return None
    
    # Get voice config
    voice_config = TTS_VOICES.get(voice_name, {"engine": "edge", "voice": "en-US-JennyNeural"})
    
    # Override with custom voice ID if provided
    if custom_voice_id:
        voice_config = {"engine": voice_config["engine"], "voice": custom_voice_id}
    
    engine = voice_config["engine"]
    voice_id = voice_config["voice"]
    
    if engine == "bark":
        return generate_tts_bark(text, output_path, voice=voice_id, args=args)
    elif engine == "edge":
        import asyncio
        return asyncio.run(generate_tts_edge(text, output_path, voice=voice_id))
    else:
        print(f"❌ Unknown TTS engine: {engine}")
        return None


def generate_music(prompt, output_path, duration_seconds=10, model_size="medium", args=None):
    """Generate music using MusicGen"""
    if not MUSICGEN_AVAILABLE:
        print("❌ MusicGen not available. Install with: pip install audiocraft")
        return None
    
    print(f"🎵 Generating music with MusicGen ({model_size})...")
    print(f"  Prompt: {prompt}")
    print(f"  Duration: {duration_seconds}s")
    
    try:
        # Load model
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = MusicGen.get_pretrained(f"facebook/musicgen-{model_size}")
        model.to(device)
        
        # Generate
        model.set_generation_params(duration=duration_seconds)
        wav = model.generate([prompt])
        
        # Save
        audio_write(
            output_path.replace('.wav', '').replace('.mp3', ''),
            wav[0].cpu(),
            model.sample_rate,
            strategy="loudness",
            loudness_compressor=True
        )
        
        # Rename to desired output
        generated_path = output_path.replace('.wav', '').replace('.mp3', '') + '.wav'
        if generated_path != output_path and os.path.exists(generated_path):
            os.rename(generated_path, output_path)
        
        print(f"  ✅ Saved music: {output_path}")
        return output_path
    except Exception as e:
        print(f"❌ MusicGen failed: {e}")
        return None


def get_audio_duration(audio_path):
    """Get duration of audio file in seconds"""
    if LIBROSA_AVAILABLE:
        y, sr = librosa.load(audio_path, sr=None)
        return len(y) / sr
    elif SCIPY_AVAILABLE:
        sr, data = scipy.io.wavfile.read(audio_path)
        return len(data) / sr
    else:
        # Fallback to ffprobe
        result = subprocess.run(
            ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', 
             '-of', 'default=noprint_wrappers=1:nokey=1', audio_path],
            capture_output=True, text=True
        )
        try:
            return float(result.stdout.strip())
        except:
            return None


def sync_audio_to_video(audio_path, video_path, output_path, mode="stretch", args=None):
    """
    Sync audio duration to match video duration
    
    Modes:
    - stretch: Time-stretch audio to match video
    - trim: Trim audio to video length
    - pad: Pad with silence if audio is shorter
    - loop: Loop audio if shorter than video
    """
    if not os.path.exists(audio_path):
        print(f"❌ Audio file not found: {audio_path}")
        return None
    
    if not os.path.exists(video_path):
        print(f"❌ Video file not found: {video_path}")
        return None
    
    audio_duration = get_audio_duration(audio_path)
    video_duration = get_video_duration(video_path)
    
    if audio_duration is None or video_duration is None:
        print("❌ Could not determine audio/video duration")
        return None
    
    print(f"🔄 Syncing audio ({audio_duration:.2f}s) to video ({video_duration:.2f}s)...")
    print(f"  Mode: {mode}")
    
    if abs(audio_duration - video_duration) < 0.1:
        # Already synced
        print("  ✅ Audio already matches video duration")
        return merge_audio_video(audio_path, video_path, output_path)
    
    temp_audio = tempfile.mktemp(suffix='.wav')
    
    if mode == "stretch" and LIBROSA_AVAILABLE:
        # Time-stretch using librosa
        y, sr = librosa.load(audio_path, sr=None)
        rate = video_duration / audio_duration
        y_stretched = librosa.effects.time_stretch(y, rate=1/rate)
        sf.write(temp_audio, y_stretched, sr)
    
    elif mode == "trim":
        # Trim audio to video length
        subprocess.run([
            'ffmpeg', '-y', '-i', audio_path, '-t', str(video_duration),
            '-c', 'copy', temp_audio
        ], capture_output=True)
    
    elif mode == "pad":
        # Pad with silence
        silence_duration = video_duration - audio_duration
        subprocess.run([
            'ffmpeg', '-y', '-i', audio_path,
            '-filter_complex', f'[0:a]apad=pad_dur={silence_duration}[a]',
            '-map', '[a]', temp_audio
        ], capture_output=True)
    
    elif mode == "loop":
        # Loop audio to fill video
        subprocess.run([
            'ffmpeg', '-y', '-stream_loop', '-1', '-i', audio_path,
            '-t', str(video_duration), '-c', 'copy', temp_audio
        ], capture_output=True)
    
    else:
        # Fallback: simple ffmpeg tempo adjustment
        tempo = video_duration / audio_duration
        if 0.5 <= tempo <= 2.0:
            subprocess.run([
                'ffmpeg', '-y', '-i', audio_path,
                '-filter:a', f'atempo={tempo}', temp_audio
            ], capture_output=True)
        else:
            print("  ⚠️ Tempo adjustment out of range, using loop mode")
            subprocess.run([
                'ffmpeg', '-y', '-stream_loop', '-1', '-i', audio_path,
                '-t', str(video_duration), '-c', 'copy', temp_audio
            ], capture_output=True)
    
    if os.path.exists(temp_audio):
        result = merge_audio_video(temp_audio, video_path, output_path)
        os.remove(temp_audio)
        return result
    
    return None


def get_video_duration(video_path):
    """Get duration of video file in seconds"""
    result = subprocess.run(
        ['ffprobe', '-v', 'error', '-show_entries', 'format=duration',
         '-of', 'default=noprint_wrappers=1:nokey=1', video_path],
        capture_output=True, text=True
    )
    try:
        return float(result.stdout.strip())
    except:
        return None


def merge_audio_video(audio_path, video_path, output_path):
    """Merge audio and video files"""
    print(f"  🔀 Merging audio and video...")
    
    result = subprocess.run([
        'ffmpeg', '-y',
        '-i', video_path,
        '-i', audio_path,
        '-c:v', 'copy',
        '-c:a', 'aac',
        '-map', '0:v:0',
        '-map', '1:a:0',
        '-shortest',
        output_path
    ], capture_output=True, text=True)
    
    if result.returncode == 0:
        print(f"  ✅ Saved synced video: {output_path}")
        return output_path
    else:
        print(f"  ❌ FFmpeg error: {result.stderr}")
        return None


# ──────────────────────────────────────────────────────────────────────────────
#                                 LIP SYNC FUNCTIONS
# ──────────────────────────────────────────────────────────────────────────────

def check_lipsync_dependencies():
    """Check if lip sync dependencies are available"""
    wav2lip_available = False
    sadtalker_available = False
    
    # Check for Wav2Lip
    wav2lip_paths = [
        os.path.expanduser("~/Wav2Lip"),
        os.path.join(os.path.dirname(__file__), "Wav2Lip"),
        "./Wav2Lip"
    ]
    
    for path in wav2lip_paths:
        if os.path.exists(os.path.join(path, "wav2lip.py")):
            wav2lip_available = path
            break
    
    # Check for SadTalker
    sadtalker_paths = [
        os.path.expanduser("~/SadTalker"),
        os.path.join(os.path.dirname(__file__), "SadTalker"),
        "./SadTalker"
    ]
    
    for path in sadtalker_paths:
        if os.path.exists(os.path.join(path, "inference.py")):
            sadtalker_available = path
            break
    
    return wav2lip_available, sadtalker_available


def apply_lip_sync_wav2lip(video_path, audio_path, output_path, wav2lip_path, args=None):
    """Apply lip sync using Wav2Lip"""
    print(f"👄 Applying lip sync with Wav2Lip...")
    
    if not wav2lip_path:
        print("❌ Wav2Lip not found. Clone with: git clone https://github.com/Rudrabha/Wav2Lip.git")
        return None
    
    checkpoint_path = os.path.join(wav2lip_path, "checkpoints", "wav2lip_gan.pth")
    if not os.path.exists(checkpoint_path):
        checkpoint_path = os.path.join(wav2lip_path, "checkpoints", "wav2lip.pth")
    
    if not os.path.exists(checkpoint_path):
        print(f"❌ Wav2Lip checkpoint not found. Download from: https://github.com/Rudrabha/Wav2Lip/releases")
        return None
    
    try:
        # Run Wav2Lip inference
        cmd = [
            sys.executable,
            os.path.join(wav2lip_path, "inference.py"),
            "--checkpoint_path", checkpoint_path,
            "--face", video_path,
            "--audio", audio_path,
            "--outfile", output_path,
            "--fps", str(args.fps if args else 25)
        ]
        
        result = subprocess.run(cmd, capture_output=True, text=True, cwd=wav2lip_path)
        
        if result.returncode == 0 and os.path.exists(output_path):
            print(f"  ✅ Saved lip-synced video: {output_path}")
            return output_path
        else:
            print(f"  ❌ Wav2Lip error: {result.stderr}")
            return None
    except Exception as e:
        print(f"❌ Lip sync failed: {e}")
        return None


def apply_lip_sync_sadtalker(video_path, audio_path, output_path, sadtalker_path, args=None):
    """Apply lip sync using SadTalker"""
    print(f"👄 Applying lip sync with SadTalker...")
    
    if not sadtalker_path:
        print("❌ SadTalker not found. Clone with: git clone https://github.com/OpenTalker/SadTalker.git")
        return None
    
    try:
        # Run SadTalker inference
        cmd = [
            sys.executable,
            os.path.join(sadtalker_path, "inference.py"),
            "--driven_audio", audio_path,
            "--source_image", video_path,  # Note: SadTalker expects image, not video
            "--result_dir", os.path.dirname(output_path),
            "--still",
            "--preprocess", "crop"
        ]
        
        result = subprocess.run(cmd, capture_output=True, text=True, cwd=sadtalker_path)
        
        if result.returncode == 0:
            print(f"  ✅ SadTalker processing complete")
            return output_path
        else:
            print(f"  ❌ SadTalker error: {result.stderr}")
            return None
    except Exception as e:
        print(f"❌ Lip sync failed: {e}")
        return None


def apply_lip_sync(video_path, audio_path, output_path, method="auto", args=None):
    """Apply lip sync using the best available method"""
    wav2lip_path, sadtalker_path = check_lipsync_dependencies()
    
    if method == "auto":
        if wav2lip_path:
            method = "wav2lip"
        elif sadtalker_path:
            method = "sadtalker"
        else:
            print("❌ No lip sync method available")
            print("  Install Wav2Lip: git clone https://github.com/Rudrabha/Wav2Lip.git")
            print("  Or SadTalker: git clone https://github.com/OpenTalker/SadTalker.git")
            return None
    
    if method == "wav2lip":
        return apply_lip_sync_wav2lip(video_path, audio_path, output_path, wav2lip_path, args)
    elif method == "sadtalker":
        return apply_lip_sync_sadtalker(video_path, audio_path, output_path, sadtalker_path, args)
    else:
        print(f"❌ Unknown lip sync method: {method}")
        return None


# ──────────────────────────────────────────────────────────────────────────────
#                                 MAIN PIPELINE
# ──────────────────────────────────────────────────────────────────────────────

def main(args):
    global MODELS
    
    # Initialize timing tracker
    timing = TimingTracker()
    
    # Handle model database update
    if args.update_models:
        hf_token = os.environ.get("HF_TOKEN")
        MODELS = update_all_models(hf_token=hf_token)
        sys.exit(0)
    
    # Handle model search
    if args.search_models:
        hf_token = os.environ.get("HF_TOKEN")
        results = search_hf_models(args.search_models, limit=args.search_limit, hf_token=hf_token)
        print_search_results(results, args)
        sys.exit(0)
    
    # Handle model addition
    if args.add_model:
        hf_token = os.environ.get("HF_TOKEN")
        result = add_model_from_hf(args.add_model, name=args.name, hf_token=hf_token, debug=getattr(args, 'debug', False))
        if result:
            name, model_entry = result
            MODELS[name] = model_entry
            save_models_config(MODELS)
            print(f"\n✅ Model '{name}' added successfully!")
            print(f"   Use with: --model {name}")
        sys.exit(0)
    
    # Handle model validation
    if args.validate_model:
        hf_token = os.environ.get("HF_TOKEN")
        model_info = validate_hf_model(args.validate_model, hf_token=hf_token, debug=getattr(args, 'debug', False))
        if model_info:
            print(f"✅ Model {args.validate_model} is valid")
            print(f"   Tags: {', '.join(model_info.get('tags', [])[:10])}")
            print(f"   Downloads: {model_info.get('downloads', 'N/A')}")
            pipeline = detect_pipeline_class(model_info)
            print(f"   Detected pipeline: {pipeline or 'Unknown'}")
        sys.exit(0)
    
    # Handle model list
    if args.model_list:
        print_model_list(args)
    
    # Handle show-model
    if args.show_model:
        show_model_details(args.show_model, args)
    
    # Handle TTS list
    if args.tts_list:
        print_tts_voices()
    
    # Check audio dependencies if audio features requested
    if args.generate_audio or args.lip_sync or args.audio_file:
        check_audio_dependencies()
    
    # Require prompt only for actual generation (unless auto mode)
    if not getattr(args, 'auto', False) and not args.model_list and not args.tts_list and not args.search_models and not args.add_model and not args.validate_model and not args.prompt:
        parser.error("the following arguments are required: --prompt")
    
    # Handle auto mode with retry support
    # Only run auto mode if this is not a retry (retry count not set yet)
    if getattr(args, 'auto', False) and not hasattr(args, '_auto_mode'):
        if not args.prompt:
            parser.error("--auto requires --prompt to analyze")
        args = run_auto_mode(args, MODELS)
        if args is None:
            sys.exit(1)
        
        # Store original args for retry
        args._auto_mode = True
        args._retry_count = 0
        args._max_retries = 3  # Maximum number of model retries

    if args.distribute and args.interface:
        os.environ["NCCL_SOCKET_IFNAME"] = args.interface
        os.environ["GLOO_SOCKET_IFNAME"] = args.interface

    if args.distribute:
        from accelerate import Accelerator
        accelerator = Accelerator()
        is_main = accelerator.is_main_process
        device_map = "auto"
    else:
        is_main = True
        device_map = "auto" if (args.low_ram_mode or args.offload_dir) else None

    max_mem = {0: f"{args.vram_limit}GiB"}
    if args.system_ram_limit > 0:
        max_mem["cpu"] = f"{args.system_ram_limit}GiB"

    if is_main:
        print(f"--- Target: {args.model.upper()} ({MODELS[args.model]['vram']}) ---")
        log_memory()

    # Validate model exists
    if args.model not in MODELS:
        print(f"❌ Model '{args.model}' not found in database.")
        print(f"   Use --model-list to see available models")
        print(f"   Or use --search-models to find models on HuggingFace")
        sys.exit(1)
    
    m_info = MODELS[args.model]
    PipelineClass = get_pipeline_class(m_info["class"])

    if not PipelineClass:
        pipeline_class = m_info['class']
        print(f"❌ Pipeline class '{pipeline_class}' not found in your diffusers installation.")
        print(f"   Model: {args.model} ({m_info['id']})")
        
        # List available video-related pipelines
        import diffusers
        available_pipelines = [name for name in dir(diffusers) if 'Pipeline' in name and any(x in name.lower() for x in ['video', 'ltx', 'cog', 'mochi', 'image', 'diffusion'])]
        if available_pipelines:
            print(f"\n   📋 Available video/image pipelines in your diffusers:")
            for p in sorted(available_pipelines)[:15]:
                print(f"      - {p}")
            if len(available_pipelines) > 15:
                print(f"      ... and {len(available_pipelines) - 15} more")
        
        # Provide specific guidance for known pipelines
        if pipeline_class == "LTXVideoPipeline":
            print(f"\n   📦 LTX Video pipelines in diffusers:")
            ltx_pipelines = [name for name in dir(diffusers) if 'LTX' in name]
            if ltx_pipelines:
                for p in ltx_pipelines:
                    print(f"      - {p}")
            print(f"\n   Try updating the model config with the correct pipeline class:")
            print(f"   videogen --add-model {m_info['id']} --name {args.model}")
        elif pipeline_class == "CogVideoXPipeline":
            print(f"\n   📦 CogVideoX requires diffusers >= 0.30.0")
        elif pipeline_class == "MochiPipeline":
            print(f"\n   📦 Mochi requires diffusers >= 0.31.0")
        
        print(f"\n   💡 Your diffusers version: {diffusers.__version__}")
        print(f"   💡 To list all pipelines: python -c 'import diffusers; print([x for x in dir(diffusers) if \"Pipeline\" in x])'")
        sys.exit(1)

    # ─── VRAM & low_mem decision ───────────────────────────────────────────────
    detected_vram_gb = detect_vram_gb()
    configured_vram_gb = args.vram_limit
    effective_vram_gb = min(detected_vram_gb, configured_vram_gb) if detected_vram_gb > 0 else configured_vram_gb

    use_low_mem, reason = should_use_low_mem(args, m_info, effective_vram_gb)

    if is_main:
        print(f"  Detected GPU VRAM: {detected_vram_gb:.1f} GB (effective: {effective_vram_gb:.1f} GB)")
        print(f"  Model estimated VRAM: {parse_vram_estimate(m_info['vram']):.1f} GB")
        print(f"  low_cpu_mem_usage: {use_low_mem}  ({reason})")

    pipe_kwargs = {
        "torch_dtype": torch.bfloat16 if any(x in args.model for x in ["mochi", "wan", "flux"]) else torch.float16,
        "device_map": device_map,
        "max_memory": max_mem,
        "offload_folder": args.offload_dir,
    }

    if use_low_mem:
        pipe_kwargs["low_cpu_mem_usage"] = True

    extra = m_info.get("extra", {})
    if variant := extra.get("variant"):
        pipe_kwargs["variant"] = variant

    # Handle LoRA models - need to load base model first
    is_lora = m_info.get("is_lora", False)
    lora_id = None
    base_model_id = None
    
    if is_lora:
        lora_id = m_info["id"]
        base_model_id = m_info.get("base_model")
        
        # Allow manual override via --base-model
        if args.base_model:
            base_model_id = args.base_model
            print(f"  Using override base model: {base_model_id}")
        
        if not base_model_id:
            # Try to infer base model from LoRA name
            if "wan" in lora_id.lower():
                if m_info.get("supports_i2v"):
                    base_model_id = "Wan-AI/Wan2.1-I2V-14B-Diffusers"
                else:
                    base_model_id = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
            elif "svd" in lora_id.lower():
                base_model_id = "stabilityai/stable-video-diffusion-img2vid-xt-1-1"
            else:
                print(f"❌ Cannot determine base model for LoRA: {lora_id}")
                print(f"   Please specify --base-model when using this LoRA")
                sys.exit(1)
        
        print(f"  LoRA detected: {lora_id}")
        print(f"  Base model: {base_model_id}")
        model_id_to_load = base_model_id
        
        # Set up custom VAE for Wan base models
        if "wan" in base_model_id.lower():
            extra["use_custom_vae"] = True
    else:
        model_id_to_load = m_info["id"]

    if extra.get("use_custom_vae"):
        try:
            vae_model_id = model_id_to_load if is_lora else m_info["id"]
            vae = AutoencoderKLWan.from_pretrained(vae_model_id, subfolder="vae", torch_dtype=pipe_kwargs["torch_dtype"])
            pipe_kwargs["vae"] = vae
        except Exception as e:
            print(f"Custom Wan VAE load failed: {e}")

    timing.start()
    timing.begin_step("model_loading")
    
    debug = getattr(args, 'debug', False)
    
    if debug:
        print(f"\n🔍 [DEBUG] Model Loading Details:")
        print(f"   [DEBUG] Model ID to load: {model_id_to_load}")
        print(f"   [DEBUG] Pipeline class: {m_info['class']}")
        print(f"   [DEBUG] Is LoRA: {is_lora}")
        if is_lora:
            print(f"   [DEBUG] LoRA ID: {lora_id}")
        print(f"   [DEBUG] Pipeline kwargs:")
        for k, v in pipe_kwargs.items():
            if k == "max_memory":
                print(f"      {k}: {v}")
            elif k == "device_map":
                print(f"      {k}: {v}")
            else:
                print(f"      {k}: {v}")
        print(f"   [DEBUG] HF Token: {'***' + os.environ.get('HF_TOKEN', '')[-4:] if os.environ.get('HF_TOKEN') else 'Not set'}")
        print(f"   [DEBUG] Cache dir: {os.environ.get('HF_HOME', 'default')}")
        print()
    
    try:
        pipe = PipelineClass.from_pretrained(model_id_to_load, **pipe_kwargs)
    except Exception as e:
        error_str = str(e)
        
        if debug:
            print(f"\n🔍 [DEBUG] Error Details:")
            print(f"   [DEBUG] Exception type: {type(e).__name__}")
            print(f"   [DEBUG] Error message: {error_str}")
            if hasattr(e, 'response'):
                print(f"   [DEBUG] Response: {e.response}")
            print()
        
        # Check if we should retry with an alternative model (auto mode)
        if getattr(args, '_auto_mode', False) and getattr(args, '_retry_count', 0) < getattr(args, '_max_retries', 3):
            alternative_models = getattr(args, '_auto_alternative_models', [])
            if alternative_models:
                args._retry_count += 1
                next_model_name, next_model_info, next_reason = alternative_models.pop(0)
                args._auto_alternative_models = alternative_models  # Update the list
                
                print(f"\n⚠️  Model loading failed: {model_id_to_load}")
                print(f"   Error: {error_str[:100]}...")
                print(f"\n🔄 Retrying with alternative model ({args._retry_count}/{args._max_retries})...")
                print(f"   New model: {next_model_name}")
                print(f"   {next_reason}")
                
                # Update args with new model and recurse
                args.model = next_model_name
                # Clean up any partial model loading
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                # Retry main() with the new model
                return main(args)
        
        # Check for common errors and provide helpful messages
        if "404" in error_str or "Entry Not Found" in error_str:
            print(f"❌ Model not found on HuggingFace: {model_id_to_load}")
            print(f"   This model may have been removed or the ID is incorrect.")
            if debug:
                print(f"\n   [DEBUG] Troubleshooting:")
                print(f"   - Check if the model exists: https://huggingface.co/{model_id_to_load}")
                print(f"   - Verify the model ID spelling")
                print(f"   - The model may have been renamed or moved")
            print(f"\n   💡 Try searching for an alternative:")
            print(f"   videogen --search-models ltxvideo")
            print(f"\n   💡 Or use the official LTX Video model:")
            print(f"   videogen --model ltx_video --prompt 'your prompt' ...")
        elif "401" in error_str or "Unauthorized" in error_str:
            print(f"❌ Model requires authentication: {model_id_to_load}")
            print(f"   Set your HuggingFace token:")
            print(f"   export HF_TOKEN=your_token_here")
            print(f"   huggingface-cli login")
            if debug:
                print(f"\n   [DEBUG] To get a token:")
                print(f"   1. Go to https://huggingface.co/settings/tokens")
                print(f"   2. Create a new token with 'read' permissions")
                print(f"   3. Export it: export HF_TOKEN=hf_xxx")
        elif "gated" in error_str.lower():
            print(f"❌ This is a gated model: {model_id_to_load}")
            print(f"   You need to accept the license on HuggingFace:")
            print(f"   https://huggingface.co/{model_id_to_load}")
            print(f"   Then set HF_TOKEN and run again.")
        elif "connection" in error_str.lower() or "timeout" in error_str.lower():
            print(f"❌ Network error loading model: {model_id_to_load}")
            print(f"   Check your internet connection and try again.")
            if debug:
                print(f"\n   [DEBUG] Network troubleshooting:")
                print(f"   - Check if you can access: https://huggingface.co/{model_id_to_load}")
                print(f"   - Try with a VPN if HuggingFace is blocked")
                print(f"   - Check if HF_ENDPOINT is set (for China mirror): {os.environ.get('HF_ENDPOINT', 'not set')}")
        elif "FrozenDict" in error_str or "scale_factor" in error_str or "has no attribute" in error_str:
            print(f"❌ Pipeline compatibility error: {model_id_to_load}")
            print(f"   This model uses a pipeline architecture incompatible with your diffusers version.")
            print(f"   The model may require a specific diffusers version or different pipeline class.")
            if debug:
                print(f"\n   [DEBUG] Compatibility troubleshooting:")
                print(f"   - Try updating diffusers: pip install --upgrade git+https://github.com/huggingface/diffusers.git")
                print(f"   - Check the model's documentation for required versions")
                print(f"   - The model may be incorrectly configured in models.json")
            print(f"\n   💡 Try a different model with --model <name>")
        else:
            print(f"Model loading failed: {e}")
            if debug:
                import traceback
                print(f"\n   [DEBUG] Full traceback:")
                traceback.print_exc()
        
        # If we've exhausted all retries, exit with error
        if getattr(args, '_auto_mode', False):
            retry_count = getattr(args, '_retry_count', 0)
            max_retries = getattr(args, '_max_retries', 3)
            alternative_models = getattr(args, '_auto_alternative_models', [])
            
            if retry_count >= max_retries or not alternative_models:
                print(f"\n❌ All model retries exhausted ({retry_count}/{max_retries} attempts)")
                print(f"   Try searching for alternative models: videogen --search-models <query>")
        
        sys.exit(1)
    
    timing.end_step()  # model_loading
    
    # Apply LoRA if this is a LoRA model
    if is_lora and lora_id:
        timing.begin_step("lora_loading")
        print(f"  Loading LoRA adapter: {lora_id}")
        try:
            # Load LoRA weights
            pipe.load_lora_weights(lora_id)
            print(f"  ✅ LoRA applied successfully")
        except Exception as e:
            print(f"  ⚠️ LoRA loading failed: {e}")
            print(f"     Continuing with base model...")
        timing.end_step()  # lora_loading

    if args.no_filter and hasattr(pipe, "safety_checker"):
        pipe.safety_checker = None

    # Offloading
    off = args.offload_strategy
    if off == "auto_map":
        pipe.enable_model_cpu_offload()
    elif off == "sequential":
        pipe.enable_sequential_cpu_offload()
    elif off == "group":
        try:
            pipe.enable_group_offload(group_size=args.offload_group_size)
        except:
            print("Group offload unavailable → model offload fallback")
            pipe.enable_model_cpu_offload()
    elif off == "model":
        pipe.enable_model_cpu_offload()
    else:
        pipe.to("cuda" if torch.cuda.is_available() else "cpu")

    pipe.enable_attention_slicing("max")
    try:
        pipe.enable_vae_slicing()
        pipe.enable_vae_tiling()
    except:
        pass

    if torch.cuda.is_available():
        try:
            pipe.enable_xformers_memory_efficient_attention()
        except:
            pass

    if "wan" in args.model and hasattr(pipe, "scheduler"):
        try:
            pipe.scheduler = UniPCMultistepScheduler.from_config(
                pipe.scheduler.config,
                prediction_type="flow_prediction",
                flow_shift=extra.get("flow_shift", 3.0)
            )
        except:
            pass

    # ─── Generation ────────────────────────────────────────────────────────────
    seed = args.seed if args.seed >= 0 else random.randint(0, 2**31 - 1)
    generator = torch.Generator("cuda" if torch.cuda.is_available() else "cpu").manual_seed(seed)

    main_prompt = ", ".join(args.prompt)
    init_image = None

    # Calculate and print time estimate
    has_i2v = args.image_to_video or args.image
    has_audio = args.generate_audio or args.audio_file
    has_lipsync = args.lip_sync
    has_upscale = args.upscale
    
    estimates = timing.estimate_total_time(
        args, m_info,
        has_i2v=has_i2v,
        has_audio=has_audio,
        has_lipsync=has_lipsync,
        has_upscale=has_upscale
    )
    timing.print_estimate(estimates)

    # Detect if we should generate a static image (T2I mode)
    # Conditions: T2I model, OR output ends with image extension, OR only prompt_image specified
    is_t2i_model = m_info.get("class") in ["StableDiffusionXLPipeline", "FluxPipeline"]
    output_ext = os.path.splitext(args.output)[1].lower()
    is_image_output = output_ext in [".png", ".jpg", ".jpeg", ".gif", ".webp"]
    only_prompt_image = args.prompt_image and not args.prompt
    generate_static_image = is_t2i_model or is_image_output or only_prompt_image

    # ─── T+I2I (Text + Image-to-Image) Mode ─────────────────────────────────────
    # Use existing image with T2I model to create modified image
    if args.image_to_image and args.image:
        if not is_t2i_model:
            print(f"⚠️  --image-to-image works best with T2I models (Flux, SDXL, etc.)")
            print(f"   Current model: {m_info.get('class', 'Unknown')}")
        
        if not os.path.exists(args.image):
            print(f"❌ Image file not found: {args.image}")
            sys.exit(1)
        
        if is_main:
            print(f"  🎨 Image-to-Image mode (T+I2I)")
            print(f"     Input image: {args.image}")
            print(f"     Strength: {args.strength}")
        
        timing.begin_step("image_to_image")
        
        # Load input image
        init_image = Image.open(args.image).convert("RGB")
        init_image = init_image.resize((args.width, args.height), Image.LANCZOS)
        
        # Use prompt_image if specified, otherwise use main prompt
        image_prompt = ", ".join(args.prompt_image) if args.prompt_image else main_prompt
        
        with torch.no_grad():
            # Check if pipeline supports img2img
            if hasattr(pipe, 'image_to_image'):
                # Use img2img if available
                image = pipe.image_to_image(
                    image_prompt,
                    image=init_image,
                    strength=args.strength,
                    generator=generator,
                    num_inference_steps=args.image_steps,
                    guidance_scale=args.guidance_scale,
                ).images[0]
            else:
                # Try standard img2img call
                try:
                    image = pipe(
                        image_prompt,
                        image=init_image,
                        strength=args.strength,
                        generator=generator,
                        num_inference_steps=args.image_steps,
                        guidance_scale=args.guidance_scale,
                    ).images[0]
                except TypeError:
                    # Pipeline doesn't support img2img - use img2img pipeline instead
                    if is_main:
                        print(f"  ⚠️ Pipeline doesn't support img2img directly, loading img2img variant...")
                    
                    # Try to load img2img pipeline
                    try:
                        if "FluxPipeline" in m_info.get("class", ""):
                            from diffusers import FluxImg2ImgPipeline
                            Img2ImgClass = FluxImg2ImgPipeline
                        else:
                            from diffusers import StableDiffusionXLImg2ImgPipeline
                            Img2ImgClass = StableDiffusionXLImg2ImgPipeline
                        
                        # Load img2img pipeline
                        img2img_pipe = Img2ImgClass.from_pretrained(
                            m_info["id"],
                            torch_dtype=pipe_kwargs["torch_dtype"],
                            device_map=device_map,
                        )
                        img2img_pipe.enable_model_cpu_offload()
                        
                        image = img2img_pipe(
                            image_prompt,
                            image=init_image,
                            strength=args.strength,
                            generator=generator,
                            num_inference_steps=args.image_steps,
                            guidance_scale=args.guidance_scale,
                        ).images[0]
                    except Exception as e:
                        print(f"❌ Failed to load img2img pipeline: {e}")
                        print(f"   Try using a model that supports img2img")
                        sys.exit(1)
        
        # Determine output filename
        if is_image_output:
            output_file = args.output
        else:
            output_file = f"{args.output}_img2img.png"
        
        # Save image
        image.save(output_file)
        timing.end_step()  # image_to_image
        
        if is_main:
            print(f"  ✅ Saved img2img result: {output_file}")
            timing.print_summary()
            print(f"✨ Done! Seed: {seed}")
        return

    # If generating static image, use T2I pipeline
    if generate_static_image and not (args.image_to_video or args.image):
        if is_main:
            print(f"  🖼️  Generating static image (T2I mode)")
            print(f"     Model type: {m_info.get('class', 'Unknown')}")
        
        timing.begin_step("image_generation")
        
        # Use prompt_image if specified, otherwise use main prompt
        image_prompt = ", ".join(args.prompt_image) if args.prompt_image else main_prompt
        
        with torch.no_grad():
            # Generate image
            image = pipe(
                image_prompt,
                width=args.width,
                height=args.height,
                generator=generator,
                num_inference_steps=args.image_steps,
                guidance_scale=args.guidance_scale,
            ).images[0]
        
        # Determine output filename
        if is_image_output:
            output_file = args.output
        else:
            output_file = f"{args.output}.png"
        
        # Save image
        image.save(output_file)
        timing.end_step()  # image_generation
        
        if is_main:
            print(f"  ✅ Saved image: {output_file}")
            timing.print_summary()
            print(f"✨ Done! Seed: {seed}")
        return

    if args.image_to_video or args.image:
        if not m_info.get("supports_i2v"):
            print(f"Error: {args.model} does not support image-to-video.")
            sys.exit(1)

        # Use provided image file if specified
        if args.image:
            if not os.path.exists(args.image):
                print(f"❌ Image file not found: {args.image}")
                sys.exit(1)
            
            print(f"  📷 Using provided image: {args.image}")
            try:
                init_image = Image.open(args.image).convert("RGB")
                # Resize to match requested dimensions
                init_image = init_image.resize((args.width, args.height), Image.LANCZOS)
                if is_main:
                    init_image.save(f"{args.output}_init.png")
                    print(f"  Saved initial image: {args.output}_init.png")
            except Exception as e:
                print(f"❌ Failed to load image: {e}")
                sys.exit(1)
        else:
            # Generate image using image_model
            timing.begin_step("image_generation")
            
            img_info = MODELS[args.image_model]
            ImgCls = get_pipeline_class(img_info["class"])
            if not ImgCls:
                print(f"❌ Pipeline class '{img_info['class']}' not found for image model.")
                print(f"   pip install --upgrade git+https://github.com/huggingface/diffusers.git")
                sys.exit(1)

            img_kwargs = {
                "torch_dtype": torch.float16,
                "device_map": device_map,
                "max_memory": max_mem,
                "offload_folder": args.offload_dir,
            }

            if use_low_mem:
                img_kwargs["low_cpu_mem_usage"] = True

            try:
                img_pipe = ImgCls.from_pretrained(img_info["id"], **img_kwargs)
                img_pipe.enable_model_cpu_offload()

                img_prompt = ", ".join(args.prompt_image) if args.prompt_image else main_prompt
                with torch.no_grad():
                    init_image = img_pipe(
                        img_prompt,
                        width=args.width,
                        height=args.height,
                        generator=generator,
                    ).images[0]

                if is_main:
                    init_image.save(f"{args.output}_init.png")
                    print(f"  Saved initial image: {args.output}_init.png")
                
                timing.end_step()  # image_generation
            except Exception as e:
                print(f"Image generation failed: {e}")
                sys.exit(1)

    # ─── Audio Generation (Pre-video) ──────────────────────────────────────────
    audio_path = None
    
    if args.generate_audio:
        timing.begin_step("audio_generation")
        audio_text = args.audio_text if args.audio_text else main_prompt
        
        if args.audio_type == "tts":
            audio_path = generate_tts(
                audio_text,
                f"{args.output}_tts.wav",
                voice_name=args.tts_voice,
                custom_voice_id=args.tts_voice_id,
                args=args
            )
        elif args.audio_type == "music":
            audio_path = generate_music(
                audio_text,
                f"{args.output}_music.wav",
                duration_seconds=args.length,
                model_size=args.music_model,
                args=args
            )
        
        timing.end_step()  # audio_generation
        
        if audio_path and is_main:
            print(f"  Generated audio: {audio_path}")

    timing.begin_step("video_generation")
    
    with torch.no_grad():
        if "pony" in args.model or "flux" in args.model:
            image = pipe(main_prompt, width=args.width, height=args.height, generator=generator).images[0]
            if is_main:
                image.save(f"{args.output}.png")
            timing.end_step()  # video_generation
        else:
            video_prompt = ", ".join(args.prompt_animation) if args.prompt_animation else main_prompt
            video_kwargs = {
                "prompt": video_prompt,
                "height": args.height,
                "width": args.width,
                "num_frames": int(args.length * args.fps),
                "generator": generator,
                "num_inference_steps": 50 if "wan" in args.model else 28,
                "guidance_scale": 5.0 if "wan" in args.model else 7.0,
            }

            if (args.image_to_video or args.image) and init_image is not None:
                if m_info.get("supports_i2v"):
                    video_kwargs["image"] = init_image
                else:
                    print(f"Warning: {args.model} does not support 'image' argument – running pure T2V")

            output = pipe(**video_kwargs)

            if is_main:
                if hasattr(output, "frames"):
                    frames = output.frames[0] if isinstance(output.frames, list) else output.frames
                elif hasattr(output, "videos"):
                    frames = output.videos[0]
                else:
                    print("Unknown output format.")
                    return

                export_to_video(frames, f"{args.output}.mp4", fps=args.fps)
                timing.end_step()  # video_generation

                if args.upscale:
                    timing.begin_step("upscaling")
                    print(f"  Upscaling ×{args.upscale_factor:.2f}...")
                    try:
                        upscaler = StableDiffusionUpscalePipeline.from_pretrained(
                            "stabilityai/stable-diffusion-x4-upscaler",
                            torch_dtype=torch.float16
                        )
                        upscaler.enable_model_cpu_offload()

                        up_frames = []
                        target_size = (int(args.width * args.upscale_factor), int(args.height * args.upscale_factor))
                        for frame in frames:
                            if isinstance(frame, torch.Tensor):
                                frame = Image.fromarray((frame.permute(1, 2, 0).cpu().numpy() * 255).astype("uint8"))
                            up = upscaler(prompt=video_prompt, image=frame, num_inference_steps=20).images[0]
                            up = up.resize(target_size, Image.LANCZOS)
                            up_frames.append(up)

                        export_to_video(up_frames, f"{args.output}_upscaled.mp4", fps=args.fps)
                        timing.end_step()  # upscaling
                    except Exception as e:
                        print(f"Upscale failed: {e}")
                        timing.end_step()  # upscaling (failed)
            else:
                timing.end_step()  # video_generation (non-main process)

    # ─── Audio Post-Processing ──────────────────────────────────────────────────
    if is_main and audio_path:
        video_file = f"{args.output}.mp4"
        
        if args.upscale:
            upscaled_file = f"{args.output}_upscaled.mp4"
            if os.path.exists(upscaled_file):
                video_file = upscaled_file
        
        # Sync audio to video
        if args.sync_audio:
            timing.begin_step("audio_sync")
            synced_output = f"{args.output}_synced.mp4"
            result = sync_audio_to_video(
                audio_path, video_file, synced_output,
                mode=args.sync_mode, args=args
            )
            timing.end_step()  # audio_sync
            if result:
                video_file = result
        
        # Apply lip sync
        if args.lip_sync:
            timing.begin_step("lip_sync")
            lipsync_output = f"{args.output}_lipsync.mp4"
            result = apply_lip_sync(
                video_file, audio_path, lipsync_output,
                method=args.lip_sync_method, args=args
            )
            timing.end_step()  # lip_sync
            if result:
                print(f"  ✨ Final lip-synced video: {result}")
        elif args.sync_audio:
            print(f"  ✨ Final synced video: {video_file}")

    if is_main:
        timing.print_summary()
        print(f"✨ Done! Seed: {seed}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Universal Video Generation Toolkit with Audio Support",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:

Single GPU (simple T2V):
  python3 videogen --model wan_1.3b_t2v --prompt "a cat playing piano" --length 5.0 --output cat_piano

Single GPU with I2V and upscale:
  python3 videogen --image_to_video --model svd_xt_1.1 --image_model pony_uncensored_v6 --prompt "cinematic scene" --prompt_animation "dynamic motion" --length 10 --upscale --upscale_factor 2.0 --offload_strategy sequential --output scene

I2V with existing image:
  python3 videogen --image_to_video --model svd_xt_1.1 --image my_image.png --prompt "animate this scene" --length 5 --output animated

I2V with existing image (no --image_to_video needed when --image is provided):
  python3 videogen --model svd_xt_1.1 --image my_photo.jpg --prompt "add subtle motion" --length 3 --output photo_motion

Distributed (multi-GPU):
  python3 videogen --model wan_14b_t2v --prompt "epic space battle" --length 10.0 --output battle --distribute --interface eth0 --vram_limit 20

NSFW I2V example with Flux NSFW init:
  python3 videogen --image_to_video --model svd_xt_1.1 --image_model flux_nsfw_uncensored --prompt "create a cinematic and realistic blowjob scene" --prompt_animation "she is moving to a deepthroat" --no_filter --output test --length 10 --seed 42 --upscale --upscale_factor 2.0 --offload_strategy sequential

T2I (TEXT-TO-IMAGE) EXAMPLES:

Generate a static image with T2I model (auto-detected):
  python3 videogen --model flux_dev --prompt "a beautiful woman in a red dress" --output image.png

Generate image with SDXL model:
  python3 videogen --model sdxl_base --prompt "cyberpunk city at night" --width 1024 --height 1024 --output city.png

Generate image by specifying output extension:
  python3 videogen --model pony_v6 --prompt "anime girl with blue hair" --output anime_girl.jpg

T+I2I (IMAGE-TO-IMAGE) EXAMPLES:

Modify an existing image with text prompt:
  python3 videogen --model flux_dev --image-to-image --image photo.png --prompt "make it look like a painting" --output painted.png

Img2img with strength control (0.0-1.0):
  python3 videogen --model sdxl_base --image-to-image --image input.jpg --prompt "add a sunset background" --strength 0.5 --output sunset.png

Strong transformation with high strength:
  python3 videogen --model pony_v6 --image-to-image --image sketch.png --prompt "detailed anime art" --strength 0.9 --output detailed.png

AUDIO GENERATION EXAMPLES:

Generate video with TTS narration:
  python3 videogen --model wan_1.3b_t2v --prompt "a beautiful sunset over the ocean" --generate_audio --audio_type tts --audio_text "The sun sets slowly over the calm ocean waves" --tts_voice edge_male_us --sync_audio --output sunset

Generate video with background music:
  python3 videogen --model wan_14b_t2v --prompt "epic battle scene" --generate_audio --audio_type music --audio_text "epic orchestral battle music with drums and brass" --music_model medium --sync_audio --output battle

Generate video with TTS and lip sync:
  python3 videogen --image_to_video --model svd_xt_1.1 --image_model pony_uncensored_v6 --prompt "a woman speaking to camera" --generate_audio --audio_type tts --audio_text "Hello, welcome to my channel" --tts_voice edge_female_us --lip_sync --output speaker

List models:
  python3 videogen --model-list
  python3 videogen --model-list --i2v-only
  python3 videogen --model-list --nsfw-friendly
  python3 videogen --model-list --low-vram --i2v-only
  python3 videogen --model-list --high-vram

List TTS voices:
  python3 videogen --tts-list
"""
    )

    # Model listing arguments
    parser.add_argument("--model-list", action="store_true",
                        help="Print list of all available models and exit")
    parser.add_argument("--tts-list", action="store_true",
                        help="Print list of all available TTS voices and exit")

    parser.add_argument("--i2v-only", action="store_true",
                        help="When using --model-list: only show I2V-capable models")
    parser.add_argument("--t2v-only", action="store_true",
                        help="When using --model-list: only show T2V-only models")
    parser.add_argument("--nsfw-friendly", action="store_true",
                        help="When using --model-list: only show uncensored/NSFW-capable models")
    parser.add_argument("--low-vram", action="store_true",
                        help="When using --model-list: only show models ≤16GB est")
    parser.add_argument("--high-vram", action="store_true",
                        help="When using --model-list: only show models >30GB est")
    parser.add_argument("--huge-vram", action="store_true",
                        help="When using --model-list: only show models >55GB est (extreme VRAM)")

    # Video generation arguments
    if MODELS:
        default_model = list(MODELS.keys())[0]
        image_models = [k for k, v in MODELS.items() if not v.get("supports_i2v", False)]
        default_image_model = image_models[0] if image_models else default_model
        parser.add_argument("--model", type=str, default=default_model,
                            metavar="MODEL",
                            help=f"Model name (default: {default_model}). Use --model-list to see available models.")
        parser.add_argument("--image_model", type=str, default=default_image_model,
                            metavar="MODEL",
                            help=f"Image model for I2V (default: {default_image_model}). Use --model-list to see available models.")
    else:
        parser.add_argument("--model", type=str, default=None,
                            metavar="MODEL",
                            help="Model name (run --update-models first to populate model list)")
        parser.add_argument("--image_model", type=str, default=None,
                            metavar="MODEL",
                            help="Image model name (run --update-models first)")
    
    parser.add_argument("--base-model", type=str, default=None,
                        metavar="MODEL_ID",
                        help="Override base model for LoRA adapters (e.g., Wan-AI/Wan2.1-I2V-14B-Diffusers)")
    parser.add_argument("--prompt", nargs="+", required=False)
    parser.add_argument("--image_to_video", action="store_true",
                        help="Enable image-to-video mode (use --image to provide an image, or --image_model to generate one)")
    parser.add_argument("--image", type=str, default=None,
                        metavar="IMAGE_FILE",
                        help="Use existing image file for I2V (PNG, JPG, etc.) instead of generating one")
    parser.add_argument("--prompt_image", nargs="+", default=None)
    parser.add_argument("--prompt_animation", nargs="+", default=None)

    parser.add_argument("--distribute", action="store_true")
    parser.add_argument("--interface", type=str, default="eth0")

    parser.add_argument("--offload_strategy", choices=["none", "model", "sequential", "group", "auto_map"], default="model")
    parser.add_argument("--offload_group_size", type=int, default=8)
    parser.add_argument("--low_ram_mode", action="store_true")
    parser.add_argument("--vram_limit", type=int, default=22)
    parser.add_argument("--system_ram_limit", type=int, default=0)
    parser.add_argument("--offload_dir", default=None)

    parser.add_argument("--length", type=float, default=5.0)
    parser.add_argument("--width", type=int, default=832)
    parser.add_argument("--height", type=int, default=480)
    parser.add_argument("--fps", type=int, default=15)
    parser.add_argument("--output", default="output")
    parser.add_argument("--seed", type=int, default=-1)
    parser.add_argument("--no_filter", action="store_true")
    parser.add_argument("--upscale", action="store_true")
    parser.add_argument("--upscale_factor", type=float, default=2.0)

    # ─── T2I / IMAGE GENERATION ARGUMENTS ────────────────────────────────────────
    
    parser.add_argument("--image-to-image", action="store_true",
                        help="Enable image-to-image mode (T+I2I). Use with --image to modify an existing image")
    parser.add_argument("--strength", type=float, default=0.75,
                        help="Strength for img2img (0.0-1.0). Higher = more change from original")
    parser.add_argument("--image-steps", type=int, default=30,
                        help="Number of inference steps for image generation (default: 30)")
    parser.add_argument("--guidance-scale", type=float, default=7.5,
                        help="Guidance scale for image generation (default: 7.5)")

    # ─── AUDIO GENERATION ARGUMENTS ─────────────────────────────────────────────
    
    parser.add_argument("--generate_audio", action="store_true",
                        help="Generate audio for the video")
    parser.add_argument("--audio_type", choices=["tts", "music"], default="tts",
                        help="Type of audio to generate: tts (speech) or music")
    parser.add_argument("--audio_text", type=str, default=None,
                        help="Text for TTS or prompt for music generation (defaults to video prompt)")
    
    # TTS arguments
    parser.add_argument("--tts_voice", choices=list(TTS_VOICES.keys()), default="edge_female_us",
                        help="TTS voice to use (see --tts-list for options)")
    parser.add_argument("--tts_voice_id", type=str, default=None,
                        help="Custom voice ID for TTS engine (overrides --tts_voice)")
    
    # Music generation arguments
    parser.add_argument("--music_model", choices=["small", "medium", "large"], default="medium",
                        help="MusicGen model size (larger = better quality, slower)")
    
    # Audio sync arguments
    parser.add_argument("--sync_audio", action="store_true",
                        help="Sync generated audio to video duration")
    parser.add_argument("--sync_mode", choices=["stretch", "trim", "pad", "loop"], default="stretch",
                        help="How to sync audio to video: stretch, trim, pad with silence, or loop")
    
    # Lip sync arguments
    parser.add_argument("--lip_sync", action="store_true",
                        help="Apply lip sync to video using generated audio")
    parser.add_argument("--lip_sync_method", choices=["auto", "wav2lip", "sadtalker"], default="auto",
                        help="Lip sync method to use (auto selects best available)")
    
    # External audio file
    parser.add_argument("--audio_file", type=str, default=None,
                        help="Use external audio file instead of generating (for sync/lip sync)")
    
    # ─── MODEL DISCOVERY ARGUMENTS ───────────────────────────────────────────────
    
    parser.add_argument("--show-model", type=str, default=None,
                        metavar="ID_OR_NAME",
                        help="Show full details for a model by numeric ID (from --model-list) or name")
    parser.add_argument("--search-models", type=str, default=None,
                        metavar="QUERY",
                        help="Search HuggingFace for models matching query")
    parser.add_argument("--search-limit", type=int, default=20,
                        help="Maximum number of search results (default: 20)")
    parser.add_argument("--add-model", type=str, default=None,
                        metavar="MODEL_ID_OR_URL",
                        help="Add a HuggingFace model to config. Accepts model ID (stabilityai/svd) or URL (https://huggingface.co/stabilityai/svd)")
    parser.add_argument("--name", type=str, default=None,
                        metavar="NAME",
                        help="Short name for --add-model (auto-generated if not provided)")
    parser.add_argument("--validate-model", type=str, default=None,
                        metavar="MODEL_ID",
                        help="Validate if a HuggingFace model exists and get info")
    parser.add_argument("--update-models", action="store_true",
                        help="Search HuggingFace and update model database with I2V, T2V, and NSFW models")
    
    # Auto mode arguments
    parser.add_argument("--auto", action="store_true",
                        help="Automatic mode: detect generation type and NSFW from prompts, select best models automatically")
    parser.add_argument("--prefer-speed", action="store_true",
                        help="In auto mode, prefer faster models over higher quality")
    
    # Debug mode
    parser.add_argument("--debug", action="store_true",
                        help="Enable debug mode for detailed error messages and troubleshooting")

    args = parser.parse_args()
    main(args)
