Fix retry logic to skip LoRAs with failed base models

- Track failed base models in _failed_base_models set - Skip LoRA adapters that depend on failed base models during retry - Try non-LoRA alternatives when all LoRAs with same base fail - Improve error detection for 'Repository Not Found' errors - Show skipped LoRA count during retry process

Fix retry logic to skip LoRAs with failed base models
- Track failed base models in _failed_base_models set - Skip LoRA adapters that depend on failed base models during retry - Try non-LoRA alternatives when all LoRAs with same base fail - Improve error detection for 'Repository Not Found' errors - Show skipped LoRA count during retry process
be1e5b9d · Stefy Lanza (nextime / spora ) · 14ae5bdc · be1e5b9d
Commit be1e5b9d authored Feb 24, 2026 by Stefy Lanza (nextime / spora )
Show whitespace changes
Inline Side-by-side

Showing with 39 additions and 6 deletions

videogen videogen +39 -6

No files found.
--- a/videogen
+++ b/videogen
@@ -3327,6 +3327,8 @@ def main(args):
        args._retry_count = 0
        # Use --retry argument or default to 3
        args._max_retries = getattr(args, 'retry', 3)
+        # Track failed base models to avoid retrying LoRAs with same failed base
+        args._failed_base_models = set()

    if args.distribute and args.interface:
        os.environ["NCCL_SOCKET_IFNAME"] = args.interface
@@ -3517,15 +3519,46 @@ def main(args):
            retry_count = getattr(args, '_retry_count', 0)
            max_retries = getattr(args, '_max_retries', 3)
            alternative_models = getattr(args, '_auto_alternative_models', [])
+            failed_base_models = getattr(args, '_failed_base_models', set())
+            
+            # If this was a LoRA with a base model, track the failed base model
+            if is_lora and base_model_id:
+                failed_base_models.add(base_model_id)
+                args._failed_base_models = failed_base_models
+                print(f"   ⚠️  Base model failed: {base_model_id}")
+                print(f"   Will skip other LoRAs depending on this base model")
+            
+            # Find next valid alternative (skip LoRAs with failed base models)
+            next_model = None
+            skipped_loras = []
+            
+            while alternative_models:
+                candidate_name, candidate_info, candidate_reason = alternative_models.pop(0)
+                
+                # Check if this is a LoRA with a failed base model
+                if candidate_info.get("is_lora", False):
+                    candidate_base = candidate_info.get("base_model") or candidate_info.get("_inferred_base_model")
+                    if candidate_base and candidate_base in failed_base_models:
+                        skipped_loras.append((candidate_name, candidate_base))
+                        continue  # Skip this LoRA
+                
+                # Found a valid candidate
+                next_model = (candidate_name, candidate_info, candidate_reason)
+                break
+            
+            # Update the alternatives list
+            args._auto_alternative_models = alternative_models
            
-            if retry_count < max_retries and alternative_models:
-                # We have alternatives available - retry with next model
+            if skipped_loras:
+                print(f"   ⏭️  Skipped {len(skipped_loras)} LoRA(s) with failed base models")
+            
+            if retry_count < max_retries and next_model:
+                # We have a valid alternative - retry with it
                args._retry_count = retry_count + 1
-                next_model_name, next_model_info, next_reason = alternative_models.pop(0)
-                args._auto_alternative_models = alternative_models  # Update the list
+                next_model_name, next_model_info, next_reason = next_model
                
                # Print appropriate error message based on error type
-                if "404" in error_str or "Entry Not Found" in error_str:
+                if "404" in error_str or "Entry Not Found" in error_str or "Repository Not Found" in error_str:
                    print(f"❌ Model not found on HuggingFace: {model_id_to_load}")
                elif "401" in error_str or "Unauthorized" in error_str:
                    print(f"❌ Model requires authentication: {model_id_to_load}")
@@ -3548,7 +3581,7 @@ def main(args):
                # Retry main() with the new model
                return main(args)
            
-            # No more alternatives or retries exhausted
+            # No more valid alternatives or retries exhausted
            print(f"\n❌ All model retries exhausted ({retry_count}/{max_retries} attempts)")
        
        # Print detailed error message for the user