Fix retry logic to skip LoRAs with failed base models

- Track failed base models in _failed_base_models set
- Skip LoRA adapters that depend on failed base models during retry
- Try non-LoRA alternatives when all LoRAs with same base fail
- Improve error detection for 'Repository Not Found' errors
- Show skipped LoRA count during retry process
parent 14ae5bdc
Pipeline #228 canceled with stages
......@@ -3327,6 +3327,8 @@ def main(args):
args._retry_count = 0
# Use --retry argument or default to 3
args._max_retries = getattr(args, 'retry', 3)
# Track failed base models to avoid retrying LoRAs with same failed base
args._failed_base_models = set()
if args.distribute and args.interface:
os.environ["NCCL_SOCKET_IFNAME"] = args.interface
......@@ -3517,15 +3519,46 @@ def main(args):
retry_count = getattr(args, '_retry_count', 0)
max_retries = getattr(args, '_max_retries', 3)
alternative_models = getattr(args, '_auto_alternative_models', [])
failed_base_models = getattr(args, '_failed_base_models', set())
# If this was a LoRA with a base model, track the failed base model
if is_lora and base_model_id:
failed_base_models.add(base_model_id)
args._failed_base_models = failed_base_models
print(f" ⚠️ Base model failed: {base_model_id}")
print(f" Will skip other LoRAs depending on this base model")
# Find next valid alternative (skip LoRAs with failed base models)
next_model = None
skipped_loras = []
while alternative_models:
candidate_name, candidate_info, candidate_reason = alternative_models.pop(0)
# Check if this is a LoRA with a failed base model
if candidate_info.get("is_lora", False):
candidate_base = candidate_info.get("base_model") or candidate_info.get("_inferred_base_model")
if candidate_base and candidate_base in failed_base_models:
skipped_loras.append((candidate_name, candidate_base))
continue # Skip this LoRA
# Found a valid candidate
next_model = (candidate_name, candidate_info, candidate_reason)
break
# Update the alternatives list
args._auto_alternative_models = alternative_models
if retry_count < max_retries and alternative_models:
# We have alternatives available - retry with next model
if skipped_loras:
print(f" ⏭️ Skipped {len(skipped_loras)} LoRA(s) with failed base models")
if retry_count < max_retries and next_model:
# We have a valid alternative - retry with it
args._retry_count = retry_count + 1
next_model_name, next_model_info, next_reason = alternative_models.pop(0)
args._auto_alternative_models = alternative_models # Update the list
next_model_name, next_model_info, next_reason = next_model
# Print appropriate error message based on error type
if "404" in error_str or "Entry Not Found" in error_str:
if "404" in error_str or "Entry Not Found" in error_str or "Repository Not Found" in error_str:
print(f"❌ Model not found on HuggingFace: {model_id_to_load}")
elif "401" in error_str or "Unauthorized" in error_str:
print(f"❌ Model requires authentication: {model_id_to_load}")
......@@ -3548,7 +3581,7 @@ def main(args):
# Retry main() with the new model
return main(args)
# No more alternatives or retries exhausted
# No more valid alternatives or retries exhausted
print(f"\n❌ All model retries exhausted ({retry_count}/{max_retries} attempts)")
# Print detailed error message for the user
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment