Commit 9b3126d7 authored by Your Name's avatar Your Name

Fix black image: use --image-precision from CLI args instead of hardcoded float16

Root cause: The refactored code was hardcoding torch.float16 for CUDA,
ignoring the --image-precision bf16 CLI argument. The Z-Image-Turbo model
requires bfloat16 precision - using float16 causes NaN values in the
image processor, resulting in all-black images.

Also restored the original model loading logic with:
- GGUF model detection (skip diffusers for GGUF)
- OOM retry with progressive memory optimization
- use_safetensors=True
- Sequential CPU offload support
parent 553cdf07
...@@ -311,106 +311,96 @@ async def create_image_generation(request: ImageGenerationRequest, http_request: ...@@ -311,106 +311,96 @@ async def create_image_generation(request: ImageGenerationRequest, http_request:
# Try diffusers first # Try diffusers first
try: try:
from diffusers import StableDiffusionPipeline, StableDiffusionXLPipeline from diffusers import StableDiffusionPipeline, StableDiffusionXLPipeline, DiffusionPipeline
import torch import torch
# Check if model is XL # Check if model is XL
is_xl = "xl" in model_to_use.lower() or "sdxl" in model_to_use.lower() is_xl = "xl" in model_to_use.lower() or "sdxl" in model_to_use.lower()
# Check if it's a GGUF model - skip diffusers for those
is_gguf_model = (model_to_use.endswith('.gguf') or 'gguf' in model_to_use.lower() or
(model_to_use.startswith('http') and '.gguf' in model_to_use))
if is_gguf_model:
print(f"GGUF model detected ({model_to_use}), skipping diffusers, using stable-diffusion-cpp...")
raise Exception("GGUF model - use stable-diffusion-cpp instead")
print(f"Loading diffusers model: {model_to_use}") print(f"Loading diffusers model: {model_to_use}")
# Determine compute type # Determine precision from CLI argument (--image-precision)
if torch.cuda.is_available(): precision = getattr(global_args, 'image_precision', 'f32') or 'f32'
dtype = torch.float16 precision_map = {
else: 'bf16': torch.bfloat16,
dtype = torch.float32 'f32': torch.float32,
'f16': torch.float16,
}
if hasattr(torch, 'float8_e4m3fn'):
precision_map['f8'] = torch.float8_e4m3fn
dtype = precision_map.get(precision, torch.float32)
print(f"Using precision: {precision} ({dtype})")
# Try to load the model # Check if CPU offload is requested via CLI
load_error = None use_sequential_offload = getattr(global_args, 'image_cpu_offload', False)
try:
# Use DiffusionPipeline which auto-detects the correct pipeline class from model_index.json
# This supports custom pipelines like ZImagePipeline (DiT-based) which use 'transformer' instead of 'unet'
from diffusers import DiffusionPipeline
print(f"Loading diffusers model: {model_to_use}") # Track loading attempts for OOM handling
load_attempt = 0
max_attempts = 3
# Determine compute type while pipeline is None and load_attempt < max_attempts:
if torch.cuda.is_available(): try:
dtype = torch.float16 load_attempt += 1
else: print(f"Loading attempt {load_attempt}/{max_attempts}...")
dtype = torch.float32
# Use DiffusionPipeline for auto-detection of pipeline class # Try to load as Stable Diffusion XL first, then generic DiffusionPipeline
pipeline = DiffusionPipeline.from_pretrained(
model_to_use,
torch_dtype=dtype,
)
except Exception as load_error:
# Try with revised model resolution for custom models
print(f"Warning: First model load attempt failed: {load_error}")
print("Trying alternative loading method...")
# Check if it's a missing component error (incomplete model)
if "expected" in str(load_error) and "but only" in str(load_error):
# This is an incomplete model - don't keep retrying the same thing
print(f"Error: Model '{model_to_use}' is incomplete or missing required components (unet, image_encoder, etc.)")
print("This model cannot be loaded with diffusers. Trying stable-diffusion-cpp-python instead...")
# Skip the retry attempts and go directly to sd.cpp
raise Exception(f"Incomplete model: {load_error}")
# Try with default resolution
try: try:
from diffusers import DiffusionPipeline
if is_xl:
pipeline = StableDiffusionXLPipeline.from_pretrained(
model_to_use,
torch_dtype=dtype,
)
else:
# Fall back to DiffusionPipeline for custom pipelines
pipeline = DiffusionPipeline.from_pretrained(
model_to_use,
torch_dtype=dtype,
)
except Exception as retry_error:
# If it still fails, try DiffusionPipeline (for custom pipelines like ZImagePipeline)
print(f"Warning: Retry failed: {retry_error}, trying DiffusionPipeline for custom pipelines...")
from diffusers import DiffusionPipeline
if is_xl:
pipeline = StableDiffusionXLPipeline.from_pretrained( pipeline = StableDiffusionXLPipeline.from_pretrained(
model_to_use, model_to_use,
torch_dtype=dtype, torch_dtype=dtype,
safety_checker=None, use_safetensors=True,
) )
else: except Exception:
# Try generic diffusion pipeline (supports custom pipelines like ZImagePipeline)
pipeline = DiffusionPipeline.from_pretrained( pipeline = DiffusionPipeline.from_pretrained(
model_to_use, model_to_use,
torch_dtype=dtype, torch_dtype=dtype,
safety_checker=None, use_safetensors=True,
) )
# Determine device # Apply memory optimizations based on attempt
backend = getattr(global_args, 'backend', 'auto') if torch.cuda.is_available():
image_backend = getattr(global_args, 'image_backend', 'auto') if load_attempt >= 2:
use_vulkan = (backend == 'vulkan') or (image_backend == 'vulkan') or (image_backend == 'auto' and backend == 'auto') # Second attempt: enable attention slicing
print("Enabling attention slicing for lower VRAM usage...")
if use_vulkan and not torch.cuda.is_available():
# Vulkan/CPU mode
try:
pipeline.to("cpu")
# Enable CPU offload if available
if hasattr(pipeline, 'enable_attention_slicing'): if hasattr(pipeline, 'enable_attention_slicing'):
pipeline.enable_attention_slicing() pipeline.enable_attention_slicing()
except Exception as e:
print(f"Warning: Could not move to CPU: {e}") if load_attempt >= 3 or use_sequential_offload:
elif torch.cuda.is_available(): # Third attempt or offload requested: enable sequential CPU offload
# CUDA mode print("Enabling sequential CPU offload for lower VRAM usage...")
try: if hasattr(pipeline, 'enable_sequential_cpu_offload'):
pipeline.to("cuda") pipeline.enable_sequential_cpu_offload()
except Exception as e: else:
print(f"Warning: Could not move to CUDA: {e}") # First attempt: try regular GPU
pipeline = pipeline.to("cuda")
else:
pipeline = pipeline.to("cpu")
except Exception as load_error:
error_msg = str(load_error).lower()
is_oom = any(x in error_msg for x in ['out of memory', 'oom', 'cuda error', 'cudamalloc'])
if is_oom and load_attempt < max_attempts:
print(f"OOM during model loading: {load_error}")
print(f"Retrying with more aggressive memory optimization...")
pipeline = None # Reset for retry
else:
print(f"Failed to load model (attempt {load_attempt}): {load_error}")
if load_attempt >= max_attempts:
raise
pipeline = None
# Cache the model # Cache the model
if pipeline is not None:
multi_model_manager.add_model(model_key, pipeline) multi_model_manager.add_model(model_key, pipeline)
print(f"Loaded diffusers model: {model_to_use}") print(f"Loaded diffusers model: {model_to_use}")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment