Commit 9b3126d7 authored by Your Name's avatar Your Name

Fix black image: use --image-precision from CLI args instead of hardcoded float16

Root cause: The refactored code was hardcoding torch.float16 for CUDA,
ignoring the --image-precision bf16 CLI argument. The Z-Image-Turbo model
requires bfloat16 precision - using float16 causes NaN values in the
image processor, resulting in all-black images.

Also restored the original model loading logic with:
- GGUF model detection (skip diffusers for GGUF)
- OOM retry with progressive memory optimization
- use_safetensors=True
- Sequential CPU offload support
parent 553cdf07
......@@ -311,106 +311,96 @@ async def create_image_generation(request: ImageGenerationRequest, http_request:
# Try diffusers first
try:
from diffusers import StableDiffusionPipeline, StableDiffusionXLPipeline
from diffusers import StableDiffusionPipeline, StableDiffusionXLPipeline, DiffusionPipeline
import torch
# Check if model is XL
is_xl = "xl" in model_to_use.lower() or "sdxl" in model_to_use.lower()
# Check if it's a GGUF model - skip diffusers for those
is_gguf_model = (model_to_use.endswith('.gguf') or 'gguf' in model_to_use.lower() or
(model_to_use.startswith('http') and '.gguf' in model_to_use))
if is_gguf_model:
print(f"GGUF model detected ({model_to_use}), skipping diffusers, using stable-diffusion-cpp...")
raise Exception("GGUF model - use stable-diffusion-cpp instead")
print(f"Loading diffusers model: {model_to_use}")
# Determine compute type
if torch.cuda.is_available():
dtype = torch.float16
else:
dtype = torch.float32
# Determine precision from CLI argument (--image-precision)
precision = getattr(global_args, 'image_precision', 'f32') or 'f32'
precision_map = {
'bf16': torch.bfloat16,
'f32': torch.float32,
'f16': torch.float16,
}
if hasattr(torch, 'float8_e4m3fn'):
precision_map['f8'] = torch.float8_e4m3fn
dtype = precision_map.get(precision, torch.float32)
print(f"Using precision: {precision} ({dtype})")
# Try to load the model
load_error = None
try:
# Use DiffusionPipeline which auto-detects the correct pipeline class from model_index.json
# This supports custom pipelines like ZImagePipeline (DiT-based) which use 'transformer' instead of 'unet'
from diffusers import DiffusionPipeline
# Check if CPU offload is requested via CLI
use_sequential_offload = getattr(global_args, 'image_cpu_offload', False)
print(f"Loading diffusers model: {model_to_use}")
# Track loading attempts for OOM handling
load_attempt = 0
max_attempts = 3
# Determine compute type
if torch.cuda.is_available():
dtype = torch.float16
else:
dtype = torch.float32
while pipeline is None and load_attempt < max_attempts:
try:
load_attempt += 1
print(f"Loading attempt {load_attempt}/{max_attempts}...")
# Use DiffusionPipeline for auto-detection of pipeline class
pipeline = DiffusionPipeline.from_pretrained(
model_to_use,
torch_dtype=dtype,
)
except Exception as load_error:
# Try with revised model resolution for custom models
print(f"Warning: First model load attempt failed: {load_error}")
print("Trying alternative loading method...")
# Check if it's a missing component error (incomplete model)
if "expected" in str(load_error) and "but only" in str(load_error):
# This is an incomplete model - don't keep retrying the same thing
print(f"Error: Model '{model_to_use}' is incomplete or missing required components (unet, image_encoder, etc.)")
print("This model cannot be loaded with diffusers. Trying stable-diffusion-cpp-python instead...")
# Skip the retry attempts and go directly to sd.cpp
raise Exception(f"Incomplete model: {load_error}")
# Try with default resolution
# Try to load as Stable Diffusion XL first, then generic DiffusionPipeline
try:
from diffusers import DiffusionPipeline
if is_xl:
pipeline = StableDiffusionXLPipeline.from_pretrained(
model_to_use,
torch_dtype=dtype,
)
else:
# Fall back to DiffusionPipeline for custom pipelines
pipeline = DiffusionPipeline.from_pretrained(
model_to_use,
torch_dtype=dtype,
)
except Exception as retry_error:
# If it still fails, try DiffusionPipeline (for custom pipelines like ZImagePipeline)
print(f"Warning: Retry failed: {retry_error}, trying DiffusionPipeline for custom pipelines...")
from diffusers import DiffusionPipeline
if is_xl:
pipeline = StableDiffusionXLPipeline.from_pretrained(
model_to_use,
torch_dtype=dtype,
safety_checker=None,
use_safetensors=True,
)
else:
except Exception:
# Try generic diffusion pipeline (supports custom pipelines like ZImagePipeline)
pipeline = DiffusionPipeline.from_pretrained(
model_to_use,
torch_dtype=dtype,
safety_checker=None,
use_safetensors=True,
)
# Determine device
backend = getattr(global_args, 'backend', 'auto')
image_backend = getattr(global_args, 'image_backend', 'auto')
use_vulkan = (backend == 'vulkan') or (image_backend == 'vulkan') or (image_backend == 'auto' and backend == 'auto')
if use_vulkan and not torch.cuda.is_available():
# Vulkan/CPU mode
try:
pipeline.to("cpu")
# Enable CPU offload if available
# Apply memory optimizations based on attempt
if torch.cuda.is_available():
if load_attempt >= 2:
# Second attempt: enable attention slicing
print("Enabling attention slicing for lower VRAM usage...")
if hasattr(pipeline, 'enable_attention_slicing'):
pipeline.enable_attention_slicing()
except Exception as e:
print(f"Warning: Could not move to CPU: {e}")
elif torch.cuda.is_available():
# CUDA mode
try:
pipeline.to("cuda")
except Exception as e:
print(f"Warning: Could not move to CUDA: {e}")
if load_attempt >= 3 or use_sequential_offload:
# Third attempt or offload requested: enable sequential CPU offload
print("Enabling sequential CPU offload for lower VRAM usage...")
if hasattr(pipeline, 'enable_sequential_cpu_offload'):
pipeline.enable_sequential_cpu_offload()
else:
# First attempt: try regular GPU
pipeline = pipeline.to("cuda")
else:
pipeline = pipeline.to("cpu")
except Exception as load_error:
error_msg = str(load_error).lower()
is_oom = any(x in error_msg for x in ['out of memory', 'oom', 'cuda error', 'cudamalloc'])
if is_oom and load_attempt < max_attempts:
print(f"OOM during model loading: {load_error}")
print(f"Retrying with more aggressive memory optimization...")
pipeline = None # Reset for retry
else:
print(f"Failed to load model (attempt {load_attempt}): {load_error}")
if load_attempt >= max_attempts:
raise
pipeline = None
# Cache the model
if pipeline is not None:
multi_model_manager.add_model(model_key, pipeline)
print(f"Loaded diffusers model: {model_to_use}")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment