Commit e004541a authored by Your Name's avatar Your Name

Centralize model resolution and VRAM management in MultiModelManager.request_model()

- Added request_model() method to MultiModelManager that handles:
  1. Alias resolution (image, audio, tts, vision, default, custom aliases)
  2. VRAM management (unloading previous models in ondemand mode)
  3. Checking if model is already loaded

- Simplified codai/api/images.py:
  - Uses request_model() for model resolution and VRAM management
  - Extracted helper functions: _is_gguf_model(), _load_diffusers_pipeline(),
    _generate_with_diffusers(), _generate_with_sdcpp(), _load_sdcpp_model()
  - Removed duplicated sd.cpp generation code
  - Fixed semaphore scope (all generation now inside semaphore block)

- Simplified codai/api/tts.py:
  - Uses request_model() instead of duplicated VRAM management code
  - Removed duplicate get_cached_model_path() and get_model_cache_dir() wrappers

- Simplified codai/api/transcriptions.py:
  - Uses request_model() instead of duplicated VRAM management code

- Simplified codai/api/text.py:
  - Both /v1/chat/completions and /v1/completions use request_model()
  - Removed duplicated VRAM management blocks
parent a5b64c4c
......@@ -178,6 +178,265 @@ def set_queue_flags(flags):
queue_flags = flags
def _is_gguf_model(model_name: str) -> bool:
"""Check if a model name/path indicates a GGUF model."""
if not model_name:
return False
return (model_name.endswith('.gguf') or
'gguf' in model_name.lower() or
(model_name.startswith('http') and '.gguf' in model_name))
def _load_diffusers_pipeline(model_name: str, global_args):
"""
Try to load a model using the diffusers library.
Returns the loaded pipeline or None if diffusers can't handle this model.
Raises Exception if loading fails for other reasons.
"""
from diffusers import StableDiffusionPipeline, StableDiffusionXLPipeline, DiffusionPipeline
import torch
# Determine precision from CLI argument (--image-precision)
precision = getattr(global_args, 'image_precision', 'f32') or 'f32'
precision_map = {
'bf16': torch.bfloat16,
'f32': torch.float32,
'f16': torch.float16,
}
if hasattr(torch, 'float8_e4m3fn'):
precision_map['f8'] = torch.float8_e4m3fn
dtype = precision_map.get(precision, torch.float32)
print(f"Using precision: {precision} ({dtype})")
# Check if CPU offload is requested via CLI
use_sequential_offload = getattr(global_args, 'image_cpu_offload', False)
# Track loading attempts for OOM handling
pipeline = None
load_attempt = 0
max_attempts = 3
while pipeline is None and load_attempt < max_attempts:
try:
load_attempt += 1
print(f"Loading attempt {load_attempt}/{max_attempts}...")
# Try to load as Stable Diffusion XL first, then generic DiffusionPipeline
try:
pipeline = StableDiffusionXLPipeline.from_pretrained(
model_name,
torch_dtype=dtype,
use_safetensors=True,
)
except Exception:
# Try generic diffusion pipeline (supports custom pipelines like ZImagePipeline)
pipeline = DiffusionPipeline.from_pretrained(
model_name,
torch_dtype=dtype,
use_safetensors=True,
)
# Apply memory optimizations based on attempt
if torch.cuda.is_available():
if load_attempt >= 2:
# Second attempt: enable attention slicing
print("Enabling attention slicing for lower VRAM usage...")
if hasattr(pipeline, 'enable_attention_slicing'):
pipeline.enable_attention_slicing()
if load_attempt >= 3 or use_sequential_offload:
# Third attempt or offload requested: enable sequential CPU offload
print("Enabling sequential CPU offload for lower VRAM usage...")
if hasattr(pipeline, 'enable_sequential_cpu_offload'):
pipeline.enable_sequential_cpu_offload()
else:
# First attempt: try regular GPU
pipeline = pipeline.to("cuda")
else:
pipeline = pipeline.to("cpu")
except Exception as load_error:
error_msg = str(load_error).lower()
is_oom = any(x in error_msg for x in ['out of memory', 'oom', 'cuda error', 'cudamalloc'])
if is_oom and load_attempt < max_attempts:
print(f"OOM during model loading: {load_error}")
print(f"Retrying with more aggressive memory optimization...")
pipeline = None # Reset for retry
else:
print(f"Failed to load model (attempt {load_attempt}): {load_error}")
if load_attempt >= max_attempts:
raise
pipeline = None
return pipeline
def _generate_with_diffusers(pipeline, request, global_args, http_request=None):
"""Generate images using a diffusers pipeline."""
import torch
import numpy as np
import time as time_module
# Determine size
width, height = 1024, 1024
if request.size:
parts = request.size.split("x")
if len(parts) == 2:
try:
width = int(parts[0])
height = int(parts[1])
except ValueError:
pass
# Check for nan/inf in dimensions
if width != width or width == float('inf'):
width = 512
if height != height or height == float('inf'):
height = 512
# Enable memory optimizations
try:
if hasattr(pipeline, 'enable_attention_slicing'):
pipeline.enable_attention_slicing(slice_size="auto")
if hasattr(pipeline, 'enable_vae_slicing'):
pipeline.enable_vae_slicing()
except Exception as e:
print(f"Warning: Could not enable memory optimizations: {e}")
# Get timestamp BEFORE calling diffusers
timestamp = int(time_module.time())
# Generate images
seed = request.seed if request.seed is not None else getattr(global_args, 'image_seed', None)
generator = None
if seed is not None:
generator = torch.Generator(device=pipeline.device).manual_seed(seed)
# Quality: "standard" or "hd"
quality = request.quality or "standard"
# Use request parameters if provided, otherwise fall back to quality-based defaults
num_steps = request.steps if request.steps else (30 if quality == "standard" else 50)
cfg_scale = request.guidance_scale if request.guidance_scale else (
getattr(global_args, 'image_cfg_scale', 7.5) if quality == "standard" else 9.0
)
# Generate
result = pipeline(
prompt=request.prompt,
negative_prompt=None,
num_images_per_prompt=request.n,
height=height,
width=width,
generator=generator,
guidance_scale=cfg_scale,
num_inference_steps=num_steps,
)
# Extract images
images = []
try:
result_images = result.images
except Exception as img_err:
print(f"Warning: Could not access result.images: {img_err}")
result_images = getattr(result, 'image', None) or getattr(result, 'output', None)
if result_images is None:
raise Exception(f"Could not extract images from diffusers result: {img_err}")
for img in result_images:
# Debug: print image type and value range
print(f"DEBUG: Image type: {type(img)}")
if isinstance(img, np.ndarray):
print(f"DEBUG: Image shape: {img.shape}, dtype: {img.dtype}, min: {img.min()}, max: {img.max()}")
img = np.nan_to_num(img, nan=0.0, posinf=1.0, neginf=0.0)
img = np.clip(img, 0.0, 1.0)
print(f"DEBUG: After NaN handling - min: {img.min()}, max: {img.max()}")
img_data = save_image_response(img, request.response_format, http_request)
images.append(img_data)
return {
"created": timestamp,
"data": images
}
async def _generate_with_sdcpp(sd_model, request, global_args, http_request=None):
"""Generate images using stable-diffusion-cpp-python."""
import time
# Parse size
width, height = 512, 512
if request.size:
parts = request.size.split("x")
if len(parts) == 2:
try:
width = int(parts[0])
height = int(parts[1])
except ValueError:
pass
# Use default steps for fast generation
steps = 4
# Use request seed if provided, otherwise use CLI default seed
seed = request.seed if request.seed is not None else getattr(global_args, 'image_seed', None)
result = await asyncio.to_thread(
sd_model.generate_image,
prompt=request.prompt,
negative_prompt='',
width=width,
height=height,
cfg_scale=get_cfg_scale(),
sample_steps=steps,
seed=seed if seed is not None else 42,
batch_count=request.n if request.n else 1,
)
# Small delay to let Vulkan driver settle after generation
time.sleep(0.1)
# Convert results to response format
images = []
for img in result:
img_data = save_image_response(img, http_request=http_request)
images.append(img_data)
return {
"created": int(time.time()),
"data": images
}
def _load_sdcpp_model(model_path: str, global_args):
"""
Try to load a model using stable-diffusion-cpp-python.
Returns the loaded StableDiffusion model or None.
"""
from stable_diffusion_cpp import StableDiffusion
print(f"Loading sd.cpp model from: {model_path}")
# Build sd.cpp constructor args from config
kwargs = {
'model_path': model_path,
}
# Add optional paths from CLI args
if global_args:
if hasattr(global_args, 'vae_path') and global_args.vae_path:
kwargs['vae_path'] = global_args.vae_path
if hasattr(global_args, 'llm_path') and global_args.llm_path:
kwargs['lora_model_dir'] = global_args.llm_path
sd_model = StableDiffusion(**kwargs)
return sd_model
# =============================================================================
# Router and Endpoints
# =============================================================================
......@@ -225,518 +484,136 @@ async def create_image_generation(request: ImageGenerationRequest, http_request:
)
async with semaphore:
image_model = multi_model_manager.image_model
# If no image model configured, try to use main --model as fallback
if not image_model:
# Try to get the main model from args
main_model = getattr(global_args, 'model', None)
if main_model and isinstance(main_model, list) and len(main_model) > 0:
image_model = main_model[0]
elif main_model:
image_model = main_model
# Check if main model is a GGUF file - can't use for image generation
if image_model and ('.gguf' in image_model.lower() or 'gguf' in image_model.lower()):
print(f"Note: Main model is a GGUF file (for text), not suitable for image generation")
image_model = None # Can't use GGUF for images
# If still no image model configured, return an error
if not image_model:
raise HTTPException(
status_code=400,
detail="Image generation not configured. Use --image-model to specify a model."
# =====================================================================
# Step 1: Ask the manager to resolve the model and manage VRAM
# =====================================================================
model_info = multi_model_manager.request_model(
requested_model=request.model,
model_type="image"
)
# Determine model to use
# Priority: 1) model specified in request, 2) default image model from --image-model
model_to_use = request.model
if not model_to_use or model_to_use == "image":
# No model specified in request, use default
model_to_use = image_model
elif model_to_use.startswith("image:"):
# Legacy format - strip prefix and use default
model_to_use = image_model
else:
# Check if model_to_use is a valid model (URL, file, or known model)
# If not, fallback to the configured image model to avoid HF resolution errors
if image_model:
is_url = model_to_use.startswith('http://') or model_to_use.startswith('https://')
is_file = os.path.isfile(model_to_use) if model_to_use else False
if not is_url and not is_file:
# Unknown model name - use default instead of trying to resolve as HF
print(f"Warning: Unknown model '{model_to_use}' in image generation request, using configured --image-model")
model_to_use = image_model
# Check if model is loaded
model_key = f"image:{model_to_use}"
pipeline = multi_model_manager.get_model(model_key)
# In ondemand mode, if ANY model is loaded in VRAM and it's different from what we need,
# fully unload it first to free VRAM
if mode == "ondemand":
from codai.models.manager import model_manager
has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
if has_any_model:
# Resolve both the requested image model and currently loaded model to their canonical names
requested_canonical = multi_model_manager.resolve_model_name(f"image:{model_to_use}")
loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
# Also check legacy model_manager
if not loaded_canonical and model_manager.backend is not None:
loaded_canonical = "legacy_model_manager"
# Compare: if they're different models (even if both are image models), unload first
already_loaded = (requested_canonical and loaded_canonical and
requested_canonical == loaded_canonical)
if not already_loaded:
print(f"In ondemand mode - model switch detected:")
print(f" Requested: 'image:{model_to_use}' (resolved to: '{requested_canonical}')")
print(f" Loaded: '{loaded_canonical}'")
print(f" -> Fully unloading current model(s) before loading new model...")
multi_model_manager.unload_all_models()
if model_manager.backend is not None:
try:
model_manager.cleanup()
except:
pass
model_name = model_info['model_name']
model_key = model_info['model_key']
pipeline = model_info['model_object']
# Try diffusers first
try:
from diffusers import StableDiffusionPipeline, StableDiffusionXLPipeline, DiffusionPipeline
import torch
# Check if model is XL
is_xl = "xl" in model_to_use.lower() or "sdxl" in model_to_use.lower()
# Check if it's a GGUF model - skip diffusers for those
is_gguf_model = (model_to_use.endswith('.gguf') or 'gguf' in model_to_use.lower() or
(model_to_use.startswith('http') and '.gguf' in model_to_use))
if is_gguf_model:
print(f"GGUF model detected ({model_to_use}), skipping diffusers, using stable-diffusion-cpp...")
raise Exception("GGUF model - use stable-diffusion-cpp instead")
print(f"Loading diffusers model: {model_to_use}")
# Determine precision from CLI argument (--image-precision)
precision = getattr(global_args, 'image_precision', 'f32') or 'f32'
precision_map = {
'bf16': torch.bfloat16,
'f32': torch.float32,
'f16': torch.float16,
}
if hasattr(torch, 'float8_e4m3fn'):
precision_map['f8'] = torch.float8_e4m3fn
dtype = precision_map.get(precision, torch.float32)
print(f"Using precision: {precision} ({dtype})")
# Check if CPU offload is requested via CLI
use_sequential_offload = getattr(global_args, 'image_cpu_offload', False)
# Track loading attempts for OOM handling
load_attempt = 0
max_attempts = 3
while pipeline is None and load_attempt < max_attempts:
try:
load_attempt += 1
print(f"Loading attempt {load_attempt}/{max_attempts}...")
# Try to load as Stable Diffusion XL first, then generic DiffusionPipeline
try:
pipeline = StableDiffusionXLPipeline.from_pretrained(
model_to_use,
torch_dtype=dtype,
use_safetensors=True,
)
except Exception:
# Try generic diffusion pipeline (supports custom pipelines like ZImagePipeline)
pipeline = DiffusionPipeline.from_pretrained(
model_to_use,
torch_dtype=dtype,
use_safetensors=True,
)
# Apply memory optimizations based on attempt
if torch.cuda.is_available():
if load_attempt >= 2:
# Second attempt: enable attention slicing
print("Enabling attention slicing for lower VRAM usage...")
if hasattr(pipeline, 'enable_attention_slicing'):
pipeline.enable_attention_slicing()
if load_attempt >= 3 or use_sequential_offload:
# Third attempt or offload requested: enable sequential CPU offload
print("Enabling sequential CPU offload for lower VRAM usage...")
if hasattr(pipeline, 'enable_sequential_cpu_offload'):
pipeline.enable_sequential_cpu_offload()
else:
# First attempt: try regular GPU
pipeline = pipeline.to("cuda")
else:
pipeline = pipeline.to("cpu")
except Exception as load_error:
error_msg = str(load_error).lower()
is_oom = any(x in error_msg for x in ['out of memory', 'oom', 'cuda error', 'cudamalloc'])
if is_oom and load_attempt < max_attempts:
print(f"OOM during model loading: {load_error}")
print(f"Retrying with more aggressive memory optimization...")
pipeline = None # Reset for retry
else:
print(f"Failed to load model (attempt {load_attempt}): {load_error}")
if load_attempt >= max_attempts:
raise
pipeline = None
# Cache the model
if pipeline is not None:
multi_model_manager.add_model(model_key, pipeline)
print(f"Loaded diffusers model: {model_to_use}")
# If no image model configured, try to use main --model as fallback
if not model_name:
main_model = getattr(global_args, 'model', None)
if main_model and isinstance(main_model, list) and len(main_model) > 0:
model_name = main_model[0]
elif main_model:
model_name = main_model
except ImportError as e:
# diffusers not installed
diffusers_error = str(e)
print(f"diffusers not available: {diffusers_error}")
except Exception as e:
import traceback
diffusers_error = str(e)
print(f"diffusers error: {diffusers_error}")
print(f"Traceback: {traceback.format_exc()}")
# Try diffusers if available
if pipeline is not None:
try:
# Determine size
width, height = 1024, 1024
if request.size:
parts = request.size.split("x")
if len(parts) == 2:
try:
width = int(parts[0])
height = int(parts[1])
except ValueError:
pass
# Check if main model is a GGUF file - can't use for image generation
if model_name and _is_gguf_model(model_name):
print(f"Note: Main model is a GGUF file (for text), not suitable for image generation")
model_name = None
# Check for nan/inf in dimensions
if width != width or width == float('inf'): # NaN or inf check
width = 512
if height != height or height == float('inf'): # NaN or inf check
height = 512
# Import torch for generation
import torch
# Ensure model is on correct device
backend = getattr(global_args, 'backend', 'auto')
image_backend = getattr(global_args, 'image_backend', 'auto')
use_vulkan = (backend == 'vulkan') or (image_backend == 'vulkan') or (image_backend == 'auto' and backend == 'auto')
if use_vulkan and not torch.cuda.is_available():
# CPU mode - try to reduce memory usage
try:
if hasattr(pipeline, 'enable_attention_slicing'):
pipeline.enable_attention_slicing(slice_size="auto")
if hasattr(pipeline, 'enable_vae_slicing'):
pipeline.enable_vae_slicing()
except Exception as e:
print(f"Warning: Could not enable memory optimizations: {e}")
elif torch.cuda.is_available():
# Try to enable memory optimizations for CUDA
try:
if hasattr(pipeline, 'enable_attention_slicing'):
pipeline.enable_attention_slicing(slice_size="auto")
if hasattr(pipeline, 'enable_vae_slicing'):
pipeline.enable_vae_slicing()
except Exception as e:
print(f"Warning: Could not enable CUDA memory optimizations: {e}")
# Get timestamp BEFORE calling diffusers (to avoid scope issues)
import time as time_module
timestamp = int(time_module.time())
# Generate images
# Use request seed if provided, otherwise use CLI default seed
seed = request.seed if request.seed is not None else getattr(global_args, 'image_seed', None)
generator = None
if seed is not None:
generator = torch.Generator(device=pipeline.device).manual_seed(seed)
# Quality: "standard" or "hd"
quality = request.quality or "standard"
# Use request parameters if provided, otherwise fall back to quality-based defaults
num_steps = request.steps if request.steps else (30 if quality == "standard" else 50)
cfg_scale = request.guidance_scale if request.guidance_scale else (
getattr(global_args, 'image_cfg_scale', 7.5) if quality == "standard" else 9.0
)
# Generate
result = pipeline(
prompt=request.prompt,
negative_prompt=None,
num_images_per_prompt=request.n,
height=height,
width=width,
generator=generator,
guidance_scale=cfg_scale,
num_inference_steps=num_steps,
if model_name:
model_key = f"image:{model_name}"
# If still no image model configured, return an error
if not model_name:
raise HTTPException(
status_code=400,
detail="Image generation not configured. Use --image-model to specify a model."
)
# Extract images
images = []
# =====================================================================
# Step 2: Check if model is a sd.cpp StableDiffusion instance
# =====================================================================
is_sdcpp = False
if pipeline is not None:
try:
result_images = result.images
except Exception as img_err:
print(f"Warning: Could not access result.images: {img_err}")
# Try alternative: result might have 'image' or 'output'
result_images = getattr(result, 'image', None) or getattr(result, 'output', None)
if result_images is None:
raise Exception(f"Could not extract images from diffusers result: {img_err}")
for img in result_images:
# Convert to base64
import numpy as np
# Debug: print image type and value range
print(f"DEBUG: Image type: {type(img)}")
if isinstance(img, np.ndarray):
print(f"DEBUG: Image shape: {img.shape}, dtype: {img.dtype}, min: {img.min()}, max: {img.max()}")
# Handle NaN/Inf values in image data - convert to valid values
# Replace NaN and Inf with valid values
img = np.nan_to_num(img, nan=0.0, posinf=1.0, neginf=0.0)
# Clip to valid range [0, 1]
img = np.clip(img, 0.0, 1.0)
print(f"DEBUG: After NaN handling - min: {img.min()}, max: {img.max()}")
from stable_diffusion_cpp import StableDiffusion
if isinstance(pipeline, StableDiffusion):
is_sdcpp = True
except ImportError:
pass
# =====================================================================
# Step 3: If already loaded, generate with appropriate backend
# =====================================================================
if pipeline is not None:
if is_sdcpp:
print(f"Using cached sd.cpp model for generation")
return await _generate_with_sdcpp(pipeline, request, global_args, http_request)
else:
# Assume it's a diffusers pipeline
print(f"Using cached diffusers pipeline for generation")
return _generate_with_diffusers(pipeline, request, global_args, http_request)
# =====================================================================
# Step 4: Model not loaded - try to load it
# =====================================================================
is_gguf = _is_gguf_model(model_name)
diffusers_error = None
sdcpp_error = None
# Try diffusers first (for non-GGUF models)
if not is_gguf:
try:
print(f"Loading diffusers model: {model_name}")
pipeline = _load_diffusers_pipeline(model_name, global_args)
# Use helper function to save and get response
img_data = save_image_response(img, request.response_format, http_request)
images.append(img_data)
return {
"created": timestamp,
"data": images
}
except ImportError as e:
# diffusers/torch not installed - record error and try sd.cpp
diffusers_error = str(e)
print(f"diffusers not available: {diffusers_error}, trying stable-diffusion-cpp-python...")
except Exception as e:
# Other error with diffusers - record and try sd.cpp
import traceback
diffusers_error = str(e)
print(f"diffusers error: {diffusers_error}")
print(f"Traceback: {traceback.format_exc()}")
print(f"Trying stable-diffusion-cpp-python...")
# Try stable-diffusion-cpp-python (sd.cpp) as fallback when diffusers fails
# sd.cpp works with GGUF models, but some HF models may be GGUF even without "gguf" in name
# Let sd.cpp attempt loading and fail gracefully if it's not compatible
# Try stable-diffusion-cpp-python (sd.cpp) as fallback
# First, check all available image models to find one loaded via sd.cpp
# Always check for cached models - allows dynamically loaded models to be reused across requests
sd_model = None
for key in multi_model_manager.models:
if key.startswith("image:"):
potential_model = multi_model_manager.get_model(key)
if potential_model is not None:
# Check if it's a stable-diffusion-cpp model
try:
from stable_diffusion_cpp import StableDiffusion
if isinstance(potential_model, StableDiffusion):
sd_model = potential_model
print(f"Found cached stable-diffusion-cpp model with key: {key}")
break
except ImportError:
pass
# If no cached image model found, need to load one - first cleanup any existing models
if sd_model is None:
# In ondemand mode, check if we need to unload before loading sd.cpp model
from codai.models.manager import model_manager
has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
if mode == "ondemand" and has_any_model:
# Resolve both the requested image model and currently loaded model to their canonical names
requested_canonical = multi_model_manager.resolve_model_name(f"image:{model_to_use}")
loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
# Also check legacy model_manager
if not loaded_canonical and model_manager.backend is not None:
loaded_canonical = "legacy_model_manager"
# Compare: if they're different models, unload first
already_loaded = (requested_canonical and loaded_canonical and
requested_canonical == loaded_canonical)
if not already_loaded:
print(f"In ondemand mode - model switch detected:")
print(f" Requested: 'image:{model_to_use}' (resolved to: '{requested_canonical}')")
print(f" Loaded: '{loaded_canonical}'")
print(f" -> Fully unloading current model(s) before loading sd.cpp model...")
multi_model_manager.unload_all_models()
if model_manager.backend is not None:
try:
model_manager.cleanup()
except:
pass
if sd_model is not None:
# Check if it's a stable-diffusion-cpp model (has generate method from sd.cpp)
if pipeline is not None:
# Cache the loaded pipeline in the manager
multi_model_manager.add_model(model_key, pipeline)
multi_model_manager.current_model_key = model_key
print(f"Loaded diffusers model: {model_name}")
return _generate_with_diffusers(pipeline, request, global_args, http_request)
except ImportError as e:
diffusers_error = str(e)
print(f"diffusers not available: {diffusers_error}")
except Exception as e:
import traceback
diffusers_error = str(e)
print(f"diffusers error: {diffusers_error}")
print(f"Traceback: {traceback.format_exc()}")
# Try stable-diffusion-cpp-python (for GGUF models or as fallback)
try:
from stable_diffusion_cpp import StableDiffusion
if isinstance(sd_model, StableDiffusion):
print(f"Using stable-diffusion-cpp-python for image generation")
# Use sd.cpp for generation
# Parse size
width, height = 512, 512
if request.size:
parts = request.size.split("x")
if len(parts) == 2:
try:
width = int(parts[0])
height = int(parts[1])
except ValueError:
pass
# Use default steps for Z-Image Turbo (very fast)
steps = 4 # Default for fast generation
# Generate images using sd.cpp (run in thread to not block event loop)
# Use request seed if provided, otherwise use CLI default seed
seed = request.seed if request.seed is not None else getattr(global_args, 'image_seed', None)
result = await asyncio.to_thread(
sd_model.generate_image,
prompt=request.prompt,
negative_prompt='',
width=width,
height=height,
cfg_scale=get_cfg_scale(),
sample_steps=steps,
seed=seed if seed is not None else 42,
batch_count=request.n if request.n else 1,
)
# Small delay to let Vulkan driver settle after generation
import time
time.sleep(0.1)
# Convert results to response format
images = []
for img in result:
# Use helper function to save and get response
img_data = save_image_response(img, http_request=http_request)
images.append(img_data)
# For GGUF models or URLs, resolve the model path through the cache
resolved_path = model_name
if is_gguf or model_name.startswith('http://') or model_name.startswith('https://'):
resolved_path = multi_model_manager.load_model(model_name)
if not resolved_path:
raise Exception(f"Failed to resolve model path: {model_name}")
# Only use sd.cpp if we have a local file path
if resolved_path and os.path.isfile(resolved_path):
sd_model = _load_sdcpp_model(resolved_path, global_args)
return {
"created": int(time.time()),
"data": images
}
except ImportError as e:
# stable-diffusion-cpp not available
sd_cpp_error = str(e)
print(f"stable-diffusion-cpp-python not available: {sd_cpp_error}")
except Exception as e:
print(f"sd.cpp generation error: {e}")
sd_cpp_error = str(e)
else:
# No sd.cpp model pre-loaded, try to load dynamically
print("No pre-loaded sd.cpp model found, trying to load...")
try:
from stable_diffusion_cpp import StableDiffusion
# Use model manager to resolve and load the model
model_path = multi_model_manager.load_model(model_to_use)
# For diffusers models, model_path will be the identifier string
# For GGUF models, it will be the file path
if model_path is not None and not os.path.isfile(model_path):
# This is a diffusers model identifier (not a file path)
# Skip sd.cpp and let diffusers handle it
print(f"Model '{model_path}' is handled by diffusers library, skipping sd.cpp")
model_path = None
if model_path is not None:
# Check if it's a stable-diffusion-cpp model (has generate method from sd.cpp)
try:
from stable_diffusion_cpp import StableDiffusion
if isinstance(sd_model, StableDiffusion):
print(f"Using stable-diffusion-cpp-python for image generation")
# Use sd.cpp for generation
# Parse size
width, height = 512, 512
if request.size:
parts = request.size.split("x")
if len(parts) == 2:
try:
width = int(parts[0])
height = int(parts[1])
except ValueError:
pass
# Use default steps for Z-Image Turbo (very fast)
steps = 4 # Default for fast generation
# Generate images using sd.cpp (run in thread to not block event loop)
# Use request seed if provided, otherwise use CLI default seed
seed = request.seed if request.seed is not None else getattr(global_args, 'image_seed', None)
result = await asyncio.to_thread(
sd_model.generate_image,
prompt=request.prompt,
negative_prompt='',
width=width,
height=height,
cfg_scale=get_cfg_scale(),
sample_steps=steps,
seed=seed if seed is not None else 42,
batch_count=request.n if request.n else 1,
)
# Small delay to let Vulkan driver settle after generation
import time
time.sleep(0.1)
# Convert results to response format
images = []
for img in result:
# Use helper function to save and get response
img_data = save_image_response(img, http_request=http_request)
images.append(img_data)
return {
"created": int(time.time()),
"data": images
}
except ImportError as e:
# stable-diffusion-cpp not available
sd_cpp_error = str(e)
print(f"stable-diffusion-cpp-python not available: {sd_cpp_error}")
except Exception as e:
print(f"sd.cpp generation error: {e}")
sd_cpp_error = str(e)
if sd_model is not None:
# Cache the loaded model in the manager
multi_model_manager.add_model(model_key, sd_model)
multi_model_manager.current_model_key = model_key
print(f"Loaded sd.cpp model: {model_name}")
return await _generate_with_sdcpp(sd_model, request, global_args, http_request)
else:
# model_path is None - likely a diffusers model handled above
print("Model handled by diffusers library")
sd_cpp_error = "Model handled by diffusers"
sdcpp_error = f"Model '{model_name}' is not a local file, cannot use sd.cpp"
print(sdcpp_error)
except ImportError as e:
sd_cpp_error = str(e)
print(f"stable-diffusion-cpp-python not available: {sd_cpp_error}")
sdcpp_error = str(e)
print(f"stable-diffusion-cpp-python not available: {sdcpp_error}")
except Exception as e:
sd_cpp_error = str(e)
print(f"sd.cpp error: {sd_cpp_error}")
# Both backends failed - return error with installation instructions
raise HTTPException(
status_code=400,
detail=f"Model '{model_to_use}' does not support image generation"
)
sdcpp_error = str(e)
print(f"sd.cpp error: {sdcpp_error}")
# =====================================================================
# Step 5: Both backends failed - return error
# =====================================================================
error_details = []
if diffusers_error:
error_details.append(f"diffusers: {diffusers_error}")
if sdcpp_error:
error_details.append(f"sd.cpp: {sdcpp_error}")
raise HTTPException(
status_code=400,
detail=f"Failed to load image model '{model_name}'. Errors: {'; '.join(error_details) if error_details else 'No compatible backend found'}"
)
......@@ -295,46 +295,13 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
# Get the model for this request
requested_model = request.model
# Get load mode to determine if we need to unload other models first
from codai.api.state import get_load_mode
load_mode = get_load_mode()
# Use the manager to resolve the model and manage VRAM (handles ondemand unloading)
model_info = multi_model_manager.request_model(
requested_model=requested_model,
model_type="text"
)
# In ondemand mode (no --load-all or --loadswap), if ANY model is loaded in VRAM
# and it's different from what we need, fully unload it first to free VRAM
if load_mode == "ondemand":
has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
if has_any_model:
# Resolve both the requested model and currently loaded model to their canonical names
requested_canonical = multi_model_manager.resolve_model_name(requested_model)
loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
# Also check legacy model_manager
if not loaded_canonical and model_manager.backend is not None:
loaded_canonical = "legacy_model_manager"
# Compare: if they're different models (even if same type), unload first
already_loaded = (requested_canonical and loaded_canonical and
requested_canonical == loaded_canonical)
if not already_loaded:
print(f"In ondemand mode - model switch detected:")
print(f" Requested: '{requested_model}' (resolved to: '{requested_canonical}')")
print(f" Loaded: '{loaded_canonical}'")
print(f" -> Fully unloading current model(s) before loading new model...")
# Use centralized unload method
multi_model_manager.unload_all_models()
# Also cleanup legacy model_manager
if model_manager.backend is not None:
print("Unloading legacy model_manager from VRAM...")
try:
model_manager.cleanup()
except Exception as e:
print(f"Warning during legacy model cleanup: {e}")
# Try to get the appropriate model
# Try to get the appropriate model (request_model handles VRAM cleanup)
mm = multi_model_manager.get_model_for_request(requested_model)
if mm is None:
......@@ -1727,40 +1694,13 @@ async def completions(request: CompletionRequest):
# Get the model for this request
requested_model = request.model
# Get load mode to determine if we need to unload other models first
from codai.api.state import get_load_mode
load_mode = get_load_mode()
# In ondemand mode, if ANY model is loaded and it's different from what we need, unload first
if load_mode == "ondemand":
has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
if has_any_model:
# Resolve both the requested model and currently loaded model to their canonical names
requested_canonical = multi_model_manager.resolve_model_name(requested_model)
loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
# Also check legacy model_manager
if not loaded_canonical and model_manager.backend is not None:
loaded_canonical = "legacy_model_manager"
# Compare: if they're different models (even if same type), unload first
already_loaded = (requested_canonical and loaded_canonical and
requested_canonical == loaded_canonical)
if not already_loaded:
print(f"In ondemand mode - model switch detected:")
print(f" Requested: '{requested_model}' (resolved to: '{requested_canonical}')")
print(f" Loaded: '{loaded_canonical}'")
print(f" -> Fully unloading current model(s) before loading new model...")
multi_model_manager.unload_all_models()
if model_manager.backend is not None:
try:
model_manager.cleanup()
except:
pass
# Use the manager to resolve the model and manage VRAM (handles ondemand unloading)
model_info = multi_model_manager.request_model(
requested_model=requested_model,
model_type="text"
)
# Try to get the appropriate model
# Try to get the appropriate model (request_model handles VRAM cleanup)
mm = multi_model_manager.get_model_for_request(requested_model)
if mm is None:
......
......@@ -54,52 +54,22 @@ async def create_transcription(
raise HTTPException(status_code=500, detail=result["error"])
return {"text": result.get("text", "")}
audio_model = multi_model_manager.audio_models[0] if multi_model_manager.audio_models else None
if not audio_model:
# Use the manager to resolve the model and manage VRAM
model_info = multi_model_manager.request_model(
requested_model=model,
model_type="audio"
)
model_name = model_info['model_name']
model_key = model_info['model_key']
whisper_model = model_info['model_object']
if not model_name:
raise HTTPException(
status_code=400,
detail="Audio transcription not configured. Use --audio-model or --whisper-server."
)
# Get load mode to determine if we need to unload other models first
from codai.api.state import get_load_mode
from codai.models.manager import model_manager
load_mode = get_load_mode()
# In ondemand mode, if ANY model is loaded and it's different from what we need, unload first
if load_mode == "ondemand":
has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
if has_any_model:
# Resolve both the requested audio model and currently loaded model to their canonical names
requested_canonical = multi_model_manager.resolve_model_name(f"audio:{audio_model}")
loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
# Also check legacy model_manager
if not loaded_canonical and model_manager.backend is not None:
loaded_canonical = "legacy_model_manager"
# Compare: if they're different models, unload first
already_loaded = (requested_canonical and loaded_canonical and
requested_canonical == loaded_canonical)
if not already_loaded:
print(f"In ondemand mode - model switch detected:")
print(f" Requested: 'audio:{audio_model}' (resolved to: '{requested_canonical}')")
print(f" Loaded: '{loaded_canonical}'")
print(f" -> Fully unloading current model(s) before loading audio model...")
multi_model_manager.unload_all_models()
if model_manager.backend is not None:
try:
model_manager.cleanup()
except:
pass
# Determine model to use
model_to_use = model
if model_to_use.startswith("whisper:") or model_to_use.startswith("audio:"):
model_to_use = audio_model
# Read the uploaded file
file_content = await file.read()
......@@ -113,26 +83,23 @@ async def create_transcription(
try:
from faster_whisper import WhisperModel
# Determine model key
model_key = f"audio:{model_to_use}"
whisper_model = multi_model_manager.get_model(model_key)
if whisper_model is None:
print(f"Loading faster-whisper model: {model_to_use}")
print(f"Loading faster-whisper model: {model_name}")
# Determine compute type - always use int8 for CPU
compute_type = "int8"
# Load the model
whisper_model = WhisperModel(
model_to_use,
model_name,
device="cpu", # Always use CPU - faster-whisper CUDA doesn't work with AMD
compute_type=compute_type,
)
# Cache the model
multi_model_manager.add_model(model_key, whisper_model)
print(f"Loaded faster-whisper model: {model_to_use}")
multi_model_manager.current_model_key = model_key
print(f"Loaded faster-whisper model: {model_name}")
# Run transcription
segments, info = whisper_model.transcribe(
......@@ -160,24 +127,21 @@ async def create_transcription(
try:
import whispercpp
# Determine model key
model_key = f"audio:{model_to_use}"
whisper_model = multi_model_manager.get_model(model_key)
if whisper_model is None:
print(f"Loading whispercpp model: {model_to_use}")
print(f"Loading whispercpp model: {model_name}")
# Check if it's a built-in model name
if model_to_use in ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large']:
if model_name in ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large']:
# It's a built-in model name
whisper_model = whispercpp.Whisper.from_pretrained(model_to_use)
whisper_model = whispercpp.Whisper.from_pretrained(model_name)
else:
# It's a path to a GGUF file
whisper_model = whispercpp.Whisper.from_pretrained(model_to_use)
whisper_model = whispercpp.Whisper.from_pretrained(model_name)
# Cache the model
multi_model_manager.add_model(model_key, whisper_model)
print(f"Loaded whispercpp model: {model_to_use}")
multi_model_manager.current_model_key = model_key
print(f"Loaded whispercpp model: {model_name}")
# Run transcription
result = whisper_model.transcribe(tmp_path)
......
......@@ -16,18 +16,6 @@ from codai.models.manager import multi_model_manager
global_args = None
def get_cached_model_path(url: str) -> str:
"""Get cached model path if available."""
from codai.models.cache import get_cached_model_path as cache_get_cached_model_path
return cache_get_cached_model_path(url)
def get_model_cache_dir() -> str:
"""Get model cache directory."""
from codai.models.cache import get_model_cache_dir
return get_model_cache_dir()
def set_global_args(args):
"""Set global args from coderai."""
global global_args
......@@ -65,80 +53,46 @@ async def create_speech(request: TTSRequest):
Supports:
- Kokoro TTS models (when --tts-model is specified)
"""
tts_model = multi_model_manager.tts_model
# Use the manager to resolve the model and manage VRAM
model_info = multi_model_manager.request_model(
requested_model=request.model,
model_type="tts"
)
model_name = model_info['model_name']
model_key = model_info['model_key']
kokoro_model = model_info['model_object']
# If no TTS model configured, return an error
if not tts_model:
if not model_name:
raise HTTPException(
status_code=400,
detail="TTS not configured. Use --tts-model to specify a model."
)
# Get load mode to determine if we need to unload other models first
from codai.api.state import get_load_mode
from codai.models.manager import model_manager
load_mode = get_load_mode()
# In ondemand mode, if ANY model is loaded and it's different from what we need, unload first
if load_mode == "ondemand":
has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
if has_any_model:
# Resolve both the requested TTS model and currently loaded model to their canonical names
requested_canonical = multi_model_manager.resolve_model_name(f"tts:{tts_model}")
loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
# Also check legacy model_manager
if not loaded_canonical and model_manager.backend is not None:
loaded_canonical = "legacy_model_manager"
# Compare: if they're different models, unload first
already_loaded = (requested_canonical and loaded_canonical and
requested_canonical == loaded_canonical)
if not already_loaded:
print(f"In ondemand mode - model switch detected:")
print(f" Requested: 'tts:{tts_model}' (resolved to: '{requested_canonical}')")
print(f" Loaded: '{loaded_canonical}'")
print(f" -> Fully unloading current model(s) before loading TTS model...")
multi_model_manager.unload_all_models()
if model_manager.backend is not None:
try:
model_manager.cleanup()
except:
pass
# Determine model to use
model_to_use = request.model
if model_to_use.startswith("tts:"):
model_to_use = tts_model
# Try to use kokoro if available
try:
from kokoro import Kokoro
# Determine model key
model_key = f"tts:{model_to_use}"
kokoro_model = multi_model_manager.get_model(model_key)
if kokoro_model is None:
print(f"Loading Kokoro TTS model: {model_to_use}")
print(f"Loading Kokoro TTS model: {model_name}")
# Check if model_to_use is a URL - download it (with caching)
# Check if model_name is a URL - download it (with caching)
model_path = None
if model_to_use.startswith('http://') or model_to_use.startswith('https://'):
print(f"Loading model from URL: {model_to_use}")
if model_name.startswith('http://') or model_name.startswith('https://'):
print(f"Loading model from URL: {model_name}")
from codai.models.cache import load_model
model_path = load_model(model_to_use)
model_path = load_model(model_name)
if not model_path:
raise Exception(f"Failed to load model from {model_to_use}")
raise Exception(f"Failed to load model from {model_name}")
else:
# Use local path or model name
model_path = model_to_use
model_path = model_name
# Load the Kokoro model
kokoro_model = Kokoro(model_path if model_path else model_to_use)
kokoro_model = Kokoro(model_path if model_path else model_name)
multi_model_manager.add_model(model_key, kokoro_model)
multi_model_manager.current_model_key = model_key
# Generate speech
voice = request.voice or "af_sarah"
......
......@@ -883,6 +883,133 @@ class MultiModelManager:
return load_model(model_path, cache_dir, file_pattern)
def request_model(self, requested_model: str, model_type: str = None) -> Dict[str, Any]:
"""
Central method for API modules to request a model.
Handles:
1. Alias resolution (e.g., "image" -> "Tongyi-MAI/Z-Image-Turbo")
2. VRAM management (unloading previous models in ondemand mode)
3. Checking if model is already loaded
Args:
requested_model: The model name/alias from the API request
model_type: The type of model being requested ("image", "text", "audio", "tts", "vision")
Used to resolve empty/None model names to the appropriate default.
Returns:
Dict with:
- 'model_key': The key used to store/retrieve the model in self.models
- 'model_name': The resolved model name/path/HF ID
- 'model_object': The loaded model object if already loaded, None otherwise
- 'config': The stored configuration for this model
- 'already_loaded': True if the model is already loaded in VRAM
"""
from codai.api.state import get_load_mode
mode = get_load_mode()
# Step 1: Resolve the model name from aliases
resolved_name = None
model_key = None
# If no model specified, use the default for the given type
if not requested_model or requested_model == model_type:
if model_type == "image":
resolved_name = self.image_models[0] if self.image_models else None
elif model_type == "audio":
resolved_name = self.audio_models[0] if self.audio_models else None
elif model_type == "tts":
resolved_name = self.tts_model
elif model_type == "vision":
resolved_name = self.vision_models[0] if self.vision_models else None
else:
resolved_name = self.default_model
else:
# Resolve custom aliases
if requested_model in self.model_aliases:
requested_model = self.model_aliases[requested_model]
# Handle "default" alias
if requested_model == "default":
resolved_name = self.default_model
# Handle type-specific aliases
elif requested_model == "image":
resolved_name = self.image_models[0] if self.image_models else None
elif requested_model == "audio":
resolved_name = self.audio_models[0] if self.audio_models else None
elif requested_model == "tts":
resolved_name = self.tts_model
elif requested_model == "vision":
resolved_name = self.vision_models[0] if self.vision_models else None
# Handle prefixed models (e.g., "image:model_name")
elif requested_model.startswith("image:"):
resolved_name = requested_model[6:]
elif requested_model.startswith("audio:"):
resolved_name = requested_model[6:]
elif requested_model.startswith("tts:"):
resolved_name = requested_model[4:]
elif requested_model.startswith("vision:"):
resolved_name = requested_model[7:]
else:
resolved_name = requested_model
if not resolved_name:
return {
'model_key': None,
'model_name': None,
'model_object': None,
'config': {},
'already_loaded': False,
}
# Step 2: Build the model key (prefixed with type)
if model_type and model_type != "text":
model_key = f"{model_type}:{resolved_name}"
else:
model_key = resolved_name
# Step 3: Check if already loaded
existing_model = self.models.get(model_key)
if existing_model is not None:
self.current_model_key = model_key
return {
'model_key': model_key,
'model_name': resolved_name,
'model_object': existing_model,
'config': self.config.get(model_key, {}),
'already_loaded': True,
}
# Step 4: In ondemand mode, unload any currently loaded model
if mode == "ondemand":
has_any_model = len(self.models) > 0 or model_manager.backend is not None
if has_any_model:
loaded_canonical = self.get_currently_loaded_model_name()
if not loaded_canonical and model_manager.backend is not None:
loaded_canonical = "legacy_model_manager"
if loaded_canonical and loaded_canonical != model_key:
print(f"Ondemand mode - model switch detected:")
print(f" Requested: '{model_key}' (resolved: '{resolved_name}')")
print(f" Currently loaded: '{loaded_canonical}'")
print(f" -> Unloading current model(s) before loading new model...")
self.unload_all_models()
if model_manager.backend is not None:
try:
model_manager.cleanup()
except:
pass
# Step 5: Return info for the caller to load the model
return {
'model_key': model_key,
'model_name': resolved_name,
'model_object': None,
'config': self.config.get(model_key, {}),
'already_loaded': False,
}
def unload_all_models(self):
"""
Fully unload ALL models from VRAM. Used in ondemand mode when switching
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment