Commit e004541a authored by Your Name's avatar Your Name

Centralize model resolution and VRAM management in MultiModelManager.request_model()

- Added request_model() method to MultiModelManager that handles:
  1. Alias resolution (image, audio, tts, vision, default, custom aliases)
  2. VRAM management (unloading previous models in ondemand mode)
  3. Checking if model is already loaded

- Simplified codai/api/images.py:
  - Uses request_model() for model resolution and VRAM management
  - Extracted helper functions: _is_gguf_model(), _load_diffusers_pipeline(),
    _generate_with_diffusers(), _generate_with_sdcpp(), _load_sdcpp_model()
  - Removed duplicated sd.cpp generation code
  - Fixed semaphore scope (all generation now inside semaphore block)

- Simplified codai/api/tts.py:
  - Uses request_model() instead of duplicated VRAM management code
  - Removed duplicate get_cached_model_path() and get_model_cache_dir() wrappers

- Simplified codai/api/transcriptions.py:
  - Uses request_model() instead of duplicated VRAM management code

- Simplified codai/api/text.py:
  - Both /v1/chat/completions and /v1/completions use request_model()
  - Removed duplicated VRAM management blocks
parent a5b64c4c
...@@ -178,6 +178,265 @@ def set_queue_flags(flags): ...@@ -178,6 +178,265 @@ def set_queue_flags(flags):
queue_flags = flags queue_flags = flags
def _is_gguf_model(model_name: str) -> bool:
"""Check if a model name/path indicates a GGUF model."""
if not model_name:
return False
return (model_name.endswith('.gguf') or
'gguf' in model_name.lower() or
(model_name.startswith('http') and '.gguf' in model_name))
def _load_diffusers_pipeline(model_name: str, global_args):
"""
Try to load a model using the diffusers library.
Returns the loaded pipeline or None if diffusers can't handle this model.
Raises Exception if loading fails for other reasons.
"""
from diffusers import StableDiffusionPipeline, StableDiffusionXLPipeline, DiffusionPipeline
import torch
# Determine precision from CLI argument (--image-precision)
precision = getattr(global_args, 'image_precision', 'f32') or 'f32'
precision_map = {
'bf16': torch.bfloat16,
'f32': torch.float32,
'f16': torch.float16,
}
if hasattr(torch, 'float8_e4m3fn'):
precision_map['f8'] = torch.float8_e4m3fn
dtype = precision_map.get(precision, torch.float32)
print(f"Using precision: {precision} ({dtype})")
# Check if CPU offload is requested via CLI
use_sequential_offload = getattr(global_args, 'image_cpu_offload', False)
# Track loading attempts for OOM handling
pipeline = None
load_attempt = 0
max_attempts = 3
while pipeline is None and load_attempt < max_attempts:
try:
load_attempt += 1
print(f"Loading attempt {load_attempt}/{max_attempts}...")
# Try to load as Stable Diffusion XL first, then generic DiffusionPipeline
try:
pipeline = StableDiffusionXLPipeline.from_pretrained(
model_name,
torch_dtype=dtype,
use_safetensors=True,
)
except Exception:
# Try generic diffusion pipeline (supports custom pipelines like ZImagePipeline)
pipeline = DiffusionPipeline.from_pretrained(
model_name,
torch_dtype=dtype,
use_safetensors=True,
)
# Apply memory optimizations based on attempt
if torch.cuda.is_available():
if load_attempt >= 2:
# Second attempt: enable attention slicing
print("Enabling attention slicing for lower VRAM usage...")
if hasattr(pipeline, 'enable_attention_slicing'):
pipeline.enable_attention_slicing()
if load_attempt >= 3 or use_sequential_offload:
# Third attempt or offload requested: enable sequential CPU offload
print("Enabling sequential CPU offload for lower VRAM usage...")
if hasattr(pipeline, 'enable_sequential_cpu_offload'):
pipeline.enable_sequential_cpu_offload()
else:
# First attempt: try regular GPU
pipeline = pipeline.to("cuda")
else:
pipeline = pipeline.to("cpu")
except Exception as load_error:
error_msg = str(load_error).lower()
is_oom = any(x in error_msg for x in ['out of memory', 'oom', 'cuda error', 'cudamalloc'])
if is_oom and load_attempt < max_attempts:
print(f"OOM during model loading: {load_error}")
print(f"Retrying with more aggressive memory optimization...")
pipeline = None # Reset for retry
else:
print(f"Failed to load model (attempt {load_attempt}): {load_error}")
if load_attempt >= max_attempts:
raise
pipeline = None
return pipeline
def _generate_with_diffusers(pipeline, request, global_args, http_request=None):
"""Generate images using a diffusers pipeline."""
import torch
import numpy as np
import time as time_module
# Determine size
width, height = 1024, 1024
if request.size:
parts = request.size.split("x")
if len(parts) == 2:
try:
width = int(parts[0])
height = int(parts[1])
except ValueError:
pass
# Check for nan/inf in dimensions
if width != width or width == float('inf'):
width = 512
if height != height or height == float('inf'):
height = 512
# Enable memory optimizations
try:
if hasattr(pipeline, 'enable_attention_slicing'):
pipeline.enable_attention_slicing(slice_size="auto")
if hasattr(pipeline, 'enable_vae_slicing'):
pipeline.enable_vae_slicing()
except Exception as e:
print(f"Warning: Could not enable memory optimizations: {e}")
# Get timestamp BEFORE calling diffusers
timestamp = int(time_module.time())
# Generate images
seed = request.seed if request.seed is not None else getattr(global_args, 'image_seed', None)
generator = None
if seed is not None:
generator = torch.Generator(device=pipeline.device).manual_seed(seed)
# Quality: "standard" or "hd"
quality = request.quality or "standard"
# Use request parameters if provided, otherwise fall back to quality-based defaults
num_steps = request.steps if request.steps else (30 if quality == "standard" else 50)
cfg_scale = request.guidance_scale if request.guidance_scale else (
getattr(global_args, 'image_cfg_scale', 7.5) if quality == "standard" else 9.0
)
# Generate
result = pipeline(
prompt=request.prompt,
negative_prompt=None,
num_images_per_prompt=request.n,
height=height,
width=width,
generator=generator,
guidance_scale=cfg_scale,
num_inference_steps=num_steps,
)
# Extract images
images = []
try:
result_images = result.images
except Exception as img_err:
print(f"Warning: Could not access result.images: {img_err}")
result_images = getattr(result, 'image', None) or getattr(result, 'output', None)
if result_images is None:
raise Exception(f"Could not extract images from diffusers result: {img_err}")
for img in result_images:
# Debug: print image type and value range
print(f"DEBUG: Image type: {type(img)}")
if isinstance(img, np.ndarray):
print(f"DEBUG: Image shape: {img.shape}, dtype: {img.dtype}, min: {img.min()}, max: {img.max()}")
img = np.nan_to_num(img, nan=0.0, posinf=1.0, neginf=0.0)
img = np.clip(img, 0.0, 1.0)
print(f"DEBUG: After NaN handling - min: {img.min()}, max: {img.max()}")
img_data = save_image_response(img, request.response_format, http_request)
images.append(img_data)
return {
"created": timestamp,
"data": images
}
async def _generate_with_sdcpp(sd_model, request, global_args, http_request=None):
"""Generate images using stable-diffusion-cpp-python."""
import time
# Parse size
width, height = 512, 512
if request.size:
parts = request.size.split("x")
if len(parts) == 2:
try:
width = int(parts[0])
height = int(parts[1])
except ValueError:
pass
# Use default steps for fast generation
steps = 4
# Use request seed if provided, otherwise use CLI default seed
seed = request.seed if request.seed is not None else getattr(global_args, 'image_seed', None)
result = await asyncio.to_thread(
sd_model.generate_image,
prompt=request.prompt,
negative_prompt='',
width=width,
height=height,
cfg_scale=get_cfg_scale(),
sample_steps=steps,
seed=seed if seed is not None else 42,
batch_count=request.n if request.n else 1,
)
# Small delay to let Vulkan driver settle after generation
time.sleep(0.1)
# Convert results to response format
images = []
for img in result:
img_data = save_image_response(img, http_request=http_request)
images.append(img_data)
return {
"created": int(time.time()),
"data": images
}
def _load_sdcpp_model(model_path: str, global_args):
"""
Try to load a model using stable-diffusion-cpp-python.
Returns the loaded StableDiffusion model or None.
"""
from stable_diffusion_cpp import StableDiffusion
print(f"Loading sd.cpp model from: {model_path}")
# Build sd.cpp constructor args from config
kwargs = {
'model_path': model_path,
}
# Add optional paths from CLI args
if global_args:
if hasattr(global_args, 'vae_path') and global_args.vae_path:
kwargs['vae_path'] = global_args.vae_path
if hasattr(global_args, 'llm_path') and global_args.llm_path:
kwargs['lora_model_dir'] = global_args.llm_path
sd_model = StableDiffusion(**kwargs)
return sd_model
# ============================================================================= # =============================================================================
# Router and Endpoints # Router and Endpoints
# ============================================================================= # =============================================================================
...@@ -225,518 +484,136 @@ async def create_image_generation(request: ImageGenerationRequest, http_request: ...@@ -225,518 +484,136 @@ async def create_image_generation(request: ImageGenerationRequest, http_request:
) )
async with semaphore: async with semaphore:
image_model = multi_model_manager.image_model # =====================================================================
# Step 1: Ask the manager to resolve the model and manage VRAM
# If no image model configured, try to use main --model as fallback # =====================================================================
if not image_model: model_info = multi_model_manager.request_model(
# Try to get the main model from args requested_model=request.model,
main_model = getattr(global_args, 'model', None) model_type="image"
if main_model and isinstance(main_model, list) and len(main_model) > 0:
image_model = main_model[0]
elif main_model:
image_model = main_model
# Check if main model is a GGUF file - can't use for image generation
if image_model and ('.gguf' in image_model.lower() or 'gguf' in image_model.lower()):
print(f"Note: Main model is a GGUF file (for text), not suitable for image generation")
image_model = None # Can't use GGUF for images
# If still no image model configured, return an error
if not image_model:
raise HTTPException(
status_code=400,
detail="Image generation not configured. Use --image-model to specify a model."
) )
# Determine model to use
# Priority: 1) model specified in request, 2) default image model from --image-model
model_to_use = request.model
if not model_to_use or model_to_use == "image":
# No model specified in request, use default
model_to_use = image_model
elif model_to_use.startswith("image:"):
# Legacy format - strip prefix and use default
model_to_use = image_model
else:
# Check if model_to_use is a valid model (URL, file, or known model)
# If not, fallback to the configured image model to avoid HF resolution errors
if image_model:
is_url = model_to_use.startswith('http://') or model_to_use.startswith('https://')
is_file = os.path.isfile(model_to_use) if model_to_use else False
if not is_url and not is_file:
# Unknown model name - use default instead of trying to resolve as HF
print(f"Warning: Unknown model '{model_to_use}' in image generation request, using configured --image-model")
model_to_use = image_model
# Check if model is loaded
model_key = f"image:{model_to_use}"
pipeline = multi_model_manager.get_model(model_key)
# In ondemand mode, if ANY model is loaded in VRAM and it's different from what we need,
# fully unload it first to free VRAM
if mode == "ondemand":
from codai.models.manager import model_manager
has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
if has_any_model: model_name = model_info['model_name']
# Resolve both the requested image model and currently loaded model to their canonical names model_key = model_info['model_key']
requested_canonical = multi_model_manager.resolve_model_name(f"image:{model_to_use}") pipeline = model_info['model_object']
loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
# Also check legacy model_manager
if not loaded_canonical and model_manager.backend is not None:
loaded_canonical = "legacy_model_manager"
# Compare: if they're different models (even if both are image models), unload first
already_loaded = (requested_canonical and loaded_canonical and
requested_canonical == loaded_canonical)
if not already_loaded:
print(f"In ondemand mode - model switch detected:")
print(f" Requested: 'image:{model_to_use}' (resolved to: '{requested_canonical}')")
print(f" Loaded: '{loaded_canonical}'")
print(f" -> Fully unloading current model(s) before loading new model...")
multi_model_manager.unload_all_models()
if model_manager.backend is not None:
try:
model_manager.cleanup()
except:
pass
# Try diffusers first # If no image model configured, try to use main --model as fallback
try: if not model_name:
from diffusers import StableDiffusionPipeline, StableDiffusionXLPipeline, DiffusionPipeline main_model = getattr(global_args, 'model', None)
import torch if main_model and isinstance(main_model, list) and len(main_model) > 0:
model_name = main_model[0]
# Check if model is XL elif main_model:
is_xl = "xl" in model_to_use.lower() or "sdxl" in model_to_use.lower() model_name = main_model
# Check if it's a GGUF model - skip diffusers for those
is_gguf_model = (model_to_use.endswith('.gguf') or 'gguf' in model_to_use.lower() or
(model_to_use.startswith('http') and '.gguf' in model_to_use))
if is_gguf_model:
print(f"GGUF model detected ({model_to_use}), skipping diffusers, using stable-diffusion-cpp...")
raise Exception("GGUF model - use stable-diffusion-cpp instead")
print(f"Loading diffusers model: {model_to_use}")
# Determine precision from CLI argument (--image-precision)
precision = getattr(global_args, 'image_precision', 'f32') or 'f32'
precision_map = {
'bf16': torch.bfloat16,
'f32': torch.float32,
'f16': torch.float16,
}
if hasattr(torch, 'float8_e4m3fn'):
precision_map['f8'] = torch.float8_e4m3fn
dtype = precision_map.get(precision, torch.float32)
print(f"Using precision: {precision} ({dtype})")
# Check if CPU offload is requested via CLI
use_sequential_offload = getattr(global_args, 'image_cpu_offload', False)
# Track loading attempts for OOM handling
load_attempt = 0
max_attempts = 3
while pipeline is None and load_attempt < max_attempts:
try:
load_attempt += 1
print(f"Loading attempt {load_attempt}/{max_attempts}...")
# Try to load as Stable Diffusion XL first, then generic DiffusionPipeline
try:
pipeline = StableDiffusionXLPipeline.from_pretrained(
model_to_use,
torch_dtype=dtype,
use_safetensors=True,
)
except Exception:
# Try generic diffusion pipeline (supports custom pipelines like ZImagePipeline)
pipeline = DiffusionPipeline.from_pretrained(
model_to_use,
torch_dtype=dtype,
use_safetensors=True,
)
# Apply memory optimizations based on attempt
if torch.cuda.is_available():
if load_attempt >= 2:
# Second attempt: enable attention slicing
print("Enabling attention slicing for lower VRAM usage...")
if hasattr(pipeline, 'enable_attention_slicing'):
pipeline.enable_attention_slicing()
if load_attempt >= 3 or use_sequential_offload:
# Third attempt or offload requested: enable sequential CPU offload
print("Enabling sequential CPU offload for lower VRAM usage...")
if hasattr(pipeline, 'enable_sequential_cpu_offload'):
pipeline.enable_sequential_cpu_offload()
else:
# First attempt: try regular GPU
pipeline = pipeline.to("cuda")
else:
pipeline = pipeline.to("cpu")
except Exception as load_error:
error_msg = str(load_error).lower()
is_oom = any(x in error_msg for x in ['out of memory', 'oom', 'cuda error', 'cudamalloc'])
if is_oom and load_attempt < max_attempts:
print(f"OOM during model loading: {load_error}")
print(f"Retrying with more aggressive memory optimization...")
pipeline = None # Reset for retry
else:
print(f"Failed to load model (attempt {load_attempt}): {load_error}")
if load_attempt >= max_attempts:
raise
pipeline = None
# Cache the model
if pipeline is not None:
multi_model_manager.add_model(model_key, pipeline)
print(f"Loaded diffusers model: {model_to_use}")
except ImportError as e: # Check if main model is a GGUF file - can't use for image generation
# diffusers not installed if model_name and _is_gguf_model(model_name):
diffusers_error = str(e) print(f"Note: Main model is a GGUF file (for text), not suitable for image generation")
print(f"diffusers not available: {diffusers_error}") model_name = None
except Exception as e:
import traceback
diffusers_error = str(e)
print(f"diffusers error: {diffusers_error}")
print(f"Traceback: {traceback.format_exc()}")
# Try diffusers if available
if pipeline is not None:
try:
# Determine size
width, height = 1024, 1024
if request.size:
parts = request.size.split("x")
if len(parts) == 2:
try:
width = int(parts[0])
height = int(parts[1])
except ValueError:
pass
# Check for nan/inf in dimensions if model_name:
if width != width or width == float('inf'): # NaN or inf check model_key = f"image:{model_name}"
width = 512
if height != height or height == float('inf'): # NaN or inf check # If still no image model configured, return an error
height = 512 if not model_name:
raise HTTPException(
# Import torch for generation status_code=400,
import torch detail="Image generation not configured. Use --image-model to specify a model."
# Ensure model is on correct device
backend = getattr(global_args, 'backend', 'auto')
image_backend = getattr(global_args, 'image_backend', 'auto')
use_vulkan = (backend == 'vulkan') or (image_backend == 'vulkan') or (image_backend == 'auto' and backend == 'auto')
if use_vulkan and not torch.cuda.is_available():
# CPU mode - try to reduce memory usage
try:
if hasattr(pipeline, 'enable_attention_slicing'):
pipeline.enable_attention_slicing(slice_size="auto")
if hasattr(pipeline, 'enable_vae_slicing'):
pipeline.enable_vae_slicing()
except Exception as e:
print(f"Warning: Could not enable memory optimizations: {e}")
elif torch.cuda.is_available():
# Try to enable memory optimizations for CUDA
try:
if hasattr(pipeline, 'enable_attention_slicing'):
pipeline.enable_attention_slicing(slice_size="auto")
if hasattr(pipeline, 'enable_vae_slicing'):
pipeline.enable_vae_slicing()
except Exception as e:
print(f"Warning: Could not enable CUDA memory optimizations: {e}")
# Get timestamp BEFORE calling diffusers (to avoid scope issues)
import time as time_module
timestamp = int(time_module.time())
# Generate images
# Use request seed if provided, otherwise use CLI default seed
seed = request.seed if request.seed is not None else getattr(global_args, 'image_seed', None)
generator = None
if seed is not None:
generator = torch.Generator(device=pipeline.device).manual_seed(seed)
# Quality: "standard" or "hd"
quality = request.quality or "standard"
# Use request parameters if provided, otherwise fall back to quality-based defaults
num_steps = request.steps if request.steps else (30 if quality == "standard" else 50)
cfg_scale = request.guidance_scale if request.guidance_scale else (
getattr(global_args, 'image_cfg_scale', 7.5) if quality == "standard" else 9.0
)
# Generate
result = pipeline(
prompt=request.prompt,
negative_prompt=None,
num_images_per_prompt=request.n,
height=height,
width=width,
generator=generator,
guidance_scale=cfg_scale,
num_inference_steps=num_steps,
) )
# Extract images # =====================================================================
images = [] # Step 2: Check if model is a sd.cpp StableDiffusion instance
# =====================================================================
is_sdcpp = False
if pipeline is not None:
try: try:
result_images = result.images from stable_diffusion_cpp import StableDiffusion
except Exception as img_err: if isinstance(pipeline, StableDiffusion):
print(f"Warning: Could not access result.images: {img_err}") is_sdcpp = True
# Try alternative: result might have 'image' or 'output' except ImportError:
result_images = getattr(result, 'image', None) or getattr(result, 'output', None) pass
if result_images is None:
raise Exception(f"Could not extract images from diffusers result: {img_err}") # =====================================================================
# Step 3: If already loaded, generate with appropriate backend
for img in result_images: # =====================================================================
# Convert to base64 if pipeline is not None:
import numpy as np if is_sdcpp:
print(f"Using cached sd.cpp model for generation")
# Debug: print image type and value range return await _generate_with_sdcpp(pipeline, request, global_args, http_request)
print(f"DEBUG: Image type: {type(img)}") else:
if isinstance(img, np.ndarray): # Assume it's a diffusers pipeline
print(f"DEBUG: Image shape: {img.shape}, dtype: {img.dtype}, min: {img.min()}, max: {img.max()}") print(f"Using cached diffusers pipeline for generation")
# Handle NaN/Inf values in image data - convert to valid values return _generate_with_diffusers(pipeline, request, global_args, http_request)
# Replace NaN and Inf with valid values
img = np.nan_to_num(img, nan=0.0, posinf=1.0, neginf=0.0) # =====================================================================
# Clip to valid range [0, 1] # Step 4: Model not loaded - try to load it
img = np.clip(img, 0.0, 1.0) # =====================================================================
print(f"DEBUG: After NaN handling - min: {img.min()}, max: {img.max()}") is_gguf = _is_gguf_model(model_name)
diffusers_error = None
sdcpp_error = None
# Try diffusers first (for non-GGUF models)
if not is_gguf:
try:
print(f"Loading diffusers model: {model_name}")
pipeline = _load_diffusers_pipeline(model_name, global_args)
# Use helper function to save and get response if pipeline is not None:
img_data = save_image_response(img, request.response_format, http_request) # Cache the loaded pipeline in the manager
images.append(img_data) multi_model_manager.add_model(model_key, pipeline)
multi_model_manager.current_model_key = model_key
return { print(f"Loaded diffusers model: {model_name}")
"created": timestamp,
"data": images return _generate_with_diffusers(pipeline, request, global_args, http_request)
}
except ImportError as e:
except ImportError as e: diffusers_error = str(e)
# diffusers/torch not installed - record error and try sd.cpp print(f"diffusers not available: {diffusers_error}")
diffusers_error = str(e) except Exception as e:
print(f"diffusers not available: {diffusers_error}, trying stable-diffusion-cpp-python...") import traceback
except Exception as e: diffusers_error = str(e)
# Other error with diffusers - record and try sd.cpp print(f"diffusers error: {diffusers_error}")
import traceback print(f"Traceback: {traceback.format_exc()}")
diffusers_error = str(e)
print(f"diffusers error: {diffusers_error}") # Try stable-diffusion-cpp-python (for GGUF models or as fallback)
print(f"Traceback: {traceback.format_exc()}")
print(f"Trying stable-diffusion-cpp-python...")
# Try stable-diffusion-cpp-python (sd.cpp) as fallback when diffusers fails
# sd.cpp works with GGUF models, but some HF models may be GGUF even without "gguf" in name
# Let sd.cpp attempt loading and fail gracefully if it's not compatible
# Try stable-diffusion-cpp-python (sd.cpp) as fallback
# First, check all available image models to find one loaded via sd.cpp
# Always check for cached models - allows dynamically loaded models to be reused across requests
sd_model = None
for key in multi_model_manager.models:
if key.startswith("image:"):
potential_model = multi_model_manager.get_model(key)
if potential_model is not None:
# Check if it's a stable-diffusion-cpp model
try:
from stable_diffusion_cpp import StableDiffusion
if isinstance(potential_model, StableDiffusion):
sd_model = potential_model
print(f"Found cached stable-diffusion-cpp model with key: {key}")
break
except ImportError:
pass
# If no cached image model found, need to load one - first cleanup any existing models
if sd_model is None:
# In ondemand mode, check if we need to unload before loading sd.cpp model
from codai.models.manager import model_manager
has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
if mode == "ondemand" and has_any_model:
# Resolve both the requested image model and currently loaded model to their canonical names
requested_canonical = multi_model_manager.resolve_model_name(f"image:{model_to_use}")
loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
# Also check legacy model_manager
if not loaded_canonical and model_manager.backend is not None:
loaded_canonical = "legacy_model_manager"
# Compare: if they're different models, unload first
already_loaded = (requested_canonical and loaded_canonical and
requested_canonical == loaded_canonical)
if not already_loaded:
print(f"In ondemand mode - model switch detected:")
print(f" Requested: 'image:{model_to_use}' (resolved to: '{requested_canonical}')")
print(f" Loaded: '{loaded_canonical}'")
print(f" -> Fully unloading current model(s) before loading sd.cpp model...")
multi_model_manager.unload_all_models()
if model_manager.backend is not None:
try:
model_manager.cleanup()
except:
pass
if sd_model is not None:
# Check if it's a stable-diffusion-cpp model (has generate method from sd.cpp)
try: try:
from stable_diffusion_cpp import StableDiffusion # For GGUF models or URLs, resolve the model path through the cache
if isinstance(sd_model, StableDiffusion): resolved_path = model_name
print(f"Using stable-diffusion-cpp-python for image generation") if is_gguf or model_name.startswith('http://') or model_name.startswith('https://'):
# Use sd.cpp for generation resolved_path = multi_model_manager.load_model(model_name)
# Parse size if not resolved_path:
width, height = 512, 512 raise Exception(f"Failed to resolve model path: {model_name}")
if request.size:
parts = request.size.split("x") # Only use sd.cpp if we have a local file path
if len(parts) == 2: if resolved_path and os.path.isfile(resolved_path):
try: sd_model = _load_sdcpp_model(resolved_path, global_args)
width = int(parts[0])
height = int(parts[1])
except ValueError:
pass
# Use default steps for Z-Image Turbo (very fast)
steps = 4 # Default for fast generation
# Generate images using sd.cpp (run in thread to not block event loop)
# Use request seed if provided, otherwise use CLI default seed
seed = request.seed if request.seed is not None else getattr(global_args, 'image_seed', None)
result = await asyncio.to_thread(
sd_model.generate_image,
prompt=request.prompt,
negative_prompt='',
width=width,
height=height,
cfg_scale=get_cfg_scale(),
sample_steps=steps,
seed=seed if seed is not None else 42,
batch_count=request.n if request.n else 1,
)
# Small delay to let Vulkan driver settle after generation
import time
time.sleep(0.1)
# Convert results to response format
images = []
for img in result:
# Use helper function to save and get response
img_data = save_image_response(img, http_request=http_request)
images.append(img_data)
return { if sd_model is not None:
"created": int(time.time()), # Cache the loaded model in the manager
"data": images multi_model_manager.add_model(model_key, sd_model)
} multi_model_manager.current_model_key = model_key
except ImportError as e: print(f"Loaded sd.cpp model: {model_name}")
# stable-diffusion-cpp not available
sd_cpp_error = str(e) return await _generate_with_sdcpp(sd_model, request, global_args, http_request)
print(f"stable-diffusion-cpp-python not available: {sd_cpp_error}")
except Exception as e:
print(f"sd.cpp generation error: {e}")
sd_cpp_error = str(e)
else:
# No sd.cpp model pre-loaded, try to load dynamically
print("No pre-loaded sd.cpp model found, trying to load...")
try:
from stable_diffusion_cpp import StableDiffusion
# Use model manager to resolve and load the model
model_path = multi_model_manager.load_model(model_to_use)
# For diffusers models, model_path will be the identifier string
# For GGUF models, it will be the file path
if model_path is not None and not os.path.isfile(model_path):
# This is a diffusers model identifier (not a file path)
# Skip sd.cpp and let diffusers handle it
print(f"Model '{model_path}' is handled by diffusers library, skipping sd.cpp")
model_path = None
if model_path is not None:
# Check if it's a stable-diffusion-cpp model (has generate method from sd.cpp)
try:
from stable_diffusion_cpp import StableDiffusion
if isinstance(sd_model, StableDiffusion):
print(f"Using stable-diffusion-cpp-python for image generation")
# Use sd.cpp for generation
# Parse size
width, height = 512, 512
if request.size:
parts = request.size.split("x")
if len(parts) == 2:
try:
width = int(parts[0])
height = int(parts[1])
except ValueError:
pass
# Use default steps for Z-Image Turbo (very fast)
steps = 4 # Default for fast generation
# Generate images using sd.cpp (run in thread to not block event loop)
# Use request seed if provided, otherwise use CLI default seed
seed = request.seed if request.seed is not None else getattr(global_args, 'image_seed', None)
result = await asyncio.to_thread(
sd_model.generate_image,
prompt=request.prompt,
negative_prompt='',
width=width,
height=height,
cfg_scale=get_cfg_scale(),
sample_steps=steps,
seed=seed if seed is not None else 42,
batch_count=request.n if request.n else 1,
)
# Small delay to let Vulkan driver settle after generation
import time
time.sleep(0.1)
# Convert results to response format
images = []
for img in result:
# Use helper function to save and get response
img_data = save_image_response(img, http_request=http_request)
images.append(img_data)
return {
"created": int(time.time()),
"data": images
}
except ImportError as e:
# stable-diffusion-cpp not available
sd_cpp_error = str(e)
print(f"stable-diffusion-cpp-python not available: {sd_cpp_error}")
except Exception as e:
print(f"sd.cpp generation error: {e}")
sd_cpp_error = str(e)
else: else:
# model_path is None - likely a diffusers model handled above sdcpp_error = f"Model '{model_name}' is not a local file, cannot use sd.cpp"
print("Model handled by diffusers library") print(sdcpp_error)
sd_cpp_error = "Model handled by diffusers"
except ImportError as e: except ImportError as e:
sd_cpp_error = str(e) sdcpp_error = str(e)
print(f"stable-diffusion-cpp-python not available: {sd_cpp_error}") print(f"stable-diffusion-cpp-python not available: {sdcpp_error}")
except Exception as e: except Exception as e:
sd_cpp_error = str(e) sdcpp_error = str(e)
print(f"sd.cpp error: {sd_cpp_error}") print(f"sd.cpp error: {sdcpp_error}")
# Both backends failed - return error with installation instructions # =====================================================================
raise HTTPException( # Step 5: Both backends failed - return error
status_code=400, # =====================================================================
detail=f"Model '{model_to_use}' does not support image generation" error_details = []
) if diffusers_error:
error_details.append(f"diffusers: {diffusers_error}")
if sdcpp_error:
error_details.append(f"sd.cpp: {sdcpp_error}")
raise HTTPException(
status_code=400,
detail=f"Failed to load image model '{model_name}'. Errors: {'; '.join(error_details) if error_details else 'No compatible backend found'}"
)
...@@ -295,46 +295,13 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request ...@@ -295,46 +295,13 @@ async def chat_completions(request: ChatCompletionRequest, http_request: Request
# Get the model for this request # Get the model for this request
requested_model = request.model requested_model = request.model
# Get load mode to determine if we need to unload other models first # Use the manager to resolve the model and manage VRAM (handles ondemand unloading)
from codai.api.state import get_load_mode model_info = multi_model_manager.request_model(
load_mode = get_load_mode() requested_model=requested_model,
model_type="text"
)
# In ondemand mode (no --load-all or --loadswap), if ANY model is loaded in VRAM # Try to get the appropriate model (request_model handles VRAM cleanup)
# and it's different from what we need, fully unload it first to free VRAM
if load_mode == "ondemand":
has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
if has_any_model:
# Resolve both the requested model and currently loaded model to their canonical names
requested_canonical = multi_model_manager.resolve_model_name(requested_model)
loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
# Also check legacy model_manager
if not loaded_canonical and model_manager.backend is not None:
loaded_canonical = "legacy_model_manager"
# Compare: if they're different models (even if same type), unload first
already_loaded = (requested_canonical and loaded_canonical and
requested_canonical == loaded_canonical)
if not already_loaded:
print(f"In ondemand mode - model switch detected:")
print(f" Requested: '{requested_model}' (resolved to: '{requested_canonical}')")
print(f" Loaded: '{loaded_canonical}'")
print(f" -> Fully unloading current model(s) before loading new model...")
# Use centralized unload method
multi_model_manager.unload_all_models()
# Also cleanup legacy model_manager
if model_manager.backend is not None:
print("Unloading legacy model_manager from VRAM...")
try:
model_manager.cleanup()
except Exception as e:
print(f"Warning during legacy model cleanup: {e}")
# Try to get the appropriate model
mm = multi_model_manager.get_model_for_request(requested_model) mm = multi_model_manager.get_model_for_request(requested_model)
if mm is None: if mm is None:
...@@ -1727,40 +1694,13 @@ async def completions(request: CompletionRequest): ...@@ -1727,40 +1694,13 @@ async def completions(request: CompletionRequest):
# Get the model for this request # Get the model for this request
requested_model = request.model requested_model = request.model
# Get load mode to determine if we need to unload other models first # Use the manager to resolve the model and manage VRAM (handles ondemand unloading)
from codai.api.state import get_load_mode model_info = multi_model_manager.request_model(
load_mode = get_load_mode() requested_model=requested_model,
model_type="text"
# In ondemand mode, if ANY model is loaded and it's different from what we need, unload first )
if load_mode == "ondemand":
has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
if has_any_model:
# Resolve both the requested model and currently loaded model to their canonical names
requested_canonical = multi_model_manager.resolve_model_name(requested_model)
loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
# Also check legacy model_manager
if not loaded_canonical and model_manager.backend is not None:
loaded_canonical = "legacy_model_manager"
# Compare: if they're different models (even if same type), unload first
already_loaded = (requested_canonical and loaded_canonical and
requested_canonical == loaded_canonical)
if not already_loaded:
print(f"In ondemand mode - model switch detected:")
print(f" Requested: '{requested_model}' (resolved to: '{requested_canonical}')")
print(f" Loaded: '{loaded_canonical}'")
print(f" -> Fully unloading current model(s) before loading new model...")
multi_model_manager.unload_all_models()
if model_manager.backend is not None:
try:
model_manager.cleanup()
except:
pass
# Try to get the appropriate model # Try to get the appropriate model (request_model handles VRAM cleanup)
mm = multi_model_manager.get_model_for_request(requested_model) mm = multi_model_manager.get_model_for_request(requested_model)
if mm is None: if mm is None:
......
...@@ -54,52 +54,22 @@ async def create_transcription( ...@@ -54,52 +54,22 @@ async def create_transcription(
raise HTTPException(status_code=500, detail=result["error"]) raise HTTPException(status_code=500, detail=result["error"])
return {"text": result.get("text", "")} return {"text": result.get("text", "")}
audio_model = multi_model_manager.audio_models[0] if multi_model_manager.audio_models else None # Use the manager to resolve the model and manage VRAM
if not audio_model: model_info = multi_model_manager.request_model(
requested_model=model,
model_type="audio"
)
model_name = model_info['model_name']
model_key = model_info['model_key']
whisper_model = model_info['model_object']
if not model_name:
raise HTTPException( raise HTTPException(
status_code=400, status_code=400,
detail="Audio transcription not configured. Use --audio-model or --whisper-server." detail="Audio transcription not configured. Use --audio-model or --whisper-server."
) )
# Get load mode to determine if we need to unload other models first
from codai.api.state import get_load_mode
from codai.models.manager import model_manager
load_mode = get_load_mode()
# In ondemand mode, if ANY model is loaded and it's different from what we need, unload first
if load_mode == "ondemand":
has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
if has_any_model:
# Resolve both the requested audio model and currently loaded model to their canonical names
requested_canonical = multi_model_manager.resolve_model_name(f"audio:{audio_model}")
loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
# Also check legacy model_manager
if not loaded_canonical and model_manager.backend is not None:
loaded_canonical = "legacy_model_manager"
# Compare: if they're different models, unload first
already_loaded = (requested_canonical and loaded_canonical and
requested_canonical == loaded_canonical)
if not already_loaded:
print(f"In ondemand mode - model switch detected:")
print(f" Requested: 'audio:{audio_model}' (resolved to: '{requested_canonical}')")
print(f" Loaded: '{loaded_canonical}'")
print(f" -> Fully unloading current model(s) before loading audio model...")
multi_model_manager.unload_all_models()
if model_manager.backend is not None:
try:
model_manager.cleanup()
except:
pass
# Determine model to use
model_to_use = model
if model_to_use.startswith("whisper:") or model_to_use.startswith("audio:"):
model_to_use = audio_model
# Read the uploaded file # Read the uploaded file
file_content = await file.read() file_content = await file.read()
...@@ -113,26 +83,23 @@ async def create_transcription( ...@@ -113,26 +83,23 @@ async def create_transcription(
try: try:
from faster_whisper import WhisperModel from faster_whisper import WhisperModel
# Determine model key
model_key = f"audio:{model_to_use}"
whisper_model = multi_model_manager.get_model(model_key)
if whisper_model is None: if whisper_model is None:
print(f"Loading faster-whisper model: {model_to_use}") print(f"Loading faster-whisper model: {model_name}")
# Determine compute type - always use int8 for CPU # Determine compute type - always use int8 for CPU
compute_type = "int8" compute_type = "int8"
# Load the model # Load the model
whisper_model = WhisperModel( whisper_model = WhisperModel(
model_to_use, model_name,
device="cpu", # Always use CPU - faster-whisper CUDA doesn't work with AMD device="cpu", # Always use CPU - faster-whisper CUDA doesn't work with AMD
compute_type=compute_type, compute_type=compute_type,
) )
# Cache the model # Cache the model
multi_model_manager.add_model(model_key, whisper_model) multi_model_manager.add_model(model_key, whisper_model)
print(f"Loaded faster-whisper model: {model_to_use}") multi_model_manager.current_model_key = model_key
print(f"Loaded faster-whisper model: {model_name}")
# Run transcription # Run transcription
segments, info = whisper_model.transcribe( segments, info = whisper_model.transcribe(
...@@ -160,24 +127,21 @@ async def create_transcription( ...@@ -160,24 +127,21 @@ async def create_transcription(
try: try:
import whispercpp import whispercpp
# Determine model key
model_key = f"audio:{model_to_use}"
whisper_model = multi_model_manager.get_model(model_key)
if whisper_model is None: if whisper_model is None:
print(f"Loading whispercpp model: {model_to_use}") print(f"Loading whispercpp model: {model_name}")
# Check if it's a built-in model name # Check if it's a built-in model name
if model_to_use in ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large']: if model_name in ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large']:
# It's a built-in model name # It's a built-in model name
whisper_model = whispercpp.Whisper.from_pretrained(model_to_use) whisper_model = whispercpp.Whisper.from_pretrained(model_name)
else: else:
# It's a path to a GGUF file # It's a path to a GGUF file
whisper_model = whispercpp.Whisper.from_pretrained(model_to_use) whisper_model = whispercpp.Whisper.from_pretrained(model_name)
# Cache the model # Cache the model
multi_model_manager.add_model(model_key, whisper_model) multi_model_manager.add_model(model_key, whisper_model)
print(f"Loaded whispercpp model: {model_to_use}") multi_model_manager.current_model_key = model_key
print(f"Loaded whispercpp model: {model_name}")
# Run transcription # Run transcription
result = whisper_model.transcribe(tmp_path) result = whisper_model.transcribe(tmp_path)
......
...@@ -16,18 +16,6 @@ from codai.models.manager import multi_model_manager ...@@ -16,18 +16,6 @@ from codai.models.manager import multi_model_manager
global_args = None global_args = None
def get_cached_model_path(url: str) -> str:
"""Get cached model path if available."""
from codai.models.cache import get_cached_model_path as cache_get_cached_model_path
return cache_get_cached_model_path(url)
def get_model_cache_dir() -> str:
"""Get model cache directory."""
from codai.models.cache import get_model_cache_dir
return get_model_cache_dir()
def set_global_args(args): def set_global_args(args):
"""Set global args from coderai.""" """Set global args from coderai."""
global global_args global global_args
...@@ -65,80 +53,46 @@ async def create_speech(request: TTSRequest): ...@@ -65,80 +53,46 @@ async def create_speech(request: TTSRequest):
Supports: Supports:
- Kokoro TTS models (when --tts-model is specified) - Kokoro TTS models (when --tts-model is specified)
""" """
tts_model = multi_model_manager.tts_model # Use the manager to resolve the model and manage VRAM
model_info = multi_model_manager.request_model(
requested_model=request.model,
model_type="tts"
)
model_name = model_info['model_name']
model_key = model_info['model_key']
kokoro_model = model_info['model_object']
# If no TTS model configured, return an error # If no TTS model configured, return an error
if not tts_model: if not model_name:
raise HTTPException( raise HTTPException(
status_code=400, status_code=400,
detail="TTS not configured. Use --tts-model to specify a model." detail="TTS not configured. Use --tts-model to specify a model."
) )
# Get load mode to determine if we need to unload other models first
from codai.api.state import get_load_mode
from codai.models.manager import model_manager
load_mode = get_load_mode()
# In ondemand mode, if ANY model is loaded and it's different from what we need, unload first
if load_mode == "ondemand":
has_any_model = len(multi_model_manager.models) > 0 or model_manager.backend is not None
if has_any_model:
# Resolve both the requested TTS model and currently loaded model to their canonical names
requested_canonical = multi_model_manager.resolve_model_name(f"tts:{tts_model}")
loaded_canonical = multi_model_manager.get_currently_loaded_model_name()
# Also check legacy model_manager
if not loaded_canonical and model_manager.backend is not None:
loaded_canonical = "legacy_model_manager"
# Compare: if they're different models, unload first
already_loaded = (requested_canonical and loaded_canonical and
requested_canonical == loaded_canonical)
if not already_loaded:
print(f"In ondemand mode - model switch detected:")
print(f" Requested: 'tts:{tts_model}' (resolved to: '{requested_canonical}')")
print(f" Loaded: '{loaded_canonical}'")
print(f" -> Fully unloading current model(s) before loading TTS model...")
multi_model_manager.unload_all_models()
if model_manager.backend is not None:
try:
model_manager.cleanup()
except:
pass
# Determine model to use
model_to_use = request.model
if model_to_use.startswith("tts:"):
model_to_use = tts_model
# Try to use kokoro if available # Try to use kokoro if available
try: try:
from kokoro import Kokoro from kokoro import Kokoro
# Determine model key
model_key = f"tts:{model_to_use}"
kokoro_model = multi_model_manager.get_model(model_key)
if kokoro_model is None: if kokoro_model is None:
print(f"Loading Kokoro TTS model: {model_to_use}") print(f"Loading Kokoro TTS model: {model_name}")
# Check if model_to_use is a URL - download it (with caching) # Check if model_name is a URL - download it (with caching)
model_path = None model_path = None
if model_to_use.startswith('http://') or model_to_use.startswith('https://'): if model_name.startswith('http://') or model_name.startswith('https://'):
print(f"Loading model from URL: {model_to_use}") print(f"Loading model from URL: {model_name}")
from codai.models.cache import load_model from codai.models.cache import load_model
model_path = load_model(model_to_use) model_path = load_model(model_name)
if not model_path: if not model_path:
raise Exception(f"Failed to load model from {model_to_use}") raise Exception(f"Failed to load model from {model_name}")
else: else:
# Use local path or model name # Use local path or model name
model_path = model_to_use model_path = model_name
# Load the Kokoro model # Load the Kokoro model
kokoro_model = Kokoro(model_path if model_path else model_to_use) kokoro_model = Kokoro(model_path if model_path else model_name)
multi_model_manager.add_model(model_key, kokoro_model) multi_model_manager.add_model(model_key, kokoro_model)
multi_model_manager.current_model_key = model_key
# Generate speech # Generate speech
voice = request.voice or "af_sarah" voice = request.voice or "af_sarah"
......
...@@ -883,6 +883,133 @@ class MultiModelManager: ...@@ -883,6 +883,133 @@ class MultiModelManager:
return load_model(model_path, cache_dir, file_pattern) return load_model(model_path, cache_dir, file_pattern)
def request_model(self, requested_model: str, model_type: str = None) -> Dict[str, Any]:
"""
Central method for API modules to request a model.
Handles:
1. Alias resolution (e.g., "image" -> "Tongyi-MAI/Z-Image-Turbo")
2. VRAM management (unloading previous models in ondemand mode)
3. Checking if model is already loaded
Args:
requested_model: The model name/alias from the API request
model_type: The type of model being requested ("image", "text", "audio", "tts", "vision")
Used to resolve empty/None model names to the appropriate default.
Returns:
Dict with:
- 'model_key': The key used to store/retrieve the model in self.models
- 'model_name': The resolved model name/path/HF ID
- 'model_object': The loaded model object if already loaded, None otherwise
- 'config': The stored configuration for this model
- 'already_loaded': True if the model is already loaded in VRAM
"""
from codai.api.state import get_load_mode
mode = get_load_mode()
# Step 1: Resolve the model name from aliases
resolved_name = None
model_key = None
# If no model specified, use the default for the given type
if not requested_model or requested_model == model_type:
if model_type == "image":
resolved_name = self.image_models[0] if self.image_models else None
elif model_type == "audio":
resolved_name = self.audio_models[0] if self.audio_models else None
elif model_type == "tts":
resolved_name = self.tts_model
elif model_type == "vision":
resolved_name = self.vision_models[0] if self.vision_models else None
else:
resolved_name = self.default_model
else:
# Resolve custom aliases
if requested_model in self.model_aliases:
requested_model = self.model_aliases[requested_model]
# Handle "default" alias
if requested_model == "default":
resolved_name = self.default_model
# Handle type-specific aliases
elif requested_model == "image":
resolved_name = self.image_models[0] if self.image_models else None
elif requested_model == "audio":
resolved_name = self.audio_models[0] if self.audio_models else None
elif requested_model == "tts":
resolved_name = self.tts_model
elif requested_model == "vision":
resolved_name = self.vision_models[0] if self.vision_models else None
# Handle prefixed models (e.g., "image:model_name")
elif requested_model.startswith("image:"):
resolved_name = requested_model[6:]
elif requested_model.startswith("audio:"):
resolved_name = requested_model[6:]
elif requested_model.startswith("tts:"):
resolved_name = requested_model[4:]
elif requested_model.startswith("vision:"):
resolved_name = requested_model[7:]
else:
resolved_name = requested_model
if not resolved_name:
return {
'model_key': None,
'model_name': None,
'model_object': None,
'config': {},
'already_loaded': False,
}
# Step 2: Build the model key (prefixed with type)
if model_type and model_type != "text":
model_key = f"{model_type}:{resolved_name}"
else:
model_key = resolved_name
# Step 3: Check if already loaded
existing_model = self.models.get(model_key)
if existing_model is not None:
self.current_model_key = model_key
return {
'model_key': model_key,
'model_name': resolved_name,
'model_object': existing_model,
'config': self.config.get(model_key, {}),
'already_loaded': True,
}
# Step 4: In ondemand mode, unload any currently loaded model
if mode == "ondemand":
has_any_model = len(self.models) > 0 or model_manager.backend is not None
if has_any_model:
loaded_canonical = self.get_currently_loaded_model_name()
if not loaded_canonical and model_manager.backend is not None:
loaded_canonical = "legacy_model_manager"
if loaded_canonical and loaded_canonical != model_key:
print(f"Ondemand mode - model switch detected:")
print(f" Requested: '{model_key}' (resolved: '{resolved_name}')")
print(f" Currently loaded: '{loaded_canonical}'")
print(f" -> Unloading current model(s) before loading new model...")
self.unload_all_models()
if model_manager.backend is not None:
try:
model_manager.cleanup()
except:
pass
# Step 5: Return info for the caller to load the model
return {
'model_key': model_key,
'model_name': resolved_name,
'model_object': None,
'config': self.config.get(model_key, {}),
'already_loaded': False,
}
def unload_all_models(self): def unload_all_models(self):
""" """
Fully unload ALL models from VRAM. Used in ondemand mode when switching Fully unload ALL models from VRAM. Used in ondemand mode when switching
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment