Phase 3: Integrate config-driven loading and admin dashboard

- Updated main.py to use ConfigManager for loading settings
- Models now loaded from config instead of CLI arguments
- Admin dashboard routes integrated into FastAPI app
- Added admin API endpoints for tokens management
- Added admin models management endpoints
- System config reload endpoint
- Static files mounted at /static/admin
- Admin UI available at /admin
parent 6f81dfe2
......@@ -270,7 +270,6 @@ async def api_delete_user(
username: str = Depends(require_admin)
):
"""Delete a user."""
# Find user by ID
users = session_manager._load_auth_data().get("users", [])
user = next((u for u in users if u["id"] == user_id), None)
......@@ -282,3 +281,164 @@ async def api_delete_user(
raise HTTPException(status_code=400, detail="Cannot delete user")
return {"success": True}
# --- Token management endpoints ---
@router.get("/admin/api/tokens", response_model=list)
async def api_list_tokens(username: str = Depends(require_admin)):
"""List all API tokens."""
auth_data = session_manager._load_auth_data()
tokens = []
for token in auth_data.get("tokens", []):
tokens.append({
"id": token["id"],
"name": token["name"],
"token": token["token"],
"provider": token["provider"],
"created_at": token["created_at"],
"last_used": token.get("last_used")
})
return tokens
@router.post("/admin/api/tokens")
async def api_create_token(request: Request, username: str = Depends(require_admin)):
"""Create a new API token."""
data = await request.json()
name = data.get("name")
provider = data.get("provider", "openai")
if not name:
raise HTTPException(status_code=400, detail="Token name is required")
auth_data = session_manager._load_auth_data()
# Generate token
token_id = len(auth_data.get("tokens", [])) + 1
import secrets
new_token = {
"id": token_id,
"name": name,
"token": f"sk-coderai-{secrets.token_hex(32)}",
"provider": provider,
"created_at": datetime.utcnow().isoformat() + "Z",
"last_used": None
}
auth_data.setdefault("tokens", []).append(new_token)
session_manager._save_auth_data(auth_data)
return {
"token": new_token["token"],
"id": new_token["id"],
"name": new_token["name"],
"provider": new_token["provider"]
}
@router.delete("/admin/api/tokens/{token_id}")
async def api_delete_token(token_id: int, username: str = Depends(require_admin)):
"""Delete an API token."""
auth_data = session_manager._load_auth_data()
tokens = auth_data.get("tokens", [])
new_tokens = [t for t in tokens if t["id"] != token_id]
if len(new_tokens) == len(tokens):
raise HTTPException(status_code=404, detail="Token not found")
auth_data["tokens"] = new_tokens
session_manager._save_auth_data(auth_data)
return {"success": True}
# --- Models management endpoints ---
@router.get("/admin/api/models")
async def api_list_models(username: str = Depends(require_admin)):
"""List all configured models with details."""
models_data = session_manager._load_auth_data() # TODO: move to ModelManager
# For now, load from models file directly
models_path = Path.cwd() / "codai" / "admin" / "templates" # hack
# Actually use config_mgr
pass
@router.post("/admin/api/model-download")
async def api_download_model(
request: Request,
username: str = Depends(require_admin)
):
"""Download a model from HuggingFace."""
data = await request.json()
model_id = data.get("model_id")
file_pattern = data.get("file_pattern")
if not model_id:
raise HTTPException(status_code=400, detail="Model ID required")
from codai.models.cache import download_model, is_huggingface_model_id
try:
if is_huggingface_model_id(model_id):
if file_pattern:
cached = download_model(model_id, file_pattern=file_pattern)
else:
cached = download_model(model_id, file_pattern='.gguf')
if not cached:
# Download full repo
from huggingface_hub import snapshot_download
cached = snapshot_download(model_id)
else:
cached = download_model(model_id, file_pattern=file_pattern or '.gguf')
if cached:
return {"success": True, "path": cached}
else:
raise HTTPException(status_code=500, detail="Download failed")
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
@router.delete("/admin/api/models/{model_identifier}")
async def api_delete_model(
model_identifier: str,
username: str = Depends(require_admin)
):
"""Remove a model from local cache."""
from codai.models.cache import remove_cached_model
try:
removed = remove_cached_model(model_identifier)
if not removed:
raise HTTPException(status_code=404, detail="Model not found")
return {"success": True, "removed_count": len(removed)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# --- System endpoints ---
@router.post("/admin/api/system/reload")
async def api_reload_config(username: str = Depends(require_admin)):
"""Reload configuration from disk."""
try:
from fastapi import Request
# config_mgr is stored in app state
request = Request({})
config = request.app.state.config_mgr.reload()
return {
"success": True,
"message": "Configuration reloaded",
"config": {
"loaded": config.models.loaded,
"preload": config.models.preload,
"load_mode": config.models.default_load_mode
}
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
from datetime import datetime
......@@ -70,16 +70,25 @@ from codai.api.transcriptions import router as transcriptions_router
from codai.api.images import router as images_router
from codai.api.tts import router as tts_router
from codai.api.text import router as text_router
from codai.admin.routes import router as admin_router
# Import and add middleware
from codai.api.log import log_requests
app.middleware("http")(log_requests)
# Mount static files for admin dashboard
from fastapi.staticfiles import StaticFiles
from pathlib import Path
admin_static_dir = Path(__file__).parent.parent / "admin" / "static"
if admin_static_dir.exists():
app.mount("/static/admin", StaticFiles(directory=str(admin_static_dir)), name="admin_static")
# Include routers from submodules
app.include_router(transcriptions_router)
app.include_router(images_router)
app.include_router(tts_router)
app.include_router(text_router)
app.include_router(admin_router)
@app.get("/v1/models", response_model=ModelList)
......
......@@ -4,6 +4,8 @@ import os
# Import configuration from codai modules
from codai.cli import parse_args
from codai.config import ConfigManager
from codai.admin.routes import init_session_manager
def main():
......@@ -25,17 +27,21 @@ def main():
args = parse_args()
# Initialize ConfigManager
config_dir = args.config
config_mgr = ConfigManager(config_dir)
config = config_mgr.load()
# Initialize admin session manager
from pathlib import Path
init_session_manager(Path(config_dir))
# Handle early exit options (before heavy imports)
if args.list_cached_models:
print("\n=== Listing Cached Models ===")
# Import only what's needed for cache listing
from codai.models.cache import list_cached_models_info, get_all_cache_dirs
cache_info = list_cached_models_info()
caches = get_all_cache_dirs()
# Show CoderAI GGUF cache
coderai_dir = caches.get('coderai')
if coderai_dir:
print(f"\n--- CODERAI GGUF Cache ({coderai_dir}) ---")
......@@ -44,11 +50,6 @@ def main():
print(f" {filename} ({size_mb:.1f} MB)")
else:
print(" No cached GGUF files.")
else:
print(f"\n--- CODERAI GGUF Cache ---")
print(" (directory not found)")
# Show HuggingFace cached models
hf_dir = caches.get('huggingface')
if hf_dir:
print(f"\n--- HUGGINGFACE Models Cache ({hf_dir}) ---")
......@@ -58,71 +59,47 @@ def main():
print(f" └─ {revision_count} revision(s)")
else:
print(" No cached HuggingFace models.")
else:
print(f"\n--- HUGGINGFACE Models Cache ---")
print(" (directory not found)")
# Show summary
print(f"\n=== Summary ===")
print(f"Total cached models: {cache_info['total_models']}")
print(f"Total disk usage: {cache_info['total_size_gb']:.2f} GB")
print("\nCache locations:")
for cache_name, cache_dir in caches.items():
print(f" {cache_name}: {cache_dir}")
sys.exit(0)
# Handle --remove-all-models early
if args.remove_all_models:
print("\n=== Removing All Cached Models ===")
from codai.models.cache import remove_all_cached_models
total_removed = remove_all_cached_models()
print(f"\n=== Removed {total_removed} item(s) from all caches ===")
sys.exit(0)
# Handle --remove-model early
if args.remove_model:
print(f"\n=== Removing Cached Model Matching: {args.remove_model} ===")
from codai.models.cache import remove_cached_model
removed = remove_cached_model(args.remove_model)
if not removed:
print(f"No cached models found matching: {args.remove_model}")
print(f"\nUse --list-cached-models to see available models.")
sys.exit(0)
total_size = sum(size for _, _, size in removed)
print(f"\nRemoved {len(removed)} cached model file(s), freeing {total_size / (1024*1024):.1f} MB")
sys.exit(0)
# Handle --download-model early (before heavy imports)
if args.download_model:
print(f"\n=== Downloading Model: {args.download_model} ===")
from codai.models.cache import download_model, is_huggingface_model_id
model_id = args.download_model
file_pattern = args.download_file_pattern
try:
# For HuggingFace model IDs, try pattern-based download first
if is_huggingface_model_id(model_id):
# If file pattern specified, use it
if file_pattern:
print(f"File pattern: {file_pattern}")
cached_path = download_model(model_id, file_pattern=file_pattern)
else:
# Try common patterns: GGUF first, then full repo download
print("Trying GGUF download first...")
cached_path = download_model(model_id, file_pattern='.gguf')
if not cached_path:
# No GGUF files - download entire repo (for transformers/diffusers models)
print("No GGUF files found, downloading full HuggingFace repo...")
try:
from huggingface_hub import snapshot_download
......@@ -131,9 +108,7 @@ def main():
print(f"Error downloading full repo: {e}")
cached_path = None
else:
# URL or local path
cached_path = download_model(model_id, file_pattern=file_pattern or '.gguf')
if cached_path:
print(f"\n=== Model downloaded successfully ===")
print(f"Cached at: {cached_path}")
......@@ -145,491 +120,430 @@ def main():
print(f"\n=== Error downloading model: {e} ===")
sys.exit(1)
# Import globals from codai modules (only after early exits)
if args.vulkan_list_devices:
print("\nListing Vulkan devices...")
try:
import subprocess
result = subprocess.run(['vulkaninfo', '--summary'], capture_output=True, text=True)
if result.returncode == 0:
print(result.stdout)
else:
print("Could not run vulkaninfo. Make sure vulkan-tools is installed.")
except Exception as e:
print(f"Error listing devices: {e}")
sys.exit(0)
# Import core modules (only after early exits)
from codai.api import app
from codai.api.state import (
set_global_args,
set_global_debug,
set_global_system_prompt,
set_global_tools_closer_prompt,
set_global_file_path,
set_load_mode,
set_grammar_guided_gen,
set_global_args, set_global_debug, set_global_system_prompt,
set_global_tools_closer_prompt, set_global_file_path, set_load_mode,
set_grammar_guided_gen, get_global_args
)
from codai.models.manager import ModelManager, MultiModelManager, model_manager, multi_model_manager
from codai.backends import detect_available_backends
from codai.models.cache import (
get_all_cache_dirs,
get_cached_model_path,
get_model_cache_dir,
download_model,
list_cached_models_info,
)
from codai.api.app import set_global_file_path_wrapper
# Import global setters from text module FIRST (before calling them)
# Import after early exits
from codai.api.text import (
set_global_args,
set_global_debug,
set_global_system_prompt,
set_global_tools_closer_prompt,
set_global_args as set_global_args_text,
set_global_debug as set_global_debug_text,
set_global_system_prompt as set_global_system_prompt_text,
set_global_tools_closer_prompt as set_global_tools_closer_prompt_text,
)
from codai.api.app import set_load_mode
from codai.api.app import set_load_mode as set_load_mode_app
# Store args globally for access in endpoints (both state and text.py)
set_global_args(args)
# Store config reference globally for access
fastapi_app = app
fastapi_app.state.config_mgr = config_mgr
fastapi_app.state.config = config
# Set global variables
# Set global variables from config and args (args override config for now)
global global_system_prompt, global_tools_closer_prompt, global_debug, global_dump, global_file_path, grammar_guided_gen
# Set global grammar-guided-gen flag
from codai.api.state import set_grammar_guided_gen
grammar_guided_gen = args.grammar_guided_gen
if grammar_guided_gen:
print("Grammar-guided generation enabled (--grammar-guided-gen)")
# Print --offload-strategy none status
if args.offload_strategy == "none":
print("Offload strategy 'none': CPU offloading and VRAM auto-detection disabled")
print(" Model will be loaded directly on GPU without memory limits")
# Print --no-ram mode status
if args.no_ram:
print("No-RAM mode enabled (--no-ram): maximizing VRAM usage, no CPU RAM spilling")
print(" llama-cpp-python: n_gpu_layers=-1, use_mmap=False, --n-ctx ignored")
print(" HuggingFace: device_map=cuda, low_cpu_mem_usage=True, torch_dtype=auto")
print(" Diffusers: forced full GPU loading")
print(" sd.cpp: maximizing GPU offload")
# Set global system prompt from --system-prompt flag
global_system_prompt = args.system_prompt
set_global_system_prompt(global_system_prompt)
# Set global tools-closer-prompt flag
global_tools_closer_prompt = args.tools_closer_prompt
set_global_tools_closer_prompt(global_tools_closer_prompt)
if global_tools_closer_prompt:
print("Tools closer prompt enabled (--tools-closer-prompt)")
# Set global debug flag
# Debug from command line flag (overrides config)
global_debug = args.debug
set_global_debug(global_debug)
set_global_debug_text(global_debug)
# Set global dump flag (enables debug as well for litellm output)
global_dump = args.dump
if global_dump:
global_debug = True
set_global_debug(True)
set_global_debug_text(True)
# Set global file path for storing generated files
global_file_path = args.file_path
set_global_file_path(global_file_path)
# System prompt (from config)
global_system_prompt = config.system_prompt
set_global_system_prompt(global_system_prompt)
set_global_system_prompt_text(global_system_prompt)
# Also set file path for images module
from codai.api.images import set_global_file_path as set_images_file_path
set_images_file_path(global_file_path)
# Tools closer prompt
global_tools_closer_prompt = config.tools_closer_prompt
set_global_tools_closer_prompt(global_tools_closer_prompt)
set_global_tools_closer_prompt_text(global_tools_closer_prompt)
if global_tools_closer_prompt:
print("Tools closer prompt enabled")
# Also set global args for images module (it has its own global_args)
from codai.api.images import set_global_args as set_images_global_args
set_images_global_args(args)
# Grammar guided generation
grammar_guided_gen = config.grammar_guided
if grammar_guided_gen:
set_grammar_guided_gen(True)
print("Grammar-guided generation enabled")
# Also set file path for app.py (needed for /v1/files endpoint)
from codai.api.app import set_global_file_path_wrapper
# File path
global_file_path = config.file_path
set_global_file_path(global_file_path)
set_global_file_path_wrapper(global_file_path)
from codai.api.images import set_global_file_path as set_images_file_path
set_images_file_path(global_file_path)
# Debug: print command line
if global_debug:
# Print the full command line that was used to invoke codai
import shlex
cmd_line = ' '.join(shlex.quote(arg) for arg in sys.argv)
print(f"\n{'='*80}")
print(f"=== COMMAND LINE: {cmd_line}")
print(f"{'='*80}\n")
print("DEBUG MODE ENABLED - Full requests and replies will be dumped to stdout")
# Handle --vulkan-list-devices
if args.vulkan_list_devices:
print("\nListing Vulkan devices...")
try:
import subprocess
result = subprocess.run(['vulkaninfo', '--summary'], capture_output=True, text=True)
if result.returncode == 0:
print(result.stdout)
else:
print("Could not run vulkaninfo. Make sure vulkan-tools is installed.")
except Exception as e:
print(f"Error listing devices: {e}")
sys.exit(0)
# Get model names from args - support multiple models
model_names = args.model if args.model else []
# Helper function to get config value by index with fallback
def get_ctx_by_index(ctx_list, index, default):
"""Get context value by model index, with fallback to default."""
if ctx_list and index < len(ctx_list):
return ctx_list[index]
return default
# Validate: must have at least one model specified
audio_models = args.audio_model if args.audio_model else []
image_models = args.image_model if args.image_model else []
vision_models = args.vision_model if args.vision_model else []
if not model_names and not audio_models and not image_models and not vision_models and args.tts_model is None:
print("Error: At least one of --model, --audio-model, --image-model, --vision-model, or --tts-model must be specified.")
print("")
print("For NVIDIA backend (HuggingFace models):")
print(" - microsoft/DialoGPT-medium")
print(" - meta-llama/Llama-2-7b-chat-hf (requires auth)")
print(" - TinyLlama/TinyLlama-1.1B-Chat-v1.0")
print(" - Use multiple --model flags for multiple models")
print("")
print("For Vulkan backend (GGUF models):")
print(" - Local path: ./phi-3-mini-4k-instruct-q4_k_m.gguf")
print(" - Or a HuggingFace model ID: TheBloke/Mistral-7B-Instruct-v0.2-GGUF")
print(" - Use multiple --model flags for multiple models")
print("")
sys.exit(1)
# Determine load mode
# Default is ondemand: pre-load only the first model, unload/load on switch
# --loadswap: load first in VRAM, others in CPU RAM, swap on switch
# --loadall: try to load all models in VRAM, offload to CPU RAM if fails
# --nopreload: skip pre-loading in any mode, load on first request
load_mode = "ondemand" # Default: on-demand loading
if args.loadall:
load_mode = "loadall"
elif args.loadswap:
load_mode = "loadswap"
nopreload = args.nopreload
print("DEBUG MODE ENABLED")
# Determine load mode from config
load_mode = config.models.default_load_mode or "ondemand"
set_load_mode(load_mode)
multi_model_manager.set_load_mode(load_mode)
print(f"\nLoad mode: {load_mode}")
if load_mode == "ondemand":
print("Load mode: ondemand (pre-load first model, unload/load on switch)")
print(" (pre-load first model, unload/load on switch)")
elif load_mode == "loadswap":
print("Load mode: loadswap (first model in VRAM, others in CPU RAM, swap on switch)")
print(" (first model in VRAM, others in CPU RAM, swap on switch)")
elif load_mode == "loadall":
print("Load mode: loadall (load all models, offload to CPU RAM if VRAM full)")
if nopreload:
print(" --nopreload: models will load on first request instead of at startup")
# Initialize model manager
print("\n=== Initializing Model Manager ===")
print(" (load all models into VRAM, offload to CPU RAM if full)")
# Detect available backends
available_backends = detect_available_backends()
print(f"Available backends: {available_backends}")
print(f"\nAvailable backends: {available_backends}")
# Determine which backend to use
backend = args.backend
# Determine backend from config
backend = config.backend.type
if backend == "auto":
if "nvidia" in available_backends:
if available_backends.get('nvidia'):
backend = "nvidia"
elif "vulkan" in available_backends:
elif available_backends.get('vulkan'):
backend = "vulkan"
elif "opencl" in available_backends:
elif available_backends.get('opencl'):
backend = "opencl"
else:
print("Error: No supported backend detected (NVIDIA CUDA, AMD Vulkan, or OpenCL)")
print("Error: No supported backend detected")
sys.exit(1)
print(f"Using backend: {backend}")
# Set the backend for the model manager
model_manager.backend_type = backend
# Store references globally for API endpoints
from codai.api import app as fastapi_app
# Store global state
fastapi_app.state.model_manager = model_manager
fastapi_app.state.multi_model_manager = multi_model_manager
# Load main text model(s)
if model_names:
print(f"\nMain text model(s): {model_names}")
# Register models with multi_model_manager (set_default_model also resolves/caches)
for idx, model_name in enumerate(model_names):
multi_model_manager.set_default_model(model_name, {
'ctx': get_ctx_by_index(args.n_ctx, idx, 0),
})
# Pre-load models at startup (unless --nopreload)
if nopreload:
print(f" --nopreload: text model(s) will load on first request")
elif load_mode == "ondemand":
# Ondemand: pre-load only the first model into VRAM
try:
print(f"Preloading first model into VRAM: {model_names[0]}...")
mm = multi_model_manager._load_default_model()
if mm is not None and mm.backend is not None:
multi_model_manager.active_in_vram = multi_model_manager.default_model
print(f"Model loaded successfully: {model_names[0]}")
# =========================================================================
# Load models from config
# =========================================================================
print(f"\n=== Loading Models from Config ===")
models_config = config_mgr.models_data
# Helper to find model config
def get_model_cfg(model_type, model_id):
key = f"{model_type}:{model_id}"
for m in models_config.get(f"{model_type}_models", []):
if m.get("id") == model_id:
return m
return {}
# Helper to build kwargs from model config
def build_kwargs_from_config(model_cfg, model_type):
kwargs = {}
if model_type == "text":
kwargs['ctx'] = model_cfg.get('context_size')
kwargs['n_gpu_layers'] = model_cfg.get('n_gpu_layers', -1)
kwargs['load_in_4bit'] = model_cfg.get('load_in_4bit', False)
kwargs['load_in_8bit'] = model_cfg.get('load_in_8bit', False)
kwargs['flash_attn'] = model_cfg.get('flash_attn', False)
kwargs['offload_strategy'] = model_cfg.get('offload_strategy', 'auto')
kwargs['manual_ram_gb'] = model_cfg.get('manual_ram_gb')
kwargs['max_gpu_percent'] = model_cfg.get('max_gpu_percent')
kwargs['no_ram'] = model_cfg.get('no_ram', False)
elif model_type == "image":
kwargs['llm_path'] = model_cfg.get('llm_path')
kwargs['vae_path'] = model_cfg.get('vae_path')
kwargs['sample_method'] = model_cfg.get('sample_method', 'res_multistep')
kwargs['steps'] = model_cfg.get('steps', 4)
kwargs['width'] = model_cfg.get('width', 512)
kwargs['height'] = model_cfg.get('height', 512)
kwargs['cfg_scale'] = model_cfg.get('cfg_scale', 1.0)
kwargs['precision'] = model_cfg.get('precision', 'f32')
kwargs['cpu_offload'] = model_cfg.get('cpu_offload', False)
kwargs['seed'] = model_cfg.get('seed')
kwargs['vae_tiling'] = model_cfg.get('vae_tiling', False)
kwargs['clip_on_cpu'] = model_cfg.get('clip_on_cpu', False)
elif model_type == "audio":
kwargs['ctx'] = model_cfg.get('context_ms')
kwargs['offload'] = model_cfg.get('offload')
kwargs['vulkan_device'] = model_cfg.get('vulkan_device', 0)
elif model_type == "vision":
kwargs['ctx'] = model_cfg.get('context_size')
kwargs['offload'] = model_cfg.get('offload')
kwargs['n_gpu_layers'] = model_cfg.get('n_gpu_layers', -1)
return kwargs
# Load text models (main LLM)
text_models = models_config.get("text_models", [])
text_model_names = [m["id"] for m in text_models if m.get("enabled", True)]
if text_model_names:
print(f"\nMain text model(s): {text_model_names}")
for idx, model_name in enumerate(text_models):
multi_model_manager.set_default_model(
model_name["id"],
config=build_kwargs_from_config(model_name, "text"),
backend_type=model_name.get("backend", "auto")
)
# Load preload list
preload_list = models_config.get("preload", [])
loaded_list = models_config.get("loaded", [])
# Determine which models to preload at startup
# loaded: models to load into VRAM (or CPU for loadswap) immediately
# preload: models to keep in CPU RAM for fast swapping
nopreload = False # Config-based loading, no CLI preload skip
# Pre-load models at startup based on config
if not nopreload and load_mode in ("loadall", "loadswap"):
all_startup_models = loaded_list + preload_list
elif not nopreload and load_mode == "ondemand":
all_startup_models = loaded_list[:1] if loaded_list else []
else:
print(f"Warning: Model {model_names[0]} failed to load")
except Exception as e:
print(f"Warning: Failed to preload model: {e}")
print(f"Model will load on first request")
elif load_mode == "loadswap":
# Loadswap: load first model into VRAM, others into CPU RAM
all_startup_models = []
# Pre-load process
if text_model_names:
first_text = text_models[0]["id"] if text_models else None
if not nopreload and load_mode == "ondemand" and first_text:
# Preload first model into VRAM
try:
print(f"Preloading first model into VRAM: {model_names[0]}...")
print(f"Preloading first model into VRAM: {first_text}...")
mm = multi_model_manager._load_default_model()
if mm is not None and mm.backend is not None:
multi_model_manager.active_in_vram = multi_model_manager.default_model
print(f"Model loaded successfully (VRAM): {model_names[0]}")
print(f"Model loaded successfully: {first_text}")
else:
print(f"Warning: Model {model_names[0]} failed to load")
print(f"Warning: Model {first_text} failed to load")
except Exception as e:
print(f"Warning: Failed to preload model: {e}")
print(f"Model will load on first request")
# Load remaining text models into CPU RAM
for idx, model_name in enumerate(model_names[1:], 1):
try:
print(f"Preloading model into CPU RAM: {model_name}...")
mm2 = multi_model_manager._load_model_by_name(model_name)
if mm2 is not None:
# Move to CPU immediately (it was loaded into VRAM by default)
multi_model_manager._move_model_to_cpu(model_name)
print(f"Model loaded successfully (CPU RAM): {model_name}")
else:
print(f"Warning: Model {model_name} failed to load")
except Exception as e:
print(f"Warning: Failed to preload model {model_name}: {e}")
elif load_mode == "loadall":
# Loadall: try to load all models into VRAM, offload to CPU RAM if fails
for idx, model_name in enumerate(model_names):
try:
if idx == 0:
print(f"Preloading model into VRAM: {model_name}...")
mm = multi_model_manager._load_default_model()
else:
print(f"Preloading model into VRAM: {model_name}...")
mm = multi_model_manager._load_model_by_name(model_name)
# Load audio models (registered, load on first request)
audio_models = models_config.get("audio_models", [])
for audio_m in audio_models:
if audio_m.get("enabled", True):
multi_model_manager.set_audio_model(
audio_m["id"],
config=build_kwargs_from_config(audio_m, "audio")
)
if mm is not None and (not hasattr(mm, 'backend') or mm.backend is not None):
if idx == 0:
multi_model_manager.active_in_vram = multi_model_manager.default_model
print(f"Model loaded successfully (VRAM): {model_name}")
else:
print(f"Warning: Model {model_name} failed to load")
except Exception as e:
error_msg = str(e).lower()
is_oom = any(x in error_msg for x in ['out of memory', 'oom', 'cuda error'])
if is_oom:
print(f"VRAM full for {model_name}, offloading to CPU RAM...")
try:
mm = multi_model_manager._load_model_by_name(model_name)
if mm is not None:
multi_model_manager._move_model_to_cpu(model_name)
print(f"Model loaded successfully (CPU RAM): {model_name}")
except Exception as e2:
print(f"Warning: Failed to load model {model_name} even to CPU: {e2}")
else:
print(f"Warning: Failed to preload model {model_name}: {e}")
# Set up audio model if specified
if audio_models:
print(f"\nAudio transcription model(s): {audio_models}")
for idx, audio_m in enumerate(audio_models):
multi_model_manager.set_audio_model(audio_m, {
'ctx': get_ctx_by_index(args.audio_ctx, idx, 0),
'offload': args.audio_offload,
})
# Set up whisper-server if specified
if args.whisper_server:
print(f"\nWhisper server: {args.whisper_server}")
print(f" Port: {args.whisper_server_port}")
# Import WhisperServerManager
from codai.models.manager import WhisperServerManager
# Check if whisper-server is already running
if multi_model_manager.whisper_server is None:
whisper_server_mgr = WhisperServerManager(
server_path=args.whisper_server,
port=args.whisper_server_port
# Load image models
image_models = models_config.get("image_models", [])
for img_m in image_models:
if img_m.get("enabled", True):
multi_model_manager.set_image_model(
img_m["id"],
config=build_kwargs_from_config(img_m, "image")
)
multi_model_manager.whisper_server = whisper_server_mgr
else:
whisper_server_mgr = multi_model_manager.whisper_server
print("Whisper server already running, using existing instance")
# Start whisper-server if we have audio_models configured
if audio_models:
model_to_use = audio_models[0] if audio_models else None
gpu_device = getattr(args, 'audio_vulkan_device', 0) or 0
print(f"DEBUG: Starting whisper-server with gpu_device={gpu_device}")
actual_model_path = whisper_server_mgr.start(model_path=model_to_use, gpu_device=gpu_device)
if actual_model_path:
# Update audio_models in multi_model_manager to store the actual path (not the URL)
if model_to_use != actual_model_path:
if multi_model_manager.audio_models and multi_model_manager.audio_models[0] == model_to_use:
multi_model_manager.audio_models[0] = actual_model_path
print(f"Whisper server started with model: {actual_model_path}")
else:
print("Warning: Failed to start whisper-server, falling back to other backends")
# Set up image model if specified
if image_models:
print(f"\nImage generation model(s): {image_models}")
for idx, img_m in enumerate(image_models):
multi_model_manager.set_image_model(img_m, {
'ctx': get_ctx_by_index(args.image_ctx, idx, 0),
'offload': args.image_offload,
'llm_path': args.llm_path,
'vae_path': args.vae_path,
'sample_method': args.image_sample_method,
'steps': args.image_steps,
'width': args.image_width,
'height': args.image_height,
'cfg_scale': args.image_cfg_scale,
})
# Set up vision model if specified
if vision_models:
print(f"\nVision model(s): {vision_models}")
for idx, vision_m in enumerate(vision_models):
multi_model_manager.set_vision_model(vision_m, {
'ctx': get_ctx_by_index(args.n_ctx, idx, 0),
'offload': args.image_offload,
})
# Set up TTS model if specified
if args.tts_model:
print(f"\nText-to-speech model: {args.tts_model}")
multi_model_manager.set_tts_model(args.tts_model, {})
# Register model aliases if specified
if args.model_aliases:
print(f"\nRegistering model aliases:")
for alias, model in args.model_aliases:
# Load vision models
vision_models = models_config.get("vision_models", [])
for vis_m in vision_models:
if vis_m.get("enabled", True):
multi_model_manager.set_vision_model(
vis_m["id"],
config=build_kwargs_from_config(vis_m, "vision")
)
# Load TTS model
tts_model = models_config.get("tts_models", [])
if tts_model:
for tts_m in tts_model:
if tts_m.get("enabled", True):
multi_model_manager.set_tts_model(tts_m["id"], {})
# Register aliases
aliases = models_config.get("aliases", {})
for alias, model in aliases.items():
multi_model_manager.set_model_alias(alias, model)
print(f" {alias} -> {model}")
# =========================================================================
# Pre-load non-text models for loadall and loadswap modes
# (Text models are already handled above)
# =========================================================================
if not nopreload and load_mode in ("loadall", "loadswap"):
# Collect all non-text models that need pre-loading
# For loadall: load all into VRAM (offload to CPU if OOM)
# For loadswap: first model in VRAM (already done for text), rest in CPU RAM
# Print startup summary
print(f"\nBackend: {backend}")
print(f"Load mode: {load_mode}")
available_models = multi_model_manager.list_models()
print(f"\nAvailable models: {[m.id for m in available_models]}")
# Determine if the first text model is already in VRAM
first_model_loaded = multi_model_manager.active_in_vram is not None
# Register custom aliases from config
if aliases:
print(f"\nModel aliases:")
for alias, target in aliases.items():
print(f" {alias} -> {target}")
# Set global args for backward compatibility with existing code
class ArgsCompat:
pass
global_args = ArgsCompat()
global_args.backend = backend
global_args.host = config.server.host
global_args.port = config.server.port
global_args.url = "auto"
global_args.https = config.server.https
global_args.privkey = config.server.https_key_path
global_args.pubkey = config.server.https_cert_path
global_args.offload_dir = config.offload.directory
global_args.ram = config.offload.manual_ram_gb
global_args.offload_strategy = config.offload.strategy
global_args.no_ram = config.offload.no_ram
global_args.load_in_4bit = config.offload.load_in_4bit
global_args.load_in_8bit = config.offload.load_in_8bit
global_args.flash_attn = config.offload.flash_attention
global_args.max_gpu_percent = config.offload.max_gpu_percent
global_args.n_gpu_layers = config.vulkan.n_gpu_layers
global_args.n_ctx = [config.vulkan.n_ctx]
global_args.vulkan_device = config.vulkan.device_id
global_args.vulkan_single_gpu = config.vulkan.single_gpu
global_args.image_sample_method = config.image.sample_method
global_args.image_steps = config.image.steps
global_args.image_width = config.image.width
global_args.image_height = config.image.height
global_args.image_cfg_scale = config.image.cfg_scale
global_args.image_precision = config.image.precision
global_args.image_cpu_offload = config.image.cpu_offload
global_args.image_seed = config.image.seed
global_args.vae_tiling = config.image.vae_tiling
global_args.clip_on_cpu = config.image.clip_on_cpu
global_args.system_prompt = config.system_prompt
global_args.tools_closer_prompt = config.tools_closer_prompt
global_args.grammar_guided_gen = config.grammar_guided
global_args.debug = global_debug
global_args.dump = global_dump
global_args.file_path = config.file_path
global_args.parser = config.parser
global_args.hf_chat_template = config.hf_chat_templates
global_args.force_reasoning = config.reasoning_options
global_args.model = text_model_names
global_args.language_model = text_model_names
global_args.image_model = [m["id"] for m in image_models if m.get("enabled")]
global_args.audio_model = [m["id"] for m in audio_models if m.get("enabled")]
global_args.vision_model = [m["id"] for m in vision_models if m.get("enabled")]
global_args.tts_model = tts_model[0]["id"] if tts_model else None
global_args.model_aliases = [(k, v) for k, v in aliases.items()]
global_args.whisper_server = config.whisper.server_path
global_args.whisper_server_port = config.whisper.server_port
global_args.audio_ctx = None
global_args.audio_offload = None
global_args.audio_vulkan_device = 0
global_args.image_ctx = None
global_args.image_offload = None
global_args.download_file_pattern = None
global_args.list_cached_models = False
global_args.remove_all_models = False
global_args.remove_model = None
global_args.download_model = None
global_args.vulkan_list_devices = False
global_args.loadall = False
global_args.loadswap = False
global_args.nopreload = nopreload
set_global_args(global_args)
set_global_args_text(global_args)
set_load_mode_app(load_mode)
# Set image module global args
from codai.api.images import set_global_args as set_images_global_args
set_images_global_args(global_args)
# Vulkan list devices
if args.vulkan_list_devices:
print("\nListing Vulkan devices...")
try:
import subprocess
result = subprocess.run(['vulkaninfo', '--summary'], capture_output=True, text=True)
if result.returncode == 0:
print(result.stdout)
else:
print("Could not run vulkaninfo.")
except Exception as e:
print(f"Error: {e}")
sys.exit(0)
# Startup: Preload configured models (non-text) for loadall/loadswap
if not nopreload and load_mode in ("loadall", "loadswap"):
first_loaded = multi_model_manager.active_in_vram is not None
# Pre-load image models
if image_models:
print(f"\n=== Pre-loading image model(s) ===")
for idx, img_m in enumerate(image_models):
model_key = f"image:{img_m}"
for img_m in image_models:
if not img_m.get("enabled", True):
continue
model_key = f"image:{img_m['id']}"
if model_key in multi_model_manager.models:
continue # Already loaded
continue
try:
from codai.api.images import _load_diffusers_pipeline, _is_gguf_model, _load_sdcpp_model
if load_mode == "loadall":
# Try to load into VRAM
print(f"Preloading image model into VRAM: {img_m}...")
if _is_gguf_model(img_m):
resolved_path = multi_model_manager.load_model(img_m)
print(f"Preloading image model into VRAM: {img_m['id']}...")
if _is_gguf_model(img_m['id']):
resolved_path = multi_model_manager.load_model(img_m['id'])
if resolved_path and os.path.isfile(resolved_path):
sd_model = _load_sdcpp_model(resolved_path, args)
sd_model = _load_sdcpp_model(resolved_path, global_args)
if sd_model:
multi_model_manager.add_model(model_key, sd_model)
print(f"Image model loaded (VRAM, sd.cpp): {img_m}")
print(f"Image model loaded (VRAM): {img_m['id']}")
else:
try:
pipeline = _load_diffusers_pipeline(img_m, args)
pipeline = _load_diffusers_pipeline(img_m['id'], global_args)
if pipeline:
multi_model_manager.add_model(model_key, pipeline)
print(f"Image model loaded (VRAM, diffusers): {img_m}")
print(f"Image model loaded (VRAM): {img_m['id']}")
except Exception as e:
error_msg = str(e).lower()
is_oom = any(x in error_msg for x in ['out of memory', 'oom', 'cuda error'])
if is_oom:
print(f"VRAM full for image model {img_m}, will load on demand")
em = str(e).lower()
if any(x in em for x in ['out of memory', 'oom', 'cuda error']):
print(f"VRAM full for {img_m['id']}, will load on demand")
else:
print(f"Warning: Failed to preload image model {img_m}: {e}")
elif load_mode == "loadswap":
# Load into VRAM then move to CPU (unless it's the first model overall)
if not first_model_loaded:
# No model in VRAM yet, load this one into VRAM
print(f"Preloading image model into VRAM: {img_m}...")
if _is_gguf_model(img_m):
resolved_path = multi_model_manager.load_model(img_m)
print(f"Warning: {e}")
elif load_mode == "loadswap" and not first_loaded:
print(f"Preloading image model: {img_m['id']}...")
if _is_gguf_model(img_m['id']):
resolved_path = multi_model_manager.load_model(img_m['id'])
if resolved_path and os.path.isfile(resolved_path):
sd_model = _load_sdcpp_model(resolved_path, args)
sd_model = _load_sdcpp_model(resolved_path, global_args)
if sd_model:
multi_model_manager.add_model(model_key, sd_model)
first_model_loaded = True
print(f"Image model loaded (VRAM): {img_m}")
first_loaded = True
print(f"Image model loaded: {img_m['id']}")
else:
try:
pipeline = _load_diffusers_pipeline(img_m, args)
pipeline = _load_diffusers_pipeline(img_m['id'], global_args)
if pipeline:
multi_model_manager.add_model(model_key, pipeline)
first_model_loaded = True
print(f"Image model loaded (VRAM): {img_m}")
first_loaded = True
print(f"Image model loaded: {img_m['id']}")
except Exception as e:
print(f"Warning: Failed to preload image model {img_m}: {e}")
else:
# First model already in VRAM, load this to VRAM then move to CPU
print(f"Preloading image model into CPU RAM: {img_m}...")
# Move current VRAM model to CPU temporarily
current_vram = multi_model_manager.active_in_vram
if current_vram and current_vram in multi_model_manager.models:
multi_model_manager._move_model_to_cpu(current_vram)
try:
if _is_gguf_model(img_m):
resolved_path = multi_model_manager.load_model(img_m)
if resolved_path and os.path.isfile(resolved_path):
sd_model = _load_sdcpp_model(resolved_path, args)
if sd_model:
multi_model_manager.add_model(model_key, sd_model)
multi_model_manager._move_model_to_cpu(model_key)
print(f"Image model loaded (CPU RAM): {img_m}")
else:
pipeline = _load_diffusers_pipeline(img_m, args)
if pipeline:
multi_model_manager.add_model(model_key, pipeline)
multi_model_manager._move_model_to_cpu(model_key)
print(f"Image model loaded (CPU RAM): {img_m}")
print(f"Warning: {e}")
except Exception as e:
print(f"Warning: Failed to preload image model {img_m}: {e}")
# Move original model back to VRAM
if current_vram and current_vram in multi_model_manager.models:
multi_model_manager._move_model_to_vram(current_vram)
multi_model_manager.active_in_vram = current_vram
except ImportError as e:
print(f"Warning: Cannot preload image model {img_m} (missing dependency): {e}")
except Exception as e:
print(f"Warning: Failed to preload image model {img_m}: {e}")
# Note: Audio models (faster-whisper) and TTS models (kokoro) are loaded
# by their respective API modules on first request, as they use specialized
# loading mechanisms. The model files are already cached by set_audio_model()
# and set_tts_model() above.
if audio_models:
print(f"\nAudio model(s) registered and cached, will load into memory on first request")
if args.tts_model:
print(f"TTS model registered and cached, will load into memory on first request")
print(f"Warning: {e}")
# Start the server
import uvicorn
print(f"\nStarting server on http://{args.host}:{args.port}")
print(f"API documentation available at http://{args.host}:{args.port}/docs")
print(f"\nStarting server on http://{config.server.host}:{config.server.port}")
print(f"API docs: http://{config.server.host}:{config.server.port}/docs")
print(f"Admin UI: http://{config.server.host}:{config.server.port}/admin")
if model_manager.backend is not None:
actual_backend = model_manager.backend_type
......@@ -637,47 +551,36 @@ def main():
actual_backend = "cuda (via llama-cpp-python)"
print(f"Using backend: {actual_backend}")
# Print available models
models = multi_model_manager.list_models()
print(f"Available models: {[m.id for m in models]}")
# Run server with or without HTTPS
if args.https:
if config.server.https:
import ssl
ssl_keyfile = None
ssl_certfile = None
if args.privkey and args.pubkey:
ssl_keyfile = args.privkey
ssl_certfile = args.pubkey
print(f"Using HTTPS with custom certificates: {args.pubkey}")
else:
ssl_keyfile = config.server.https_key_path
ssl_certfile = config.server.https_cert_path
if not (ssl_keyfile and ssl_certfile):
print("Generating self-signed HTTPS certificate...")
import subprocess
cert_path = config_dir / "cert.pem"
key_path = config_dir / "key.pem"
try:
cert_path = "./cert.pem"
key_path = "./key.pem"
subprocess.run([
"openssl", "req", "-x509", "-newkey", "rsa:4096",
"-keyout", key_path, "-out", cert_path,
"-days", "365", "-nodes",
"-subj", "/CN=localhost"
], check=True, capture_output=True)
ssl_keyfile = key_path
ssl_certfile = cert_path
print(f"Generated self-signed certificate: {cert_path}")
subprocess.run(
["openssl", "req", "-x509", "-newkey", "rsa:4096",
"-keyout", str(key_path), "-out", str(cert_path),
"-days", "365", "-nodes", "-subj", "/CN=localhost"],
check=True, capture_output=True
)
ssl_keyfile = str(key_path)
ssl_certfile = str(cert_path)
print(f"Generated self-signed certificate")
except Exception as e:
print(f"Warning: Could not generate certificate: {e}")
print("Falling back to HTTP...")
uvicorn.run(app, host=args.host, port=args.port)
uvicorn.run(app, host=config.server.host, port=config.server.port)
return
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
ssl_context.load_cert_chain(ssl_certfile, ssl_keyfile)
uvicorn.run(app, host=args.host, port=args.port, ssl=ssl_context)
uvicorn.run(app, host=config.server.host, port=config.server.port, ssl_context=ssl_context)
else:
uvicorn.run(app, host=args.host, port=args.port)
uvicorn.run(app, host=config.server.host, port=config.server.port)
if __name__ == "__main__":
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment