Web Admin

parent f4a34bc3
......@@ -5,6 +5,8 @@
venv/
.venv/
env/
venv_all/
# Python cache
__pycache__/
......
# Backend selection file
.backend
# Virtual environments
venv/
.venv/
env/
# Python cache
__pycache__/
*.pyc
*.pyo
*.pyd
.Python
# Debug logs
debug.log
# Test files
test_*.py
#!/bin/bash
# Build script for CoderAI - Supports NVIDIA (CUDA), Vulkan, OpenCL, and CPU backends
# Usage: ./build.sh [nvidia|vulkan|vulkan-nvidia|cuda|opencl|all] [--flash]
# Usage: ./build.sh [nvidia|vulkan|vulkan-nvidia|cuda|opencl|all] [--flash] [--venv <venv>]
# Default: all (installs all backends)
# --flash: Enable and install Flash Attention 2 (for NVIDIA GPUs)
# --venv <venv>: Specify custom virtual environment name
set -e
......@@ -16,12 +17,21 @@ NC='\033[0m' # No Color
# Determine backend and flags
BACKEND="${1:-all}"
FLASH=false
CUSTOM_VENV=""
# Check for --flash flag in any position
# Parse arguments
i=1
for arg in "$@"; do
case $arg in
--flash) FLASH=true;;
--flash)
FLASH=true
;;
--venv)
i=$((i + 1))
eval "CUSTOM_VENV=\${$i}"
;;
esac
i=$((i + 1))
done
BACKEND=$(echo "$BACKEND" | tr '[:upper:]' '[:lower:]')
......@@ -61,8 +71,23 @@ fi
echo -e "${GREEN}✓ Python version: $PYTHON_VERSION${NC}"
# Determine cmake args for stable-diffusion-cpp-python.
# The pip release is missing the libwebm/build/ cmake submodule files.
# If libwebm-dev is installed system-wide we can link against it; otherwise disable WebM.
if ldconfig -p 2>/dev/null | grep -q "libwebm" || pkg-config --exists libwebm 2>/dev/null; then
SD_CMAKE_ARGS="-DSD_USE_SYSTEM_WEBM=ON"
echo -e "${GREEN}✓ Found system libwebm — stable-diffusion-cpp-python will use it${NC}"
else
SD_CMAKE_ARGS="-DSD_WEBM=OFF"
echo -e "${YELLOW}Note: libwebm-dev not found — WebM video output disabled for stable-diffusion-cpp-python${NC}"
echo -e "${YELLOW} Install libwebm-dev to enable WebM support${NC}"
fi
# Determine venv directory based on backend
if [ "$BACKEND" = "nvidia" ]; then
if [ -n "$CUSTOM_VENV" ]; then
VENV_DIR="$CUSTOM_VENV"
echo -e "${BLUE}Using custom virtual environment: $VENV_DIR${NC}"
elif [ "$BACKEND" = "nvidia" ]; then
VENV_DIR="venv_nvidia"
elif [ "$BACKEND" = "vulkan" ]; then
VENV_DIR="venv_vulkan"
......@@ -77,7 +102,11 @@ elif [ "$BACKEND" = "all" ]; then
fi
# Create virtual environment if it doesn't exist
echo -e "${YELLOW}Creating virtual environment: $VENV_DIR${NC}"
if [ -n "$CUSTOM_VENV" ]; then
echo -e "${YELLOW}Creating custom virtual environment: $VENV_DIR${NC}"
else
echo -e "${YELLOW}Creating virtual environment: $VENV_DIR${NC}"
fi
if [ ! -d "$VENV_DIR" ]; then
python3 -m venv "$VENV_DIR"
echo -e "${GREEN}✓ Created virtual environment: $VENV_DIR${NC}"
......@@ -116,7 +145,7 @@ if [ "$BACKEND" = "nvidia" ]; then
echo ""
echo -e "${YELLOW}Installing Flash Attention 2...${NC}"
echo -e "${YELLOW}This may take several minutes and requires CUDA 11.6+${NC}"
pip install flash-attn --no-build-isolation || {
MAX_JOBS=5 NVCC_THREADS=2 pip install flash-attn --no-build-isolation || {
echo -e "${RED}Warning: Flash Attention 2 installation failed${NC}"
echo -e "${YELLOW}Requirements: CUDA 11.6+, Linux, Ampere/Ada/Hopper GPU${NC}"
echo -e "${YELLOW}Continuing without Flash Attention...${NC}"
......@@ -394,9 +423,9 @@ elif [ "$BACKEND" = "opencl" ]; then
echo -e "${YELLOW}Installing base requirements...${NC}"
pip install -r requirements.txt
# Install stable-diffusion-cpp-python with OpenCL
# Install stable-diffusion-cpp-python with OpenCL (disable WebM to avoid libwebm cmake issue)
echo -e "${YELLOW}Installing stable-diffusion-cpp-python with OpenCL...${NC}"
pip install stable-diffusion-cpp-python || {
CMAKE_ARGS="$SD_CMAKE_ARGS" pip install stable-diffusion-cpp-python || {
echo ""
echo -e "${YELLOW}Note: If stable-diffusion-cpp-python is not available with pip,${NC}"
echo -e "${YELLOW}you may need to build from source.${NC}"
......@@ -448,11 +477,10 @@ elif [ "$BACKEND" = "all" ]; then
pip install whispercpp || echo -e "${YELLOW}Warning: whispercpp failed${NC}"
pip install litellm || echo -e "${YELLOW}Warning: litellm failed${NC}"
# Try procname (may fail on Python 3.13)
pip install procname || echo -e "${YELLOW}Warning: procname failed (optional)${NC}"
pip install setproctitle || echo -e "${YELLOW}Warning: setproctitle failed (optional)${NC}"
# Try stable-diffusion-cpp-python (requires CMake)
pip install stable-diffusion-cpp-python || echo -e "${YELLOW}Warning: stable-diffusion-cpp-python failed (optional)${NC}"
# Try stable-diffusion-cpp-python (disable WebM to avoid missing libwebm cmake submodule)
CMAKE_ARGS="$SD_CMAKE_ARGS" pip install stable-diffusion-cpp-python || echo -e "${YELLOW}Warning: stable-diffusion-cpp-python failed (optional)${NC}"
}
# Install PyTorch with CUDA support (for nvidia backend)
......@@ -540,7 +568,7 @@ elif [ "$BACKEND" = "all" ]; then
# Try to install stable-diffusion-cpp-python with OpenCL
if [ "$OPENCL_AVAILABLE" = true ]; then
echo -e "${YELLOW}Installing stable-diffusion-cpp-python with OpenCL support...${NC}"
pip install stable-diffusion-cpp-python || {
CMAKE_ARGS="$SD_CMAKE_ARGS" pip install stable-diffusion-cpp-python || {
echo -e "${YELLOW}Warning: stable-diffusion-cpp-python not available (requires CMake and build tools)${NC}"
}
else
......@@ -553,10 +581,10 @@ elif [ "$BACKEND" = "all" ]; then
echo -e "${YELLOW}Warning: Some additional packages failed${NC}"
}
# Try procname (optional, may fail on Python 3.13)
echo -e "${YELLOW}Installing procname (optional)...${NC}"
pip install procname || {
echo -e "${YELLOW}Note: procname failed to install (optional package, not critical)${NC}"
# Install setproctitle for process naming (Python 3.13 compatible)
echo -e "${YELLOW}Installing setproctitle...${NC}"
pip install setproctitle || {
echo -e "${YELLOW}Note: setproctitle failed to install (optional package, not critical)${NC}"
}
# Install Flash Attention 2 if requested and CUDA is available
......@@ -564,7 +592,7 @@ elif [ "$BACKEND" = "all" ]; then
echo ""
echo -e "${YELLOW}Installing Flash Attention 2...${NC}"
echo -e "${YELLOW}This may take several minutes and requires CUDA 11.6+${NC}"
pip install flash-attn --no-build-isolation || {
MAX_JOBS=5 NVCC_THREADS=2 pip install flash-attn --no-build-isolation || {
echo -e "${RED}Warning: Flash Attention 2 installation failed${NC}"
echo -e "${YELLOW}Requirements: CUDA 11.6+, Linux, Ampere/Ada/Hopper GPU${NC}"
echo -e "${YELLOW}Continuing without Flash Attention...${NC}"
......
#!/bin/bash
# Build script for CoderAI - Supports NVIDIA (CUDA), Vulkan, OpenCL, and CPU backends
# Usage: ./build.sh [nvidia|vulkan|vulkan-nvidia|cuda|opencl|all] [--flash] [--venv <venv>]
# Default: all (installs all backends)
# --flash: Enable and install Flash Attention 2 (for NVIDIA GPUs)
# --venv <venv>: Specify custom virtual environment name
set -e
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Determine backend and flags
BACKEND="${1:-all}"
FLASH=false
CUSTOM_VENV=""
# Parse arguments
i=1
for arg in "$@"; do
case $arg in
--flash)
FLASH=true
;;
--venv)
i=$((i + 1))
eval "CUSTOM_VENV=\${$i}"
;;
esac
i=$((i + 1))
done
BACKEND=$(echo "$BACKEND" | tr '[:upper:]' '[:lower:]')
if [[ "$BACKEND" != "nvidia" && "$BACKEND" != "vulkan" && "$BACKEND" != "vulkan-nvidia" && "$BACKEND" != "cuda" && "$BACKEND" != "opencl" && "$BACKEND" != "all" ]]; then
echo -e "${RED}Error: Invalid backend '$BACKEND'${NC}"
echo "Usage: ./build.sh [nvidia|vulkan|vulkan-nvidia|cuda|opencl|all] [--flash]"
echo " nvidia - Use PyTorch with CUDA for NVIDIA GPUs"
echo " vulkan - Use llama-cpp-python with Vulkan for AMD GPUs"
echo " vulkan-nvidia - Use llama-cpp-python with Vulkan for NVIDIA GPU only"
echo " cuda - Use llama-cpp-python with CUDA for NVIDIA GPUs"
echo " opencl - Use stable-diffusion-cpp-python with OpenCL"
echo " all - Install all backends (nvidia, cuda, vulkan, opencl, cpu) - DEFAULT"
echo ""
echo "Options:"
echo " --flash - Install Flash Attention 2 for faster inference (NVIDIA only)"
exit 1
fi
echo -e "${BLUE}========================================${NC}"
echo -e "${BLUE} CoderAI Build Script${NC}"
echo -e "${BLUE} Backend: ${GREEN}$BACKEND${NC}"
if [ "$FLASH" = true ]; then
echo -e "${BLUE} Flash Attention 2: ${GREEN}ENABLED${NC}"
fi
echo -e "${BLUE}========================================${NC}"
echo ""
# Check Python version
PYTHON_VERSION=$(python3 --version 2>&1 | grep -oP '\d+\.\d+' | head -1)
REQUIRED_VERSION="3.8"
if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$PYTHON_VERSION" | sort -V | head -n1)" != "$REQUIRED_VERSION" ]; then
echo -e "${RED}Error: Python 3.8+ required, found $PYTHON_VERSION${NC}"
exit 1
fi
echo -e "${GREEN}✓ Python version: $PYTHON_VERSION${NC}"
# Determine cmake args for stable-diffusion-cpp-python.
# The pip release is missing the libwebm/build/ cmake submodule files.
# If libwebm-dev is installed system-wide we can link against it; otherwise disable WebM.
if ldconfig -p 2>/dev/null | grep -q "libwebm" || pkg-config --exists libwebm 2>/dev/null; then
SD_CMAKE_ARGS="-DSD_USE_SYSTEM_WEBM=ON"
echo -e "${GREEN}✓ Found system libwebm — stable-diffusion-cpp-python will use it${NC}"
else
SD_CMAKE_ARGS="-DSD_WEBM=OFF"
echo -e "${YELLOW}Note: libwebm-dev not found — WebM video output disabled for stable-diffusion-cpp-python${NC}"
echo -e "${YELLOW} Install libwebm-dev to enable WebM support${NC}"
fi
# Determine venv directory based on backend
if [ -n "$CUSTOM_VENV" ]; then
VENV_DIR="$CUSTOM_VENV"
echo -e "${BLUE}Using custom virtual environment: $VENV_DIR${NC}"
elif [ "$BACKEND" = "nvidia" ]; then
VENV_DIR="venv_nvidia"
elif [ "$BACKEND" = "vulkan" ]; then
VENV_DIR="venv_vulkan"
elif [ "$BACKEND" = "vulkan-nvidia" ]; then
VENV_DIR="venv_vulkan_nvidia"
elif [ "$BACKEND" = "cuda" ]; then
VENV_DIR="venv_cuda"
elif [ "$BACKEND" = "opencl" ]; then
VENV_DIR="venv_opencl"
elif [ "$BACKEND" = "all" ]; then
VENV_DIR="venv_all"
fi
# Create virtual environment if it doesn't exist
if [ -n "$CUSTOM_VENV" ]; then
echo -e "${YELLOW}Creating custom virtual environment: $VENV_DIR${NC}"
else
echo -e "${YELLOW}Creating virtual environment: $VENV_DIR${NC}"
fi
if [ ! -d "$VENV_DIR" ]; then
python3 -m venv "$VENV_DIR"
echo -e "${GREEN}✓ Created virtual environment: $VENV_DIR${NC}"
else
echo -e "${YELLOW}Using existing virtual environment: $VENV_DIR${NC}"
fi
# Activate virtual environment
echo -e "${YELLOW}Activating virtual environment...${NC}"
source "$VENV_DIR/bin/activate"
# Force pip to use this venv and install packages
export PIP_NO_INPUT=1
export PIP_REQUIRE_VIRTUALENV=1
# Upgrade pip
echo -e "${YELLOW}Upgrading pip...${NC}"
pip install --upgrade pip
echo ""
echo -e "${BLUE}Installing dependencies for $BACKEND backend...${NC}"
echo ""
if [ "$BACKEND" = "nvidia" ]; then
# NVIDIA/CUDA backend
echo -e "${YELLOW}Installing PyTorch with CUDA support...${NC}"
pip install "torch>=2.0.0" "torchvision>=0.15.0" "torchaudio>=2.0.0"
echo -e "${YELLOW}Installing NVIDIA-specific requirements...${NC}"
pip install -r requirements-nvidia.txt || {
echo -e "${YELLOW}Warning: Some NVIDIA packages failed to install${NC}"
}
# Install Flash Attention 2 if requested
if [ "$FLASH" = true ]; then
echo ""
echo -e "${YELLOW}Installing Flash Attention 2...${NC}"
echo -e "${YELLOW}This may take several minutes and requires CUDA 11.6+${NC}"
MAX_JOBS=5 NVCC_THREADS=2 pip install flash-attn --no-build-isolation || {
echo -e "${RED}Warning: Flash Attention 2 installation failed${NC}"
echo -e "${YELLOW}Requirements: CUDA 11.6+, Linux, Ampere/Ada/Hopper GPU${NC}"
echo -e "${YELLOW}Continuing without Flash Attention...${NC}"
}
fi
echo ""
echo -e "${GREEN}========================================${NC}"
echo -e "${GREEN} NVIDIA/CUDA build complete!${NC}"
echo -e "${GREEN}========================================${NC}"
echo ""
echo "Usage:"
echo " source $VENV_DIR/bin/activate"
echo " python coderai --model <huggingface-model-name>"
if [ "$FLASH" = true ]; then
echo ""
echo "Flash Attention 2 enabled - use --flash-attn flag when running"
fi
echo ""
echo "Example:"
echo " python coderai --model microsoft/DialoGPT-medium"
echo ""
echo -e "${GREEN}========================================${NC}"
echo -e "${GREEN} NVIDIA/CUDA build complete!${NC}"
echo -e "${GREEN}========================================${NC}"
echo ""
echo "Usage:"
echo " source $VENV_DIR/bin/activate"
echo " python coderai --model <huggingface-model-name>"
echo ""
echo "Example:"
echo " python coderai --model microsoft/DialoGPT-medium"
echo ""
elif [ "$BACKEND" = "vulkan" ]; then
# Vulkan backend (all GPUs)
echo -e "${YELLOW}Installing llama-cpp-python with Vulkan support (all GPUs)...${NC}"
# Check for required Vulkan development libraries
if ! pkg-config --exists vulkan 2>/dev/null; then
echo -e "${YELLOW}Warning: Vulkan development libraries not found via pkg-config${NC}"
echo -e "${YELLOW}You may need to install Vulkan drivers and SDK:${NC}"
echo " Debian/Ubuntu: sudo apt install libvulkan-dev vulkan-tools"
echo " Fedora: sudo dnf install vulkan-loader-devel vulkan-tools"
echo " Arch: sudo pacman -S vulkan-headers vulkan-icd-loader"
echo ""
echo -e "${YELLOW}Attempting installation anyway...${NC}"
fi
# Check for glslc (Vulkan shader compiler)
GLSLC_CMD=""
if command -v glslc &> /dev/null; then
GLSLC_CMD="glslc"
elif command -v glslangValidator &> /dev/null; then
GLSLC_CMD="glslangValidator"
fi
if [ -z "$GLSLC_CMD" ]; then
echo -e "${YELLOW}Warning: glslc/glslangValidator not found in PATH${NC}"
else
echo -e "${GREEN}✓ Found Vulkan shader compiler: $GLSLC_CMD${NC}"
fi
# Build with Vulkan support
echo -e "${YELLOW}Building llama-cpp-python with Vulkan support...${NC}"
CMAKE_ARGS="-DGGML_VULKAN=ON" pip install --upgrade llama-cpp-python --no-cache-dir || {
echo -e "${RED}Build failed!${NC}"
exit 1
}
echo -e "${YELLOW}Installing Vulkan-specific requirements...${NC}"
pip install -r requirements-vulkan.txt
# Build whispercpp Python package with Vulkan support for GPU-accelerated audio transcription
echo -e "${YELLOW}Building whispercpp with Vulkan support for GPU-accelerated transcription...${NC}"
# First, uninstall any existing whispercpp (pip version doesn't have Vulkan)
pip uninstall -y whispercpp 2>/dev/null || true
# Clone and build whisper.cpp with Vulkan for Python bindings
WHISPERCPP_DIR="$HOME/whisper.cpp"
if [ ! -d "$WHISPERCPP_DIR" ]; then
echo "Cloning whisper.cpp..."
git clone --depth 1 https://github.com/ggerganov/whisper.cpp "$WHISPERCPP_DIR" 2>/dev/null || {
echo -e "${YELLOW}Warning: Could not clone whisper.cpp${NC}"
}
fi
if [ -d "$WHISPERCPP_DIR/bindings/python" ]; then
cd "$WHISPERCPP_DIR/bindings/python"
# Build with Vulkan support
# Set CMAKE_ARGS to enable Vulkan for ggml (whisper uses ggml library internally)
CMAKE_ARGS="-DWHISPER_VULKAN=ON -DGGML_VULKAN=ON" pip install . --no-cache-dir --force-reinstall 2>/dev/null || {
# If Vulkan build fails, try without (will fall back to CPU)
echo -e "${YELLOW}Warning: whispercpp Vulkan build failed, will use CPU${NC}"
pip install . --no-cache-dir --force-reinstall 2>/dev/null || {
echo -e "${YELLOW}Warning: Could not install whispercpp at all${NC}"
}
}
cd "$OLDPWD"
echo -e "${GREEN}✓ whispercpp with Vulkan support installed!${NC}"
else
echo -e "${YELLOW}Warning: whisper.cpp Python bindings not found${NC}"
fi
# Also build the main whisper.cpp C++ with Vulkan for standalone usage
echo -e "${YELLOW}Building whisper.cpp C++ with Vulkan support (optional)...${NC}"
WHISPER_DIR="$HOME/whisper.cpp"
if [ -d "$WHISPER_DIR" ]; then
echo "Using existing whisper.cpp installation"
else
echo "Cloning whisper.cpp..."
git clone https://github.com/ggerganov/whisper.cpp "$WHISPER_DIR" 2>/dev/null || {
echo -e "${YELLOW}Warning: Could not clone whisper.cpp. Audio transcription will use CPU.${NC}"
}
fi
if [ -d "$WHISPER_DIR" ]; then
cd "$WHISPER_DIR"
mkdir -p build 2>/dev/null
cd build
cmake -DGGML_VULKAN=ON .. >/dev/null 2>&1 || {
echo -e "${YELLOW}Warning: Vulkan build failed, building with OpenBLAS${NC}"
cmake -DBUILD_SHARED_LIBS=ON .. >/dev/null 2>&1
}
make -j$(nproc) >/dev/null 2>&1 || {
echo -e "${YELLOW}Warning: Build failed. Audio transcription will use CPU.${NC}"
}
cd "$OLDPWD"
if [ ! -f "$WHISPER_DIR/models/ggml-base.bin" ]; then
echo "Downloading Whisper base model..."
bash "$WHISPER_DIR/models/download-ggml-model.sh" base 2>/dev/null || {
echo -e "${YELLOW}Warning: Could not download Whisper model.${NC}"
}
fi
echo -e "${GREEN}✓ whisper.cpp ready for audio transcription!${NC}"
fi
echo ""
echo -e "${GREEN}========================================${NC}"
echo -e "${GREEN} Vulkan build complete!${NC}"
echo -e "${GREEN}========================================${NC}"
echo ""
echo "Usage:"
echo " python coderai --model <gguf-model> --backend vulkan"
echo ""
elif [ "$BACKEND" = "vulkan-nvidia" ]; then
# Vulkan backend (NVIDIA only)
echo -e "${YELLOW}Installing llama-cpp-python with Vulkan support (NVIDIA-only)...${NC}"
# Check for required Vulkan development libraries
if ! pkg-config --exists vulkan 2>/dev/null; then
echo -e "${YELLOW}Warning: Vulkan development libraries not found via pkg-config${NC}"
fi
# Check for glslc (Vulkan shader compiler)
GLSLC_CMD=""
if command -v glslc &> /dev/null; then
GLSLC_CMD="glslc"
elif command -v glslangValidator &> /dev/null; then
GLSLC_CMD="glslangValidator"
fi
if [ -z "$GLSLC_CMD" ]; then
echo -e "${YELLOW}Warning: glslc/glslangValidator not found in PATH${NC}"
else
echo -e "${GREEN}✓ Found Vulkan shader compiler: $GLSLC_CMD${NC}"
fi
# Build with Vulkan support
# Note: llama.cpp doesn't have a compile-time option to disable specific GPUs
# The device selection happens at runtime via environment variables
echo -e "${YELLOW}Building llama-cpp-python with Vulkan support...${NC}"
CMAKE_ARGS="-DGGML_VULKAN=ON" pip install --upgrade llama-cpp-python --no-cache-dir || {
echo -e "${RED}Build failed!${NC}"
exit 1
}
echo -e "${YELLOW}Installing Vulkan-specific requirements...${NC}"
pip install -r requirements-vulkan.txt
# Build whispercpp Python package with Vulkan support for GPU-accelerated audio transcription
echo -e "${YELLOW}Building whispercpp with Vulkan support for GPU-accelerated transcription...${NC}"
pip uninstall -y whispercpp 2>/dev/null || true
WHISPERCPP_DIR="$HOME/whisper.cpp"
if [ ! -d "$WHISPERCPP_DIR" ]; then
git clone --depth 1 https://github.com/ggerganov/whisper.cpp "$WHISPERCPP_DIR" 2>/dev/null || true
fi
if [ -d "$WHISPERCPP_DIR/bindings/python" ]; then
cd "$WHISPERCPP_DIR/bindings/python"
CMAKE_ARGS="-DWHISPER_VULKAN=ON -DGGML_VULKAN=ON" pip install . --no-cache-dir --force-reinstall 2>/dev/null || {
pip install . --no-cache-dir --force-reinstall 2>/dev/null || true
}
cd "$OLDPWD"
echo -e "${GREEN}✓ whispercpp with Vulkan support installed!${NC}"
fi
echo ""
echo -e "${GREEN}========================================${NC}"
echo -e "${GREEN} Vulkan (NVIDIA-only) build complete!${NC}"
echo -e "${GREEN}========================================${NC}"
echo ""
echo "Usage:"
echo " VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json \\"
echo " python coderai --model <gguf-model> --backend vulkan"
echo ""
echo "Note: This build includes both AMD and NVIDIA Vulkan support."
echo " At runtime, use VK_ICD_FILENAMES to select only NVIDIA."
echo ""
elif [ "$BACKEND" = "cuda" ]; then
# llama-cpp-python with CUDA backend (NVIDIA only)
echo -e "${YELLOW}Installing llama-cpp-python with CUDA support...${NC}"
# Check for CUDA toolkit
if ! command -v nvcc &> /dev/null; then
echo -e "${YELLOW}Warning: CUDA toolkit (nvcc) not found in PATH${NC}"
echo -e "${YELLOW}You may need to install CUDA toolkit:${NC}"
echo " Download from: https://developer.nvidia.com/cuda-downloads"
else
CUDA_VERSION=$(nvcc --version | grep "release" | sed -n 's/.*release \([0-9.]*\),.*/\1/p')
echo -e "${GREEN}✓ Found CUDA $CUDA_VERSION${NC}"
fi
# Check for CUDA libraries
if [ -d "/usr/local/cuda" ]; then
echo -e "${GREEN}✓ Found CUDA at /usr/local/cuda${NC}"
fi
# Build llama-cpp-python with CUDA support
echo -e "${YELLOW}Building llama-cpp-python with CUDA support...${NC}"
echo -e "${YELLOW}This may take several minutes...${NC}"
CMAKE_ARGS="-DGGML_CUDA=ON" pip install --upgrade llama-cpp-python --no-cache-dir || {
echo ""
echo -e "${RED}Build failed!${NC}"
echo -e "${YELLOW}Make sure CUDA toolkit is installed:${NC}"
echo " sudo apt install cuda-toolkit-12"
echo " or"
echo " Download from: https://developer.nvidia.com/cuda-downloads"
exit 1
}
echo -e "${YELLOW}Installing Vulkan-specific requirements...${NC}"
pip install -r requirements-vulkan.txt
echo ""
echo -e "${GREEN}========================================${NC}"
echo -e "${GREEN} llama-cpp-python CUDA build complete!${NC}"
echo -e "${GREEN}========================================${NC}"
echo ""
echo "Usage:"
echo " source $VENV_DIR/bin/activate"
echo " python coderai --model <gguf-model> --backend vulkan --vulkan-device 0"
echo ""
echo "Note: With CUDA backend, llama-cpp-python will only use NVIDIA GPUs."
echo ""
elif [ "$BACKEND" = "opencl" ]; then
# stable-diffusion-cpp-python with OpenCL backend
echo -e "${YELLOW}Installing stable-diffusion-cpp-python with OpenCL support...${NC}"
# Check for OpenCL
if ! command -v clinfo &> /dev/null && ! ls /usr/lib/*/libOpenCL* &> /dev/null; then
echo -e "${YELLOW}Warning: OpenCL not found in system${NC}"
echo -e "${YELLOW}You may need to install OpenCL runtime:${NC}"
echo " Debian/Ubuntu: sudo apt install ocl-icd-opencl-dev"
echo " Fedora: sudo dnf install ocl-icd-devel"
else
echo -e "${GREEN}✓ Found OpenCL${NC}"
fi
# Install base requirements
echo -e "${YELLOW}Installing base requirements...${NC}"
pip install -r requirements.txt
# Install stable-diffusion-cpp-python with OpenCL (disable WebM to avoid libwebm cmake issue)
echo -e "${YELLOW}Installing stable-diffusion-cpp-python with OpenCL...${NC}"
CMAKE_ARGS="$SD_CMAKE_ARGS" pip install stable-diffusion-cpp-python || {
echo ""
echo -e "${YELLOW}Note: If stable-diffusion-cpp-python is not available with pip,${NC}"
echo -e "${YELLOW}you may need to build from source.${NC}"
}
# Install additional requirements for OpenCL
echo -e "${YELLOW}Installing additional requirements for OpenCL...${NC}"
pip install numpy pillow
echo ""
echo -e "${GREEN}========================================${NC}"
echo -e "${GREEN} OpenCL build complete!${NC}"
echo -e "${GREEN}========================================${NC}"
echo ""
echo "Usage:"
echo " source $VENV_DIR/bin/activate"
echo " python coderai --model <model> --image-backend opencl"
echo ""
echo "Note: With OpenCL backend, stable-diffusion-cpp-python can use various GPUs."
echo ""
elif [ "$BACKEND" = "all" ]; then
# Install ALL backends: nvidia (CUDA), vulkan, opencl, and cpu
echo -e "${BLUE}========================================${NC}"
echo -e "${BLUE} Installing ALL backends${NC}"
echo -e "${BLUE} (NVIDIA/CUDA, Vulkan, OpenCL, CPU)${NC}"
echo -e "${BLUE}========================================${NC}"
echo ""
# Install base requirements
echo -e "${YELLOW}Installing base requirements...${NC}"
pip install --upgrade pip
# Install requirements with error handling for problematic packages
echo -e "${YELLOW}Installing core dependencies...${NC}"
pip install -r requirements.txt || {
echo -e "${YELLOW}Some packages failed to install, trying individually...${NC}"
# Install core packages that should always work
pip install fastapi uvicorn pydantic requests python-multipart psutil || {
echo -e "${RED}Failed to install core dependencies${NC}"
exit 1
}
# Try optional packages individually
echo -e "${YELLOW}Installing optional packages...${NC}"
pip install transformers accelerate diffusers safetensors || echo -e "${YELLOW}Warning: Some ML packages failed${NC}"
pip install faster-whisper || echo -e "${YELLOW}Warning: faster-whisper failed${NC}"
pip install whispercpp || echo -e "${YELLOW}Warning: whispercpp failed${NC}"
pip install litellm || echo -e "${YELLOW}Warning: litellm failed${NC}"
pip install setproctitle || echo -e "${YELLOW}Warning: setproctitle failed (optional)${NC}"
# Try stable-diffusion-cpp-python (disable WebM to avoid missing libwebm cmake submodule)
CMAKE_ARGS="$SD_CMAKE_ARGS" pip install stable-diffusion-cpp-python || echo -e "${YELLOW}Warning: stable-diffusion-cpp-python failed (optional)${NC}"
}
# Install PyTorch with CUDA support (for nvidia backend)
echo -e "${YELLOW}Installing PyTorch with CUDA support (NVIDIA backend)...${NC}"
pip install torch torchvision torchaudio || {
echo -e "${YELLOW}Warning: PyTorch installation failed, will try CPU version${NC}"
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu || {
echo -e "${RED}Failed to install PyTorch${NC}"
exit 1
}
}
echo -e "${YELLOW}Installing NVIDIA-specific requirements...${NC}"
pip install -r requirements-nvidia.txt || {
echo -e "${YELLOW}Warning: Some NVIDIA packages failed to install${NC}"
}
# Check for Vulkan development libraries
VULKAN_AVAILABLE=false
if pkg-config --exists vulkan 2>/dev/null; then
VULKAN_AVAILABLE=true
echo -e "${GREEN}✓ Found Vulkan development libraries${NC}"
else
echo -e "${YELLOW}Warning: Vulkan development libraries not found${NC}"
echo -e "${YELLOW} Vulkan support will be limited${NC}"
fi
# Check for CUDA
CUDA_AVAILABLE=false
if command -v nvcc &> /dev/null || [ -d "/usr/local/cuda" ]; then
CUDA_AVAILABLE=true
echo -e "${GREEN}✓ Found CUDA toolkit${NC}"
else
echo -e "${YELLOW}Warning: CUDA toolkit not found${NC}"
echo -e "${YELLOW} CUDA support will be limited${NC}"
fi
# Check for OpenCL
OPENCL_AVAILABLE=false
if command -v clinfo &> /dev/null || ls /usr/lib/*/libOpenCL* &> /dev/null 2>&1; then
OPENCL_AVAILABLE=true
echo -e "${GREEN}✓ Found OpenCL${NC}"
else
echo -e "${YELLOW}Warning: OpenCL not found${NC}"
echo -e "${YELLOW} OpenCL support will be limited${NC}"
fi
# Build llama-cpp-python with both CUDA and Vulkan support
echo -e "${YELLOW}Building llama-cpp-python with CUDA and Vulkan support...${NC}"
echo -e "${YELLOW}This may take several minutes...${NC}"
# Determine CMAKE_ARGS based on available hardware
CMAKE_ARGS=""
if [ "$CUDA_AVAILABLE" = true ]; then
CMAKE_ARGS="-DGGML_CUDA=ON"
echo -e "${GREEN} ✓ Enabling CUDA support${NC}"
fi
if [ "$VULKAN_AVAILABLE" = true ]; then
if [ -n "$CMAKE_ARGS" ]; then
CMAKE_ARGS="$CMAKE_ARGS -DGGML_VULKAN=ON"
else
CMAKE_ARGS="-DGGML_VULKAN=ON"
fi
echo -e "${GREEN} ✓ Enabling Vulkan support${NC}"
fi
if [ -n "$CMAKE_ARGS" ]; then
echo -e "${YELLOW} Building with: $CMAKE_ARGS${NC}"
CMAKE_ARGS="$CMAKE_ARGS" pip install --upgrade llama-cpp-python --no-cache-dir || {
echo -e "${YELLOW}Warning: llama-cpp-python build failed, installing from pip${NC}"
pip install llama-cpp-python
}
else
echo -e "${YELLOW}Warning: No GPU backends available, installing CPU version${NC}"
pip install llama-cpp-python
fi
# Install Vulkan-specific requirements
echo -e "${YELLOW}Installing Vulkan-specific requirements...${NC}"
pip install -r requirements-vulkan.txt || {
echo -e "${YELLOW}Warning: Some Vulkan packages failed to install${NC}"
}
# Try to install stable-diffusion-cpp-python with OpenCL
if [ "$OPENCL_AVAILABLE" = true ]; then
echo -e "${YELLOW}Installing stable-diffusion-cpp-python with OpenCL support...${NC}"
CMAKE_ARGS="$SD_CMAKE_ARGS" pip install stable-diffusion-cpp-python || {
echo -e "${YELLOW}Warning: stable-diffusion-cpp-python not available (requires CMake and build tools)${NC}"
}
else
echo -e "${YELLOW}Skipping OpenCL (stable-diffusion-cpp-python) - OpenCL not available${NC}"
fi
# Install additional requirements
echo -e "${YELLOW}Installing additional requirements...${NC}"
pip install numpy pillow || {
echo -e "${YELLOW}Warning: Some additional packages failed${NC}"
}
# Install setproctitle for process naming (Python 3.13 compatible)
echo -e "${YELLOW}Installing setproctitle...${NC}"
pip install setproctitle || {
echo -e "${YELLOW}Note: setproctitle failed to install (optional package, not critical)${NC}"
}
# Install Flash Attention 2 if requested and CUDA is available
if [ "$FLASH" = true ] && [ "$CUDA_AVAILABLE" = true ]; then
echo ""
echo -e "${YELLOW}Installing Flash Attention 2...${NC}"
echo -e "${YELLOW}This may take several minutes and requires CUDA 11.6+${NC}"
MAX_JOBS=6 pip install flash-attn --no-build-isolation || {
echo -e "${RED}Warning: Flash Attention 2 installation failed${NC}"
echo -e "${YELLOW}Requirements: CUDA 11.6+, Linux, Ampere/Ada/Hopper GPU${NC}"
echo -e "${YELLOW}Continuing without Flash Attention...${NC}"
}
elif [ "$FLASH" = true ]; then
echo -e "${YELLOW}Warning: Flash Attention 2 requires CUDA backend${NC}"
echo -e "${YELLOW}Skipping Flash Attention installation${NC}"
fi
echo ""
echo -e "${GREEN}========================================${NC}"
echo -e "${GREEN} ALL backends build complete!${NC}"
echo -e "${GREEN}========================================${NC}"
echo ""
echo "Available backends:"
[ "$CUDA_AVAILABLE" = true ] && echo " ✓ NVIDIA/CUDA (PyTorch)"
[ "$CUDA_AVAILABLE" = true ] && echo " ✓ CUDA (llama-cpp-python)"
[ "$VULKAN_AVAILABLE" = true ] && echo " ✓ Vulkan (llama-cpp-python)"
[ "$OPENCL_AVAILABLE" = true ] && echo " ✓ OpenCL (stable-diffusion-cpp-python)"
echo " ✓ CPU (fallback for all)"
if [ "$FLASH" = true ] && [ "$CUDA_AVAILABLE" = true ]; then
echo ""
echo " ✓ Flash Attention 2 (NVIDIA)"
fi
echo ""
echo "Usage:"
echo " source $VENV_DIR/bin/activate"
echo ""
echo " # For text models with NVIDIA:"
echo " python coderai --model <model> --backend nvidia"
if [ "$FLASH" = true ]; then
echo " python coderai --model <model> --backend nvidia --flash-attn"
fi
echo ""
echo " # For GGUF models with CUDA:"
echo " python coderai --model <gguf-model> --backend vulkan"
echo ""
echo " # For GGUF models with Vulkan:"
echo " python coderai --model <gguf-model> --backend vulkan"
echo ""
echo " # For image generation with OpenCL:"
echo " python coderai --model <model> --image-backend opencl"
echo ""
fi
# Create .backend file to track which backend was used
echo "$BACKEND" > .backend
echo -e "${GREEN}Build completed successfully!${NC}"
echo ""
echo "To activate the environment in the future, run:"
echo " source $VENV_DIR/bin/activate"
......@@ -18,10 +18,10 @@ def main():
original_unraisablehook(unraisable)
sys.unraisablehook = suppress_llama_del_errors
# Optional: set process name if procname is available
# Optional: set process name if setproctitle is available
try:
import procname
procname.setprocname("codai")
import setproctitle
setproctitle.setproctitle("codai")
except ImportError:
pass
......
"""Main entry point for codai server."""
import sys
import os
# Import configuration from codai modules
from codai.cli import parse_args
def main():
"""Main entry point for the codai server."""
# Suppress unraisable exceptions from LlamaModel.__del__
original_unraisablehook = sys.unraisablehook
def suppress_llama_del_errors(unraisable):
if isinstance(unraisable.exc_value, AttributeError) and 'LlamaModel' in repr(unraisable.object) and 'sampler' in str(unraisable.exc_value):
return # Ignore this specific error
original_unraisablehook(unraisable)
sys.unraisablehook = suppress_llama_del_errors
# Optional: set process name if procname is available
try:
import procname
procname.setprocname("codai")
except ImportError:
pass
args = parse_args()
# Handle early exit options (before heavy imports)
if args.list_cached_models:
print("\n=== Listing Cached Models ===")
# Import only what's needed for cache listing
from codai.models.cache import list_cached_models_info, get_all_cache_dirs
cache_info = list_cached_models_info()
caches = get_all_cache_dirs()
# Show CoderAI GGUF cache
coderai_dir = caches.get('coderai')
if coderai_dir:
print(f"\n--- CODERAI GGUF Cache ({coderai_dir}) ---")
if cache_info['coderai']:
for filename, size_mb in cache_info['coderai']:
print(f" {filename} ({size_mb:.1f} MB)")
else:
print(" No cached GGUF files.")
else:
print(f"\n--- CODERAI GGUF Cache ---")
print(" (directory not found)")
# Show HuggingFace cached models
hf_dir = caches.get('huggingface')
if hf_dir:
print(f"\n--- HUGGINGFACE Models Cache ({hf_dir}) ---")
if cache_info['huggingface']:
for repo_id, size_gb, revision_count in cache_info['huggingface']:
print(f" {repo_id} ({size_gb:.2f} GB)")
print(f" └─ {revision_count} revision(s)")
else:
print(" No cached HuggingFace models.")
else:
print(f"\n--- HUGGINGFACE Models Cache ---")
print(" (directory not found)")
# Show summary
print(f"\n=== Summary ===")
print(f"Total cached models: {cache_info['total_models']}")
print(f"Total disk usage: {cache_info['total_size_gb']:.2f} GB")
print("\nCache locations:")
for cache_name, cache_dir in caches.items():
print(f" {cache_name}: {cache_dir}")
sys.exit(0)
# Handle --remove-all-models early
if args.remove_all_models:
print("\n=== Removing All Cached Models ===")
from codai.models.cache import remove_all_cached_models
total_removed = remove_all_cached_models()
print(f"\n=== Removed {total_removed} item(s) from all caches ===")
sys.exit(0)
# Handle --remove-model early
if args.remove_model:
print(f"\n=== Removing Cached Model Matching: {args.remove_model} ===")
from codai.models.cache import remove_cached_model
removed = remove_cached_model(args.remove_model)
if not removed:
print(f"No cached models found matching: {args.remove_model}")
print(f"\nUse --list-cached-models to see available models.")
sys.exit(0)
total_size = sum(size for _, _, size in removed)
print(f"\nRemoved {len(removed)} cached model file(s), freeing {total_size / (1024*1024):.1f} MB")
sys.exit(0)
# Handle --download-model early (before heavy imports)
if args.download_model:
print(f"\n=== Downloading Model: {args.download_model} ===")
from codai.models.cache import download_model
try:
cached_path = download_model(args.download_model)
if cached_path:
print(f"\n=== Model downloaded successfully ===")
print(f"Cached at: {cached_path}")
sys.exit(0)
else:
print(f"\n=== Failed to download model ===")
sys.exit(1)
except Exception as e:
print(f"\n=== Error downloading model: {e} ===")
sys.exit(1)
# Import globals from codai modules (only after early exits)
from codai.api import app
from codai.api.state import (
set_global_args,
set_global_debug,
set_global_system_prompt,
set_global_tools_closer_prompt,
set_global_file_path,
set_load_mode,
set_grammar_guided_gen,
)
from codai.models.manager import ModelManager, MultiModelManager, model_manager, multi_model_manager
from codai.backends import detect_available_backends
from codai.models.cache import (
get_all_cache_dirs,
get_cached_model_path,
get_model_cache_dir,
download_model,
list_cached_models_info,
)
# Import global setters from text module FIRST (before calling them)
from codai.api.text import (
set_global_args,
set_global_debug,
set_global_system_prompt,
set_global_tools_closer_prompt,
)
from codai.api.app import set_load_mode
# Store args globally for access in endpoints (both state and text.py)
set_global_args(args)
# Set global variables
global global_system_prompt, global_tools_closer_prompt, global_debug, global_dump, global_file_path, grammar_guided_gen
# Set global grammar-guided-gen flag
from codai.api.state import set_grammar_guided_gen
grammar_guided_gen = args.grammar_guided_gen
if grammar_guided_gen:
print("Grammar-guided generation enabled (--grammar-guided-gen)")
# Set global system prompt from --system-prompt flag
global_system_prompt = args.system_prompt
set_global_system_prompt(global_system_prompt)
# Set global tools-closer-prompt flag
global_tools_closer_prompt = args.tools_closer_prompt
set_global_tools_closer_prompt(global_tools_closer_prompt)
if global_tools_closer_prompt:
print("Tools closer prompt enabled (--tools-closer-prompt)")
# Set global debug flag
global_debug = args.debug
set_global_debug(global_debug)
# Set global dump flag (enables debug as well for litellm output)
global_dump = args.dump
if global_dump:
global_debug = True
set_global_debug(True)
# Set global file path for storing generated files
global_file_path = args.file_path
set_global_file_path(global_file_path)
# Also set file path for images module
from codai.api.images import set_global_file_path as set_images_file_path
set_images_file_path(global_file_path)
# Also set global args for images module (it has its own global_args)
from codai.api.images import set_global_args as set_images_global_args
set_images_global_args(args)
# Also set file path for app.py (needed for /v1/files endpoint)
from codai.api.app import set_global_file_path_wrapper
set_global_file_path_wrapper(global_file_path)
if global_debug:
# Print the full command line that was used to invoke codai
import shlex
cmd_line = ' '.join(shlex.quote(arg) for arg in sys.argv)
print(f"\n{'='*80}")
print(f"=== COMMAND LINE: {cmd_line}")
print(f"{'='*80}\n")
print("DEBUG MODE ENABLED - Full requests and replies will be dumped to stdout")
# Handle --vulkan-list-devices
if args.vulkan_list_devices:
print("\nListing Vulkan devices...")
try:
import subprocess
result = subprocess.run(['vulkaninfo', '--summary'], capture_output=True, text=True)
if result.returncode == 0:
print(result.stdout)
else:
print("Could not run vulkaninfo. Make sure vulkan-tools is installed.")
except Exception as e:
print(f"Error listing devices: {e}")
sys.exit(0)
# Get model names from args - support multiple models
model_names = args.model if args.model else []
# Helper function to get config value by index with fallback
def get_ctx_by_index(ctx_list, index, default):
"""Get context value by model index, with fallback to default."""
if ctx_list and index < len(ctx_list):
return ctx_list[index]
return default
# Validate: must have at least one model specified
audio_models = args.audio_model if args.audio_model else []
image_models = args.image_model if args.image_model else []
vision_models = args.vision_model if args.vision_model else []
if not model_names and not audio_models and not image_models and not vision_models and args.tts_model is None:
print("Error: At least one of --model, --audio-model, --image-model, --vision-model, or --tts-model must be specified.")
print("")
print("For NVIDIA backend (HuggingFace models):")
print(" - microsoft/DialoGPT-medium")
print(" - meta-llama/Llama-2-7b-chat-hf (requires auth)")
print(" - TinyLlama/TinyLlama-1.1B-Chat-v1.0")
print(" - Use multiple --model flags for multiple models")
print("")
print("For Vulkan backend (GGUF models):")
print(" - Local path: ./phi-3-mini-4k-instruct-q4_k_m.gguf")
print(" - Or a HuggingFace model ID: TheBloke/Mistral-7B-Instruct-v0.2-GGUF")
print(" - Use multiple --model flags for multiple models")
print("")
sys.exit(1)
# Determine load mode
# Default is ondemand: pre-load only the first model, unload/load on switch
# --loadswap: load first in VRAM, others in CPU RAM, swap on switch
# --loadall: try to load all models in VRAM, offload to CPU RAM if fails
# --nopreload: skip pre-loading in any mode, load on first request
load_mode = "ondemand" # Default: on-demand loading
if args.loadall:
load_mode = "loadall"
elif args.loadswap:
load_mode = "loadswap"
nopreload = args.nopreload
set_load_mode(load_mode)
multi_model_manager.set_load_mode(load_mode)
if load_mode == "ondemand":
print("Load mode: ondemand (pre-load first model, unload/load on switch)")
elif load_mode == "loadswap":
print("Load mode: loadswap (first model in VRAM, others in CPU RAM, swap on switch)")
elif load_mode == "loadall":
print("Load mode: loadall (load all models, offload to CPU RAM if VRAM full)")
if nopreload:
print(" --nopreload: models will load on first request instead of at startup")
# Initialize model manager
print("\n=== Initializing Model Manager ===")
# Detect available backends
available_backends = detect_available_backends()
print(f"Available backends: {available_backends}")
# Determine which backend to use
backend = args.backend
if backend == "auto":
if "nvidia" in available_backends:
backend = "nvidia"
elif "vulkan" in available_backends:
backend = "vulkan"
elif "opencl" in available_backends:
backend = "opencl"
else:
print("Error: No supported backend detected (NVIDIA CUDA, AMD Vulkan, or OpenCL)")
sys.exit(1)
print(f"Using backend: {backend}")
# Set the backend for the model manager
model_manager.backend_type = backend
# Store references globally for API endpoints
from codai.api import app as fastapi_app
fastapi_app.state.model_manager = model_manager
fastapi_app.state.multi_model_manager = multi_model_manager
# Load main text model(s)
if model_names:
print(f"\nMain text model(s): {model_names}")
# Register models with multi_model_manager (set_default_model also resolves/caches)
for idx, model_name in enumerate(model_names):
multi_model_manager.set_default_model(model_name, {
'ctx': get_ctx_by_index(args.n_ctx, idx, 0),
})
# Pre-load models at startup (unless --nopreload)
if nopreload:
print(f" --nopreload: text model(s) will load on first request")
elif load_mode == "ondemand":
# Ondemand: pre-load only the first model into VRAM
try:
print(f"Preloading first model into VRAM: {model_names[0]}...")
mm = multi_model_manager._load_default_model()
if mm is not None and mm.backend is not None:
multi_model_manager.active_in_vram = multi_model_manager.default_model
print(f"Model loaded successfully: {model_names[0]}")
else:
print(f"Warning: Model {model_names[0]} failed to load")
except Exception as e:
print(f"Warning: Failed to preload model: {e}")
print(f"Model will load on first request")
elif load_mode == "loadswap":
# Loadswap: load first model into VRAM, others into CPU RAM
try:
print(f"Preloading first model into VRAM: {model_names[0]}...")
mm = multi_model_manager._load_default_model()
if mm is not None and mm.backend is not None:
multi_model_manager.active_in_vram = multi_model_manager.default_model
print(f"Model loaded successfully (VRAM): {model_names[0]}")
else:
print(f"Warning: Model {model_names[0]} failed to load")
except Exception as e:
print(f"Warning: Failed to preload model: {e}")
# Load remaining text models into CPU RAM
for idx, model_name in enumerate(model_names[1:], 1):
try:
print(f"Preloading model into CPU RAM: {model_name}...")
mm2 = multi_model_manager._load_model_by_name(model_name)
if mm2 is not None:
# Move to CPU immediately (it was loaded into VRAM by default)
multi_model_manager._move_model_to_cpu(model_name)
print(f"Model loaded successfully (CPU RAM): {model_name}")
else:
print(f"Warning: Model {model_name} failed to load")
except Exception as e:
print(f"Warning: Failed to preload model {model_name}: {e}")
elif load_mode == "loadall":
# Loadall: try to load all models into VRAM, offload to CPU RAM if fails
for idx, model_name in enumerate(model_names):
try:
if idx == 0:
print(f"Preloading model into VRAM: {model_name}...")
mm = multi_model_manager._load_default_model()
else:
print(f"Preloading model into VRAM: {model_name}...")
mm = multi_model_manager._load_model_by_name(model_name)
if mm is not None and (not hasattr(mm, 'backend') or mm.backend is not None):
if idx == 0:
multi_model_manager.active_in_vram = multi_model_manager.default_model
print(f"Model loaded successfully (VRAM): {model_name}")
else:
print(f"Warning: Model {model_name} failed to load")
except Exception as e:
error_msg = str(e).lower()
is_oom = any(x in error_msg for x in ['out of memory', 'oom', 'cuda error'])
if is_oom:
print(f"VRAM full for {model_name}, offloading to CPU RAM...")
try:
mm = multi_model_manager._load_model_by_name(model_name)
if mm is not None:
multi_model_manager._move_model_to_cpu(model_name)
print(f"Model loaded successfully (CPU RAM): {model_name}")
except Exception as e2:
print(f"Warning: Failed to load model {model_name} even to CPU: {e2}")
else:
print(f"Warning: Failed to preload model {model_name}: {e}")
# Set up audio model if specified
if audio_models:
print(f"\nAudio transcription model(s): {audio_models}")
for idx, audio_m in enumerate(audio_models):
multi_model_manager.set_audio_model(audio_m, {
'ctx': get_ctx_by_index(args.audio_ctx, idx, 0),
'offload': args.audio_offload,
})
# Set up whisper-server if specified
if args.whisper_server:
print(f"\nWhisper server: {args.whisper_server}")
print(f" Port: {args.whisper_server_port}")
# Import WhisperServerManager
from codai.models.manager import WhisperServerManager
# Check if whisper-server is already running
if multi_model_manager.whisper_server is None:
whisper_server_mgr = WhisperServerManager(
server_path=args.whisper_server,
port=args.whisper_server_port
)
multi_model_manager.whisper_server = whisper_server_mgr
else:
whisper_server_mgr = multi_model_manager.whisper_server
print("Whisper server already running, using existing instance")
# Start whisper-server if we have audio_models configured
if audio_models:
model_to_use = audio_models[0] if audio_models else None
gpu_device = getattr(args, 'audio_vulkan_device', 0) or 0
print(f"DEBUG: Starting whisper-server with gpu_device={gpu_device}")
actual_model_path = whisper_server_mgr.start(model_path=model_to_use, gpu_device=gpu_device)
if actual_model_path:
# Update audio_models in multi_model_manager to store the actual path (not the URL)
if model_to_use != actual_model_path:
if multi_model_manager.audio_models and multi_model_manager.audio_models[0] == model_to_use:
multi_model_manager.audio_models[0] = actual_model_path
print(f"Whisper server started with model: {actual_model_path}")
else:
print("Warning: Failed to start whisper-server, falling back to other backends")
# Set up image model if specified
if image_models:
print(f"\nImage generation model(s): {image_models}")
for idx, img_m in enumerate(image_models):
multi_model_manager.set_image_model(img_m, {
'ctx': get_ctx_by_index(args.image_ctx, idx, 0),
'offload': args.image_offload,
'llm_path': args.llm_path,
'vae_path': args.vae_path,
'sample_method': args.image_sample_method,
'steps': args.image_steps,
'width': args.image_width,
'height': args.image_height,
'cfg_scale': args.image_cfg_scale,
})
# Set up vision model if specified
if vision_models:
print(f"\nVision model(s): {vision_models}")
for idx, vision_m in enumerate(vision_models):
multi_model_manager.set_vision_model(vision_m, {
'ctx': get_ctx_by_index(args.n_ctx, idx, 0),
'offload': args.image_offload,
})
# Set up TTS model if specified
if args.tts_model:
print(f"\nText-to-speech model: {args.tts_model}")
multi_model_manager.set_tts_model(args.tts_model, {})
# Register model aliases if specified
if args.model_aliases:
print(f"\nRegistering model aliases:")
for alias, model in args.model_aliases:
multi_model_manager.set_model_alias(alias, model)
print(f" {alias} -> {model}")
# =========================================================================
# Pre-load non-text models for loadall and loadswap modes
# (Text models are already handled above)
# =========================================================================
if not nopreload and load_mode in ("loadall", "loadswap"):
# Collect all non-text models that need pre-loading
# For loadall: load all into VRAM (offload to CPU if OOM)
# For loadswap: first model in VRAM (already done for text), rest in CPU RAM
# Determine if the first text model is already in VRAM
first_model_loaded = multi_model_manager.active_in_vram is not None
# Pre-load image models
if image_models:
print(f"\n=== Pre-loading image model(s) ===")
for idx, img_m in enumerate(image_models):
model_key = f"image:{img_m}"
if model_key in multi_model_manager.models:
continue # Already loaded
try:
from codai.api.images import _load_diffusers_pipeline, _is_gguf_model, _load_sdcpp_model
if load_mode == "loadall":
# Try to load into VRAM
print(f"Preloading image model into VRAM: {img_m}...")
if _is_gguf_model(img_m):
resolved_path = multi_model_manager.load_model(img_m)
if resolved_path and os.path.isfile(resolved_path):
sd_model = _load_sdcpp_model(resolved_path, args)
if sd_model:
multi_model_manager.add_model(model_key, sd_model)
print(f"Image model loaded (VRAM, sd.cpp): {img_m}")
else:
try:
pipeline = _load_diffusers_pipeline(img_m, args)
if pipeline:
multi_model_manager.add_model(model_key, pipeline)
print(f"Image model loaded (VRAM, diffusers): {img_m}")
except Exception as e:
error_msg = str(e).lower()
is_oom = any(x in error_msg for x in ['out of memory', 'oom', 'cuda error'])
if is_oom:
print(f"VRAM full for image model {img_m}, will load on demand")
else:
print(f"Warning: Failed to preload image model {img_m}: {e}")
elif load_mode == "loadswap":
# Load into VRAM then move to CPU (unless it's the first model overall)
if not first_model_loaded:
# No model in VRAM yet, load this one into VRAM
print(f"Preloading image model into VRAM: {img_m}...")
if _is_gguf_model(img_m):
resolved_path = multi_model_manager.load_model(img_m)
if resolved_path and os.path.isfile(resolved_path):
sd_model = _load_sdcpp_model(resolved_path, args)
if sd_model:
multi_model_manager.add_model(model_key, sd_model)
first_model_loaded = True
print(f"Image model loaded (VRAM): {img_m}")
else:
try:
pipeline = _load_diffusers_pipeline(img_m, args)
if pipeline:
multi_model_manager.add_model(model_key, pipeline)
first_model_loaded = True
print(f"Image model loaded (VRAM): {img_m}")
except Exception as e:
print(f"Warning: Failed to preload image model {img_m}: {e}")
else:
# First model already in VRAM, load this to VRAM then move to CPU
print(f"Preloading image model into CPU RAM: {img_m}...")
# Move current VRAM model to CPU temporarily
current_vram = multi_model_manager.active_in_vram
if current_vram and current_vram in multi_model_manager.models:
multi_model_manager._move_model_to_cpu(current_vram)
try:
if _is_gguf_model(img_m):
resolved_path = multi_model_manager.load_model(img_m)
if resolved_path and os.path.isfile(resolved_path):
sd_model = _load_sdcpp_model(resolved_path, args)
if sd_model:
multi_model_manager.add_model(model_key, sd_model)
multi_model_manager._move_model_to_cpu(model_key)
print(f"Image model loaded (CPU RAM): {img_m}")
else:
pipeline = _load_diffusers_pipeline(img_m, args)
if pipeline:
multi_model_manager.add_model(model_key, pipeline)
multi_model_manager._move_model_to_cpu(model_key)
print(f"Image model loaded (CPU RAM): {img_m}")
except Exception as e:
print(f"Warning: Failed to preload image model {img_m}: {e}")
# Move original model back to VRAM
if current_vram and current_vram in multi_model_manager.models:
multi_model_manager._move_model_to_vram(current_vram)
multi_model_manager.active_in_vram = current_vram
except ImportError as e:
print(f"Warning: Cannot preload image model {img_m} (missing dependency): {e}")
except Exception as e:
print(f"Warning: Failed to preload image model {img_m}: {e}")
# Note: Audio models (faster-whisper) and TTS models (kokoro) are loaded
# by their respective API modules on first request, as they use specialized
# loading mechanisms. The model files are already cached by set_audio_model()
# and set_tts_model() above.
if audio_models:
print(f"\nAudio model(s) registered and cached, will load into memory on first request")
if args.tts_model:
print(f"TTS model registered and cached, will load into memory on first request")
# Start the server
import uvicorn
print(f"\nStarting server on http://{args.host}:{args.port}")
print(f"API documentation available at http://{args.host}:{args.port}/docs")
if model_manager.backend is not None:
actual_backend = model_manager.backend_type
if hasattr(model_manager.backend, 'force_cuda') and model_manager.backend.force_cuda:
actual_backend = "cuda (via llama-cpp-python)"
print(f"Using backend: {actual_backend}")
# Print available models
models = multi_model_manager.list_models()
print(f"Available models: {[m.id for m in models]}")
# Run server with or without HTTPS
if args.https:
import ssl
ssl_keyfile = None
ssl_certfile = None
if args.privkey and args.pubkey:
ssl_keyfile = args.privkey
ssl_certfile = args.pubkey
print(f"Using HTTPS with custom certificates: {args.pubkey}")
else:
print("Generating self-signed HTTPS certificate...")
import subprocess
try:
cert_path = "./cert.pem"
key_path = "./key.pem"
subprocess.run([
"openssl", "req", "-x509", "-newkey", "rsa:4096",
"-keyout", key_path, "-out", cert_path,
"-days", "365", "-nodes",
"-subj", "/CN=localhost"
], check=True, capture_output=True)
ssl_keyfile = key_path
ssl_certfile = cert_path
print(f"Generated self-signed certificate: {cert_path}")
except Exception as e:
print(f"Warning: Could not generate certificate: {e}")
print("Falling back to HTTP...")
uvicorn.run(app, host=args.host, port=args.port)
return
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
ssl_context.load_cert_chain(ssl_certfile, ssl_keyfile)
uvicorn.run(app, host=args.host, port=args.port, ssl=ssl_context)
else:
uvicorn.run(app, host=args.host, port=args.port)
if __name__ == "__main__":
main()
......@@ -15,7 +15,7 @@ psutil>=5.9.0
# Optional: Audio transcription dependencies
faster-whisper>=0.10.0 # For NVIDIA/CUDA whisper transcription
whispercpp>=1.0.0 # Alternative whisper library (works without PyTorch)
whispercpp>=0.0.17 # Alternative whisper library (works without PyTorch)
# Optional: for better performance with NVIDIA GPUs
bitsandbytes>=0.41.0
......
......@@ -17,4 +17,4 @@ huggingface-hub>=0.19.0
# Optional: Audio transcription without PyTorch (whispercpp)
# Note: faster-whisper requires PyTorch, but whispercpp works without it
whispercpp>=1.0.0 # For GGUF-based Whisper transcription without PyTorch
whispercpp>=0.0.17 # For GGUF-based Whisper transcription without PyTorch
......@@ -36,15 +36,15 @@ transformers>=4.35.0
accelerate>=0.24.0
diffusers>=0.25.0 # For Stable Diffusion image generation
safetensors>=0.4.0 # Required by diffusers
stable-diffusion-cpp-python # For Vulkan/AMD image generation (no version pin for Python 3.13 compat)
# stable-diffusion-cpp-python is installed by build.sh with CMAKE_ARGS to fix the libwebm submodule issue
# System resource detection
psutil>=5.9.0
procname # Process naming (no version pin for Python 3.13 compatibility)
setproctitle>=1.1 # Process naming (replaces procname, Python 3.13 compatible)
# Optional: Audio transcription dependencies
faster-whisper>=0.10.0 # For NVIDIA/CUDA whisper transcription
whispercpp>=0.0.6 # Alternative whisper library (works without PyTorch)
whispercpp>=0.0.17 # Alternative whisper library (works without PyTorch)
# LiteLLM for standardized API responses
litellm>=1.40.0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment