Web Admin

80b1c060 · Stefy Lanza (nextime / spora ) · f4a34bc3 · 80b1c060 · 80b1c060 · 80b1c060
Commit 80b1c060 authored May 03, 2026 by Stefy Lanza (nextime / spora )
9 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,8 @@
 venv/
 .venv/
 env/
+venv_all/
+

 # Python cache
 __pycache__/

--- a/.gitignore~
+++ b/.gitignore~
+# Backend selection file
+.backend
+
+# Virtual environments
+venv/
+.venv/
+env/
+
+# Python cache
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+
+# Debug logs
+debug.log
+
+# Test files
+test_*.py
--- a/build.sh
+++ b/build.sh
 #!/bin/bash
 # Build script for CoderAI - Supports NVIDIA (CUDA), Vulkan, OpenCL, and CPU backends
-# Usage: ./build.sh [nvidia|vulkan|vulkan-nvidia|cuda|opencl|all] [--flash]
+# Usage: ./build.sh [nvidia|vulkan|vulkan-nvidia|cuda|opencl|all] [--flash] [--venv <venv>]
 # Default: all (installs all backends)
 # --flash: Enable and install Flash Attention 2 (for NVIDIA GPUs)
+# --venv <venv>: Specify custom virtual environment name

 set -e

@@ -16,12 +17,21 @@ NC='\033[0m' # No Color
 # Determine backend and flags
 BACKEND="${1:-all}"
 FLASH=false
+CUSTOM_VENV=""

-# Check for --flash flag in any position
+# Parse arguments
+i=1
 for arg in "$@"; do
    case $arg in
-        --flash) FLASH=true;;
+        --flash) 
+            FLASH=true
+            ;;
+        --venv)
+            i=$((i + 1))
+            eval "CUSTOM_VENV=\${$i}"
+            ;;
    esac
+    i=$((i + 1))
 done

 BACKEND=$(echo "$BACKEND" | tr '[:upper:]' '[:lower:]')
@@ -61,8 +71,23 @@ fi

 echo -e "${GREEN}✓ Python version: $PYTHON_VERSION${NC}"

+# Determine cmake args for stable-diffusion-cpp-python.
+# The pip release is missing the libwebm/build/ cmake submodule files.
+# If libwebm-dev is installed system-wide we can link against it; otherwise disable WebM.
+if ldconfig -p 2>/dev/null | grep -q "libwebm" || pkg-config --exists libwebm 2>/dev/null; then
+    SD_CMAKE_ARGS="-DSD_USE_SYSTEM_WEBM=ON"
+    echo -e "${GREEN}✓ Found system libwebm — stable-diffusion-cpp-python will use it${NC}"
+else
+    SD_CMAKE_ARGS="-DSD_WEBM=OFF"
+    echo -e "${YELLOW}Note: libwebm-dev not found — WebM video output disabled for stable-diffusion-cpp-python${NC}"
+    echo -e "${YELLOW}      Install libwebm-dev to enable WebM support${NC}"
+fi
+
 # Determine venv directory based on backend
-if [ "$BACKEND" = "nvidia" ]; then
+if [ -n "$CUSTOM_VENV" ]; then
+    VENV_DIR="$CUSTOM_VENV"
+    echo -e "${BLUE}Using custom virtual environment: $VENV_DIR${NC}"
+elif [ "$BACKEND" = "nvidia" ]; then
    VENV_DIR="venv_nvidia"
 elif [ "$BACKEND" = "vulkan" ]; then
    VENV_DIR="venv_vulkan"
@@ -77,7 +102,11 @@ elif [ "$BACKEND" = "all" ]; then
 fi

 # Create virtual environment if it doesn't exist
-echo -e "${YELLOW}Creating virtual environment: $VENV_DIR${NC}"
+if [ -n "$CUSTOM_VENV" ]; then
+    echo -e "${YELLOW}Creating custom virtual environment: $VENV_DIR${NC}"
+else
+    echo -e "${YELLOW}Creating virtual environment: $VENV_DIR${NC}"
+fi
 if [ ! -d "$VENV_DIR" ]; then
    python3 -m venv "$VENV_DIR"
    echo -e "${GREEN}✓ Created virtual environment: $VENV_DIR${NC}"
@@ -116,7 +145,7 @@ if [ "$BACKEND" = "nvidia" ]; then
        echo ""
        echo -e "${YELLOW}Installing Flash Attention 2...${NC}"
        echo -e "${YELLOW}This may take several minutes and requires CUDA 11.6+${NC}"
-        pip install flash-attn --no-build-isolation || {
+        MAX_JOBS=5 NVCC_THREADS=2  pip install flash-attn --no-build-isolation || {
            echo -e "${RED}Warning: Flash Attention 2 installation failed${NC}"
            echo -e "${YELLOW}Requirements: CUDA 11.6+, Linux, Ampere/Ada/Hopper GPU${NC}"
            echo -e "${YELLOW}Continuing without Flash Attention...${NC}"
@@ -394,9 +423,9 @@ elif [ "$BACKEND" = "opencl" ]; then
    echo -e "${YELLOW}Installing base requirements...${NC}"
    pip install -r requirements.txt
    
-    # Install stable-diffusion-cpp-python with OpenCL
+    # Install stable-diffusion-cpp-python with OpenCL (disable WebM to avoid libwebm cmake issue)
    echo -e "${YELLOW}Installing stable-diffusion-cpp-python with OpenCL...${NC}"
-    pip install stable-diffusion-cpp-python || {
+    CMAKE_ARGS="$SD_CMAKE_ARGS" pip install stable-diffusion-cpp-python || {
        echo ""
        echo -e "${YELLOW}Note: If stable-diffusion-cpp-python is not available with pip,${NC}"
        echo -e "${YELLOW}you may need to build from source.${NC}"
@@ -448,11 +477,10 @@ elif [ "$BACKEND" = "all" ]; then
        pip install whispercpp || echo -e "${YELLOW}Warning: whispercpp failed${NC}"
        pip install litellm || echo -e "${YELLOW}Warning: litellm failed${NC}"
        
-        # Try procname (may fail on Python 3.13)
-        pip install procname || echo -e "${YELLOW}Warning: procname failed (optional)${NC}"
+        pip install setproctitle || echo -e "${YELLOW}Warning: setproctitle failed (optional)${NC}"

-        # Try stable-diffusion-cpp-python (requires CMake)
-        pip install stable-diffusion-cpp-python || echo -e "${YELLOW}Warning: stable-diffusion-cpp-python failed (optional)${NC}"
+        # Try stable-diffusion-cpp-python (disable WebM to avoid missing libwebm cmake submodule)
+        CMAKE_ARGS="$SD_CMAKE_ARGS" pip install stable-diffusion-cpp-python || echo -e "${YELLOW}Warning: stable-diffusion-cpp-python failed (optional)${NC}"
    }
    
    # Install PyTorch with CUDA support (for nvidia backend)
@@ -540,7 +568,7 @@ elif [ "$BACKEND" = "all" ]; then
    # Try to install stable-diffusion-cpp-python with OpenCL
    if [ "$OPENCL_AVAILABLE" = true ]; then
        echo -e "${YELLOW}Installing stable-diffusion-cpp-python with OpenCL support...${NC}"
-        pip install stable-diffusion-cpp-python || {
+        CMAKE_ARGS="$SD_CMAKE_ARGS" pip install stable-diffusion-cpp-python || {
            echo -e "${YELLOW}Warning: stable-diffusion-cpp-python not available (requires CMake and build tools)${NC}"
        }
    else
@@ -553,10 +581,10 @@ elif [ "$BACKEND" = "all" ]; then
        echo -e "${YELLOW}Warning: Some additional packages failed${NC}"
    }

-    # Try procname (optional, may fail on Python 3.13)
-    echo -e "${YELLOW}Installing procname (optional)...${NC}"
-    pip install procname || {
-        echo -e "${YELLOW}Note: procname failed to install (optional package, not critical)${NC}"
+    # Install setproctitle for process naming (Python 3.13 compatible)
+    echo -e "${YELLOW}Installing setproctitle...${NC}"
+    pip install setproctitle || {
+        echo -e "${YELLOW}Note: setproctitle failed to install (optional package, not critical)${NC}"
    }
    
    # Install Flash Attention 2 if requested and CUDA is available
@@ -564,7 +592,7 @@ elif [ "$BACKEND" = "all" ]; then
        echo ""
        echo -e "${YELLOW}Installing Flash Attention 2...${NC}"
        echo -e "${YELLOW}This may take several minutes and requires CUDA 11.6+${NC}"
-        pip install flash-attn --no-build-isolation || {
+        MAX_JOBS=5 NVCC_THREADS=2 pip install flash-attn --no-build-isolation || {
            echo -e "${RED}Warning: Flash Attention 2 installation failed${NC}"
            echo -e "${YELLOW}Requirements: CUDA 11.6+, Linux, Ampere/Ada/Hopper GPU${NC}"
            echo -e "${YELLOW}Continuing without Flash Attention...${NC}"

--- a/build.sh~
+++ b/build.sh~
+#!/bin/bash
+# Build script for CoderAI - Supports NVIDIA (CUDA), Vulkan, OpenCL, and CPU backends
+# Usage: ./build.sh [nvidia|vulkan|vulkan-nvidia|cuda|opencl|all] [--flash] [--venv <venv>]
+# Default: all (installs all backends)
+# --flash: Enable and install Flash Attention 2 (for NVIDIA GPUs)
+# --venv <venv>: Specify custom virtual environment name
+
+set -e
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Determine backend and flags
+BACKEND="${1:-all}"
+FLASH=false
+CUSTOM_VENV=""
+
+# Parse arguments
+i=1
+for arg in "$@"; do
+    case $arg in
+        --flash) 
+            FLASH=true
+            ;;
+        --venv)
+            i=$((i + 1))
+            eval "CUSTOM_VENV=\${$i}"
+            ;;
+    esac
+    i=$((i + 1))
+done
+
+BACKEND=$(echo "$BACKEND" | tr '[:upper:]' '[:lower:]')
+
+if [[ "$BACKEND" != "nvidia" && "$BACKEND" != "vulkan" && "$BACKEND" != "vulkan-nvidia" && "$BACKEND" != "cuda" && "$BACKEND" != "opencl" && "$BACKEND" != "all" ]]; then
+    echo -e "${RED}Error: Invalid backend '$BACKEND'${NC}"
+    echo "Usage: ./build.sh [nvidia|vulkan|vulkan-nvidia|cuda|opencl|all] [--flash]"
+    echo "  nvidia       - Use PyTorch with CUDA for NVIDIA GPUs"
+    echo "  vulkan      - Use llama-cpp-python with Vulkan for AMD GPUs"
+    echo "  vulkan-nvidia - Use llama-cpp-python with Vulkan for NVIDIA GPU only"
+    echo "  cuda        - Use llama-cpp-python with CUDA for NVIDIA GPUs"
+    echo "  opencl      - Use stable-diffusion-cpp-python with OpenCL"
+    echo "  all         - Install all backends (nvidia, cuda, vulkan, opencl, cpu) - DEFAULT"
+    echo ""
+    echo "Options:"
+    echo "  --flash     - Install Flash Attention 2 for faster inference (NVIDIA only)"
+    exit 1
+fi
+
+echo -e "${BLUE}========================================${NC}"
+echo -e "${BLUE}  CoderAI Build Script${NC}"
+echo -e "${BLUE}  Backend: ${GREEN}$BACKEND${NC}"
+if [ "$FLASH" = true ]; then
+    echo -e "${BLUE}  Flash Attention 2: ${GREEN}ENABLED${NC}"
+fi
+echo -e "${BLUE}========================================${NC}"
+echo ""
+
+# Check Python version
+PYTHON_VERSION=$(python3 --version 2>&1 | grep -oP '\d+\.\d+' | head -1)
+REQUIRED_VERSION="3.8"
+
+if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$PYTHON_VERSION" | sort -V | head -n1)" != "$REQUIRED_VERSION" ]; then
+    echo -e "${RED}Error: Python 3.8+ required, found $PYTHON_VERSION${NC}"
+    exit 1
+fi
+
+echo -e "${GREEN}✓ Python version: $PYTHON_VERSION${NC}"
+
+# Determine cmake args for stable-diffusion-cpp-python.
+# The pip release is missing the libwebm/build/ cmake submodule files.
+# If libwebm-dev is installed system-wide we can link against it; otherwise disable WebM.
+if ldconfig -p 2>/dev/null | grep -q "libwebm" || pkg-config --exists libwebm 2>/dev/null; then
+    SD_CMAKE_ARGS="-DSD_USE_SYSTEM_WEBM=ON"
+    echo -e "${GREEN}✓ Found system libwebm — stable-diffusion-cpp-python will use it${NC}"
+else
+    SD_CMAKE_ARGS="-DSD_WEBM=OFF"
+    echo -e "${YELLOW}Note: libwebm-dev not found — WebM video output disabled for stable-diffusion-cpp-python${NC}"
+    echo -e "${YELLOW}      Install libwebm-dev to enable WebM support${NC}"
+fi
+
+# Determine venv directory based on backend
+if [ -n "$CUSTOM_VENV" ]; then
+    VENV_DIR="$CUSTOM_VENV"
+    echo -e "${BLUE}Using custom virtual environment: $VENV_DIR${NC}"
+elif [ "$BACKEND" = "nvidia" ]; then
+    VENV_DIR="venv_nvidia"
+elif [ "$BACKEND" = "vulkan" ]; then
+    VENV_DIR="venv_vulkan"
+elif [ "$BACKEND" = "vulkan-nvidia" ]; then
+    VENV_DIR="venv_vulkan_nvidia"
+elif [ "$BACKEND" = "cuda" ]; then
+    VENV_DIR="venv_cuda"
+elif [ "$BACKEND" = "opencl" ]; then
+    VENV_DIR="venv_opencl"
+elif [ "$BACKEND" = "all" ]; then
+    VENV_DIR="venv_all"
+fi
+
+# Create virtual environment if it doesn't exist
+if [ -n "$CUSTOM_VENV" ]; then
+    echo -e "${YELLOW}Creating custom virtual environment: $VENV_DIR${NC}"
+else
+    echo -e "${YELLOW}Creating virtual environment: $VENV_DIR${NC}"
+fi
+if [ ! -d "$VENV_DIR" ]; then
+    python3 -m venv "$VENV_DIR"
+    echo -e "${GREEN}✓ Created virtual environment: $VENV_DIR${NC}"
+else
+    echo -e "${YELLOW}Using existing virtual environment: $VENV_DIR${NC}"
+fi
+
+# Activate virtual environment
+echo -e "${YELLOW}Activating virtual environment...${NC}"
+source "$VENV_DIR/bin/activate"
+
+# Force pip to use this venv and install packages
+export PIP_NO_INPUT=1
+export PIP_REQUIRE_VIRTUALENV=1
+
+# Upgrade pip
+echo -e "${YELLOW}Upgrading pip...${NC}"
+pip install --upgrade pip
+
+echo ""
+echo -e "${BLUE}Installing dependencies for $BACKEND backend...${NC}"
+echo ""
+
+if [ "$BACKEND" = "nvidia" ]; then
+    # NVIDIA/CUDA backend
+    echo -e "${YELLOW}Installing PyTorch with CUDA support...${NC}"
+    pip install "torch>=2.0.0" "torchvision>=0.15.0" "torchaudio>=2.0.0"
+    
+    echo -e "${YELLOW}Installing NVIDIA-specific requirements...${NC}"
+    pip install -r requirements-nvidia.txt || {
+        echo -e "${YELLOW}Warning: Some NVIDIA packages failed to install${NC}"
+    }
+    
+    # Install Flash Attention 2 if requested
+    if [ "$FLASH" = true ]; then
+        echo ""
+        echo -e "${YELLOW}Installing Flash Attention 2...${NC}"
+        echo -e "${YELLOW}This may take several minutes and requires CUDA 11.6+${NC}"
+        MAX_JOBS=5 NVCC_THREADS=2  pip install flash-attn --no-build-isolation || {
+            echo -e "${RED}Warning: Flash Attention 2 installation failed${NC}"
+            echo -e "${YELLOW}Requirements: CUDA 11.6+, Linux, Ampere/Ada/Hopper GPU${NC}"
+            echo -e "${YELLOW}Continuing without Flash Attention...${NC}"
+        }
+    fi
+    
+    echo ""
+    echo -e "${GREEN}========================================${NC}"
+    echo -e "${GREEN}  NVIDIA/CUDA build complete!${NC}"
+    echo -e "${GREEN}========================================${NC}"
+    echo ""
+    echo "Usage:"
+    echo "  source $VENV_DIR/bin/activate"
+    echo "  python coderai --model <huggingface-model-name>"
+    if [ "$FLASH" = true ]; then
+        echo ""
+        echo "Flash Attention 2 enabled - use --flash-attn flag when running"
+    fi
+    echo ""
+    echo "Example:"
+    echo "  python coderai --model microsoft/DialoGPT-medium"
+    echo ""
+    echo -e "${GREEN}========================================${NC}"
+    echo -e "${GREEN}  NVIDIA/CUDA build complete!${NC}"
+    echo -e "${GREEN}========================================${NC}"
+    echo ""
+    echo "Usage:"
+    echo "  source $VENV_DIR/bin/activate"
+    echo "  python coderai --model <huggingface-model-name>"
+    echo ""
+    echo "Example:"
+    echo "  python coderai --model microsoft/DialoGPT-medium"
+    echo ""
+    
+elif [ "$BACKEND" = "vulkan" ]; then
+    # Vulkan backend (all GPUs)
+    echo -e "${YELLOW}Installing llama-cpp-python with Vulkan support (all GPUs)...${NC}"
+    
+    # Check for required Vulkan development libraries
+    if ! pkg-config --exists vulkan 2>/dev/null; then
+        echo -e "${YELLOW}Warning: Vulkan development libraries not found via pkg-config${NC}"
+        echo -e "${YELLOW}You may need to install Vulkan drivers and SDK:${NC}"
+        echo "  Debian/Ubuntu: sudo apt install libvulkan-dev vulkan-tools"
+        echo "  Fedora: sudo dnf install vulkan-loader-devel vulkan-tools"
+        echo "  Arch: sudo pacman -S vulkan-headers vulkan-icd-loader"
+        echo ""
+        echo -e "${YELLOW}Attempting installation anyway...${NC}"
+    fi
+    
+    # Check for glslc (Vulkan shader compiler)
+    GLSLC_CMD=""
+    if command -v glslc &> /dev/null; then
+        GLSLC_CMD="glslc"
+    elif command -v glslangValidator &> /dev/null; then
+        GLSLC_CMD="glslangValidator"
+    fi
+    
+    if [ -z "$GLSLC_CMD" ]; then
+        echo -e "${YELLOW}Warning: glslc/glslangValidator not found in PATH${NC}"
+    else
+        echo -e "${GREEN}✓ Found Vulkan shader compiler: $GLSLC_CMD${NC}"
+    fi
+    
+    # Build with Vulkan support
+    echo -e "${YELLOW}Building llama-cpp-python with Vulkan support...${NC}"
+    CMAKE_ARGS="-DGGML_VULKAN=ON" pip install --upgrade llama-cpp-python --no-cache-dir || {
+        echo -e "${RED}Build failed!${NC}"
+        exit 1
+    }
+    
+    echo -e "${YELLOW}Installing Vulkan-specific requirements...${NC}"
+    pip install -r requirements-vulkan.txt
+    
+    # Build whispercpp Python package with Vulkan support for GPU-accelerated audio transcription
+    echo -e "${YELLOW}Building whispercpp with Vulkan support for GPU-accelerated transcription...${NC}"
+    
+    # First, uninstall any existing whispercpp (pip version doesn't have Vulkan)
+    pip uninstall -y whispercpp 2>/dev/null || true
+    
+    # Clone and build whisper.cpp with Vulkan for Python bindings
+    WHISPERCPP_DIR="$HOME/whisper.cpp"
+    if [ ! -d "$WHISPERCPP_DIR" ]; then
+        echo "Cloning whisper.cpp..."
+        git clone --depth 1 https://github.com/ggerganov/whisper.cpp "$WHISPERCPP_DIR" 2>/dev/null || {
+            echo -e "${YELLOW}Warning: Could not clone whisper.cpp${NC}"
+        }
+    fi
+    
+    if [ -d "$WHISPERCPP_DIR/bindings/python" ]; then
+        cd "$WHISPERCPP_DIR/bindings/python"
+        
+        # Build with Vulkan support
+        # Set CMAKE_ARGS to enable Vulkan for ggml (whisper uses ggml library internally)
+        CMAKE_ARGS="-DWHISPER_VULKAN=ON -DGGML_VULKAN=ON" pip install . --no-cache-dir --force-reinstall 2>/dev/null || {
+            # If Vulkan build fails, try without (will fall back to CPU)
+            echo -e "${YELLOW}Warning: whispercpp Vulkan build failed, will use CPU${NC}"
+            pip install . --no-cache-dir --force-reinstall 2>/dev/null || {
+                echo -e "${YELLOW}Warning: Could not install whispercpp at all${NC}"
+            }
+        }
+        cd "$OLDPWD"
+        echo -e "${GREEN}✓ whispercpp with Vulkan support installed!${NC}"
+    else
+        echo -e "${YELLOW}Warning: whisper.cpp Python bindings not found${NC}"
+    fi
+    
+    # Also build the main whisper.cpp C++ with Vulkan for standalone usage
+    echo -e "${YELLOW}Building whisper.cpp C++ with Vulkan support (optional)...${NC}"
+    WHISPER_DIR="$HOME/whisper.cpp"
+    if [ -d "$WHISPER_DIR" ]; then
+        echo "Using existing whisper.cpp installation"
+    else
+        echo "Cloning whisper.cpp..."
+        git clone https://github.com/ggerganov/whisper.cpp "$WHISPER_DIR" 2>/dev/null || {
+            echo -e "${YELLOW}Warning: Could not clone whisper.cpp. Audio transcription will use CPU.${NC}"
+        }
+    fi
+    
+    if [ -d "$WHISPER_DIR" ]; then
+        cd "$WHISPER_DIR"
+        mkdir -p build 2>/dev/null
+        cd build
+        cmake -DGGML_VULKAN=ON .. >/dev/null 2>&1 || {
+            echo -e "${YELLOW}Warning: Vulkan build failed, building with OpenBLAS${NC}"
+            cmake -DBUILD_SHARED_LIBS=ON .. >/dev/null 2>&1
+        }
+        make -j$(nproc) >/dev/null 2>&1 || {
+            echo -e "${YELLOW}Warning: Build failed. Audio transcription will use CPU.${NC}"
+        }
+        cd "$OLDPWD"
+        
+        if [ ! -f "$WHISPER_DIR/models/ggml-base.bin" ]; then
+            echo "Downloading Whisper base model..."
+            bash "$WHISPER_DIR/models/download-ggml-model.sh" base 2>/dev/null || {
+                echo -e "${YELLOW}Warning: Could not download Whisper model.${NC}"
+            }
+        fi
+        echo -e "${GREEN}✓ whisper.cpp ready for audio transcription!${NC}"
+    fi
+    
+    echo ""
+    echo -e "${GREEN}========================================${NC}"
+    echo -e "${GREEN}  Vulkan build complete!${NC}"
+    echo -e "${GREEN}========================================${NC}"
+    echo ""
+    echo "Usage:"
+    echo "  python coderai --model <gguf-model> --backend vulkan"
+    echo ""
+    
+elif [ "$BACKEND" = "vulkan-nvidia" ]; then
+    # Vulkan backend (NVIDIA only)
+    echo -e "${YELLOW}Installing llama-cpp-python with Vulkan support (NVIDIA-only)...${NC}"
+    
+    # Check for required Vulkan development libraries
+    if ! pkg-config --exists vulkan 2>/dev/null; then
+        echo -e "${YELLOW}Warning: Vulkan development libraries not found via pkg-config${NC}"
+    fi
+    
+    # Check for glslc (Vulkan shader compiler)
+    GLSLC_CMD=""
+    if command -v glslc &> /dev/null; then
+        GLSLC_CMD="glslc"
+    elif command -v glslangValidator &> /dev/null; then
+        GLSLC_CMD="glslangValidator"
+    fi
+    
+    if [ -z "$GLSLC_CMD" ]; then
+        echo -e "${YELLOW}Warning: glslc/glslangValidator not found in PATH${NC}"
+    else
+        echo -e "${GREEN}✓ Found Vulkan shader compiler: $GLSLC_CMD${NC}"
+    fi
+    
+    # Build with Vulkan support
+    # Note: llama.cpp doesn't have a compile-time option to disable specific GPUs
+    # The device selection happens at runtime via environment variables
+    echo -e "${YELLOW}Building llama-cpp-python with Vulkan support...${NC}"
+    CMAKE_ARGS="-DGGML_VULKAN=ON" pip install --upgrade llama-cpp-python --no-cache-dir || {
+        echo -e "${RED}Build failed!${NC}"
+        exit 1
+    }
+    
+    echo -e "${YELLOW}Installing Vulkan-specific requirements...${NC}"
+    pip install -r requirements-vulkan.txt
+    
+    # Build whispercpp Python package with Vulkan support for GPU-accelerated audio transcription
+    echo -e "${YELLOW}Building whispercpp with Vulkan support for GPU-accelerated transcription...${NC}"
+    pip uninstall -y whispercpp 2>/dev/null || true
+    WHISPERCPP_DIR="$HOME/whisper.cpp"
+    if [ ! -d "$WHISPERCPP_DIR" ]; then
+        git clone --depth 1 https://github.com/ggerganov/whisper.cpp "$WHISPERCPP_DIR" 2>/dev/null || true
+    fi
+    if [ -d "$WHISPERCPP_DIR/bindings/python" ]; then
+        cd "$WHISPERCPP_DIR/bindings/python"
+        CMAKE_ARGS="-DWHISPER_VULKAN=ON -DGGML_VULKAN=ON" pip install . --no-cache-dir --force-reinstall 2>/dev/null || {
+            pip install . --no-cache-dir --force-reinstall 2>/dev/null || true
+        }
+        cd "$OLDPWD"
+        echo -e "${GREEN}✓ whispercpp with Vulkan support installed!${NC}"
+    fi
+    
+    echo ""
+    echo -e "${GREEN}========================================${NC}"
+    echo -e "${GREEN}  Vulkan (NVIDIA-only) build complete!${NC}"
+    echo -e "${GREEN}========================================${NC}"
+    echo ""
+    echo "Usage:"
+    echo "  VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json \\"
+    echo "  python coderai --model <gguf-model> --backend vulkan"
+    echo ""
+    echo "Note: This build includes both AMD and NVIDIA Vulkan support."
+    echo "      At runtime, use VK_ICD_FILENAMES to select only NVIDIA."
+    echo ""
+ 
+elif [ "$BACKEND" = "cuda" ]; then
+    # llama-cpp-python with CUDA backend (NVIDIA only)
+    echo -e "${YELLOW}Installing llama-cpp-python with CUDA support...${NC}"
+    
+    # Check for CUDA toolkit
+    if ! command -v nvcc &> /dev/null; then
+        echo -e "${YELLOW}Warning: CUDA toolkit (nvcc) not found in PATH${NC}"
+        echo -e "${YELLOW}You may need to install CUDA toolkit:${NC}"
+        echo "  Download from: https://developer.nvidia.com/cuda-downloads"
+    else
+        CUDA_VERSION=$(nvcc --version | grep "release" | sed -n 's/.*release \([0-9.]*\),.*/\1/p')
+        echo -e "${GREEN}✓ Found CUDA $CUDA_VERSION${NC}"
+    fi
+    
+    # Check for CUDA libraries
+    if [ -d "/usr/local/cuda" ]; then
+        echo -e "${GREEN}✓ Found CUDA at /usr/local/cuda${NC}"
+    fi
+    
+    # Build llama-cpp-python with CUDA support
+    echo -e "${YELLOW}Building llama-cpp-python with CUDA support...${NC}"
+    echo -e "${YELLOW}This may take several minutes...${NC}"
+    CMAKE_ARGS="-DGGML_CUDA=ON" pip install --upgrade llama-cpp-python --no-cache-dir || {
+        echo ""
+        echo -e "${RED}Build failed!${NC}"
+        echo -e "${YELLOW}Make sure CUDA toolkit is installed:${NC}"
+        echo "  sudo apt install cuda-toolkit-12"
+        echo "  or"
+        echo "  Download from: https://developer.nvidia.com/cuda-downloads"
+        exit 1
+    }
+    
+    echo -e "${YELLOW}Installing Vulkan-specific requirements...${NC}"
+    pip install -r requirements-vulkan.txt
+    
+    echo ""
+    echo -e "${GREEN}========================================${NC}"
+    echo -e "${GREEN}  llama-cpp-python CUDA build complete!${NC}"
+    echo -e "${GREEN}========================================${NC}"
+    echo ""
+    echo "Usage:"
+    echo "  source $VENV_DIR/bin/activate"
+    echo "  python coderai --model <gguf-model> --backend vulkan --vulkan-device 0"
+    echo ""
+    echo "Note: With CUDA backend, llama-cpp-python will only use NVIDIA GPUs."
+    echo ""
+elif [ "$BACKEND" = "opencl" ]; then
+    # stable-diffusion-cpp-python with OpenCL backend
+    echo -e "${YELLOW}Installing stable-diffusion-cpp-python with OpenCL support...${NC}"
+    
+    # Check for OpenCL
+    if ! command -v clinfo &> /dev/null && ! ls /usr/lib/*/libOpenCL* &> /dev/null; then
+        echo -e "${YELLOW}Warning: OpenCL not found in system${NC}"
+        echo -e "${YELLOW}You may need to install OpenCL runtime:${NC}"
+        echo "  Debian/Ubuntu: sudo apt install ocl-icd-opencl-dev"
+        echo "  Fedora: sudo dnf install ocl-icd-devel"
+    else
+        echo -e "${GREEN}✓ Found OpenCL${NC}"
+    fi
+    
+    # Install base requirements
+    echo -e "${YELLOW}Installing base requirements...${NC}"
+    pip install -r requirements.txt
+    
+    # Install stable-diffusion-cpp-python with OpenCL (disable WebM to avoid libwebm cmake issue)
+    echo -e "${YELLOW}Installing stable-diffusion-cpp-python with OpenCL...${NC}"
+    CMAKE_ARGS="$SD_CMAKE_ARGS" pip install stable-diffusion-cpp-python || {
+        echo ""
+        echo -e "${YELLOW}Note: If stable-diffusion-cpp-python is not available with pip,${NC}"
+        echo -e "${YELLOW}you may need to build from source.${NC}"
+    }
+    
+    # Install additional requirements for OpenCL
+    echo -e "${YELLOW}Installing additional requirements for OpenCL...${NC}"
+    pip install numpy pillow
+    
+    echo ""
+    echo -e "${GREEN}========================================${NC}"
+    echo -e "${GREEN}  OpenCL build complete!${NC}"
+    echo -e "${GREEN}========================================${NC}"
+    echo ""
+    echo "Usage:"
+    echo "  source $VENV_DIR/bin/activate"
+    echo "  python coderai --model <model> --image-backend opencl"
+    echo ""
+    echo "Note: With OpenCL backend, stable-diffusion-cpp-python can use various GPUs."
+    echo ""
+
+elif [ "$BACKEND" = "all" ]; then
+    # Install ALL backends: nvidia (CUDA), vulkan, opencl, and cpu
+    echo -e "${BLUE}========================================${NC}"
+    echo -e "${BLUE}  Installing ALL backends${NC}"
+    echo -e "${BLUE}  (NVIDIA/CUDA, Vulkan, OpenCL, CPU)${NC}"
+    echo -e "${BLUE}========================================${NC}"
+    echo ""
+    
+    # Install base requirements
+    echo -e "${YELLOW}Installing base requirements...${NC}"
+    pip install --upgrade pip
+    
+    # Install requirements with error handling for problematic packages
+    echo -e "${YELLOW}Installing core dependencies...${NC}"
+    pip install -r requirements.txt || {
+        echo -e "${YELLOW}Some packages failed to install, trying individually...${NC}"
+        
+        # Install core packages that should always work
+        pip install fastapi uvicorn pydantic requests python-multipart psutil || {
+            echo -e "${RED}Failed to install core dependencies${NC}"
+            exit 1
+        }
+        
+        # Try optional packages individually
+        echo -e "${YELLOW}Installing optional packages...${NC}"
+        pip install transformers accelerate diffusers safetensors || echo -e "${YELLOW}Warning: Some ML packages failed${NC}"
+        pip install faster-whisper || echo -e "${YELLOW}Warning: faster-whisper failed${NC}"
+        pip install whispercpp || echo -e "${YELLOW}Warning: whispercpp failed${NC}"
+        pip install litellm || echo -e "${YELLOW}Warning: litellm failed${NC}"
+        
+        pip install setproctitle || echo -e "${YELLOW}Warning: setproctitle failed (optional)${NC}"
+
+        # Try stable-diffusion-cpp-python (disable WebM to avoid missing libwebm cmake submodule)
+        CMAKE_ARGS="$SD_CMAKE_ARGS" pip install stable-diffusion-cpp-python || echo -e "${YELLOW}Warning: stable-diffusion-cpp-python failed (optional)${NC}"
+    }
+    
+    # Install PyTorch with CUDA support (for nvidia backend)
+    echo -e "${YELLOW}Installing PyTorch with CUDA support (NVIDIA backend)...${NC}"
+    pip install torch torchvision torchaudio || {
+        echo -e "${YELLOW}Warning: PyTorch installation failed, will try CPU version${NC}"
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu || {
+            echo -e "${RED}Failed to install PyTorch${NC}"
+            exit 1
+        }
+    }
+    
+    echo -e "${YELLOW}Installing NVIDIA-specific requirements...${NC}"
+    pip install -r requirements-nvidia.txt || {
+        echo -e "${YELLOW}Warning: Some NVIDIA packages failed to install${NC}"
+    }
+    
+    # Check for Vulkan development libraries
+    VULKAN_AVAILABLE=false
+    if pkg-config --exists vulkan 2>/dev/null; then
+        VULKAN_AVAILABLE=true
+        echo -e "${GREEN}✓ Found Vulkan development libraries${NC}"
+    else
+        echo -e "${YELLOW}Warning: Vulkan development libraries not found${NC}"
+        echo -e "${YELLOW}  Vulkan support will be limited${NC}"
+    fi
+    
+    # Check for CUDA
+    CUDA_AVAILABLE=false
+    if command -v nvcc &> /dev/null || [ -d "/usr/local/cuda" ]; then
+        CUDA_AVAILABLE=true
+        echo -e "${GREEN}✓ Found CUDA toolkit${NC}"
+    else
+        echo -e "${YELLOW}Warning: CUDA toolkit not found${NC}"
+        echo -e "${YELLOW}  CUDA support will be limited${NC}"
+    fi
+    
+    # Check for OpenCL
+    OPENCL_AVAILABLE=false
+    if command -v clinfo &> /dev/null || ls /usr/lib/*/libOpenCL* &> /dev/null 2>&1; then
+        OPENCL_AVAILABLE=true
+        echo -e "${GREEN}✓ Found OpenCL${NC}"
+    else
+        echo -e "${YELLOW}Warning: OpenCL not found${NC}"
+        echo -e "${YELLOW}  OpenCL support will be limited${NC}"
+    fi
+    
+    # Build llama-cpp-python with both CUDA and Vulkan support
+    echo -e "${YELLOW}Building llama-cpp-python with CUDA and Vulkan support...${NC}"
+    echo -e "${YELLOW}This may take several minutes...${NC}"
+    
+    # Determine CMAKE_ARGS based on available hardware
+    CMAKE_ARGS=""
+    if [ "$CUDA_AVAILABLE" = true ]; then
+        CMAKE_ARGS="-DGGML_CUDA=ON"
+        echo -e "${GREEN}  ✓ Enabling CUDA support${NC}"
+    fi
+    
+    if [ "$VULKAN_AVAILABLE" = true ]; then
+        if [ -n "$CMAKE_ARGS" ]; then
+            CMAKE_ARGS="$CMAKE_ARGS -DGGML_VULKAN=ON"
+        else
+            CMAKE_ARGS="-DGGML_VULKAN=ON"
+        fi
+        echo -e "${GREEN}  ✓ Enabling Vulkan support${NC}"
+    fi
+    
+    if [ -n "$CMAKE_ARGS" ]; then
+        echo -e "${YELLOW}  Building with: $CMAKE_ARGS${NC}"
+        CMAKE_ARGS="$CMAKE_ARGS" pip install --upgrade llama-cpp-python --no-cache-dir || {
+            echo -e "${YELLOW}Warning: llama-cpp-python build failed, installing from pip${NC}"
+            pip install llama-cpp-python
+        }
+    else
+        echo -e "${YELLOW}Warning: No GPU backends available, installing CPU version${NC}"
+        pip install llama-cpp-python
+    fi
+    
+    # Install Vulkan-specific requirements
+    echo -e "${YELLOW}Installing Vulkan-specific requirements...${NC}"
+    pip install -r requirements-vulkan.txt || {
+        echo -e "${YELLOW}Warning: Some Vulkan packages failed to install${NC}"
+    }
+    
+    # Try to install stable-diffusion-cpp-python with OpenCL
+    if [ "$OPENCL_AVAILABLE" = true ]; then
+        echo -e "${YELLOW}Installing stable-diffusion-cpp-python with OpenCL support...${NC}"
+        CMAKE_ARGS="$SD_CMAKE_ARGS" pip install stable-diffusion-cpp-python || {
+            echo -e "${YELLOW}Warning: stable-diffusion-cpp-python not available (requires CMake and build tools)${NC}"
+        }
+    else
+        echo -e "${YELLOW}Skipping OpenCL (stable-diffusion-cpp-python) - OpenCL not available${NC}"
+    fi
+
+    # Install additional requirements
+    echo -e "${YELLOW}Installing additional requirements...${NC}"
+    pip install numpy pillow || {
+        echo -e "${YELLOW}Warning: Some additional packages failed${NC}"
+    }
+
+    # Install setproctitle for process naming (Python 3.13 compatible)
+    echo -e "${YELLOW}Installing setproctitle...${NC}"
+    pip install setproctitle || {
+        echo -e "${YELLOW}Note: setproctitle failed to install (optional package, not critical)${NC}"
+    }
+    
+    # Install Flash Attention 2 if requested and CUDA is available
+    if [ "$FLASH" = true ] && [ "$CUDA_AVAILABLE" = true ]; then
+        echo ""
+        echo -e "${YELLOW}Installing Flash Attention 2...${NC}"
+        echo -e "${YELLOW}This may take several minutes and requires CUDA 11.6+${NC}"
+        MAX_JOBS=6 pip install flash-attn --no-build-isolation || {
+            echo -e "${RED}Warning: Flash Attention 2 installation failed${NC}"
+            echo -e "${YELLOW}Requirements: CUDA 11.6+, Linux, Ampere/Ada/Hopper GPU${NC}"
+            echo -e "${YELLOW}Continuing without Flash Attention...${NC}"
+        }
+    elif [ "$FLASH" = true ]; then
+        echo -e "${YELLOW}Warning: Flash Attention 2 requires CUDA backend${NC}"
+        echo -e "${YELLOW}Skipping Flash Attention installation${NC}"
+    fi
+    
+    echo ""
+    echo -e "${GREEN}========================================${NC}"
+    echo -e "${GREEN}  ALL backends build complete!${NC}"
+    echo -e "${GREEN}========================================${NC}"
+    echo ""
+    echo "Available backends:"
+    [ "$CUDA_AVAILABLE" = true ] && echo "  ✓ NVIDIA/CUDA (PyTorch)"
+    [ "$CUDA_AVAILABLE" = true ] && echo "  ✓ CUDA (llama-cpp-python)"
+    [ "$VULKAN_AVAILABLE" = true ] && echo "  ✓ Vulkan (llama-cpp-python)"
+    [ "$OPENCL_AVAILABLE" = true ] && echo "  ✓ OpenCL (stable-diffusion-cpp-python)"
+    echo "  ✓ CPU (fallback for all)"
+    if [ "$FLASH" = true ] && [ "$CUDA_AVAILABLE" = true ]; then
+        echo ""
+        echo "  ✓ Flash Attention 2 (NVIDIA)"
+    fi
+    echo ""
+    echo "Usage:"
+    echo "  source $VENV_DIR/bin/activate"
+    echo ""
+    echo "  # For text models with NVIDIA:"
+    echo "  python coderai --model <model> --backend nvidia"
+    if [ "$FLASH" = true ]; then
+        echo "  python coderai --model <model> --backend nvidia --flash-attn"
+    fi
+    echo ""
+    echo "  # For GGUF models with CUDA:"
+    echo "  python coderai --model <gguf-model> --backend vulkan"
+    echo ""
+    echo "  # For GGUF models with Vulkan:"
+    echo "  python coderai --model <gguf-model> --backend vulkan"
+    echo ""
+    echo "  # For image generation with OpenCL:"
+    echo "  python coderai --model <model> --image-backend opencl"
+    echo ""
+fi
+
+# Create .backend file to track which backend was used
+echo "$BACKEND" > .backend
+
+echo -e "${GREEN}Build completed successfully!${NC}"
+echo ""
+echo "To activate the environment in the future, run:"
+echo "  source $VENV_DIR/bin/activate"
--- a/codai/main.py
+++ b/codai/main.py
@@ -18,10 +18,10 @@ def main():
        original_unraisablehook(unraisable)
    sys.unraisablehook = suppress_llama_del_errors
    
-    # Optional: set process name if procname is available
+    # Optional: set process name if setproctitle is available
    try:
-        import procname
-        procname.setprocname("codai")
+        import setproctitle
+        setproctitle.setproctitle("codai")
    except ImportError:
        pass
    

--- a/codai/main.py~
+++ b/codai/main.py~
-"""Main entry point for codai server."""
-import sys
-import os
-
-# Import configuration from codai modules
-from codai.cli import parse_args
-
-
-def main():
-    """Main entry point for the codai server."""
-    # Suppress unraisable exceptions from LlamaModel.__del__
-    original_unraisablehook = sys.unraisablehook
-    def suppress_llama_del_errors(unraisable):
-        if isinstance(unraisable.exc_value, AttributeError) and 'LlamaModel' in repr(unraisable.object) and 'sampler' in str(unraisable.exc_value):
-            return  # Ignore this specific error
-        original_unraisablehook(unraisable)
-    sys.unraisablehook = suppress_llama_del_errors
-    
-    # Optional: set process name if procname is available
-    try:
-        import procname
-        procname.setprocname("codai")
-    except ImportError:
-        pass
-    
-    args = parse_args()
-
-    # Handle early exit options (before heavy imports)
-    if args.list_cached_models:
-        print("\n=== Listing Cached Models ===")
-
-        # Import only what's needed for cache listing
-        from codai.models.cache import list_cached_models_info, get_all_cache_dirs
-
-        cache_info = list_cached_models_info()
-        caches = get_all_cache_dirs()
-
-        # Show CoderAI GGUF cache
-        coderai_dir = caches.get('coderai')
-        if coderai_dir:
-            print(f"\n--- CODERAI GGUF Cache ({coderai_dir}) ---")
-            if cache_info['coderai']:
-                for filename, size_mb in cache_info['coderai']:
-                    print(f"  {filename} ({size_mb:.1f} MB)")
-            else:
-                print("  No cached GGUF files.")
-        else:
-            print(f"\n--- CODERAI GGUF Cache ---")
-            print("  (directory not found)")
-
-        # Show HuggingFace cached models
-        hf_dir = caches.get('huggingface')
-        if hf_dir:
-            print(f"\n--- HUGGINGFACE Models Cache ({hf_dir}) ---")
-            if cache_info['huggingface']:
-                for repo_id, size_gb, revision_count in cache_info['huggingface']:
-                    print(f"  {repo_id} ({size_gb:.2f} GB)")
-                    print(f"    └─ {revision_count} revision(s)")
-            else:
-                print("  No cached HuggingFace models.")
-        else:
-            print(f"\n--- HUGGINGFACE Models Cache ---")
-            print("  (directory not found)")
-
-        # Show summary
-        print(f"\n=== Summary ===")
-        print(f"Total cached models: {cache_info['total_models']}")
-        print(f"Total disk usage: {cache_info['total_size_gb']:.2f} GB")
-        print("\nCache locations:")
-        for cache_name, cache_dir in caches.items():
-            print(f"  {cache_name}: {cache_dir}")
-
-        sys.exit(0)
-
-    # Handle --remove-all-models early
-    if args.remove_all_models:
-        print("\n=== Removing All Cached Models ===")
-
-        from codai.models.cache import remove_all_cached_models
-
-        total_removed = remove_all_cached_models()
-
-        print(f"\n=== Removed {total_removed} item(s) from all caches ===")
-        sys.exit(0)
-
-    # Handle --remove-model early
-    if args.remove_model:
-        print(f"\n=== Removing Cached Model Matching: {args.remove_model} ===")
-
-        from codai.models.cache import remove_cached_model
-
-        removed = remove_cached_model(args.remove_model)
-
-        if not removed:
-            print(f"No cached models found matching: {args.remove_model}")
-            print(f"\nUse --list-cached-models to see available models.")
-            sys.exit(0)
-
-        total_size = sum(size for _, _, size in removed)
-        print(f"\nRemoved {len(removed)} cached model file(s), freeing {total_size / (1024*1024):.1f} MB")
-        sys.exit(0)
-
-    # Handle --download-model early (before heavy imports)
-    if args.download_model:
-        print(f"\n=== Downloading Model: {args.download_model} ===")
-
-        from codai.models.cache import download_model
-
-        try:
-            cached_path = download_model(args.download_model)
-
-            if cached_path:
-                print(f"\n=== Model downloaded successfully ===")
-                print(f"Cached at: {cached_path}")
-                sys.exit(0)
-            else:
-                print(f"\n=== Failed to download model ===")
-                sys.exit(1)
-        except Exception as e:
-            print(f"\n=== Error downloading model: {e} ===")
-            sys.exit(1)
-
-    # Import globals from codai modules (only after early exits)
-    from codai.api import app
-    from codai.api.state import (
-        set_global_args,
-        set_global_debug,
-        set_global_system_prompt,
-        set_global_tools_closer_prompt,
-        set_global_file_path,
-        set_load_mode,
-        set_grammar_guided_gen,
-    )
-    from codai.models.manager import ModelManager, MultiModelManager, model_manager, multi_model_manager
-    from codai.backends import detect_available_backends
-    from codai.models.cache import (
-        get_all_cache_dirs,
-        get_cached_model_path,
-        get_model_cache_dir,
-        download_model,
-        list_cached_models_info,
-    )
-
-    # Import global setters from text module FIRST (before calling them)
-    from codai.api.text import (
-        set_global_args,
-        set_global_debug,
-        set_global_system_prompt,
-        set_global_tools_closer_prompt,
-    )
-    from codai.api.app import set_load_mode
-    
-    # Store args globally for access in endpoints (both state and text.py)
-    set_global_args(args)
-    
-    # Set global variables
-    global global_system_prompt, global_tools_closer_prompt, global_debug, global_dump, global_file_path, grammar_guided_gen
-    
-    # Set global grammar-guided-gen flag
-    from codai.api.state import set_grammar_guided_gen
-    grammar_guided_gen = args.grammar_guided_gen
-    if grammar_guided_gen:
-        print("Grammar-guided generation enabled (--grammar-guided-gen)")
-    
-    # Set global system prompt from --system-prompt flag
-    global_system_prompt = args.system_prompt
-    set_global_system_prompt(global_system_prompt)
-    
-    # Set global tools-closer-prompt flag
-    global_tools_closer_prompt = args.tools_closer_prompt
-    set_global_tools_closer_prompt(global_tools_closer_prompt)
-    if global_tools_closer_prompt:
-        print("Tools closer prompt enabled (--tools-closer-prompt)")
-    
-    # Set global debug flag
-    global_debug = args.debug
-    set_global_debug(global_debug)
-    
-    # Set global dump flag (enables debug as well for litellm output)
-    global_dump = args.dump
-    if global_dump:
-        global_debug = True
-        set_global_debug(True)
-    
-    # Set global file path for storing generated files
-    global_file_path = args.file_path
-    set_global_file_path(global_file_path)
-    
-    # Also set file path for images module
-    from codai.api.images import set_global_file_path as set_images_file_path
-    set_images_file_path(global_file_path)
-    
-    # Also set global args for images module (it has its own global_args)
-    from codai.api.images import set_global_args as set_images_global_args
-    set_images_global_args(args)
-    
-    # Also set file path for app.py (needed for /v1/files endpoint)
-    from codai.api.app import set_global_file_path_wrapper
-    set_global_file_path_wrapper(global_file_path)
-    
-    if global_debug:
-        # Print the full command line that was used to invoke codai
-        import shlex
-        cmd_line = ' '.join(shlex.quote(arg) for arg in sys.argv)
-        print(f"\n{'='*80}")
-        print(f"=== COMMAND LINE: {cmd_line}")
-        print(f"{'='*80}\n")
-        print("DEBUG MODE ENABLED - Full requests and replies will be dumped to stdout")
-    
-    # Handle --vulkan-list-devices
-    if args.vulkan_list_devices:
-        print("\nListing Vulkan devices...")
-        try:
-            import subprocess
-            result = subprocess.run(['vulkaninfo', '--summary'], capture_output=True, text=True)
-            if result.returncode == 0:
-                print(result.stdout)
-            else:
-                print("Could not run vulkaninfo. Make sure vulkan-tools is installed.")
-        except Exception as e:
-            print(f"Error listing devices: {e}")
-        sys.exit(0)
-    
-    # Get model names from args - support multiple models
-    model_names = args.model if args.model else []
-    
-    # Helper function to get config value by index with fallback
-    def get_ctx_by_index(ctx_list, index, default):
-        """Get context value by model index, with fallback to default."""
-        if ctx_list and index < len(ctx_list):
-            return ctx_list[index]
-        return default
-    
-    # Validate: must have at least one model specified
-    audio_models = args.audio_model if args.audio_model else []
-    image_models = args.image_model if args.image_model else []
-    vision_models = args.vision_model if args.vision_model else []
-    
-    if not model_names and not audio_models and not image_models and not vision_models and args.tts_model is None:
-        print("Error: At least one of --model, --audio-model, --image-model, --vision-model, or --tts-model must be specified.")
-        print("")
-        print("For NVIDIA backend (HuggingFace models):")
-        print("  - microsoft/DialoGPT-medium")
-        print("  - meta-llama/Llama-2-7b-chat-hf (requires auth)")
-        print("  - TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-        print("  - Use multiple --model flags for multiple models")
-        print("")
-        print("For Vulkan backend (GGUF models):")
-        print("  - Local path: ./phi-3-mini-4k-instruct-q4_k_m.gguf")
-        print("  - Or a HuggingFace model ID: TheBloke/Mistral-7B-Instruct-v0.2-GGUF")
-        print("  - Use multiple --model flags for multiple models")
-        print("")
-        sys.exit(1)
-    
-    # Determine load mode
-    # Default is ondemand: pre-load only the first model, unload/load on switch
-    # --loadswap: load first in VRAM, others in CPU RAM, swap on switch
-    # --loadall: try to load all models in VRAM, offload to CPU RAM if fails
-    # --nopreload: skip pre-loading in any mode, load on first request
-    load_mode = "ondemand"  # Default: on-demand loading
-    if args.loadall:
-        load_mode = "loadall"
-    elif args.loadswap:
-        load_mode = "loadswap"
-    
-    nopreload = args.nopreload
-    
-    set_load_mode(load_mode)
-    multi_model_manager.set_load_mode(load_mode)
-    
-    if load_mode == "ondemand":
-        print("Load mode: ondemand (pre-load first model, unload/load on switch)")
-    elif load_mode == "loadswap":
-        print("Load mode: loadswap (first model in VRAM, others in CPU RAM, swap on switch)")
-    elif load_mode == "loadall":
-        print("Load mode: loadall (load all models, offload to CPU RAM if VRAM full)")
-    if nopreload:
-        print("  --nopreload: models will load on first request instead of at startup")
-    
-    # Initialize model manager
-    print("\n=== Initializing Model Manager ===")
-    
-    # Detect available backends
-    available_backends = detect_available_backends()
-    print(f"Available backends: {available_backends}")
-    
-    # Determine which backend to use
-    backend = args.backend
-    if backend == "auto":
-        if "nvidia" in available_backends:
-            backend = "nvidia"
-        elif "vulkan" in available_backends:
-            backend = "vulkan"
-        elif "opencl" in available_backends:
-            backend = "opencl"
-        else:
-            print("Error: No supported backend detected (NVIDIA CUDA, AMD Vulkan, or OpenCL)")
-            sys.exit(1)
-    
-    print(f"Using backend: {backend}")
-    
-    # Set the backend for the model manager
-    model_manager.backend_type = backend
-    
-    # Store references globally for API endpoints
-    from codai.api import app as fastapi_app
-    fastapi_app.state.model_manager = model_manager
-    fastapi_app.state.multi_model_manager = multi_model_manager
-    
-    # Load main text model(s)
-    if model_names:
-        print(f"\nMain text model(s): {model_names}")
-        
-        # Register models with multi_model_manager (set_default_model also resolves/caches)
-        for idx, model_name in enumerate(model_names):
-            multi_model_manager.set_default_model(model_name, {
-                'ctx': get_ctx_by_index(args.n_ctx, idx, 0),
-            })
-        
-        # Pre-load models at startup (unless --nopreload)
-        if nopreload:
-            print(f"  --nopreload: text model(s) will load on first request")
-        elif load_mode == "ondemand":
-            # Ondemand: pre-load only the first model into VRAM
-            try:
-                print(f"Preloading first model into VRAM: {model_names[0]}...")
-                mm = multi_model_manager._load_default_model()
-                if mm is not None and mm.backend is not None:
-                    multi_model_manager.active_in_vram = multi_model_manager.default_model
-                    print(f"Model loaded successfully: {model_names[0]}")
-                else:
-                    print(f"Warning: Model {model_names[0]} failed to load")
-            except Exception as e:
-                print(f"Warning: Failed to preload model: {e}")
-                print(f"Model will load on first request")
-        elif load_mode == "loadswap":
-            # Loadswap: load first model into VRAM, others into CPU RAM
-            try:
-                print(f"Preloading first model into VRAM: {model_names[0]}...")
-                mm = multi_model_manager._load_default_model()
-                if mm is not None and mm.backend is not None:
-                    multi_model_manager.active_in_vram = multi_model_manager.default_model
-                    print(f"Model loaded successfully (VRAM): {model_names[0]}")
-                else:
-                    print(f"Warning: Model {model_names[0]} failed to load")
-            except Exception as e:
-                print(f"Warning: Failed to preload model: {e}")
-            
-            # Load remaining text models into CPU RAM
-            for idx, model_name in enumerate(model_names[1:], 1):
-                try:
-                    print(f"Preloading model into CPU RAM: {model_name}...")
-                    mm2 = multi_model_manager._load_model_by_name(model_name)
-                    if mm2 is not None:
-                        # Move to CPU immediately (it was loaded into VRAM by default)
-                        multi_model_manager._move_model_to_cpu(model_name)
-                        print(f"Model loaded successfully (CPU RAM): {model_name}")
-                    else:
-                        print(f"Warning: Model {model_name} failed to load")
-                except Exception as e:
-                    print(f"Warning: Failed to preload model {model_name}: {e}")
-        elif load_mode == "loadall":
-            # Loadall: try to load all models into VRAM, offload to CPU RAM if fails
-            for idx, model_name in enumerate(model_names):
-                try:
-                    if idx == 0:
-                        print(f"Preloading model into VRAM: {model_name}...")
-                        mm = multi_model_manager._load_default_model()
-                    else:
-                        print(f"Preloading model into VRAM: {model_name}...")
-                        mm = multi_model_manager._load_model_by_name(model_name)
-                    
-                    if mm is not None and (not hasattr(mm, 'backend') or mm.backend is not None):
-                        if idx == 0:
-                            multi_model_manager.active_in_vram = multi_model_manager.default_model
-                        print(f"Model loaded successfully (VRAM): {model_name}")
-                    else:
-                        print(f"Warning: Model {model_name} failed to load")
-                except Exception as e:
-                    error_msg = str(e).lower()
-                    is_oom = any(x in error_msg for x in ['out of memory', 'oom', 'cuda error'])
-                    if is_oom:
-                        print(f"VRAM full for {model_name}, offloading to CPU RAM...")
-                        try:
-                            mm = multi_model_manager._load_model_by_name(model_name)
-                            if mm is not None:
-                                multi_model_manager._move_model_to_cpu(model_name)
-                                print(f"Model loaded successfully (CPU RAM): {model_name}")
-                        except Exception as e2:
-                            print(f"Warning: Failed to load model {model_name} even to CPU: {e2}")
-                    else:
-                        print(f"Warning: Failed to preload model {model_name}: {e}")
-    
-    # Set up audio model if specified
-    if audio_models:
-        print(f"\nAudio transcription model(s): {audio_models}")
-        
-        for idx, audio_m in enumerate(audio_models):
-            multi_model_manager.set_audio_model(audio_m, {
-                'ctx': get_ctx_by_index(args.audio_ctx, idx, 0),
-                'offload': args.audio_offload,
-            })
-    
-    # Set up whisper-server if specified
-    if args.whisper_server:
-        print(f"\nWhisper server: {args.whisper_server}")
-        print(f"  Port: {args.whisper_server_port}")
-        
-        # Import WhisperServerManager
-        from codai.models.manager import WhisperServerManager
-        
-        # Check if whisper-server is already running
-        if multi_model_manager.whisper_server is None:
-            whisper_server_mgr = WhisperServerManager(
-                server_path=args.whisper_server,
-                port=args.whisper_server_port
-            )
-            multi_model_manager.whisper_server = whisper_server_mgr
-        else:
-            whisper_server_mgr = multi_model_manager.whisper_server
-            print("Whisper server already running, using existing instance")
-        
-        # Start whisper-server if we have audio_models configured
-        if audio_models:
-            model_to_use = audio_models[0] if audio_models else None
-            gpu_device = getattr(args, 'audio_vulkan_device', 0) or 0
-            print(f"DEBUG: Starting whisper-server with gpu_device={gpu_device}")
-            actual_model_path = whisper_server_mgr.start(model_path=model_to_use, gpu_device=gpu_device)
-            if actual_model_path:
-                # Update audio_models in multi_model_manager to store the actual path (not the URL)
-                if model_to_use != actual_model_path:
-                    if multi_model_manager.audio_models and multi_model_manager.audio_models[0] == model_to_use:
-                        multi_model_manager.audio_models[0] = actual_model_path
-                print(f"Whisper server started with model: {actual_model_path}")
-            else:
-                print("Warning: Failed to start whisper-server, falling back to other backends")
-    
-    # Set up image model if specified
-    if image_models:
-        print(f"\nImage generation model(s): {image_models}")
-        
-        for idx, img_m in enumerate(image_models):
-            multi_model_manager.set_image_model(img_m, {
-                'ctx': get_ctx_by_index(args.image_ctx, idx, 0),
-                'offload': args.image_offload,
-                'llm_path': args.llm_path,
-                'vae_path': args.vae_path,
-                'sample_method': args.image_sample_method,
-                'steps': args.image_steps,
-                'width': args.image_width,
-                'height': args.image_height,
-                'cfg_scale': args.image_cfg_scale,
-            })
-    
-    # Set up vision model if specified
-    if vision_models:
-        print(f"\nVision model(s): {vision_models}")
-        
-        for idx, vision_m in enumerate(vision_models):
-            multi_model_manager.set_vision_model(vision_m, {
-                'ctx': get_ctx_by_index(args.n_ctx, idx, 0),
-                'offload': args.image_offload,
-            })
-    
-    # Set up TTS model if specified
-    if args.tts_model:
-        print(f"\nText-to-speech model: {args.tts_model}")
-        multi_model_manager.set_tts_model(args.tts_model, {})
-    
-    # Register model aliases if specified
-    if args.model_aliases:
-        print(f"\nRegistering model aliases:")
-        for alias, model in args.model_aliases:
-            multi_model_manager.set_model_alias(alias, model)
-            print(f"  {alias} -> {model}")
-    
-    # =========================================================================
-    # Pre-load non-text models for loadall and loadswap modes
-    # (Text models are already handled above)
-    # =========================================================================
-    if not nopreload and load_mode in ("loadall", "loadswap"):
-        # Collect all non-text models that need pre-loading
-        # For loadall: load all into VRAM (offload to CPU if OOM)
-        # For loadswap: first model in VRAM (already done for text), rest in CPU RAM
-        
-        # Determine if the first text model is already in VRAM
-        first_model_loaded = multi_model_manager.active_in_vram is not None
-        
-        # Pre-load image models
-        if image_models:
-            print(f"\n=== Pre-loading image model(s) ===")
-            for idx, img_m in enumerate(image_models):
-                model_key = f"image:{img_m}"
-                if model_key in multi_model_manager.models:
-                    continue  # Already loaded
-                
-                try:
-                    from codai.api.images import _load_diffusers_pipeline, _is_gguf_model, _load_sdcpp_model
-                    
-                    if load_mode == "loadall":
-                        # Try to load into VRAM
-                        print(f"Preloading image model into VRAM: {img_m}...")
-                        if _is_gguf_model(img_m):
-                            resolved_path = multi_model_manager.load_model(img_m)
-                            if resolved_path and os.path.isfile(resolved_path):
-                                sd_model = _load_sdcpp_model(resolved_path, args)
-                                if sd_model:
-                                    multi_model_manager.add_model(model_key, sd_model)
-                                    print(f"Image model loaded (VRAM, sd.cpp): {img_m}")
-                        else:
-                            try:
-                                pipeline = _load_diffusers_pipeline(img_m, args)
-                                if pipeline:
-                                    multi_model_manager.add_model(model_key, pipeline)
-                                    print(f"Image model loaded (VRAM, diffusers): {img_m}")
-                            except Exception as e:
-                                error_msg = str(e).lower()
-                                is_oom = any(x in error_msg for x in ['out of memory', 'oom', 'cuda error'])
-                                if is_oom:
-                                    print(f"VRAM full for image model {img_m}, will load on demand")
-                                else:
-                                    print(f"Warning: Failed to preload image model {img_m}: {e}")
-                    
-                    elif load_mode == "loadswap":
-                        # Load into VRAM then move to CPU (unless it's the first model overall)
-                        if not first_model_loaded:
-                            # No model in VRAM yet, load this one into VRAM
-                            print(f"Preloading image model into VRAM: {img_m}...")
-                            if _is_gguf_model(img_m):
-                                resolved_path = multi_model_manager.load_model(img_m)
-                                if resolved_path and os.path.isfile(resolved_path):
-                                    sd_model = _load_sdcpp_model(resolved_path, args)
-                                    if sd_model:
-                                        multi_model_manager.add_model(model_key, sd_model)
-                                        first_model_loaded = True
-                                        print(f"Image model loaded (VRAM): {img_m}")
-                            else:
-                                try:
-                                    pipeline = _load_diffusers_pipeline(img_m, args)
-                                    if pipeline:
-                                        multi_model_manager.add_model(model_key, pipeline)
-                                        first_model_loaded = True
-                                        print(f"Image model loaded (VRAM): {img_m}")
-                                except Exception as e:
-                                    print(f"Warning: Failed to preload image model {img_m}: {e}")
-                        else:
-                            # First model already in VRAM, load this to VRAM then move to CPU
-                            print(f"Preloading image model into CPU RAM: {img_m}...")
-                            # Move current VRAM model to CPU temporarily
-                            current_vram = multi_model_manager.active_in_vram
-                            if current_vram and current_vram in multi_model_manager.models:
-                                multi_model_manager._move_model_to_cpu(current_vram)
-                            
-                            try:
-                                if _is_gguf_model(img_m):
-                                    resolved_path = multi_model_manager.load_model(img_m)
-                                    if resolved_path and os.path.isfile(resolved_path):
-                                        sd_model = _load_sdcpp_model(resolved_path, args)
-                                        if sd_model:
-                                            multi_model_manager.add_model(model_key, sd_model)
-                                            multi_model_manager._move_model_to_cpu(model_key)
-                                            print(f"Image model loaded (CPU RAM): {img_m}")
-                                else:
-                                    pipeline = _load_diffusers_pipeline(img_m, args)
-                                    if pipeline:
-                                        multi_model_manager.add_model(model_key, pipeline)
-                                        multi_model_manager._move_model_to_cpu(model_key)
-                                        print(f"Image model loaded (CPU RAM): {img_m}")
-                            except Exception as e:
-                                print(f"Warning: Failed to preload image model {img_m}: {e}")
-                            
-                            # Move original model back to VRAM
-                            if current_vram and current_vram in multi_model_manager.models:
-                                multi_model_manager._move_model_to_vram(current_vram)
-                                multi_model_manager.active_in_vram = current_vram
-                
-                except ImportError as e:
-                    print(f"Warning: Cannot preload image model {img_m} (missing dependency): {e}")
-                except Exception as e:
-                    print(f"Warning: Failed to preload image model {img_m}: {e}")
-        
-        # Note: Audio models (faster-whisper) and TTS models (kokoro) are loaded
-        # by their respective API modules on first request, as they use specialized
-        # loading mechanisms. The model files are already cached by set_audio_model()
-        # and set_tts_model() above.
-        if audio_models:
-            print(f"\nAudio model(s) registered and cached, will load into memory on first request")
-        if args.tts_model:
-            print(f"TTS model registered and cached, will load into memory on first request")
-    
-    # Start the server
-    import uvicorn
-    print(f"\nStarting server on http://{args.host}:{args.port}")
-    print(f"API documentation available at http://{args.host}:{args.port}/docs")
-    
-    if model_manager.backend is not None:
-        actual_backend = model_manager.backend_type
-        if hasattr(model_manager.backend, 'force_cuda') and model_manager.backend.force_cuda:
-            actual_backend = "cuda (via llama-cpp-python)"
-        print(f"Using backend: {actual_backend}")
-    
-    # Print available models
-    models = multi_model_manager.list_models()
-    print(f"Available models: {[m.id for m in models]}")
-    
-    # Run server with or without HTTPS
-    if args.https:
-        import ssl
-        
-        ssl_keyfile = None
-        ssl_certfile = None
-        
-        if args.privkey and args.pubkey:
-            ssl_keyfile = args.privkey
-            ssl_certfile = args.pubkey
-            print(f"Using HTTPS with custom certificates: {args.pubkey}")
-        else:
-            print("Generating self-signed HTTPS certificate...")
-            import subprocess
-            try:
-                cert_path = "./cert.pem"
-                key_path = "./key.pem"
-                subprocess.run([
-                    "openssl", "req", "-x509", "-newkey", "rsa:4096",
-                    "-keyout", key_path, "-out", cert_path,
-                    "-days", "365", "-nodes",
-                    "-subj", "/CN=localhost"
-                ], check=True, capture_output=True)
-                ssl_keyfile = key_path
-                ssl_certfile = cert_path
-                print(f"Generated self-signed certificate: {cert_path}")
-            except Exception as e:
-                print(f"Warning: Could not generate certificate: {e}")
-                print("Falling back to HTTP...")
-                uvicorn.run(app, host=args.host, port=args.port)
-                return
-        
-        ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
-        ssl_context.load_cert_chain(ssl_certfile, ssl_keyfile)
-        uvicorn.run(app, host=args.host, port=args.port, ssl=ssl_context)
-    else:
-        uvicorn.run(app, host=args.host, port=args.port)
-
-
-if __name__ == "__main__":
-    main()
--- a/requirements-nvidia.txt
+++ b/requirements-nvidia.txt
@@ -15,7 +15,7 @@ psutil>=5.9.0

 # Optional: Audio transcription dependencies
 faster-whisper>=0.10.0  # For NVIDIA/CUDA whisper transcription
-whispercpp>=1.0.0  # Alternative whisper library (works without PyTorch)
+whispercpp>=0.0.17  # Alternative whisper library (works without PyTorch)

 # Optional: for better performance with NVIDIA GPUs
 bitsandbytes>=0.41.0

--- a/requirements-vulkan.txt
+++ b/requirements-vulkan.txt
@@ -17,4 +17,4 @@ huggingface-hub>=0.19.0

 # Optional: Audio transcription without PyTorch (whispercpp)
 # Note: faster-whisper requires PyTorch, but whispercpp works without it
-whispercpp>=1.0.0  # For GGUF-based Whisper transcription without PyTorch
+whispercpp>=0.0.17  # For GGUF-based Whisper transcription without PyTorch
--- a/requirements.txt
+++ b/requirements.txt
@@ -36,15 +36,15 @@ transformers>=4.35.0
 accelerate>=0.24.0
 diffusers>=0.25.0  # For Stable Diffusion image generation
 safetensors>=0.4.0  # Required by diffusers
-stable-diffusion-cpp-python  # For Vulkan/AMD image generation (no version pin for Python 3.13 compat)
+# stable-diffusion-cpp-python is installed by build.sh with CMAKE_ARGS to fix the libwebm submodule issue

 # System resource detection
 psutil>=5.9.0
-procname  # Process naming (no version pin for Python 3.13 compatibility)
+setproctitle>=1.1  # Process naming (replaces procname, Python 3.13 compatible)

 # Optional: Audio transcription dependencies
 faster-whisper>=0.10.0  # For NVIDIA/CUDA whisper transcription
-whispercpp>=0.0.6  # Alternative whisper library (works without PyTorch)
+whispercpp>=0.0.17  # Alternative whisper library (works without PyTorch)

 # LiteLLM for standardized API responses
 litellm>=1.40.0