Allow CPU-only cluster clients and flexible backend support

- Removed GPU-only requirement for cluster client connections - CPU-only clients can now join cluster and run CPU-based workers - Master accepts all clients regardless of GPU availability - Nodes are properly marked as CPU-only when no GPUs detected - Driver selection modal supports CUDA, ROCm, and CPU backends - Local and remote workers can use any available backend (GPU or CPU) - Enhanced cluster flexibility for mixed hardware environments - CPU nodes contribute to cluster for CPU-only processing tasks - Maintains backward compatibility with existing GPU-only workflows - Clear node type identification in cluster management interface

Allow CPU-only cluster clients and flexible backend support
- Removed GPU-only requirement for cluster client connections - CPU-only clients can now join cluster and run CPU-based workers - Master accepts all clients regardless of GPU availability - Nodes are properly marked as CPU-only when no GPUs detected - Driver selection modal supports CUDA, ROCm, and CPU backends - Local and remote workers can use any available backend (GPU or CPU) - Enhanced cluster flexibility for mixed hardware environments - CPU nodes contribute to cluster for CPU-only processing tasks - Maintains backward compatibility with existing GPU-only workflows - Clear node type identification in cluster management interface
bd087af5 · Stefy Lanza (nextime / spora ) · f57a1468 · bd087af5 · bd087af5 · bd087af5
Commit bd087af5 authored Oct 07, 2025 by Stefy Lanza (nextime / spora )
5 changed files
--- a/templates/admin/cluster_nodes.html
+++ b/templates/admin/cluster_nodes.html
@@ -214,7 +214,7 @@ function renderNodesTable() {
 }

 function openDriverModal(hostname, token, hostnameValue) {
-    // Find the node data to get available GPU backends
+    // Find the node data to get available backends
    const node = nodesData.find(n => n.hostname === hostname && n.token === token);
    if (!node) {
        console.error('Node not found:', hostname, token);
@@ -225,15 +225,19 @@ function openDriverModal(hostname, token, hostnameValue) {
    document.getElementById('modalHostnameInput').value = hostnameValue;
    document.getElementById('modalTokenInput').value = token;

-    // Populate driver options based on available GPU backends
+    // Populate driver options based on available backends (GPU and CPU)
    const driverSelect = document.getElementById('driverSelect');
    driverSelect.innerHTML = '';

-    const availableBackends = node.available_gpu_backends || [];
+    const availableBackends = node.available_backends || [];
    if (availableBackends.length === 0) {
        // Fallback for nodes without backend info
+        if (node.is_cpu_only) {
+            availableBackends.push('cpu');
+        } else {
            availableBackends.push('cuda', 'rocm');
        }
+    }

    availableBackends.forEach(backend => {
        const option = document.createElement('option');

--- a/vidai/cluster_client.py
+++ b/vidai/cluster_client.py
@@ -54,17 +54,6 @@ class ClusterClient:
    async def connect(self) -> bool:
        """Connect to cluster master via secure websocket."""
        try:
-            # Detect available backends first
-            from .compat import detect_gpu_backends, get_available_backends
-            gpu_info = detect_gpu_backends()
-            available_backends = get_available_backends()
-
-            # Check if we have any GPU backends available
-            gpu_backends = [b for b in available_backends if b in ['cuda', 'rocm']]
-            if not gpu_backends:
-                print("No GPU backends detected (CUDA/ROCm). Cluster client requires GPU capabilities to connect.")
-                return False
-
            # Create SSL context that accepts self-signed certificates
            ssl_context = ssl.create_default_context()
            ssl_context.check_hostname = False
@@ -73,6 +62,11 @@ class ClusterClient:
            uri = f"wss://{self.host}:{self.port}/cluster"
            self.websocket = await websockets.connect(uri, ssl=ssl_context)

+            # Detect available backends
+            from .compat import detect_gpu_backends, get_available_backends
+            gpu_info = detect_gpu_backends()
+            available_backends = get_available_backends()
+
            # Get hostname
            import socket
            hostname = socket.gethostname()
@@ -139,16 +133,16 @@ class ClusterClient:
                self.connected = False

    async def start_local_processes(self) -> None:
-        """Start local worker processes based on available GPU backends."""
+        """Start local worker processes based on available backends (GPU and CPU)."""
        from .compat import detect_gpu_backends, get_available_backends

        gpu_info = detect_gpu_backends()
        available_backends = get_available_backends()

-        print(f"Client GPU detection: CUDA={gpu_info['cuda']}, ROCm={gpu_info['rocm']}")
+        print(f"Client backend detection: CUDA={gpu_info['cuda']}, ROCm={gpu_info['rocm']}")
        print(f"Available backends: {available_backends}")

-        # Start analysis workers for available backends
+        # Start analysis workers for available backends (including CPU)
        for backend in available_backends:
            proc_name = f'analysis_{backend}'
            cmd = [sys.executable, '-m', 'vidai.worker_analysis', backend]
@@ -161,7 +155,7 @@ class ClusterClient:
            self.process_models[proc_name] = 'Qwen/Qwen2.5-VL-7B-Instruct'
            print(f"Started analysis worker for {backend}")

-        # Start training workers for available backends
+        # Start training workers for available backends (including CPU)
        for backend in available_backends:
            proc_name = f'training_{backend}'
            cmd = [sys.executable, '-m', 'vidai.worker_training', backend]

--- a/vidai/cluster_master.py
+++ b/vidai/cluster_master.py
@@ -194,16 +194,14 @@ class ClusterMaster:
        if not token:
            return {'type': 'auth_failed', 'message': 'No token provided'}

-        # Check if client has GPU capabilities
+        # Generate client ID from token
+        client_id = hashlib.sha256(token.encode()).hexdigest()[:16]
+
+        # Check GPU capabilities for logging
        gpu_info = client_info.get('gpu_info', {})
        available_backends = gpu_info.get('available_backends', [])
        gpu_backends = [b for b in available_backends if b in ['cuda', 'rocm']]
-
-        if not gpu_backends:
-            return {'type': 'auth_failed', 'message': 'Client must have GPU capabilities (CUDA or ROCm) to join cluster'}
-
-        # Generate client ID from token
-        client_id = hashlib.sha256(token.encode()).hexdigest()[:16]
+        has_gpu = len(gpu_backends) > 0

        # Store client info including GPU capabilities and weight
        self.clients[client_id] = {
@@ -222,7 +220,8 @@ class ClusterMaster:
            self.weight = 0
            print("First client connected - changing master weight to 0 (automatic)")

-        print(f"Client {client_id} authenticated with GPU backends: {gpu_backends}")
+        backend_type = "GPU" if has_gpu else "CPU-only"
+        print(f"Client {client_id} authenticated ({backend_type}) with backends: {available_backends}")
        return {'type': 'auth_success', 'client_id': client_id}

    def _handle_register_processes(self, message: Dict[str, Any], websocket: websockets.WebSocketServerProtocol) -> Dict[str, Any]:
@@ -604,6 +603,7 @@ class ClusterMaster:
            return False

        if backend not in ['cuda', 'rocm', 'cpu']:
+            print(f"Invalid backend requested: {backend} - only CUDA, ROCm, and CPU supported")
            return False

        # Send restart command to client

--- a/vidai/database.py
+++ b/vidai/database.py
@@ -1535,7 +1535,7 @@ def get_client_driver_preference(hostname: str, token: str) -> str:

 def set_client_driver_preference(hostname: str, token: str, driver: str) -> bool:
    """Set the preferred driver for a client (hostname + token)."""
-    if driver not in ['cuda', 'rocm']:
+    if driver not in ['cuda', 'rocm', 'cpu']:
        return False

    conn = get_db_connection()

--- a/vidai/web.py
+++ b/vidai/web.py
@@ -419,18 +419,17 @@ def api_cluster_nodes():
            connected_at = client_info.get('connected_at', current_time)
            uptime_seconds = current_time - connected_at
    
-            # Detect mixed GPU availability and available backends
+            # Detect GPU capabilities and available backends
            gpu_info = client_info.get('gpu_info', {})
-            has_cuda = gpu_info.get('cuda_available', False)
-            has_rocm = gpu_info.get('rocm_available', False)
-            mixed_gpu = has_cuda and has_rocm
+            available_backends = gpu_info.get('available_backends', [])
+            gpu_backends = [b for b in available_backends if b in ['cuda', 'rocm']]
+            cpu_backends = [b for b in available_backends if b == 'cpu']
    
-            # Determine available GPU backends for this node
-            available_gpu_backends = []
-            if has_cuda:
-                available_gpu_backends.append('cuda')
-            if has_rocm:
-                available_gpu_backends.append('rocm')
+            has_cuda = 'cuda' in gpu_backends
+            has_rocm = 'rocm' in gpu_backends
+            has_cpu = len(cpu_backends) > 0
+            mixed_gpu = has_cuda and has_rocm
+            is_cpu_only = not gpu_backends and has_cpu
    
            node_map[node_key] = {
                'token': token,
@@ -448,7 +447,8 @@ def api_cluster_nodes():
                'weight': client_info.get('weight', 100),
                'is_local': False,
                'mixed_gpu': mixed_gpu,
-                'available_gpu_backends': available_gpu_backends,
+                'is_cpu_only': is_cpu_only,
+                'available_backends': available_backends,
                'workers': []  # Will collect worker details
            }

@@ -537,8 +537,9 @@ def api_cluster_nodes():
                'completed_jobs': 0,  # Placeholder
                'weight': 0,  # Local workers don't participate in cluster load balancing
                'is_local': True,
-                'mixed_gpu': len(local_available_gpu_backends) > 1,
-                'available_gpu_backends': local_available_gpu_backends
+                'mixed_gpu': len(local_gpu_backends) > 1,
+                'is_cpu_only': not local_gpu_backends and local_cpu_backends,
+                'available_backends': local_available_backends
            }
            nodes.append(local_node)

@@ -640,8 +641,8 @@ def api_set_client_driver():
    if not hostname or not driver:
        return {'success': False, 'error': 'Missing required parameters'}, 400

-    if driver not in ['cuda', 'rocm']:
-        return {'success': False, 'error': 'Invalid driver - only CUDA and ROCm are supported'}, 400
+    if driver not in ['cuda', 'rocm', 'cpu']:
+        return {'success': False, 'error': 'Invalid driver - only CUDA, ROCm, and CPU are supported'}, 400

    # Handle local workers
    if token == 'local':