Restrict driver selection to available GPU backends only

- Removed CPU option from driver selection (only CUDA/ROCm GPU drivers) - Set CUDA as default driver selection when available - Added available_gpu_backends field to node API responses - Frontend dynamically populates driver options based on node's available GPUs - API validation rejects non-GPU driver requests - Cluster clients only accept CUDA/ROCm backend restart commands - Improved user experience by showing only relevant driver options per node

Restrict driver selection to available GPU backends only
- Removed CPU option from driver selection (only CUDA/ROCm GPU drivers) - Set CUDA as default driver selection when available - Added available_gpu_backends field to node API responses - Frontend dynamically populates driver options based on node's available GPUs - API validation rejects non-GPU driver requests - Cluster clients only accept CUDA/ROCm backend restart commands - Improved user experience by showing only relevant driver options per node
abec9e31 · Stefy Lanza (nextime / spora ) · bedc1de9 · abec9e31 · abec9e31 · abec9e31
Commit abec9e31 authored Oct 07, 2025 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 53 additions and 15 deletions

cluster_nodes.html templates/admin/cluster_nodes.html +33 -9

cluster_client.py vidai/cluster_client.py +2 -2

web.py vidai/web.py +18 -4

No files found.
--- a/templates/admin/cluster_nodes.html
+++ b/templates/admin/cluster_nodes.html
@@ -127,11 +127,9 @@
            <input type="hidden" id="modalHostnameInput" name="hostname">
            <input type="hidden" id="modalTokenInput" name="token">
            <div class="form-group">
-                <label for="driverSelect">Preferred Driver:</label>
+                <label for="driverSelect">Preferred GPU Driver:</label>
                <select id="driverSelect" name="driver">
-                    <option value="cuda">CUDA</option>
-                    <option value="rocm">ROCm</option>
-                    <option value="cpu">CPU</option>
+                    <!-- Options will be populated dynamically -->
                </select>
            </div>
            <div class="modal-footer">
@@ -215,13 +213,39 @@ function renderNodesTable() {
    `).join('');
 }

-function openDriverModal(hostname, tokenName, hostnameValue) {
-    // Find the token for this hostname (simplified - in real implementation, pass token)
-    // For now, we'll use tokenName as token, but actually need the actual token string
-    // This is a placeholder - you'd need to modify the API to include token
+function openDriverModal(hostname, token, hostnameValue) {
+    // Find the node data to get available GPU backends
+    const node = nodesData.find(n => n.hostname === hostname && n.token === token);
+    if (!node) {
+        console.error('Node not found:', hostname, token);
+        return;
+    }
+
    document.getElementById('modalHostname').textContent = hostname;
    document.getElementById('modalHostnameInput').value = hostnameValue;
-    document.getElementById('modalTokenInput').value = tokenName; // This should be the actual token, not name
+    document.getElementById('modalTokenInput').value = token;
+
+    // Populate driver options based on available GPU backends
+    const driverSelect = document.getElementById('driverSelect');
+    driverSelect.innerHTML = '';
+
+    const availableBackends = node.available_gpu_backends || [];
+    if (availableBackends.length === 0) {
+        // Fallback for nodes without backend info
+        availableBackends.push('cuda', 'rocm');
+    }
+
+    availableBackends.forEach(backend => {
+        const option = document.createElement('option');
+        option.value = backend;
+        option.textContent = backend.toUpperCase();
+        driverSelect.appendChild(option);
+    });
+
+    // Set CUDA as default if available, otherwise first available
+    const defaultBackend = availableBackends.includes('cuda') ? 'cuda' : availableBackends[0];
+    driverSelect.value = defaultBackend;
+
    document.getElementById('driverModal').style.display = 'block';
 }


--- a/vidai/cluster_client.py
+++ b/vidai/cluster_client.py
@@ -363,8 +363,8 @@ class ClusterClient:
        """Handle restart workers command from master."""
        backend = message.get('backend', 'cuda')

-        if backend not in ['cuda', 'rocm', 'cpu']:
-            print(f"Invalid backend requested: {backend}")
+        if backend not in ['cuda', 'rocm']:
+            print(f"Invalid backend requested: {backend} - only CUDA and ROCm supported")
            return

        print(f"Restarting workers with {backend} backend")

--- a/vidai/web.py
+++ b/vidai/web.py
@@ -419,12 +419,19 @@ def api_cluster_nodes():
            connected_at = client_info.get('connected_at', current_time)
            uptime_seconds = current_time - connected_at
    
-            # Detect mixed GPU availability
+            # Detect mixed GPU availability and available backends
            gpu_info = client_info.get('gpu_info', {})
            has_cuda = gpu_info.get('cuda_available', False)
            has_rocm = gpu_info.get('rocm_available', False)
            mixed_gpu = has_cuda and has_rocm
    
+            # Determine available GPU backends for this node
+            available_gpu_backends = []
+            if has_cuda:
+                available_gpu_backends.append('cuda')
+            if has_rocm:
+                available_gpu_backends.append('rocm')
+    
            node_map[node_key] = {
                'token': token,
                'token_name': token_name,
@@ -441,6 +448,7 @@ def api_cluster_nodes():
                'weight': client_info.get('weight', 100),
                'is_local': False,
                'mixed_gpu': mixed_gpu,
+                'available_gpu_backends': available_gpu_backends,
                'workers': []  # Will collect worker details
            }

@@ -507,6 +515,10 @@ def api_cluster_nodes():
        # Use the longest uptime as representative
        max_uptime = max((w.get('uptime_seconds', 0) for w in local_workers), default=0)

+        # Get available GPU backends for local system
+        from vidai.compat import get_available_backends
+        local_available_gpu_backends = [b for b in get_available_backends() if b in ['cuda', 'rocm']]
+
        local_node = {
            'token': 'local',
            'token_name': 'Local Master Node',
@@ -524,7 +536,9 @@ def api_cluster_nodes():
            'active_jobs': 0,  # Placeholder
            'completed_jobs': 0,  # Placeholder
            'weight': 0,  # Local workers don't participate in cluster load balancing
-            'is_local': True
+            'is_local': True,
+            'mixed_gpu': len(local_available_gpu_backends) > 1,
+            'available_gpu_backends': local_available_gpu_backends
        }
        nodes.append(local_node)

@@ -626,8 +640,8 @@ def api_set_client_driver():
    if not hostname or not driver:
        return {'success': False, 'error': 'Missing required parameters'}, 400

-    if driver not in ['cuda', 'rocm', 'cpu']:
-        return {'success': False, 'error': 'Invalid driver'}, 400
+    if driver not in ['cuda', 'rocm']:
+        return {'success': False, 'error': 'Invalid driver - only CUDA and ROCm are supported'}, 400

    # Handle local workers
    if token == 'local':