Restrict driver selection to available GPU backends only

- Removed CPU option from driver selection (only CUDA/ROCm GPU drivers)
- Set CUDA as default driver selection when available
- Added available_gpu_backends field to node API responses
- Frontend dynamically populates driver options based on node's available GPUs
- API validation rejects non-GPU driver requests
- Cluster clients only accept CUDA/ROCm backend restart commands
- Improved user experience by showing only relevant driver options per node
parent bedc1de9
...@@ -127,11 +127,9 @@ ...@@ -127,11 +127,9 @@
<input type="hidden" id="modalHostnameInput" name="hostname"> <input type="hidden" id="modalHostnameInput" name="hostname">
<input type="hidden" id="modalTokenInput" name="token"> <input type="hidden" id="modalTokenInput" name="token">
<div class="form-group"> <div class="form-group">
<label for="driverSelect">Preferred Driver:</label> <label for="driverSelect">Preferred GPU Driver:</label>
<select id="driverSelect" name="driver"> <select id="driverSelect" name="driver">
<option value="cuda">CUDA</option> <!-- Options will be populated dynamically -->
<option value="rocm">ROCm</option>
<option value="cpu">CPU</option>
</select> </select>
</div> </div>
<div class="modal-footer"> <div class="modal-footer">
...@@ -215,13 +213,39 @@ function renderNodesTable() { ...@@ -215,13 +213,39 @@ function renderNodesTable() {
`).join(''); `).join('');
} }
function openDriverModal(hostname, tokenName, hostnameValue) { function openDriverModal(hostname, token, hostnameValue) {
// Find the token for this hostname (simplified - in real implementation, pass token) // Find the node data to get available GPU backends
// For now, we'll use tokenName as token, but actually need the actual token string const node = nodesData.find(n => n.hostname === hostname && n.token === token);
// This is a placeholder - you'd need to modify the API to include token if (!node) {
console.error('Node not found:', hostname, token);
return;
}
document.getElementById('modalHostname').textContent = hostname; document.getElementById('modalHostname').textContent = hostname;
document.getElementById('modalHostnameInput').value = hostnameValue; document.getElementById('modalHostnameInput').value = hostnameValue;
document.getElementById('modalTokenInput').value = tokenName; // This should be the actual token, not name document.getElementById('modalTokenInput').value = token;
// Populate driver options based on available GPU backends
const driverSelect = document.getElementById('driverSelect');
driverSelect.innerHTML = '';
const availableBackends = node.available_gpu_backends || [];
if (availableBackends.length === 0) {
// Fallback for nodes without backend info
availableBackends.push('cuda', 'rocm');
}
availableBackends.forEach(backend => {
const option = document.createElement('option');
option.value = backend;
option.textContent = backend.toUpperCase();
driverSelect.appendChild(option);
});
// Set CUDA as default if available, otherwise first available
const defaultBackend = availableBackends.includes('cuda') ? 'cuda' : availableBackends[0];
driverSelect.value = defaultBackend;
document.getElementById('driverModal').style.display = 'block'; document.getElementById('driverModal').style.display = 'block';
} }
......
...@@ -363,8 +363,8 @@ class ClusterClient: ...@@ -363,8 +363,8 @@ class ClusterClient:
"""Handle restart workers command from master.""" """Handle restart workers command from master."""
backend = message.get('backend', 'cuda') backend = message.get('backend', 'cuda')
if backend not in ['cuda', 'rocm', 'cpu']: if backend not in ['cuda', 'rocm']:
print(f"Invalid backend requested: {backend}") print(f"Invalid backend requested: {backend} - only CUDA and ROCm supported")
return return
print(f"Restarting workers with {backend} backend") print(f"Restarting workers with {backend} backend")
......
...@@ -419,12 +419,19 @@ def api_cluster_nodes(): ...@@ -419,12 +419,19 @@ def api_cluster_nodes():
connected_at = client_info.get('connected_at', current_time) connected_at = client_info.get('connected_at', current_time)
uptime_seconds = current_time - connected_at uptime_seconds = current_time - connected_at
# Detect mixed GPU availability # Detect mixed GPU availability and available backends
gpu_info = client_info.get('gpu_info', {}) gpu_info = client_info.get('gpu_info', {})
has_cuda = gpu_info.get('cuda_available', False) has_cuda = gpu_info.get('cuda_available', False)
has_rocm = gpu_info.get('rocm_available', False) has_rocm = gpu_info.get('rocm_available', False)
mixed_gpu = has_cuda and has_rocm mixed_gpu = has_cuda and has_rocm
# Determine available GPU backends for this node
available_gpu_backends = []
if has_cuda:
available_gpu_backends.append('cuda')
if has_rocm:
available_gpu_backends.append('rocm')
node_map[node_key] = { node_map[node_key] = {
'token': token, 'token': token,
'token_name': token_name, 'token_name': token_name,
...@@ -441,6 +448,7 @@ def api_cluster_nodes(): ...@@ -441,6 +448,7 @@ def api_cluster_nodes():
'weight': client_info.get('weight', 100), 'weight': client_info.get('weight', 100),
'is_local': False, 'is_local': False,
'mixed_gpu': mixed_gpu, 'mixed_gpu': mixed_gpu,
'available_gpu_backends': available_gpu_backends,
'workers': [] # Will collect worker details 'workers': [] # Will collect worker details
} }
...@@ -507,6 +515,10 @@ def api_cluster_nodes(): ...@@ -507,6 +515,10 @@ def api_cluster_nodes():
# Use the longest uptime as representative # Use the longest uptime as representative
max_uptime = max((w.get('uptime_seconds', 0) for w in local_workers), default=0) max_uptime = max((w.get('uptime_seconds', 0) for w in local_workers), default=0)
# Get available GPU backends for local system
from vidai.compat import get_available_backends
local_available_gpu_backends = [b for b in get_available_backends() if b in ['cuda', 'rocm']]
local_node = { local_node = {
'token': 'local', 'token': 'local',
'token_name': 'Local Master Node', 'token_name': 'Local Master Node',
...@@ -524,7 +536,9 @@ def api_cluster_nodes(): ...@@ -524,7 +536,9 @@ def api_cluster_nodes():
'active_jobs': 0, # Placeholder 'active_jobs': 0, # Placeholder
'completed_jobs': 0, # Placeholder 'completed_jobs': 0, # Placeholder
'weight': 0, # Local workers don't participate in cluster load balancing 'weight': 0, # Local workers don't participate in cluster load balancing
'is_local': True 'is_local': True,
'mixed_gpu': len(local_available_gpu_backends) > 1,
'available_gpu_backends': local_available_gpu_backends
} }
nodes.append(local_node) nodes.append(local_node)
...@@ -626,8 +640,8 @@ def api_set_client_driver(): ...@@ -626,8 +640,8 @@ def api_set_client_driver():
if not hostname or not driver: if not hostname or not driver:
return {'success': False, 'error': 'Missing required parameters'}, 400 return {'success': False, 'error': 'Missing required parameters'}, 400
if driver not in ['cuda', 'rocm', 'cpu']: if driver not in ['cuda', 'rocm']:
return {'success': False, 'error': 'Invalid driver'}, 400 return {'success': False, 'error': 'Invalid driver - only CUDA and ROCm are supported'}, 400
# Handle local workers # Handle local workers
if token == 'local': if token == 'local':
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment