Restrict driver selection to available GPU backends only

- Removed CPU option from driver selection (only CUDA/ROCm GPU drivers)
- Set CUDA as default driver selection when available
- Added available_gpu_backends field to node API responses
- Frontend dynamically populates driver options based on node's available GPUs
- API validation rejects non-GPU driver requests
- Cluster clients only accept CUDA/ROCm backend restart commands
- Improved user experience by showing only relevant driver options per node
parent bedc1de9
......@@ -127,11 +127,9 @@
<input type="hidden" id="modalHostnameInput" name="hostname">
<input type="hidden" id="modalTokenInput" name="token">
<div class="form-group">
<label for="driverSelect">Preferred Driver:</label>
<label for="driverSelect">Preferred GPU Driver:</label>
<select id="driverSelect" name="driver">
<option value="cuda">CUDA</option>
<option value="rocm">ROCm</option>
<option value="cpu">CPU</option>
<!-- Options will be populated dynamically -->
</select>
</div>
<div class="modal-footer">
......@@ -215,13 +213,39 @@ function renderNodesTable() {
`).join('');
}
function openDriverModal(hostname, tokenName, hostnameValue) {
// Find the token for this hostname (simplified - in real implementation, pass token)
// For now, we'll use tokenName as token, but actually need the actual token string
// This is a placeholder - you'd need to modify the API to include token
function openDriverModal(hostname, token, hostnameValue) {
// Find the node data to get available GPU backends
const node = nodesData.find(n => n.hostname === hostname && n.token === token);
if (!node) {
console.error('Node not found:', hostname, token);
return;
}
document.getElementById('modalHostname').textContent = hostname;
document.getElementById('modalHostnameInput').value = hostnameValue;
document.getElementById('modalTokenInput').value = tokenName; // This should be the actual token, not name
document.getElementById('modalTokenInput').value = token;
// Populate driver options based on available GPU backends
const driverSelect = document.getElementById('driverSelect');
driverSelect.innerHTML = '';
const availableBackends = node.available_gpu_backends || [];
if (availableBackends.length === 0) {
// Fallback for nodes without backend info
availableBackends.push('cuda', 'rocm');
}
availableBackends.forEach(backend => {
const option = document.createElement('option');
option.value = backend;
option.textContent = backend.toUpperCase();
driverSelect.appendChild(option);
});
// Set CUDA as default if available, otherwise first available
const defaultBackend = availableBackends.includes('cuda') ? 'cuda' : availableBackends[0];
driverSelect.value = defaultBackend;
document.getElementById('driverModal').style.display = 'block';
}
......
......@@ -363,8 +363,8 @@ class ClusterClient:
"""Handle restart workers command from master."""
backend = message.get('backend', 'cuda')
if backend not in ['cuda', 'rocm', 'cpu']:
print(f"Invalid backend requested: {backend}")
if backend not in ['cuda', 'rocm']:
print(f"Invalid backend requested: {backend} - only CUDA and ROCm supported")
return
print(f"Restarting workers with {backend} backend")
......
......@@ -419,12 +419,19 @@ def api_cluster_nodes():
connected_at = client_info.get('connected_at', current_time)
uptime_seconds = current_time - connected_at
# Detect mixed GPU availability
# Detect mixed GPU availability and available backends
gpu_info = client_info.get('gpu_info', {})
has_cuda = gpu_info.get('cuda_available', False)
has_rocm = gpu_info.get('rocm_available', False)
mixed_gpu = has_cuda and has_rocm
# Determine available GPU backends for this node
available_gpu_backends = []
if has_cuda:
available_gpu_backends.append('cuda')
if has_rocm:
available_gpu_backends.append('rocm')
node_map[node_key] = {
'token': token,
'token_name': token_name,
......@@ -441,6 +448,7 @@ def api_cluster_nodes():
'weight': client_info.get('weight', 100),
'is_local': False,
'mixed_gpu': mixed_gpu,
'available_gpu_backends': available_gpu_backends,
'workers': [] # Will collect worker details
}
......@@ -507,6 +515,10 @@ def api_cluster_nodes():
# Use the longest uptime as representative
max_uptime = max((w.get('uptime_seconds', 0) for w in local_workers), default=0)
# Get available GPU backends for local system
from vidai.compat import get_available_backends
local_available_gpu_backends = [b for b in get_available_backends() if b in ['cuda', 'rocm']]
local_node = {
'token': 'local',
'token_name': 'Local Master Node',
......@@ -524,7 +536,9 @@ def api_cluster_nodes():
'active_jobs': 0, # Placeholder
'completed_jobs': 0, # Placeholder
'weight': 0, # Local workers don't participate in cluster load balancing
'is_local': True
'is_local': True,
'mixed_gpu': len(local_available_gpu_backends) > 1,
'available_gpu_backends': local_available_gpu_backends
}
nodes.append(local_node)
......@@ -626,8 +640,8 @@ def api_set_client_driver():
if not hostname or not driver:
return {'success': False, 'error': 'Missing required parameters'}, 400
if driver not in ['cuda', 'rocm', 'cpu']:
return {'success': False, 'error': 'Invalid driver'}, 400
if driver not in ['cuda', 'rocm']:
return {'success': False, 'error': 'Invalid driver - only CUDA and ROCm are supported'}, 400
# Handle local workers
if token == 'local':
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment