Enable driver switching for local workers and show master weight

- Display actual cluster master weight instead of 'N/A' for local node
- Implement driver switching for local workers via modal popup
- Add switch_local_worker_backends() function to restart workers with new backends
- Update API endpoint to handle local worker driver changes
- Add CPU option to driver selection modal
- Local workers can now switch between CUDA, ROCm, and CPU backends dynamically
- Workers are terminated and restarted with new backend configuration
parent fb7ad973
......@@ -131,6 +131,7 @@
<select id="driverSelect" name="driver">
<option value="cuda">CUDA</option>
<option value="rocm">ROCm</option>
<option value="cpu">CPU</option>
</select>
</div>
<div class="modal-footer">
......@@ -192,7 +193,7 @@ function renderNodesTable() {
</td>
<td>${node.token_name}${node.backend ? ` (${node.backend.toUpperCase()})` : ''}</td>
<td>${node.hostname}</td>
<td>${node.is_local ? 'N/A' : (node.weight || 100)}</td>
<td>${node.weight || 100}</td>
<td>${node.gpus}</td>
<td>
${node.gpu_memory.length > 0 ? node.gpu_memory.join('<br>') : 'No GPU info'}
......@@ -207,10 +208,7 @@ function renderNodesTable() {
<td>${node.active_jobs || 0}</td>
<td>${node.completed_jobs || 0}</td>
<td>
${node.is_local ?
'<span class="local-worker-note">Local worker - restart required</span>' :
`<button class="btn btn-sm" onclick="openDriverModal('${node.hostname}', '${node.token}', '${node.hostname}')">Set Driver</button>`
}
<button class="btn btn-sm" onclick="openDriverModal('${node.hostname}', '${node.token}', '${node.hostname}')">Set Driver</button>
</td>
</tr>
`).join('');
......
......@@ -611,22 +611,100 @@ def detect_local_workers():
@app.route('/api/admin/cluster_nodes/set_driver', methods=['POST'])
@admin_required
def api_set_client_driver():
"""API endpoint to set driver preference for a client."""
from .database import set_client_driver_preference
"""API endpoint to set driver preference for a client or local workers."""
hostname = request.form.get('hostname')
token = request.form.get('token')
driver = request.form.get('driver')
if not hostname or not token or not driver:
if not hostname or not driver:
return {'success': False, 'error': 'Missing required parameters'}, 400
if driver not in ['cuda', 'rocm']:
if driver not in ['cuda', 'rocm', 'cpu']:
return {'success': False, 'error': 'Invalid driver'}, 400
# Handle local workers
if token == 'local':
success = switch_local_worker_backends(driver)
return {'success': success}
# Handle remote clients
if not token:
return {'success': False, 'error': 'Missing token for remote client'}, 400
from .database import set_client_driver_preference
success = set_client_driver_preference(hostname, token, driver)
return {'success': success}
def switch_local_worker_backends(new_backend):
"""Switch local worker processes to use a different backend."""
import psutil
import subprocess
import sys
try:
# Find and terminate existing local worker processes
terminated_pids = []
for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
try:
if proc.info['name'] and 'python' in proc.info['name'].lower():
cmdline = proc.info.get('cmdline', [])
if cmdline and ('vidai.worker_analysis' in ' '.join(cmdline) or
'vidai.worker_training' in ' '.join(cmdline)):
proc.terminate()
terminated_pids.append(proc.info['pid'])
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
# Wait for processes to terminate
import time
time.sleep(2)
# Force kill if still running
for pid in terminated_pids:
try:
proc = psutil.Process(pid)
if proc.is_running():
proc.kill()
except psutil.NoSuchProcess:
pass
# Start new worker processes with the new backend
from vidai.compat import get_available_backends
available_backends = get_available_backends()
if new_backend not in available_backends:
print(f"Warning: {new_backend} backend not available, available: {available_backends}")
# Try to start with available backends instead
backends_to_use = [b for b in available_backends if b != new_backend][:1] # Use first available
if not backends_to_use:
return False
new_backend = backends_to_use[0]
# Start analysis worker
try:
cmd = [sys.executable, '-m', 'vidai.worker_analysis', new_backend]
subprocess.Popen(cmd)
print(f"Started analysis worker with {new_backend} backend")
except Exception as e:
print(f"Failed to start analysis worker: {e}")
return False
# Start training worker
try:
cmd = [sys.executable, '-m', 'vidai.worker_training', new_backend]
subprocess.Popen(cmd)
print(f"Started training worker with {new_backend} backend")
except Exception as e:
print(f"Failed to start training worker: {e}")
return False
return True
except Exception as e:
print(f"Error switching local worker backends: {e}")
return False
@app.route('/api_tokens')
@login_required
def api_tokens():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment