Enable driver switching for local workers and show master weight

- Display actual cluster master weight instead of 'N/A' for local node
- Implement driver switching for local workers via modal popup
- Add switch_local_worker_backends() function to restart workers with new backends
- Update API endpoint to handle local worker driver changes
- Add CPU option to driver selection modal
- Local workers can now switch between CUDA, ROCm, and CPU backends dynamically
- Workers are terminated and restarted with new backend configuration
parent fb7ad973
...@@ -131,6 +131,7 @@ ...@@ -131,6 +131,7 @@
<select id="driverSelect" name="driver"> <select id="driverSelect" name="driver">
<option value="cuda">CUDA</option> <option value="cuda">CUDA</option>
<option value="rocm">ROCm</option> <option value="rocm">ROCm</option>
<option value="cpu">CPU</option>
</select> </select>
</div> </div>
<div class="modal-footer"> <div class="modal-footer">
...@@ -192,7 +193,7 @@ function renderNodesTable() { ...@@ -192,7 +193,7 @@ function renderNodesTable() {
</td> </td>
<td>${node.token_name}${node.backend ? ` (${node.backend.toUpperCase()})` : ''}</td> <td>${node.token_name}${node.backend ? ` (${node.backend.toUpperCase()})` : ''}</td>
<td>${node.hostname}</td> <td>${node.hostname}</td>
<td>${node.is_local ? 'N/A' : (node.weight || 100)}</td> <td>${node.weight || 100}</td>
<td>${node.gpus}</td> <td>${node.gpus}</td>
<td> <td>
${node.gpu_memory.length > 0 ? node.gpu_memory.join('<br>') : 'No GPU info'} ${node.gpu_memory.length > 0 ? node.gpu_memory.join('<br>') : 'No GPU info'}
...@@ -207,10 +208,7 @@ function renderNodesTable() { ...@@ -207,10 +208,7 @@ function renderNodesTable() {
<td>${node.active_jobs || 0}</td> <td>${node.active_jobs || 0}</td>
<td>${node.completed_jobs || 0}</td> <td>${node.completed_jobs || 0}</td>
<td> <td>
${node.is_local ? <button class="btn btn-sm" onclick="openDriverModal('${node.hostname}', '${node.token}', '${node.hostname}')">Set Driver</button>
'<span class="local-worker-note">Local worker - restart required</span>' :
`<button class="btn btn-sm" onclick="openDriverModal('${node.hostname}', '${node.token}', '${node.hostname}')">Set Driver</button>`
}
</td> </td>
</tr> </tr>
`).join(''); `).join('');
......
...@@ -611,22 +611,100 @@ def detect_local_workers(): ...@@ -611,22 +611,100 @@ def detect_local_workers():
@app.route('/api/admin/cluster_nodes/set_driver', methods=['POST']) @app.route('/api/admin/cluster_nodes/set_driver', methods=['POST'])
@admin_required @admin_required
def api_set_client_driver(): def api_set_client_driver():
"""API endpoint to set driver preference for a client.""" """API endpoint to set driver preference for a client or local workers."""
from .database import set_client_driver_preference
hostname = request.form.get('hostname') hostname = request.form.get('hostname')
token = request.form.get('token') token = request.form.get('token')
driver = request.form.get('driver') driver = request.form.get('driver')
if not hostname or not token or not driver: if not hostname or not driver:
return {'success': False, 'error': 'Missing required parameters'}, 400 return {'success': False, 'error': 'Missing required parameters'}, 400
if driver not in ['cuda', 'rocm']: if driver not in ['cuda', 'rocm', 'cpu']:
return {'success': False, 'error': 'Invalid driver'}, 400 return {'success': False, 'error': 'Invalid driver'}, 400
# Handle local workers
if token == 'local':
success = switch_local_worker_backends(driver)
return {'success': success}
# Handle remote clients
if not token:
return {'success': False, 'error': 'Missing token for remote client'}, 400
from .database import set_client_driver_preference
success = set_client_driver_preference(hostname, token, driver) success = set_client_driver_preference(hostname, token, driver)
return {'success': success} return {'success': success}
def switch_local_worker_backends(new_backend):
"""Switch local worker processes to use a different backend."""
import psutil
import subprocess
import sys
try:
# Find and terminate existing local worker processes
terminated_pids = []
for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
try:
if proc.info['name'] and 'python' in proc.info['name'].lower():
cmdline = proc.info.get('cmdline', [])
if cmdline and ('vidai.worker_analysis' in ' '.join(cmdline) or
'vidai.worker_training' in ' '.join(cmdline)):
proc.terminate()
terminated_pids.append(proc.info['pid'])
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
# Wait for processes to terminate
import time
time.sleep(2)
# Force kill if still running
for pid in terminated_pids:
try:
proc = psutil.Process(pid)
if proc.is_running():
proc.kill()
except psutil.NoSuchProcess:
pass
# Start new worker processes with the new backend
from vidai.compat import get_available_backends
available_backends = get_available_backends()
if new_backend not in available_backends:
print(f"Warning: {new_backend} backend not available, available: {available_backends}")
# Try to start with available backends instead
backends_to_use = [b for b in available_backends if b != new_backend][:1] # Use first available
if not backends_to_use:
return False
new_backend = backends_to_use[0]
# Start analysis worker
try:
cmd = [sys.executable, '-m', 'vidai.worker_analysis', new_backend]
subprocess.Popen(cmd)
print(f"Started analysis worker with {new_backend} backend")
except Exception as e:
print(f"Failed to start analysis worker: {e}")
return False
# Start training worker
try:
cmd = [sys.executable, '-m', 'vidai.worker_training', new_backend]
subprocess.Popen(cmd)
print(f"Started training worker with {new_backend} backend")
except Exception as e:
print(f"Failed to start training worker: {e}")
return False
return True
except Exception as e:
print(f"Error switching local worker backends: {e}")
return False
@app.route('/api_tokens') @app.route('/api_tokens')
@login_required @login_required
def api_tokens(): def api_tokens():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment