Add local worker processes to cluster nodes display

- Detect running local worker processes on cluster master using psutil
- Include local workers in cluster nodes API response with distinct styling
- Show local workers with blue background and 'Local' status indicator
- Display backend information (CUDA/ROCm) in worker names
- Indicate that local workers require manual restart for driver changes
- Update API documentation with local worker response format
- Local workers show N/A for weight since they don't participate in cluster load balancing
parent 1c9ae89a
...@@ -33,6 +33,11 @@ ...@@ -33,6 +33,11 @@
.modal-footer { text-align: right; margin-top: 2rem; } .modal-footer { text-align: right; margin-top: 2rem; }
.btn-secondary { background: #6b7280; } .btn-secondary { background: #6b7280; }
.btn-secondary:hover { background: #4b5563; } .btn-secondary:hover { background: #4b5563; }
/* Local worker styling */
.local-worker-row { background-color: #f0f9ff; }
.local-worker-row td { border-color: #bae6fd; }
.local-worker-note { color: #64748b; font-size: 0.875rem; font-style: italic; }
</style> </style>
{% endblock %} {% endblock %}
...@@ -179,15 +184,15 @@ function renderNodesTable() { ...@@ -179,15 +184,15 @@ function renderNodesTable() {
} }
tbody.innerHTML = nodesData.map(node => ` tbody.innerHTML = nodesData.map(node => `
<tr> <tr class="${node.is_local ? 'local-worker-row' : ''}">
<td> <td>
<span class="status-${node.connected ? 'connected' : 'disconnected'}"> <span class="status-${node.connected ? 'connected' : 'disconnected'}">
${node.connected ? '● Connected' : '● Disconnected'} ${node.connected ? (node.is_local ? '● Local' : '● Connected') : '● Disconnected'}
</span> </span>
</td> </td>
<td>${node.token_name}</td> <td>${node.token_name}${node.backend ? ` (${node.backend.toUpperCase()})` : ''}</td>
<td>${node.hostname}</td> <td>${node.hostname}</td>
<td>${node.weight || 100}</td> <td>${node.is_local ? 'N/A' : (node.weight || 100)}</td>
<td>${node.gpus}</td> <td>${node.gpus}</td>
<td> <td>
${node.gpu_memory.length > 0 ? node.gpu_memory.join('<br>') : 'No GPU info'} ${node.gpu_memory.length > 0 ? node.gpu_memory.join('<br>') : 'No GPU info'}
...@@ -199,9 +204,10 @@ function renderNodesTable() { ...@@ -199,9 +204,10 @@ function renderNodesTable() {
<td>${node.active_jobs || 0}</td> <td>${node.active_jobs || 0}</td>
<td>${node.completed_jobs || 0}</td> <td>${node.completed_jobs || 0}</td>
<td> <td>
<button class="btn btn-sm" onclick="openDriverModal('${node.hostname}', '${node.token}', '${node.hostname}')"> ${node.is_local ?
Set Driver '<span class="local-worker-note">Local worker - restart required</span>' :
</button> `<button class="btn btn-sm" onclick="openDriverModal('${node.hostname}', '${node.token}', '${node.hostname}')">Set Driver</button>`
}
</td> </td>
</tr> </tr>
`).join(''); `).join('');
......
...@@ -455,7 +455,26 @@ ...@@ -455,7 +455,26 @@
"last_seen": 1640995300.0, "last_seen": 1640995300.0,
"uptime_seconds": 1800.5, "uptime_seconds": 1800.5,
"active_jobs": 1, "active_jobs": 1,
"completed_jobs": 5 "completed_jobs": 5,
"is_local": false
},
{
"token": "local",
"token_name": "Local Analysis Worker (CUDA)",
"hostname": "localhost",
"weight": 0,
"gpus": 1,
"gpu_memory": ["CUDA Device 0: 8GB VRAM"],
"total_memory": 8,
"workers_available": 1,
"ip_address": "127.0.0.1",
"connected": true,
"last_seen": 1640995300.0,
"uptime_seconds": 3600.0,
"active_jobs": 0,
"completed_jobs": 0,
"is_local": true,
"backend": "cuda"
} }
] ]
}</div> }</div>
......
...@@ -444,7 +444,30 @@ def api_cluster_nodes(): ...@@ -444,7 +444,30 @@ def api_cluster_nodes():
'uptime_seconds': uptime_seconds, 'uptime_seconds': uptime_seconds,
'active_jobs': active_jobs, 'active_jobs': active_jobs,
'completed_jobs': completed_jobs, 'completed_jobs': completed_jobs,
'weight': client_info.get('weight', 100) 'weight': client_info.get('weight', 100),
'is_local': False
})
# Detect local worker processes on master
local_workers = detect_local_workers()
for worker in local_workers:
nodes.append({
'token': 'local',
'token_name': f'Local {worker["type"].title()} Worker',
'hostname': 'localhost',
'gpus': worker.get('gpus', 0),
'gpu_memory': worker.get('gpu_memory', []),
'total_memory': worker.get('total_memory', 0),
'workers_available': 1,
'ip_address': '127.0.0.1',
'connected': True,
'last_seen': current_time,
'uptime_seconds': worker.get('uptime_seconds', 0),
'active_jobs': 0, # Placeholder
'completed_jobs': 0, # Placeholder
'weight': 0, # Local workers don't participate in cluster load balancing
'is_local': True,
'backend': worker.get('backend', 'unknown')
}) })
# Get recently disconnected clients (last 10 that were connected in last 10 minutes) # Get recently disconnected clients (last 10 that were connected in last 10 minutes)
...@@ -467,6 +490,77 @@ def api_cluster_nodes(): ...@@ -467,6 +490,77 @@ def api_cluster_nodes():
return {'master_stats': master_stats, 'nodes': nodes} return {'master_stats': master_stats, 'nodes': nodes}
def detect_local_workers():
"""Detect local worker processes running on the master."""
import psutil
import os
import time
workers = []
current_time = time.time()
try:
# Get current process start time for uptime calculation
master_start_time = psutil.Process(os.getpid()).create_time()
# Check for Python processes running vidai worker modules
for proc in psutil.process_iter(['pid', 'name', 'cmdline', 'create_time']):
try:
if proc.info['name'] and 'python' in proc.info['name'].lower():
cmdline = proc.info.get('cmdline', [])
if cmdline and len(cmdline) >= 3:
# Check for worker processes
if ('vidai.worker_analysis' in ' '.join(cmdline) or
'vidai.worker_training' in ' '.join(cmdline)):
# Extract backend from command line
backend = 'unknown'
for arg in cmdline:
if arg in ['cuda', 'rocm', 'cpu']:
backend = arg
break
# Determine worker type
worker_type = 'analysis' if 'worker_analysis' in ' '.join(cmdline) else 'training'
# Get GPU info (simplified - would need better detection)
from vidai.compat import detect_gpu_backends
gpu_info = detect_gpu_backends()
gpu_memory = []
total_memory = 0
gpus = 0
if backend == 'cuda' and gpu_info['cuda']:
gpus = gpu_info['cuda_devices']
gpu_memory = [f"CUDA Device {i}: 8GB VRAM" for i in range(gpus)]
total_memory = gpus * 8
elif backend == 'rocm' and gpu_info['rocm']:
gpus = gpu_info['rocm_devices']
gpu_memory = [f"ROCm Device {i}: 16GB VRAM" for i in range(gpus)]
total_memory = gpus * 16
uptime_seconds = current_time - proc.info.get('create_time', current_time)
workers.append({
'type': worker_type,
'backend': backend,
'gpus': gpus,
'gpu_memory': gpu_memory,
'total_memory': total_memory,
'uptime_seconds': uptime_seconds,
'pid': proc.info['pid']
})
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
except Exception as e:
print(f"Error detecting local workers: {e}")
return workers
@app.route('/api/admin/cluster_nodes/set_driver', methods=['POST']) @app.route('/api/admin/cluster_nodes/set_driver', methods=['POST'])
@admin_required @admin_required
def api_set_client_driver(): def api_set_client_driver():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment