Add local worker processes to cluster nodes display

- Detect running local worker processes on cluster master using psutil - Include local workers in cluster nodes API response with distinct styling - Show local workers with blue background and 'Local' status indicator - Display backend information (CUDA/ROCm) in worker names - Indicate that local workers require manual restart for driver changes - Update API documentation with local worker response format - Local workers show N/A for weight since they don't participate in cluster load balancing

Add local worker processes to cluster nodes display
- Detect running local worker processes on cluster master using psutil - Include local workers in cluster nodes API response with distinct styling - Show local workers with blue background and 'Local' status indicator - Display backend information (CUDA/ROCm) in worker names - Indicate that local workers require manual restart for driver changes - Update API documentation with local worker response format - Local workers show N/A for weight since they don't participate in cluster load balancing
27e73381 · Stefy Lanza (nextime / spora ) · 1c9ae89a · 27e73381 · 27e73381 · 27e73381
Commit 27e73381 authored Oct 07, 2025 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 128 additions and 9 deletions

cluster_nodes.html templates/admin/cluster_nodes.html +13 -7

admin_api.html templates/admin_api.html +20 -1

web.py vidai/web.py +95 -1

No files found.
--- a/templates/admin/cluster_nodes.html
+++ b/templates/admin/cluster_nodes.html
@@ -33,6 +33,11 @@
    .modal-footer { text-align: right; margin-top: 2rem; }
    .btn-secondary { background: #6b7280; }
    .btn-secondary:hover { background: #4b5563; }
+
+    /* Local worker styling */
+    .local-worker-row { background-color: #f0f9ff; }
+    .local-worker-row td { border-color: #bae6fd; }
+    .local-worker-note { color: #64748b; font-size: 0.875rem; font-style: italic; }
 </style>
 {% endblock %}

@@ -179,15 +184,15 @@ function renderNodesTable() {
    }

    tbody.innerHTML = nodesData.map(node => `
-        <tr>
+        <tr class="${node.is_local ? 'local-worker-row' : ''}">
            <td>
                <span class="status-${node.connected ? 'connected' : 'disconnected'}">
-                    ${node.connected ? '● Connected' : '● Disconnected'}
+                    ${node.connected ? (node.is_local ? '● Local' : '● Connected') : '● Disconnected'}
                </span>
            </td>
-            <td>${node.token_name}</td>
+            <td>${node.token_name}${node.backend ? ` (${node.backend.toUpperCase()})` : ''}</td>
            <td>${node.hostname}</td>
-            <td>${node.weight || 100}</td>
+            <td>${node.is_local ? 'N/A' : (node.weight || 100)}</td>
            <td>${node.gpus}</td>
            <td>
                ${node.gpu_memory.length > 0 ? node.gpu_memory.join('<br>') : 'No GPU info'}
@@ -199,9 +204,10 @@ function renderNodesTable() {
            <td>${node.active_jobs || 0}</td>
            <td>${node.completed_jobs || 0}</td>
            <td>
-                <button class="btn btn-sm" onclick="openDriverModal('${node.hostname}', '${node.token}', '${node.hostname}')">
-                    Set Driver
-                </button>
+                ${node.is_local ?
+                    '<span class="local-worker-note">Local worker - restart required</span>' :
+                    `<button class="btn btn-sm" onclick="openDriverModal('${node.hostname}', '${node.token}', '${node.hostname}')">Set Driver</button>`
+                }
            </td>
        </tr>
    `).join('');

--- a/templates/admin_api.html
+++ b/templates/admin_api.html
@@ -455,7 +455,26 @@
      "last_seen": 1640995300.0,
      "uptime_seconds": 1800.5,
      "active_jobs": 1,
-      "completed_jobs": 5
+      "completed_jobs": 5,
+      "is_local": false
+    },
+    {
+      "token": "local",
+      "token_name": "Local Analysis Worker (CUDA)",
+      "hostname": "localhost",
+      "weight": 0,
+      "gpus": 1,
+      "gpu_memory": ["CUDA Device 0: 8GB VRAM"],
+      "total_memory": 8,
+      "workers_available": 1,
+      "ip_address": "127.0.0.1",
+      "connected": true,
+      "last_seen": 1640995300.0,
+      "uptime_seconds": 3600.0,
+      "active_jobs": 0,
+      "completed_jobs": 0,
+      "is_local": true,
+      "backend": "cuda"
    }
  ]
 }</div>

--- a/vidai/web.py
+++ b/vidai/web.py
@@ -444,7 +444,30 @@ def api_cluster_nodes():
            'uptime_seconds': uptime_seconds,
            'active_jobs': active_jobs,
            'completed_jobs': completed_jobs,
-            'weight': client_info.get('weight', 100)
+            'weight': client_info.get('weight', 100),
+            'is_local': False
+        })
+
+    # Detect local worker processes on master
+    local_workers = detect_local_workers()
+    for worker in local_workers:
+        nodes.append({
+            'token': 'local',
+            'token_name': f'Local {worker["type"].title()} Worker',
+            'hostname': 'localhost',
+            'gpus': worker.get('gpus', 0),
+            'gpu_memory': worker.get('gpu_memory', []),
+            'total_memory': worker.get('total_memory', 0),
+            'workers_available': 1,
+            'ip_address': '127.0.0.1',
+            'connected': True,
+            'last_seen': current_time,
+            'uptime_seconds': worker.get('uptime_seconds', 0),
+            'active_jobs': 0,  # Placeholder
+            'completed_jobs': 0,  # Placeholder
+            'weight': 0,  # Local workers don't participate in cluster load balancing
+            'is_local': True,
+            'backend': worker.get('backend', 'unknown')
        })

    # Get recently disconnected clients (last 10 that were connected in last 10 minutes)
@@ -467,6 +490,77 @@ def api_cluster_nodes():

    return {'master_stats': master_stats, 'nodes': nodes}

+
+def detect_local_workers():
+    """Detect local worker processes running on the master."""
+    import psutil
+    import os
+    import time
+
+    workers = []
+    current_time = time.time()
+
+    try:
+        # Get current process start time for uptime calculation
+        master_start_time = psutil.Process(os.getpid()).create_time()
+
+        # Check for Python processes running vidai worker modules
+        for proc in psutil.process_iter(['pid', 'name', 'cmdline', 'create_time']):
+            try:
+                if proc.info['name'] and 'python' in proc.info['name'].lower():
+                    cmdline = proc.info.get('cmdline', [])
+                    if cmdline and len(cmdline) >= 3:
+                        # Check for worker processes
+                        if ('vidai.worker_analysis' in ' '.join(cmdline) or
+                            'vidai.worker_training' in ' '.join(cmdline)):
+
+                            # Extract backend from command line
+                            backend = 'unknown'
+                            for arg in cmdline:
+                                if arg in ['cuda', 'rocm', 'cpu']:
+                                    backend = arg
+                                    break
+
+                            # Determine worker type
+                            worker_type = 'analysis' if 'worker_analysis' in ' '.join(cmdline) else 'training'
+
+                            # Get GPU info (simplified - would need better detection)
+                            from vidai.compat import detect_gpu_backends
+                            gpu_info = detect_gpu_backends()
+
+                            gpu_memory = []
+                            total_memory = 0
+                            gpus = 0
+
+                            if backend == 'cuda' and gpu_info['cuda']:
+                                gpus = gpu_info['cuda_devices']
+                                gpu_memory = [f"CUDA Device {i}: 8GB VRAM" for i in range(gpus)]
+                                total_memory = gpus * 8
+                            elif backend == 'rocm' and gpu_info['rocm']:
+                                gpus = gpu_info['rocm_devices']
+                                gpu_memory = [f"ROCm Device {i}: 16GB VRAM" for i in range(gpus)]
+                                total_memory = gpus * 16
+
+                            uptime_seconds = current_time - proc.info.get('create_time', current_time)
+
+                            workers.append({
+                                'type': worker_type,
+                                'backend': backend,
+                                'gpus': gpus,
+                                'gpu_memory': gpu_memory,
+                                'total_memory': total_memory,
+                                'uptime_seconds': uptime_seconds,
+                                'pid': proc.info['pid']
+                            })
+
+            except (psutil.NoSuchProcess, psutil.AccessDenied):
+                continue
+
+    except Exception as e:
+        print(f"Error detecting local workers: {e}")
+
+    return workers
+
 @app.route('/api/admin/cluster_nodes/set_driver', methods=['POST'])
 @admin_required
 def api_set_client_driver():