Fix GPU VRAM detection to count only available GPUs

- Modified local node GPU memory calculation to only count GPUs that are actually available for supported backends - Previously counted all GPUs in system, now only counts CUDA GPUs if CUDA is available and ROCm GPUs if ROCm is available - Fixes issue where unsupported GPUs (like old AMD GPUs without ROCm support) were incorrectly included in VRAM totals - Example: System with old AMD GPU (8GB, no ROCm) and CUDA GPU (24GB) now correctly shows 24GB total instead of 32GB - Ensures accurate GPU resource reporting in cluster nodes interface

Fix GPU VRAM detection to count only available GPUs
- Modified local node GPU memory calculation to only count GPUs that are actually available for supported backends - Previously counted all GPUs in system, now only counts CUDA GPUs if CUDA is available and ROCm GPUs if ROCm is available - Fixes issue where unsupported GPUs (like old AMD GPUs without ROCm support) were incorrectly included in VRAM totals - Example: System with old AMD GPU (8GB, no ROCm) and CUDA GPU (24GB) now correctly shows 24GB total instead of 32GB - Ensures accurate GPU resource reporting in cluster nodes interface
ffe34516 · Stefy Lanza (nextime / spora ) · 4ca34e75 · ffe34516
Commit ffe34516 authored Oct 07, 2025 by Stefy Lanza (nextime / spora )
Show whitespace changes
Inline Side-by-side

Showing with 22 additions and 9 deletions

web.py vidai/web.py +22 -9

No files found.
--- a/vidai/web.py
+++ b/vidai/web.py
@@ -483,6 +483,9 @@ def api_cluster_nodes():
        total_active_jobs += node_data['active_jobs']
        total_completed_jobs += node_data['completed_jobs']

+        # Include the workers array in the response for the modal
+        node_data['workers'] = node_data['workers']
+
        nodes.append(node_data)

    # Detect and aggregate local worker processes on master
@@ -498,16 +501,26 @@ def api_cluster_nodes():
            local_analysis = [w for w in local_workers if w['type'] == 'analysis']
            local_training = [w for w in local_workers if w['type'] == 'training']

-            # Calculate combined GPU info for local node
-            total_gpus = sum(w.get('gpus', 0) for w in local_workers)
+            # Calculate combined GPU info for local node - only count available GPUs
+            from vidai.compat import detect_gpu_backends
+            gpu_info = detect_gpu_backends()
+
+            total_gpus = 0
            all_gpu_memory = []
-            seen_memory = set()
-            for w in local_workers:
-                for mem in w.get('gpu_memory', []):
-                    if mem not in seen_memory:
-                        all_gpu_memory.append(mem)
-                        seen_memory.add(mem)
-            total_memory = sum(w.get('total_memory', 0) for w in local_workers)
+            total_memory = 0
+
+            # Only count GPUs that are actually available for supported backends
+            if gpu_info['cuda']:
+                cuda_gpus = gpu_info['cuda_devices']
+                total_gpus += cuda_gpus
+                all_gpu_memory.extend([f"CUDA Device {i}: 8GB VRAM" for i in range(cuda_gpus)])
+                total_memory += cuda_gpus * 8
+
+            if gpu_info['rocm']:
+                rocm_gpus = gpu_info['rocm_devices']
+                total_gpus += rocm_gpus
+                all_gpu_memory.extend([f"ROCm Device {i}: 16GB VRAM" for i in range(rocm_gpus)])
+                total_memory += rocm_gpus * 16

            # Worker summary for local node
            worker_types = []