Fix GPU VRAM detection for cluster clients

- Update detect_gpu_backends to collect actual VRAM for each GPU device - Store device info including VRAM in gpu_info sent to master - Use real VRAM data in cluster nodes API instead of hardcoded values - Ensure consistent VRAM reporting between master and clients

Fix GPU VRAM detection for cluster clients
- Update detect_gpu_backends to collect actual VRAM for each GPU device - Store device info including VRAM in gpu_info sent to master - Use real VRAM data in cluster nodes API instead of hardcoded values - Ensure consistent VRAM reporting between master and clients
1da6025d · Stefy Lanza (nextime / spora ) · 97a13987 · 1da6025d · 1da6025d
Commit 1da6025d authored Oct 07, 2025 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 83 additions and 9 deletions

compat.py vidai/compat.py +53 -1

web.py vidai/web.py +30 -8

No files found.
--- a/vidai/compat.py
+++ b/vidai/compat.py
@@ -255,7 +255,9 @@ def detect_gpu_backends() -> dict:
        'cuda_version': None,
        'rocm_version': None,
        'cuda_devices': 0,
-        'rocm_devices': 0
+        'rocm_devices': 0,
+        'cuda_device_info': [],  # List of dicts with device info including VRAM
+        'rocm_device_info': []   # List of dicts with device info including VRAM
    }
    # Check CUDA availability - only count working, functional CUDA devices
@@ -263,6 +265,7 @@ def detect_gpu_backends() -> dict:
        import torch
        if torch.cuda.is_available():
            working_cuda_devices = 0
+            cuda_device_info = []
            for i in range(torch.cuda.device_count()):
                try:
                    # Test if device can actually perform CUDA operations
@@ -271,7 +274,20 @@ def detect_gpu_backends() -> dict:
                            test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
                            test_result = test_tensor + 1  # Simple operation
                            del test_tensor, test_result
+                        # Get actual VRAM (same as /api/stats)
+                        props = torch.cuda.get_device_properties(i)
+                        vram_gb = int(props.total_memory / 1024**3)  # Convert bytes to GB
+                        device_info = {
+                            'device_id': i,
+                            'name': props.name,
+                            'vram_gb': vram_gb,
+                            'compute_capability': f"{props.major}.{props.minor}"
+                        }
+                        cuda_device_info.append(device_info)
                        working_cuda_devices += 1
                    except Exception:
                        # Device not functional, skip it
                        continue
@@ -282,6 +298,7 @@ def detect_gpu_backends() -> dict:
            if working_cuda_devices > 0:
                backends['cuda'] = True
                backends['cuda_devices'] = working_cuda_devices
+                backends['cuda_device_info'] = cuda_device_info
                try:
                    backends['cuda_version'] = torch.version.cuda
                except:
@@ -312,6 +329,7 @@ def detect_gpu_backends() -> dict:
        import torch
        if hasattr(torch, 'hip') and torch.hip.is_available():
            working_rocm_devices = 0
+            rocm_device_info = []
            for i in range(torch.hip.device_count()):
                try:
                    # Test if ROCm device is actually functional
@@ -323,7 +341,40 @@ def detect_gpu_backends() -> dict:
                            test_tensor = torch.tensor([1.0], device=f'hip:{i}')
                            test_result = test_tensor + 1  # Simple operation
                            del test_tensor, test_result
+                        # ROCm VRAM detection is harder, use estimate since PyTorch doesn't expose ROCm device properties easily
+                        # Try to get VRAM info via rocm-smi if available
+                        vram_gb = 16  # Default estimate for ROCm GPUs
+                        try:
+                            import subprocess
+                            result = subprocess.run(['rocm-smi', '--showmeminfo', 'vram', '--device', str(i)],
+                                                  capture_output=True, text=True, timeout=2)
+                            if result.returncode == 0:
+                                # Parse VRAM info from rocm-smi output
+                                lines = result.stdout.strip().split('\n')
+                                for line in lines:
+                                    if 'GPU Memory Total' in line and 'GB' in line:
+                                        # Extract number before GB
+                                        parts = line.split()
+                                        for part in parts:
+                                            if part.replace('.', '').isdigit():
+                                                try:
+                                                    vram_gb = int(float(part))
+                                                    break
+                                                except:
+                                                    pass
+                                        break
+                        except:
+                            pass  # Keep default estimate
+                        device_info = {
+                            'device_id': i,
+                            'name': device_name,
+                            'vram_gb': vram_gb
+                        }
+                        rocm_device_info.append(device_info)
                        working_rocm_devices += 1
                    except Exception:
                        # Device not functional, skip it
                        continue
@@ -334,6 +385,7 @@ def detect_gpu_backends() -> dict:
            if working_rocm_devices > 0:
                backends['rocm'] = True
                backends['rocm_devices'] = working_rocm_devices
+                backends['rocm_device_info'] = rocm_device_info
                try:
                    backends['rocm_version'] = torch.version.hip
                except:

--- a/vidai/web.py
+++ b/vidai/web.py
@@ -406,17 +406,39 @@ def api_cluster_nodes():
        if node_key not in node_map:
            gpu_info = client.get('gpu_info', {})
            available_backends = client.get('available_backends', [])
-            cuda_devices = gpu_info.get('cuda_devices', 0)
-            rocm_devices = gpu_info.get('rocm_devices', 0)
-            # Get GPU memory info
+            # Get GPU memory info from actual device info
            gpu_memory = []
-            if cuda_devices > 0:
+            total_memory = 0
-                gpu_memory.extend([f"CUDA Device {i}: 8GB VRAM" for i in range(cuda_devices)])
-            if rocm_devices > 0:
-                gpu_memory.extend([f"ROCm Device {i}: 16GB VRAM" for i in range(rocm_devices)])
-            total_memory = sum([8 if 'CUDA' in mem else 16 if 'ROCm' in mem else 0 for mem in gpu_memory])
+            # Use CUDA device info if available
+            cuda_device_info = gpu_info.get('cuda_device_info', [])
+            for device in cuda_device_info:
+                device_id = device.get('device_id', 0)
+                vram_gb = device.get('vram_gb', 8)
+                name = device.get('name', f'CUDA Device {device_id}')
+                gpu_memory.append(f"{name}: {vram_gb}GB VRAM")
+                total_memory += vram_gb
+            # Use ROCm device info if available
+            rocm_device_info = gpu_info.get('rocm_device_info', [])
+            for device in rocm_device_info:
+                device_id = device.get('device_id', 0)
+                vram_gb = device.get('vram_gb', 16)
+                name = device.get('name', f'ROCm Device {device_id}')
+                gpu_memory.append(f"{name}: {vram_gb}GB VRAM")
+                total_memory += vram_gb
+            # Fallback to old format if no device info available
+            if not gpu_memory:
+                cuda_devices = gpu_info.get('cuda_devices', 0)
+                rocm_devices = gpu_info.get('rocm_devices', 0)
+                if cuda_devices > 0:
+                    gpu_memory.extend([f"CUDA Device {i}: 8GB VRAM" for i in range(cuda_devices)])
+                    total_memory += cuda_devices * 8
+                if rocm_devices > 0:
+                    gpu_memory.extend([f"ROCm Device {i}: 16GB VRAM" for i in range(rocm_devices)])
+                    total_memory += rocm_devices * 16
            # Calculate uptime from last_seen
            last_seen = client.get('last_seen')