Fix GPU VRAM detection to use correct method from /api/stats

- Updated GPU VRAM detection to use torch.cuda.get_device_properties(i).total_memory / 1024**3 - Same method as used in /api/stats endpoint for consistency - Still filters out non-NVIDIA and non-functional GPUs - Now shows correct VRAM amounts (e.g., 24GB for RTX 3090 instead of hardcoded 8GB) - Fixed both worker-level and node-level GPU detection

Fix GPU VRAM detection to use correct method from /api/stats
- Updated GPU VRAM detection to use torch.cuda.get_device_properties(i).total_memory / 1024**3 - Same method as used in /api/stats endpoint for consistency - Still filters out non-NVIDIA and non-functional GPUs - Now shows correct VRAM amounts (e.g., 24GB for RTX 3090 instead of hardcoded 8GB) - Fixed both worker-level and node-level GPU detection
efbb77ce · Stefy Lanza (nextime / spora ) · f91fafcf · efbb77ce · efbb77ce
Commit efbb77ce authored Oct 07, 2025 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 98 additions and 34 deletions

compat.py vidai/compat.py +2 -7

web.py vidai/web.py +96 -27

No files found.
--- a/vidai/compat.py
+++ b/vidai/compat.py
@@ -267,11 +267,9 @@ def detect_gpu_backends() -> dict:
                try:
                    # Test if device is actually functional by trying a simple operation
                    device_name = torch.cuda.get_device_name(i).lower()
-                    print(f"CUDA Device {i}: {device_name}")  # Debug output
                    # Only consider NVIDIA GPUs
                    if not any(keyword in device_name for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
-                        print(f"Skipping non-NVIDIA device {i}: {device_name}")
                        continue
                    # Test device functionality
@@ -280,15 +278,12 @@ def detect_gpu_backends() -> dict:
                            test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
                            test_result = test_tensor + 1  # Simple operation
                            del test_tensor, test_result
-                        print(f"CUDA Device {i} is functional: {device_name}")
                        working_cuda_devices += 1
-                    except Exception as e:
+                    except Exception:
-                        print(f"CUDA Device {i} failed functional test: {device_name} - {e}")
                        # Device not functional, skip it
                        continue
-                except Exception as e:
+                except Exception:
-                    print(f"Error checking CUDA device {i}: {e}")
                    continue
            if working_cuda_devices > 0:

--- a/vidai/web.py
+++ b/vidai/web.py
@@ -501,26 +501,59 @@ def api_cluster_nodes():
            local_analysis = [w for w in local_workers if w['type'] == 'analysis']
            local_training = [w for w in local_workers if w['type'] == 'training']
-            # Calculate combined GPU info for local node - only count available GPUs
+            # Calculate combined GPU info for local node - only count available GPUs with real VRAM
-            from vidai.compat import detect_gpu_backends
+            import torch
-            gpu_info = detect_gpu_backends()
            total_gpus = 0
            all_gpu_memory = []
            total_memory = 0
-            # Only count GPUs that are actually available for supported backends
+            # Only count working NVIDIA GPUs with actual VRAM detection
-            if gpu_info['cuda']:
+            if torch.cuda.is_available():
-                cuda_gpus = gpu_info['cuda_devices']
+                for i in range(torch.cuda.device_count()):
-                total_gpus += cuda_gpus
+                    try:
-                all_gpu_memory.extend([f"CUDA Device {i}: 8GB VRAM" for i in range(cuda_gpus)])
+                        device_name = torch.cuda.get_device_name(i).lower()
-                total_memory += cuda_gpus * 8
+                        # Only consider NVIDIA GPUs
+                        if not any(keyword in device_name for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
-            if gpu_info['rocm']:
+                            continue
-                rocm_gpus = gpu_info['rocm_devices']
-                total_gpus += rocm_gpus
+                        # Test device functionality
-                all_gpu_memory.extend([f"ROCm Device {i}: 16GB VRAM" for i in range(rocm_gpus)])
+                        try:
-                total_memory += rocm_gpus * 16
+                            with torch.cuda.device(i):
+                                test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
+                                test_result = test_tensor + 1
+                                del test_tensor, test_result
+                            # Get actual VRAM (same as /api/stats)
+                            props = torch.cuda.get_device_properties(i)
+                            vram_gb = int(props.total_memory / 1024**3)  # Convert bytes to GB
+                            all_gpu_memory.append(f"CUDA Device {i}: {vram_gb}GB VRAM")
+                            total_memory += vram_gb
+                            total_gpus += 1
+                        except:
+                            continue
+                    except:
+                        continue
+            # Only count working ROCm GPUs
+            if hasattr(torch, 'hip') and torch.hip.is_available():
+                for i in range(torch.hip.device_count()):
+                    try:
+                        # Test device functionality
+                        try:
+                            with torch.device(f'hip:{i}'):
+                                test_tensor = torch.tensor([1.0], device=f'hip:{i}')
+                                test_result = test_tensor + 1
+                                del test_tensor, test_result
+                            # ROCm VRAM detection is harder, use estimate
+                            all_gpu_memory.append(f"ROCm Device {i}: 16GB VRAM")
+                            total_memory += 16
+                            total_gpus += 1
+                        except:
+                            continue
+                    except:
+                        continue
            # Worker summary for local node
            worker_types = []
@@ -608,22 +641,58 @@ def detect_local_workers():
                            # Determine worker type
                            worker_type = 'analysis' if 'worker_analysis' in ' '.join(cmdline) else 'training'
-                            # Get GPU info (simplified - would need better detection)
+                            # Get actual GPU info with real VRAM detection (only working GPUs)
-                            from vidai.compat import detect_gpu_backends
+                            import torch
-                            gpu_info = detect_gpu_backends()
                            gpu_memory = []
                            total_memory = 0
                            gpus = 0
-                            if backend == 'cuda' and gpu_info['cuda']:
+                            if backend == 'cuda' and torch.cuda.is_available():
-                                gpus = gpu_info['cuda_devices']
+                                # Only count working NVIDIA GPUs
-                                gpu_memory = [f"CUDA Device {i}: 8GB VRAM" for i in range(gpus)]
+                                for i in range(torch.cuda.device_count()):
-                                total_memory = gpus * 8
+                                    try:
-                            elif backend == 'rocm' and gpu_info['rocm']:
+                                        device_name = torch.cuda.get_device_name(i).lower()
-                                gpus = gpu_info['rocm_devices']
+                                        # Only consider NVIDIA GPUs
-                                gpu_memory = [f"ROCm Device {i}: 16GB VRAM" for i in range(gpus)]
+                                        if not any(keyword in device_name for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
-                                total_memory = gpus * 16
+                                            continue
+                                        # Test device functionality
+                                        try:
+                                            with torch.cuda.device(i):
+                                                test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
+                                                test_result = test_tensor + 1
+                                                del test_tensor, test_result
+                                            # Get actual VRAM (same as /api/stats)
+                                            props = torch.cuda.get_device_properties(i)
+                                            vram_gb = int(props.total_memory / 1024**3)  # Convert bytes to GB
+                                            gpu_memory.append(f"CUDA Device {i}: {vram_gb}GB VRAM")
+                                            total_memory += vram_gb
+                                            gpus += 1
+                                        except:
+                                            continue
+                                    except:
+                                        continue
+                            elif backend == 'rocm' and hasattr(torch, 'hip') and torch.hip.is_available():
+                                # Only count working ROCm GPUs
+                                for i in range(torch.hip.device_count()):
+                                    try:
+                                        # Test device functionality
+                                        try:
+                                            with torch.device(f'hip:{i}'):
+                                                test_tensor = torch.tensor([1.0], device=f'hip:{i}')
+                                                test_result = test_tensor + 1
+                                                del test_tensor, test_result
+                                            # Get actual VRAM (ROCm doesn't have easy property access, estimate)
+                                            gpu_memory.append(f"ROCm Device {i}: 16GB VRAM")  # Estimate
+                                            total_memory += 16
+                                            gpus += 1
+                                        except:
+                                            continue
+                                    except:
+                                        continue
                            uptime_seconds = current_time - proc.info.get('create_time', current_time)