Fix GPU VRAM detection to use correct method from /api/stats

- Updated GPU VRAM detection to use torch.cuda.get_device_properties(i).total_memory / 1024**3
- Same method as used in /api/stats endpoint for consistency
- Still filters out non-NVIDIA and non-functional GPUs
- Now shows correct VRAM amounts (e.g., 24GB for RTX 3090 instead of hardcoded 8GB)
- Fixed both worker-level and node-level GPU detection
parent f91fafcf
...@@ -267,11 +267,9 @@ def detect_gpu_backends() -> dict: ...@@ -267,11 +267,9 @@ def detect_gpu_backends() -> dict:
try: try:
# Test if device is actually functional by trying a simple operation # Test if device is actually functional by trying a simple operation
device_name = torch.cuda.get_device_name(i).lower() device_name = torch.cuda.get_device_name(i).lower()
print(f"CUDA Device {i}: {device_name}") # Debug output
# Only consider NVIDIA GPUs # Only consider NVIDIA GPUs
if not any(keyword in device_name for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']): if not any(keyword in device_name for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
print(f"Skipping non-NVIDIA device {i}: {device_name}")
continue continue
# Test device functionality # Test device functionality
...@@ -280,15 +278,12 @@ def detect_gpu_backends() -> dict: ...@@ -280,15 +278,12 @@ def detect_gpu_backends() -> dict:
test_tensor = torch.tensor([1.0], device=f'cuda:{i}') test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
test_result = test_tensor + 1 # Simple operation test_result = test_tensor + 1 # Simple operation
del test_tensor, test_result del test_tensor, test_result
print(f"CUDA Device {i} is functional: {device_name}")
working_cuda_devices += 1 working_cuda_devices += 1
except Exception as e: except Exception:
print(f"CUDA Device {i} failed functional test: {device_name} - {e}")
# Device not functional, skip it # Device not functional, skip it
continue continue
except Exception as e: except Exception:
print(f"Error checking CUDA device {i}: {e}")
continue continue
if working_cuda_devices > 0: if working_cuda_devices > 0:
......
...@@ -501,26 +501,59 @@ def api_cluster_nodes(): ...@@ -501,26 +501,59 @@ def api_cluster_nodes():
local_analysis = [w for w in local_workers if w['type'] == 'analysis'] local_analysis = [w for w in local_workers if w['type'] == 'analysis']
local_training = [w for w in local_workers if w['type'] == 'training'] local_training = [w for w in local_workers if w['type'] == 'training']
# Calculate combined GPU info for local node - only count available GPUs # Calculate combined GPU info for local node - only count available GPUs with real VRAM
from vidai.compat import detect_gpu_backends import torch
gpu_info = detect_gpu_backends()
total_gpus = 0 total_gpus = 0
all_gpu_memory = [] all_gpu_memory = []
total_memory = 0 total_memory = 0
# Only count GPUs that are actually available for supported backends # Only count working NVIDIA GPUs with actual VRAM detection
if gpu_info['cuda']: if torch.cuda.is_available():
cuda_gpus = gpu_info['cuda_devices'] for i in range(torch.cuda.device_count()):
total_gpus += cuda_gpus try:
all_gpu_memory.extend([f"CUDA Device {i}: 8GB VRAM" for i in range(cuda_gpus)]) device_name = torch.cuda.get_device_name(i).lower()
total_memory += cuda_gpus * 8 # Only consider NVIDIA GPUs
if not any(keyword in device_name for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
if gpu_info['rocm']: continue
rocm_gpus = gpu_info['rocm_devices']
total_gpus += rocm_gpus # Test device functionality
all_gpu_memory.extend([f"ROCm Device {i}: 16GB VRAM" for i in range(rocm_gpus)]) try:
total_memory += rocm_gpus * 16 with torch.cuda.device(i):
test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
test_result = test_tensor + 1
del test_tensor, test_result
# Get actual VRAM (same as /api/stats)
props = torch.cuda.get_device_properties(i)
vram_gb = int(props.total_memory / 1024**3) # Convert bytes to GB
all_gpu_memory.append(f"CUDA Device {i}: {vram_gb}GB VRAM")
total_memory += vram_gb
total_gpus += 1
except:
continue
except:
continue
# Only count working ROCm GPUs
if hasattr(torch, 'hip') and torch.hip.is_available():
for i in range(torch.hip.device_count()):
try:
# Test device functionality
try:
with torch.device(f'hip:{i}'):
test_tensor = torch.tensor([1.0], device=f'hip:{i}')
test_result = test_tensor + 1
del test_tensor, test_result
# ROCm VRAM detection is harder, use estimate
all_gpu_memory.append(f"ROCm Device {i}: 16GB VRAM")
total_memory += 16
total_gpus += 1
except:
continue
except:
continue
# Worker summary for local node # Worker summary for local node
worker_types = [] worker_types = []
...@@ -608,22 +641,58 @@ def detect_local_workers(): ...@@ -608,22 +641,58 @@ def detect_local_workers():
# Determine worker type # Determine worker type
worker_type = 'analysis' if 'worker_analysis' in ' '.join(cmdline) else 'training' worker_type = 'analysis' if 'worker_analysis' in ' '.join(cmdline) else 'training'
# Get GPU info (simplified - would need better detection) # Get actual GPU info with real VRAM detection (only working GPUs)
from vidai.compat import detect_gpu_backends import torch
gpu_info = detect_gpu_backends()
gpu_memory = [] gpu_memory = []
total_memory = 0 total_memory = 0
gpus = 0 gpus = 0
if backend == 'cuda' and gpu_info['cuda']: if backend == 'cuda' and torch.cuda.is_available():
gpus = gpu_info['cuda_devices'] # Only count working NVIDIA GPUs
gpu_memory = [f"CUDA Device {i}: 8GB VRAM" for i in range(gpus)] for i in range(torch.cuda.device_count()):
total_memory = gpus * 8 try:
elif backend == 'rocm' and gpu_info['rocm']: device_name = torch.cuda.get_device_name(i).lower()
gpus = gpu_info['rocm_devices'] # Only consider NVIDIA GPUs
gpu_memory = [f"ROCm Device {i}: 16GB VRAM" for i in range(gpus)] if not any(keyword in device_name for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
total_memory = gpus * 16 continue
# Test device functionality
try:
with torch.cuda.device(i):
test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
test_result = test_tensor + 1
del test_tensor, test_result
# Get actual VRAM (same as /api/stats)
props = torch.cuda.get_device_properties(i)
vram_gb = int(props.total_memory / 1024**3) # Convert bytes to GB
gpu_memory.append(f"CUDA Device {i}: {vram_gb}GB VRAM")
total_memory += vram_gb
gpus += 1
except:
continue
except:
continue
elif backend == 'rocm' and hasattr(torch, 'hip') and torch.hip.is_available():
# Only count working ROCm GPUs
for i in range(torch.hip.device_count()):
try:
# Test device functionality
try:
with torch.device(f'hip:{i}'):
test_tensor = torch.tensor([1.0], device=f'hip:{i}')
test_result = test_tensor + 1
del test_tensor, test_result
# Get actual VRAM (ROCm doesn't have easy property access, estimate)
gpu_memory.append(f"ROCm Device {i}: 16GB VRAM") # Estimate
total_memory += 16
gpus += 1
except:
continue
except:
continue
uptime_seconds = current_time - proc.info.get('create_time', current_time) uptime_seconds = current_time - proc.info.get('create_time', current_time)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment