Fix GPU VRAM detection to use correct method from /api/stats

- Updated GPU VRAM detection to use torch.cuda.get_device_properties(i).total_memory / 1024**3
- Same method as used in /api/stats endpoint for consistency
- Still filters out non-NVIDIA and non-functional GPUs
- Now shows correct VRAM amounts (e.g., 24GB for RTX 3090 instead of hardcoded 8GB)
- Fixed both worker-level and node-level GPU detection
parent f91fafcf
......@@ -267,11 +267,9 @@ def detect_gpu_backends() -> dict:
try:
# Test if device is actually functional by trying a simple operation
device_name = torch.cuda.get_device_name(i).lower()
print(f"CUDA Device {i}: {device_name}") # Debug output
# Only consider NVIDIA GPUs
if not any(keyword in device_name for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
print(f"Skipping non-NVIDIA device {i}: {device_name}")
continue
# Test device functionality
......@@ -280,15 +278,12 @@ def detect_gpu_backends() -> dict:
test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
test_result = test_tensor + 1 # Simple operation
del test_tensor, test_result
print(f"CUDA Device {i} is functional: {device_name}")
working_cuda_devices += 1
except Exception as e:
print(f"CUDA Device {i} failed functional test: {device_name} - {e}")
except Exception:
# Device not functional, skip it
continue
except Exception as e:
print(f"Error checking CUDA device {i}: {e}")
except Exception:
continue
if working_cuda_devices > 0:
......
......@@ -501,26 +501,59 @@ def api_cluster_nodes():
local_analysis = [w for w in local_workers if w['type'] == 'analysis']
local_training = [w for w in local_workers if w['type'] == 'training']
# Calculate combined GPU info for local node - only count available GPUs
from vidai.compat import detect_gpu_backends
gpu_info = detect_gpu_backends()
# Calculate combined GPU info for local node - only count available GPUs with real VRAM
import torch
total_gpus = 0
all_gpu_memory = []
total_memory = 0
# Only count GPUs that are actually available for supported backends
if gpu_info['cuda']:
cuda_gpus = gpu_info['cuda_devices']
total_gpus += cuda_gpus
all_gpu_memory.extend([f"CUDA Device {i}: 8GB VRAM" for i in range(cuda_gpus)])
total_memory += cuda_gpus * 8
if gpu_info['rocm']:
rocm_gpus = gpu_info['rocm_devices']
total_gpus += rocm_gpus
all_gpu_memory.extend([f"ROCm Device {i}: 16GB VRAM" for i in range(rocm_gpus)])
total_memory += rocm_gpus * 16
# Only count working NVIDIA GPUs with actual VRAM detection
if torch.cuda.is_available():
for i in range(torch.cuda.device_count()):
try:
device_name = torch.cuda.get_device_name(i).lower()
# Only consider NVIDIA GPUs
if not any(keyword in device_name for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
continue
# Test device functionality
try:
with torch.cuda.device(i):
test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
test_result = test_tensor + 1
del test_tensor, test_result
# Get actual VRAM (same as /api/stats)
props = torch.cuda.get_device_properties(i)
vram_gb = int(props.total_memory / 1024**3) # Convert bytes to GB
all_gpu_memory.append(f"CUDA Device {i}: {vram_gb}GB VRAM")
total_memory += vram_gb
total_gpus += 1
except:
continue
except:
continue
# Only count working ROCm GPUs
if hasattr(torch, 'hip') and torch.hip.is_available():
for i in range(torch.hip.device_count()):
try:
# Test device functionality
try:
with torch.device(f'hip:{i}'):
test_tensor = torch.tensor([1.0], device=f'hip:{i}')
test_result = test_tensor + 1
del test_tensor, test_result
# ROCm VRAM detection is harder, use estimate
all_gpu_memory.append(f"ROCm Device {i}: 16GB VRAM")
total_memory += 16
total_gpus += 1
except:
continue
except:
continue
# Worker summary for local node
worker_types = []
......@@ -608,22 +641,58 @@ def detect_local_workers():
# Determine worker type
worker_type = 'analysis' if 'worker_analysis' in ' '.join(cmdline) else 'training'
# Get GPU info (simplified - would need better detection)
from vidai.compat import detect_gpu_backends
gpu_info = detect_gpu_backends()
# Get actual GPU info with real VRAM detection (only working GPUs)
import torch
gpu_memory = []
total_memory = 0
gpus = 0
if backend == 'cuda' and gpu_info['cuda']:
gpus = gpu_info['cuda_devices']
gpu_memory = [f"CUDA Device {i}: 8GB VRAM" for i in range(gpus)]
total_memory = gpus * 8
elif backend == 'rocm' and gpu_info['rocm']:
gpus = gpu_info['rocm_devices']
gpu_memory = [f"ROCm Device {i}: 16GB VRAM" for i in range(gpus)]
total_memory = gpus * 16
if backend == 'cuda' and torch.cuda.is_available():
# Only count working NVIDIA GPUs
for i in range(torch.cuda.device_count()):
try:
device_name = torch.cuda.get_device_name(i).lower()
# Only consider NVIDIA GPUs
if not any(keyword in device_name for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
continue
# Test device functionality
try:
with torch.cuda.device(i):
test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
test_result = test_tensor + 1
del test_tensor, test_result
# Get actual VRAM (same as /api/stats)
props = torch.cuda.get_device_properties(i)
vram_gb = int(props.total_memory / 1024**3) # Convert bytes to GB
gpu_memory.append(f"CUDA Device {i}: {vram_gb}GB VRAM")
total_memory += vram_gb
gpus += 1
except:
continue
except:
continue
elif backend == 'rocm' and hasattr(torch, 'hip') and torch.hip.is_available():
# Only count working ROCm GPUs
for i in range(torch.hip.device_count()):
try:
# Test device functionality
try:
with torch.device(f'hip:{i}'):
test_tensor = torch.tensor([1.0], device=f'hip:{i}')
test_result = test_tensor + 1
del test_tensor, test_result
# Get actual VRAM (ROCm doesn't have easy property access, estimate)
gpu_memory.append(f"ROCm Device {i}: 16GB VRAM") # Estimate
total_memory += 16
gpus += 1
except:
continue
except:
continue
uptime_seconds = current_time - proc.info.get('create_time', current_time)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment