Fix GPU VRAM detection for cluster clients

- Update detect_gpu_backends to collect actual VRAM for each GPU device
- Store device info including VRAM in gpu_info sent to master
- Use real VRAM data in cluster nodes API instead of hardcoded values
- Ensure consistent VRAM reporting between master and clients
parent 97a13987
...@@ -255,7 +255,9 @@ def detect_gpu_backends() -> dict: ...@@ -255,7 +255,9 @@ def detect_gpu_backends() -> dict:
'cuda_version': None, 'cuda_version': None,
'rocm_version': None, 'rocm_version': None,
'cuda_devices': 0, 'cuda_devices': 0,
'rocm_devices': 0 'rocm_devices': 0,
'cuda_device_info': [], # List of dicts with device info including VRAM
'rocm_device_info': [] # List of dicts with device info including VRAM
} }
# Check CUDA availability - only count working, functional CUDA devices # Check CUDA availability - only count working, functional CUDA devices
...@@ -263,6 +265,7 @@ def detect_gpu_backends() -> dict: ...@@ -263,6 +265,7 @@ def detect_gpu_backends() -> dict:
import torch import torch
if torch.cuda.is_available(): if torch.cuda.is_available():
working_cuda_devices = 0 working_cuda_devices = 0
cuda_device_info = []
for i in range(torch.cuda.device_count()): for i in range(torch.cuda.device_count()):
try: try:
# Test if device can actually perform CUDA operations # Test if device can actually perform CUDA operations
...@@ -271,7 +274,20 @@ def detect_gpu_backends() -> dict: ...@@ -271,7 +274,20 @@ def detect_gpu_backends() -> dict:
test_tensor = torch.tensor([1.0], device=f'cuda:{i}') test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
test_result = test_tensor + 1 # Simple operation test_result = test_tensor + 1 # Simple operation
del test_tensor, test_result del test_tensor, test_result
# Get actual VRAM (same as /api/stats)
props = torch.cuda.get_device_properties(i)
vram_gb = int(props.total_memory / 1024**3) # Convert bytes to GB
device_info = {
'device_id': i,
'name': props.name,
'vram_gb': vram_gb,
'compute_capability': f"{props.major}.{props.minor}"
}
cuda_device_info.append(device_info)
working_cuda_devices += 1 working_cuda_devices += 1
except Exception: except Exception:
# Device not functional, skip it # Device not functional, skip it
continue continue
...@@ -282,6 +298,7 @@ def detect_gpu_backends() -> dict: ...@@ -282,6 +298,7 @@ def detect_gpu_backends() -> dict:
if working_cuda_devices > 0: if working_cuda_devices > 0:
backends['cuda'] = True backends['cuda'] = True
backends['cuda_devices'] = working_cuda_devices backends['cuda_devices'] = working_cuda_devices
backends['cuda_device_info'] = cuda_device_info
try: try:
backends['cuda_version'] = torch.version.cuda backends['cuda_version'] = torch.version.cuda
except: except:
...@@ -312,6 +329,7 @@ def detect_gpu_backends() -> dict: ...@@ -312,6 +329,7 @@ def detect_gpu_backends() -> dict:
import torch import torch
if hasattr(torch, 'hip') and torch.hip.is_available(): if hasattr(torch, 'hip') and torch.hip.is_available():
working_rocm_devices = 0 working_rocm_devices = 0
rocm_device_info = []
for i in range(torch.hip.device_count()): for i in range(torch.hip.device_count()):
try: try:
# Test if ROCm device is actually functional # Test if ROCm device is actually functional
...@@ -323,7 +341,40 @@ def detect_gpu_backends() -> dict: ...@@ -323,7 +341,40 @@ def detect_gpu_backends() -> dict:
test_tensor = torch.tensor([1.0], device=f'hip:{i}') test_tensor = torch.tensor([1.0], device=f'hip:{i}')
test_result = test_tensor + 1 # Simple operation test_result = test_tensor + 1 # Simple operation
del test_tensor, test_result del test_tensor, test_result
# ROCm VRAM detection is harder, use estimate since PyTorch doesn't expose ROCm device properties easily
# Try to get VRAM info via rocm-smi if available
vram_gb = 16 # Default estimate for ROCm GPUs
try:
import subprocess
result = subprocess.run(['rocm-smi', '--showmeminfo', 'vram', '--device', str(i)],
capture_output=True, text=True, timeout=2)
if result.returncode == 0:
# Parse VRAM info from rocm-smi output
lines = result.stdout.strip().split('\n')
for line in lines:
if 'GPU Memory Total' in line and 'GB' in line:
# Extract number before GB
parts = line.split()
for part in parts:
if part.replace('.', '').isdigit():
try:
vram_gb = int(float(part))
break
except:
pass
break
except:
pass # Keep default estimate
device_info = {
'device_id': i,
'name': device_name,
'vram_gb': vram_gb
}
rocm_device_info.append(device_info)
working_rocm_devices += 1 working_rocm_devices += 1
except Exception: except Exception:
# Device not functional, skip it # Device not functional, skip it
continue continue
...@@ -334,6 +385,7 @@ def detect_gpu_backends() -> dict: ...@@ -334,6 +385,7 @@ def detect_gpu_backends() -> dict:
if working_rocm_devices > 0: if working_rocm_devices > 0:
backends['rocm'] = True backends['rocm'] = True
backends['rocm_devices'] = working_rocm_devices backends['rocm_devices'] = working_rocm_devices
backends['rocm_device_info'] = rocm_device_info
try: try:
backends['rocm_version'] = torch.version.hip backends['rocm_version'] = torch.version.hip
except: except:
......
...@@ -406,17 +406,39 @@ def api_cluster_nodes(): ...@@ -406,17 +406,39 @@ def api_cluster_nodes():
if node_key not in node_map: if node_key not in node_map:
gpu_info = client.get('gpu_info', {}) gpu_info = client.get('gpu_info', {})
available_backends = client.get('available_backends', []) available_backends = client.get('available_backends', [])
cuda_devices = gpu_info.get('cuda_devices', 0)
rocm_devices = gpu_info.get('rocm_devices', 0)
# Get GPU memory info # Get GPU memory info from actual device info
gpu_memory = [] gpu_memory = []
if cuda_devices > 0: total_memory = 0
gpu_memory.extend([f"CUDA Device {i}: 8GB VRAM" for i in range(cuda_devices)])
if rocm_devices > 0:
gpu_memory.extend([f"ROCm Device {i}: 16GB VRAM" for i in range(rocm_devices)])
total_memory = sum([8 if 'CUDA' in mem else 16 if 'ROCm' in mem else 0 for mem in gpu_memory]) # Use CUDA device info if available
cuda_device_info = gpu_info.get('cuda_device_info', [])
for device in cuda_device_info:
device_id = device.get('device_id', 0)
vram_gb = device.get('vram_gb', 8)
name = device.get('name', f'CUDA Device {device_id}')
gpu_memory.append(f"{name}: {vram_gb}GB VRAM")
total_memory += vram_gb
# Use ROCm device info if available
rocm_device_info = gpu_info.get('rocm_device_info', [])
for device in rocm_device_info:
device_id = device.get('device_id', 0)
vram_gb = device.get('vram_gb', 16)
name = device.get('name', f'ROCm Device {device_id}')
gpu_memory.append(f"{name}: {vram_gb}GB VRAM")
total_memory += vram_gb
# Fallback to old format if no device info available
if not gpu_memory:
cuda_devices = gpu_info.get('cuda_devices', 0)
rocm_devices = gpu_info.get('rocm_devices', 0)
if cuda_devices > 0:
gpu_memory.extend([f"CUDA Device {i}: 8GB VRAM" for i in range(cuda_devices)])
total_memory += cuda_devices * 8
if rocm_devices > 0:
gpu_memory.extend([f"ROCm Device {i}: 16GB VRAM" for i in range(rocm_devices)])
total_memory += rocm_devices * 16
# Calculate uptime from last_seen # Calculate uptime from last_seen
last_seen = client.get('last_seen') last_seen = client.get('last_seen')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment