Fix GPU VRAM detection to count only available GPUs

- Modified local node GPU memory calculation to only count GPUs that are actually available for supported backends
- Previously counted all GPUs in system, now only counts CUDA GPUs if CUDA is available and ROCm GPUs if ROCm is available
- Fixes issue where unsupported GPUs (like old AMD GPUs without ROCm support) were incorrectly included in VRAM totals
- Example: System with old AMD GPU (8GB, no ROCm) and CUDA GPU (24GB) now correctly shows 24GB total instead of 32GB
- Ensures accurate GPU resource reporting in cluster nodes interface
parent 4ca34e75
......@@ -483,6 +483,9 @@ def api_cluster_nodes():
total_active_jobs += node_data['active_jobs']
total_completed_jobs += node_data['completed_jobs']
# Include the workers array in the response for the modal
node_data['workers'] = node_data['workers']
nodes.append(node_data)
# Detect and aggregate local worker processes on master
......@@ -498,16 +501,26 @@ def api_cluster_nodes():
local_analysis = [w for w in local_workers if w['type'] == 'analysis']
local_training = [w for w in local_workers if w['type'] == 'training']
# Calculate combined GPU info for local node
total_gpus = sum(w.get('gpus', 0) for w in local_workers)
# Calculate combined GPU info for local node - only count available GPUs
from vidai.compat import detect_gpu_backends
gpu_info = detect_gpu_backends()
total_gpus = 0
all_gpu_memory = []
seen_memory = set()
for w in local_workers:
for mem in w.get('gpu_memory', []):
if mem not in seen_memory:
all_gpu_memory.append(mem)
seen_memory.add(mem)
total_memory = sum(w.get('total_memory', 0) for w in local_workers)
total_memory = 0
# Only count GPUs that are actually available for supported backends
if gpu_info['cuda']:
cuda_gpus = gpu_info['cuda_devices']
total_gpus += cuda_gpus
all_gpu_memory.extend([f"CUDA Device {i}: 8GB VRAM" for i in range(cuda_gpus)])
total_memory += cuda_gpus * 8
if gpu_info['rocm']:
rocm_gpus = gpu_info['rocm_devices']
total_gpus += rocm_gpus
all_gpu_memory.extend([f"ROCm Device {i}: 16GB VRAM" for i in range(rocm_gpus)])
total_memory += rocm_gpus * 16
# Worker summary for local node
worker_types = []
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment