Fix GPU detection to only count working, functional GPUs

- Modified detect_gpu_backends() to perform functional tests on GPUs
- CUDA detection now verifies devices can actually perform tensor operations
- ROCm detection now tests device functionality before counting
- Only NVIDIA GPUs are counted for CUDA, and only functional devices
- Prevents counting of non-working GPUs like old AMD cards misreported as CUDA
- Example: System with old AMD GPU (device 0) + working CUDA GPU (device 1) now correctly shows only the functional CUDA GPU
- Total VRAM calculation now reflects only actually usable GPUs
- Both PyTorch and nvidia-smi/rocm-smi detection paths updated
parent ffe34516
......@@ -258,34 +258,88 @@ def detect_gpu_backends() -> dict:
'rocm_devices': 0
}
# Check CUDA availability
# Check CUDA availability - only count working, functional CUDA devices
try:
import torch
if torch.cuda.is_available():
working_cuda_devices = 0
for i in range(torch.cuda.device_count()):
try:
# Test if device is actually functional by trying a simple operation
device_name = torch.cuda.get_device_name(i).lower()
# Only consider NVIDIA GPUs
if not any(keyword in device_name for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
continue
# Test device functionality
try:
with torch.cuda.device(i):
test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
test_result = test_tensor + 1 # Simple operation
del test_tensor, test_result
working_cuda_devices += 1
except Exception:
# Device not functional, skip it
continue
except Exception:
continue
if working_cuda_devices > 0:
backends['cuda'] = True
backends['cuda_devices'] = torch.cuda.device_count()
backends['cuda_devices'] = working_cuda_devices
try:
backends['cuda_version'] = torch.version.cuda
except:
pass
except ImportError:
# Try to detect CUDA without torch
# Try to detect CUDA without torch using nvidia-smi
try:
result = subprocess.run(['nvidia-smi', '--query-gpu=name', '--format=csv,noheader,nounits'],
result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used', '--format=csv,noheader,nounits'],
capture_output=True, text=True, timeout=5)
if result.returncode == 0:
lines = result.stdout.strip().split('\n')
working_cuda_devices = 0
for line in lines:
if ',' in line:
name = line.split(',')[0].strip()
# Check if it's an NVIDIA GPU and has memory info (indicating it's functional)
if any(keyword in name.lower() for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
working_cuda_devices += 1
if working_cuda_devices > 0:
backends['cuda'] = True
backends['cuda_devices'] = len(lines)
backends['cuda_devices'] = working_cuda_devices
except (subprocess.TimeoutExpired, FileNotFoundError):
pass
# Check ROCm availability
# Check ROCm availability - only count working, functional ROCm devices
try:
import torch
if hasattr(torch, 'hip') and torch.hip.is_available():
working_rocm_devices = 0
for i in range(torch.hip.device_count()):
try:
# Test if ROCm device is actually functional
device_name = torch.hip.get_device_name(i).lower() if hasattr(torch.hip, 'get_device_name') else f"hip:{i}"
# Test device functionality
try:
with torch.device(f'hip:{i}'):
test_tensor = torch.tensor([1.0], device=f'hip:{i}')
test_result = test_tensor + 1 # Simple operation
del test_tensor, test_result
working_rocm_devices += 1
except Exception:
# Device not functional, skip it
continue
except Exception:
continue
if working_rocm_devices > 0:
backends['rocm'] = True
backends['rocm_devices'] = torch.hip.device_count()
backends['rocm_devices'] = working_rocm_devices
try:
backends['rocm_version'] = torch.version.hip
except:
......@@ -295,11 +349,23 @@ def detect_gpu_backends() -> dict:
try:
result = subprocess.run(['rocm-smi', '--showid'], capture_output=True, text=True, timeout=5)
if result.returncode == 0:
# Count GPU lines (excluding header)
# Count GPU lines and verify they are functional
lines = result.stdout.strip().split('\n')
gpu_lines = [line for line in lines if 'GPU' in line and any(char.isdigit() for char in line)]
backends['rocm'] = len(gpu_lines) > 0
backends['rocm_devices'] = len(gpu_lines)
working_rocm_devices = 0
# Additional check: try to get memory info to verify functionality
try:
mem_result = subprocess.run(['rocm-smi', '--showmeminfo', 'vram'], capture_output=True, text=True, timeout=5)
if mem_result.returncode == 0:
working_rocm_devices = len(gpu_lines)
except:
# If memory check fails, assume GPUs are working if detected
working_rocm_devices = len(gpu_lines)
if working_rocm_devices > 0:
backends['rocm'] = True
backends['rocm_devices'] = working_rocm_devices
except (subprocess.TimeoutExpired, FileNotFoundError):
pass
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment