Fix GPU detection to only count working, functional GPUs

- Modified detect_gpu_backends() to perform functional tests on GPUs
- CUDA detection now verifies devices can actually perform tensor operations
- ROCm detection now tests device functionality before counting
- Only NVIDIA GPUs are counted for CUDA, and only functional devices
- Prevents counting of non-working GPUs like old AMD cards misreported as CUDA
- Example: System with old AMD GPU (device 0) + working CUDA GPU (device 1) now correctly shows only the functional CUDA GPU
- Total VRAM calculation now reflects only actually usable GPUs
- Both PyTorch and nvidia-smi/rocm-smi detection paths updated
parent ffe34516
...@@ -258,48 +258,114 @@ def detect_gpu_backends() -> dict: ...@@ -258,48 +258,114 @@ def detect_gpu_backends() -> dict:
'rocm_devices': 0 'rocm_devices': 0
} }
# Check CUDA availability # Check CUDA availability - only count working, functional CUDA devices
try: try:
import torch import torch
if torch.cuda.is_available(): if torch.cuda.is_available():
backends['cuda'] = True working_cuda_devices = 0
backends['cuda_devices'] = torch.cuda.device_count() for i in range(torch.cuda.device_count()):
try: try:
backends['cuda_version'] = torch.version.cuda # Test if device is actually functional by trying a simple operation
except: device_name = torch.cuda.get_device_name(i).lower()
pass # Only consider NVIDIA GPUs
if not any(keyword in device_name for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
continue
# Test device functionality
try:
with torch.cuda.device(i):
test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
test_result = test_tensor + 1 # Simple operation
del test_tensor, test_result
working_cuda_devices += 1
except Exception:
# Device not functional, skip it
continue
except Exception:
continue
if working_cuda_devices > 0:
backends['cuda'] = True
backends['cuda_devices'] = working_cuda_devices
try:
backends['cuda_version'] = torch.version.cuda
except:
pass
except ImportError: except ImportError:
# Try to detect CUDA without torch # Try to detect CUDA without torch using nvidia-smi
try: try:
result = subprocess.run(['nvidia-smi', '--query-gpu=name', '--format=csv,noheader,nounits'], result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used', '--format=csv,noheader,nounits'],
capture_output=True, text=True, timeout=5) capture_output=True, text=True, timeout=5)
if result.returncode == 0: if result.returncode == 0:
lines = result.stdout.strip().split('\n') lines = result.stdout.strip().split('\n')
backends['cuda'] = True working_cuda_devices = 0
backends['cuda_devices'] = len(lines) for line in lines:
if ',' in line:
name = line.split(',')[0].strip()
# Check if it's an NVIDIA GPU and has memory info (indicating it's functional)
if any(keyword in name.lower() for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
working_cuda_devices += 1
if working_cuda_devices > 0:
backends['cuda'] = True
backends['cuda_devices'] = working_cuda_devices
except (subprocess.TimeoutExpired, FileNotFoundError): except (subprocess.TimeoutExpired, FileNotFoundError):
pass pass
# Check ROCm availability # Check ROCm availability - only count working, functional ROCm devices
try: try:
import torch import torch
if hasattr(torch, 'hip') and torch.hip.is_available(): if hasattr(torch, 'hip') and torch.hip.is_available():
backends['rocm'] = True working_rocm_devices = 0
backends['rocm_devices'] = torch.hip.device_count() for i in range(torch.hip.device_count()):
try: try:
backends['rocm_version'] = torch.version.hip # Test if ROCm device is actually functional
except: device_name = torch.hip.get_device_name(i).lower() if hasattr(torch.hip, 'get_device_name') else f"hip:{i}"
pass
# Test device functionality
try:
with torch.device(f'hip:{i}'):
test_tensor = torch.tensor([1.0], device=f'hip:{i}')
test_result = test_tensor + 1 # Simple operation
del test_tensor, test_result
working_rocm_devices += 1
except Exception:
# Device not functional, skip it
continue
except Exception:
continue
if working_rocm_devices > 0:
backends['rocm'] = True
backends['rocm_devices'] = working_rocm_devices
try:
backends['rocm_version'] = torch.version.hip
except:
pass
except (ImportError, AttributeError): except (ImportError, AttributeError):
# Try to detect ROCm via rocm-smi # Try to detect ROCm via rocm-smi
try: try:
result = subprocess.run(['rocm-smi', '--showid'], capture_output=True, text=True, timeout=5) result = subprocess.run(['rocm-smi', '--showid'], capture_output=True, text=True, timeout=5)
if result.returncode == 0: if result.returncode == 0:
# Count GPU lines (excluding header) # Count GPU lines and verify they are functional
lines = result.stdout.strip().split('\n') lines = result.stdout.strip().split('\n')
gpu_lines = [line for line in lines if 'GPU' in line and any(char.isdigit() for char in line)] gpu_lines = [line for line in lines if 'GPU' in line and any(char.isdigit() for char in line)]
backends['rocm'] = len(gpu_lines) > 0 working_rocm_devices = 0
backends['rocm_devices'] = len(gpu_lines)
# Additional check: try to get memory info to verify functionality
try:
mem_result = subprocess.run(['rocm-smi', '--showmeminfo', 'vram'], capture_output=True, text=True, timeout=5)
if mem_result.returncode == 0:
working_rocm_devices = len(gpu_lines)
except:
# If memory check fails, assume GPUs are working if detected
working_rocm_devices = len(gpu_lines)
if working_rocm_devices > 0:
backends['rocm'] = True
backends['rocm_devices'] = working_rocm_devices
except (subprocess.TimeoutExpired, FileNotFoundError): except (subprocess.TimeoutExpired, FileNotFoundError):
pass pass
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment