Remove NVIDIA-only GPU filtering, detect all working CUDA/ROCm GPUs

- Removed brand-specific filtering that only allowed NVIDIA GPUs
- Now detects any GPU that can actually perform CUDA or ROCm operations
- Functional test determines if GPU should be included, not brand
- GPUs are shown with correct system indices (Device 0, 1, etc.)
- AMD GPUs that support ROCm will be shown if functional
- CUDA GPUs from any vendor will be shown if functional
parent efbb77ce
......@@ -265,14 +265,7 @@ def detect_gpu_backends() -> dict:
working_cuda_devices = 0
for i in range(torch.cuda.device_count()):
try:
# Test if device is actually functional by trying a simple operation
device_name = torch.cuda.get_device_name(i).lower()
# Only consider NVIDIA GPUs
if not any(keyword in device_name for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
continue
# Test device functionality
# Test if device can actually perform CUDA operations
try:
with torch.cuda.device(i):
test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
......
......@@ -508,16 +508,11 @@ def api_cluster_nodes():
all_gpu_memory = []
total_memory = 0
# Only count working NVIDIA GPUs with actual VRAM detection
# Count all working CUDA GPUs with actual VRAM detection
if torch.cuda.is_available():
for i in range(torch.cuda.device_count()):
try:
device_name = torch.cuda.get_device_name(i).lower()
# Only consider NVIDIA GPUs
if not any(keyword in device_name for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
continue
# Test device functionality
# Test device functionality - if it can perform CUDA operations, include it
try:
with torch.cuda.device(i):
test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
......@@ -531,6 +526,7 @@ def api_cluster_nodes():
total_memory += vram_gb
total_gpus += 1
except:
# Device not functional, skip it
continue
except:
continue
......@@ -648,15 +644,10 @@ def detect_local_workers():
gpus = 0
if backend == 'cuda' and torch.cuda.is_available():
# Only count working NVIDIA GPUs
# Count all working CUDA GPUs (any brand that actually works)
for i in range(torch.cuda.device_count()):
try:
device_name = torch.cuda.get_device_name(i).lower()
# Only consider NVIDIA GPUs
if not any(keyword in device_name for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
continue
# Test device functionality
# Test device functionality - if it can perform CUDA operations, include it
try:
with torch.cuda.device(i):
test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
......@@ -670,6 +661,7 @@ def detect_local_workers():
total_memory += vram_gb
gpus += 1
except:
# Device not functional, skip it
continue
except:
continue
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment