Remove NVIDIA-only GPU filtering, detect all working CUDA/ROCm GPUs

- Removed brand-specific filtering that only allowed NVIDIA GPUs
- Now detects any GPU that can actually perform CUDA or ROCm operations
- Functional test determines if GPU should be included, not brand
- GPUs are shown with correct system indices (Device 0, 1, etc.)
- AMD GPUs that support ROCm will be shown if functional
- CUDA GPUs from any vendor will be shown if functional
parent efbb77ce
...@@ -265,14 +265,7 @@ def detect_gpu_backends() -> dict: ...@@ -265,14 +265,7 @@ def detect_gpu_backends() -> dict:
working_cuda_devices = 0 working_cuda_devices = 0
for i in range(torch.cuda.device_count()): for i in range(torch.cuda.device_count()):
try: try:
# Test if device is actually functional by trying a simple operation # Test if device can actually perform CUDA operations
device_name = torch.cuda.get_device_name(i).lower()
# Only consider NVIDIA GPUs
if not any(keyword in device_name for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
continue
# Test device functionality
try: try:
with torch.cuda.device(i): with torch.cuda.device(i):
test_tensor = torch.tensor([1.0], device=f'cuda:{i}') test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
......
...@@ -508,16 +508,11 @@ def api_cluster_nodes(): ...@@ -508,16 +508,11 @@ def api_cluster_nodes():
all_gpu_memory = [] all_gpu_memory = []
total_memory = 0 total_memory = 0
# Only count working NVIDIA GPUs with actual VRAM detection # Count all working CUDA GPUs with actual VRAM detection
if torch.cuda.is_available(): if torch.cuda.is_available():
for i in range(torch.cuda.device_count()): for i in range(torch.cuda.device_count()):
try: try:
device_name = torch.cuda.get_device_name(i).lower() # Test device functionality - if it can perform CUDA operations, include it
# Only consider NVIDIA GPUs
if not any(keyword in device_name for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
continue
# Test device functionality
try: try:
with torch.cuda.device(i): with torch.cuda.device(i):
test_tensor = torch.tensor([1.0], device=f'cuda:{i}') test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
...@@ -531,6 +526,7 @@ def api_cluster_nodes(): ...@@ -531,6 +526,7 @@ def api_cluster_nodes():
total_memory += vram_gb total_memory += vram_gb
total_gpus += 1 total_gpus += 1
except: except:
# Device not functional, skip it
continue continue
except: except:
continue continue
...@@ -648,15 +644,10 @@ def detect_local_workers(): ...@@ -648,15 +644,10 @@ def detect_local_workers():
gpus = 0 gpus = 0
if backend == 'cuda' and torch.cuda.is_available(): if backend == 'cuda' and torch.cuda.is_available():
# Only count working NVIDIA GPUs # Count all working CUDA GPUs (any brand that actually works)
for i in range(torch.cuda.device_count()): for i in range(torch.cuda.device_count()):
try: try:
device_name = torch.cuda.get_device_name(i).lower() # Test device functionality - if it can perform CUDA operations, include it
# Only consider NVIDIA GPUs
if not any(keyword in device_name for keyword in ['nvidia', 'geforce', 'quadro', 'tesla', 'rtx', 'gtx']):
continue
# Test device functionality
try: try:
with torch.cuda.device(i): with torch.cuda.device(i):
test_tensor = torch.tensor([1.0], device=f'cuda:{i}') test_tensor = torch.tensor([1.0], device=f'cuda:{i}')
...@@ -670,6 +661,7 @@ def detect_local_workers(): ...@@ -670,6 +661,7 @@ def detect_local_workers():
total_memory += vram_gb total_memory += vram_gb
gpus += 1 gpus += 1
except: except:
# Device not functional, skip it
continue continue
except: except:
continue continue
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment