Fix GPU memory and utilization stats to show actual values using pynvml...

Fix GPU memory and utilization stats to show actual values using pynvml instead of PyTorch-only stats
parent 566d8113
......@@ -135,21 +135,65 @@ def api_stats():
pass
# GPU stats (local machine)
if torch.cuda.is_available():
data['gpu_count'] = torch.cuda.device_count()
data['gpu_count'] = 0
data['gpus'] = []
# Try to get actual GPU stats using pynvml (NVIDIA management library)
try:
import nvidia_ml_py as pynvml
pynvml.nvmlInit()
device_count = pynvml.nvmlDeviceGetCount()
data['gpu_count'] = device_count
data['gpus'] = []
for i in range(torch.cuda.device_count()):
for i in range(device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
name = pynvml.nvmlDeviceGetName(handle)
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
gpu = {
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0, # Would need pynvml for actual utilization
'name': name.decode('utf-8') if isinstance(name, bytes) else str(name),
'memory_used': memory_info.used / 1024**3, # Convert bytes to GB
'memory_total': memory_info.total / 1024**3,
'utilization': utilization.gpu,
'backend': 'cuda'
}
data['gpus'].append(gpu)
else:
data['gpu_count'] = 0
data['gpus'] = []
pynvml.nvmlShutdown()
except ImportError:
# Fallback to PyTorch-only stats if pynvml not available
print("pynvml not available, falling back to PyTorch GPU stats")
if torch.cuda.is_available():
data['gpu_count'] = torch.cuda.device_count()
data['gpus'] = []
for i in range(torch.cuda.device_count()):
gpu = {
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0, # pynvml required for actual utilization
'backend': 'cuda'
}
data['gpus'].append(gpu)
except Exception as e:
print(f"Error getting GPU stats with pynvml: {e}")
# Fallback to PyTorch if pynvml fails
if torch.cuda.is_available():
data['gpu_count'] = torch.cuda.device_count()
data['gpus'] = []
for i in range(torch.cuda.device_count()):
gpu = {
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0,
'backend': 'cuda'
}
data['gpus'].append(gpu)
# CPU and RAM (local machine)
data['cpu_percent'] = psutil.cpu_percent()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment