Improve GPU memory detection fallback chain

- Add nvidia-smi as intermediate fallback before PyTorch in GPU stats collection
- Fallback order: pynvml -> nvidia-smi -> PyTorch
- Applied to api.py, backend.py, and cluster_client.py GPU stats functions
- nvidia-smi provides accurate memory usage and utilization data
- Fix SocketCommunicator.receive_message() timeout parameter error
- Added optional timeout parameter to receive_message method
- Fixes 'unexpected keyword argument timeout' error in api_stats and backend functions
parent ec797e8e
...@@ -166,35 +166,103 @@ def api_stats(): ...@@ -166,35 +166,103 @@ def api_stats():
pynvml.nvmlShutdown() pynvml.nvmlShutdown()
except ImportError: except ImportError:
# Fallback to PyTorch-only stats if pynvml not available # Fallback to nvidia-smi if pynvml not available
log_message("pynvml not available, falling back to PyTorch GPU stats") log_message("pynvml not available, trying nvidia-smi")
if torch.cuda.is_available(): try:
data['gpu_count'] = torch.cuda.device_count() import subprocess
data['gpus'] = [] import json
for i in range(torch.cuda.device_count()):
gpu = { # Try to get GPU stats using nvidia-smi
'name': torch.cuda.get_device_name(i), result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB capture_output=True, text=True, timeout=10)
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0, # pynvml required for actual utilization if result.returncode == 0:
'backend': 'cuda' lines = result.stdout.strip().split('\n')
} data['gpu_count'] = len(lines)
data['gpus'].append(gpu) data['gpus'] = []
for line in lines:
if line.strip():
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 4:
name = parts[0]
memory_used = float(parts[1]) / 1024 # Convert MB to GB
memory_total = float(parts[2]) / 1024 # Convert MB to GB
utilization = int(parts[3])
gpu = {
'name': name,
'memory_used': memory_used,
'memory_total': memory_total,
'utilization': utilization,
'backend': 'cuda'
}
data['gpus'].append(gpu)
else:
raise Exception("nvidia-smi command failed")
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError, Exception) as e:
log_message(f"nvidia-smi not available or failed: {e}, falling back to PyTorch GPU stats")
if torch.cuda.is_available():
data['gpu_count'] = torch.cuda.device_count()
data['gpus'] = []
for i in range(torch.cuda.device_count()):
gpu = {
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0, # pynvml or nvidia-smi required for actual utilization
'backend': 'cuda'
}
data['gpus'].append(gpu)
except Exception as e: except Exception as e:
log_message(f"Error getting GPU stats with pynvml: {e}") log_message(f"Error getting GPU stats with pynvml: {e}")
# Fallback to PyTorch if pynvml fails # Fallback to nvidia-smi
if torch.cuda.is_available(): try:
data['gpu_count'] = torch.cuda.device_count() import subprocess
data['gpus'] = []
for i in range(torch.cuda.device_count()): result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
gpu = { capture_output=True, text=True, timeout=10)
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB if result.returncode == 0:
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3, lines = result.stdout.strip().split('\n')
'utilization': 0, data['gpu_count'] = len(lines)
'backend': 'cuda' data['gpus'] = []
}
data['gpus'].append(gpu) for line in lines:
if line.strip():
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 4:
name = parts[0]
memory_used = float(parts[1]) / 1024 # Convert MB to GB
memory_total = float(parts[2]) / 1024 # Convert MB to GB
utilization = int(parts[3])
gpu = {
'name': name,
'memory_used': memory_used,
'memory_total': memory_total,
'utilization': utilization,
'backend': 'cuda'
}
data['gpus'].append(gpu)
else:
raise Exception("nvidia-smi command failed")
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError, Exception) as e2:
log_message(f"nvidia-smi also failed: {e2}, falling back to PyTorch GPU stats")
if torch.cuda.is_available():
data['gpu_count'] = torch.cuda.device_count()
data['gpus'] = []
for i in range(torch.cuda.device_count()):
gpu = {
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0,
'backend': 'cuda'
}
data['gpus'].append(gpu)
# CPU and RAM (local machine) # CPU and RAM (local machine)
data['cpu_percent'] = psutil.cpu_percent() data['cpu_percent'] = psutil.cpu_percent()
......
...@@ -165,35 +165,102 @@ def handle_web_message(message: Message, client_sock=None) -> Message: ...@@ -165,35 +165,102 @@ def handle_web_message(message: Message, client_sock=None) -> Message:
pynvml.nvmlShutdown() pynvml.nvmlShutdown()
except ImportError: except ImportError:
# Fallback to PyTorch-only stats if pynvml not available # Fallback to nvidia-smi if pynvml not available
log_message("pynvml not available, falling back to PyTorch GPU stats") log_message("pynvml not available, trying nvidia-smi")
if torch.cuda.is_available(): try:
stats['gpu_count'] = torch.cuda.device_count() import subprocess
stats['gpus'] = []
for i in range(torch.cuda.device_count()): # Try to get GPU stats using nvidia-smi
gpu = { result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
'name': torch.cuda.get_device_name(i), capture_output=True, text=True, timeout=10)
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3, if result.returncode == 0:
'utilization': 0, # pynvml required for actual utilization lines = result.stdout.strip().split('\n')
'backend': 'cuda' stats['gpu_count'] = len(lines)
} stats['gpus'] = []
stats['gpus'].append(gpu)
for line in lines:
if line.strip():
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 4:
name = parts[0]
memory_used = float(parts[1]) / 1024 # Convert MB to GB
memory_total = float(parts[2]) / 1024 # Convert MB to GB
utilization = int(parts[3])
gpu = {
'name': name,
'memory_used': memory_used,
'memory_total': memory_total,
'utilization': utilization,
'backend': 'cuda'
}
stats['gpus'].append(gpu)
else:
raise Exception("nvidia-smi command failed")
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError, Exception) as e:
log_message(f"nvidia-smi not available or failed: {e}, falling back to PyTorch GPU stats")
if torch.cuda.is_available():
stats['gpu_count'] = torch.cuda.device_count()
stats['gpus'] = []
for i in range(torch.cuda.device_count()):
gpu = {
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0, # pynvml or nvidia-smi required for actual utilization
'backend': 'cuda'
}
stats['gpus'].append(gpu)
except Exception as e: except Exception as e:
log_message(f"Error getting GPU stats with pynvml: {e}") log_message(f"Error getting GPU stats with pynvml: {e}")
# Fallback to PyTorch if pynvml fails # Fallback to nvidia-smi
if torch.cuda.is_available(): try:
stats['gpu_count'] = torch.cuda.device_count() import subprocess
stats['gpus'] = []
for i in range(torch.cuda.device_count()): result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
gpu = { capture_output=True, text=True, timeout=10)
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB if result.returncode == 0:
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3, lines = result.stdout.strip().split('\n')
'utilization': 0, stats['gpu_count'] = len(lines)
'backend': 'cuda' stats['gpus'] = []
}
stats['gpus'].append(gpu) for line in lines:
if line.strip():
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 4:
name = parts[0]
memory_used = float(parts[1]) / 1024 # Convert MB to GB
memory_total = float(parts[2]) / 1024 # Convert MB to GB
utilization = int(parts[3])
gpu = {
'name': name,
'memory_used': memory_used,
'memory_total': memory_total,
'utilization': utilization,
'backend': 'cuda'
}
stats['gpus'].append(gpu)
else:
raise Exception("nvidia-smi command failed")
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError, Exception) as e2:
log_message(f"nvidia-smi also failed: {e2}, falling back to PyTorch GPU stats")
if torch.cuda.is_available():
stats['gpu_count'] = torch.cuda.device_count()
stats['gpus'] = []
for i in range(torch.cuda.device_count()):
gpu = {
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0,
'backend': 'cuda'
}
stats['gpus'].append(gpu)
# CPU and RAM (local machine) # CPU and RAM (local machine)
stats['cpu_percent'] = psutil.cpu_percent() stats['cpu_percent'] = psutil.cpu_percent()
......
...@@ -503,43 +503,113 @@ class ClusterClient: ...@@ -503,43 +503,113 @@ class ClusterClient:
return {'gpus': gpu_stats, 'backend': 'cuda'} return {'gpus': gpu_stats, 'backend': 'cuda'}
except ImportError: except ImportError:
# Fallback to PyTorch-only stats if pynvml not available # Fallback to nvidia-smi if pynvml not available
log_message("pynvml not available, trying nvidia-smi")
try: try:
import torch import subprocess
if torch.cuda.is_available():
# Try to get GPU stats using nvidia-smi
result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
capture_output=True, text=True, timeout=10)
if result.returncode == 0:
lines = result.stdout.strip().split('\n')
gpu_stats = [] gpu_stats = []
for i in range(torch.cuda.device_count()):
gpu_stats.append({ for i, line in enumerate(lines):
'device_id': i, if line.strip():
'name': torch.cuda.get_device_name(i), parts = [p.strip() for p in line.split(',')]
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB if len(parts) >= 4:
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3, name = parts[0]
'utilization': 0, # pynvml required for actual utilization memory_used = float(parts[1]) / 1024 # Convert MB to GB
'backend': 'cuda' memory_total = float(parts[2]) / 1024 # Convert MB to GB
}) utilization = int(parts[3])
gpu_stats.append({
'device_id': i,
'name': name,
'memory_used': memory_used,
'memory_total': memory_total,
'utilization': utilization,
'backend': 'cuda'
})
return {'gpus': gpu_stats, 'backend': 'cuda'} return {'gpus': gpu_stats, 'backend': 'cuda'}
except: else:
pass raise Exception("nvidia-smi command failed")
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError, Exception) as e:
log_message(f"nvidia-smi not available or failed: {e}, falling back to PyTorch GPU stats")
try:
import torch
if torch.cuda.is_available():
gpu_stats = []
for i in range(torch.cuda.device_count()):
gpu_stats.append({
'device_id': i,
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0, # pynvml or nvidia-smi required for actual utilization
'backend': 'cuda'
})
return {'gpus': gpu_stats, 'backend': 'cuda'}
except:
pass
except Exception as e: except Exception as e:
log_message(f"Error collecting GPU stats with pynvml: {e}") log_message(f"Error collecting GPU stats with pynvml: {e}")
# Fallback to PyTorch if pynvml fails # Fallback to nvidia-smi
try: try:
import torch import subprocess
if torch.cuda.is_available():
result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
capture_output=True, text=True, timeout=10)
if result.returncode == 0:
lines = result.stdout.strip().split('\n')
gpu_stats = [] gpu_stats = []
for i in range(torch.cuda.device_count()):
gpu_stats.append({ for i, line in enumerate(lines):
'device_id': i, if line.strip():
'name': torch.cuda.get_device_name(i), parts = [p.strip() for p in line.split(',')]
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB if len(parts) >= 4:
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3, name = parts[0]
'utilization': 0, memory_used = float(parts[1]) / 1024 # Convert MB to GB
'backend': 'cuda' memory_total = float(parts[2]) / 1024 # Convert MB to GB
}) utilization = int(parts[3])
gpu_stats.append({
'device_id': i,
'name': name,
'memory_used': memory_used,
'memory_total': memory_total,
'utilization': utilization,
'backend': 'cuda'
})
return {'gpus': gpu_stats, 'backend': 'cuda'} return {'gpus': gpu_stats, 'backend': 'cuda'}
except: else:
pass raise Exception("nvidia-smi command failed")
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError, Exception) as e2:
log_message(f"nvidia-smi also failed: {e2}, falling back to PyTorch GPU stats")
try:
import torch
if torch.cuda.is_available():
gpu_stats = []
for i in range(torch.cuda.device_count()):
gpu_stats.append({
'device_id': i,
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0,
'backend': 'cuda'
})
return {'gpus': gpu_stats, 'backend': 'cuda'}
except:
pass
return None return None
......
...@@ -76,10 +76,12 @@ class SocketCommunicator: ...@@ -76,10 +76,12 @@ class SocketCommunicator:
log_message(f"DEBUG: SocketCommunicator sending: {full_data}") log_message(f"DEBUG: SocketCommunicator sending: {full_data}")
self.sock.sendall(full_data) self.sock.sendall(full_data)
def receive_message(self) -> Optional[Message]: def receive_message(self, timeout: Optional[float] = None) -> Optional[Message]:
"""Receive a message.""" """Receive a message with optional timeout."""
if self.sock: if self.sock:
try: try:
if timeout is not None:
self.sock.settimeout(timeout)
data = self.sock.recv(4096) data = self.sock.recv(4096)
if data: if data:
decoded = data.decode('utf-8').strip() decoded = data.decode('utf-8').strip()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment