Improve GPU memory detection fallback chain

- Add nvidia-smi as intermediate fallback before PyTorch in GPU stats collection
- Fallback order: pynvml -> nvidia-smi -> PyTorch
- Applied to api.py, backend.py, and cluster_client.py GPU stats functions
- nvidia-smi provides accurate memory usage and utilization data
- Fix SocketCommunicator.receive_message() timeout parameter error
- Added optional timeout parameter to receive_message method
- Fixes 'unexpected keyword argument timeout' error in api_stats and backend functions
parent ec797e8e
......@@ -166,35 +166,103 @@ def api_stats():
pynvml.nvmlShutdown()
except ImportError:
# Fallback to PyTorch-only stats if pynvml not available
log_message("pynvml not available, falling back to PyTorch GPU stats")
if torch.cuda.is_available():
data['gpu_count'] = torch.cuda.device_count()
data['gpus'] = []
for i in range(torch.cuda.device_count()):
gpu = {
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0, # pynvml required for actual utilization
'backend': 'cuda'
}
data['gpus'].append(gpu)
# Fallback to nvidia-smi if pynvml not available
log_message("pynvml not available, trying nvidia-smi")
try:
import subprocess
import json
# Try to get GPU stats using nvidia-smi
result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
capture_output=True, text=True, timeout=10)
if result.returncode == 0:
lines = result.stdout.strip().split('\n')
data['gpu_count'] = len(lines)
data['gpus'] = []
for line in lines:
if line.strip():
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 4:
name = parts[0]
memory_used = float(parts[1]) / 1024 # Convert MB to GB
memory_total = float(parts[2]) / 1024 # Convert MB to GB
utilization = int(parts[3])
gpu = {
'name': name,
'memory_used': memory_used,
'memory_total': memory_total,
'utilization': utilization,
'backend': 'cuda'
}
data['gpus'].append(gpu)
else:
raise Exception("nvidia-smi command failed")
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError, Exception) as e:
log_message(f"nvidia-smi not available or failed: {e}, falling back to PyTorch GPU stats")
if torch.cuda.is_available():
data['gpu_count'] = torch.cuda.device_count()
data['gpus'] = []
for i in range(torch.cuda.device_count()):
gpu = {
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0, # pynvml or nvidia-smi required for actual utilization
'backend': 'cuda'
}
data['gpus'].append(gpu)
except Exception as e:
log_message(f"Error getting GPU stats with pynvml: {e}")
# Fallback to PyTorch if pynvml fails
if torch.cuda.is_available():
data['gpu_count'] = torch.cuda.device_count()
data['gpus'] = []
for i in range(torch.cuda.device_count()):
gpu = {
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0,
'backend': 'cuda'
}
data['gpus'].append(gpu)
# Fallback to nvidia-smi
try:
import subprocess
result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
capture_output=True, text=True, timeout=10)
if result.returncode == 0:
lines = result.stdout.strip().split('\n')
data['gpu_count'] = len(lines)
data['gpus'] = []
for line in lines:
if line.strip():
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 4:
name = parts[0]
memory_used = float(parts[1]) / 1024 # Convert MB to GB
memory_total = float(parts[2]) / 1024 # Convert MB to GB
utilization = int(parts[3])
gpu = {
'name': name,
'memory_used': memory_used,
'memory_total': memory_total,
'utilization': utilization,
'backend': 'cuda'
}
data['gpus'].append(gpu)
else:
raise Exception("nvidia-smi command failed")
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError, Exception) as e2:
log_message(f"nvidia-smi also failed: {e2}, falling back to PyTorch GPU stats")
if torch.cuda.is_available():
data['gpu_count'] = torch.cuda.device_count()
data['gpus'] = []
for i in range(torch.cuda.device_count()):
gpu = {
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0,
'backend': 'cuda'
}
data['gpus'].append(gpu)
# CPU and RAM (local machine)
data['cpu_percent'] = psutil.cpu_percent()
......
......@@ -165,35 +165,102 @@ def handle_web_message(message: Message, client_sock=None) -> Message:
pynvml.nvmlShutdown()
except ImportError:
# Fallback to PyTorch-only stats if pynvml not available
log_message("pynvml not available, falling back to PyTorch GPU stats")
if torch.cuda.is_available():
stats['gpu_count'] = torch.cuda.device_count()
stats['gpus'] = []
for i in range(torch.cuda.device_count()):
gpu = {
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0, # pynvml required for actual utilization
'backend': 'cuda'
}
stats['gpus'].append(gpu)
# Fallback to nvidia-smi if pynvml not available
log_message("pynvml not available, trying nvidia-smi")
try:
import subprocess
# Try to get GPU stats using nvidia-smi
result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
capture_output=True, text=True, timeout=10)
if result.returncode == 0:
lines = result.stdout.strip().split('\n')
stats['gpu_count'] = len(lines)
stats['gpus'] = []
for line in lines:
if line.strip():
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 4:
name = parts[0]
memory_used = float(parts[1]) / 1024 # Convert MB to GB
memory_total = float(parts[2]) / 1024 # Convert MB to GB
utilization = int(parts[3])
gpu = {
'name': name,
'memory_used': memory_used,
'memory_total': memory_total,
'utilization': utilization,
'backend': 'cuda'
}
stats['gpus'].append(gpu)
else:
raise Exception("nvidia-smi command failed")
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError, Exception) as e:
log_message(f"nvidia-smi not available or failed: {e}, falling back to PyTorch GPU stats")
if torch.cuda.is_available():
stats['gpu_count'] = torch.cuda.device_count()
stats['gpus'] = []
for i in range(torch.cuda.device_count()):
gpu = {
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0, # pynvml or nvidia-smi required for actual utilization
'backend': 'cuda'
}
stats['gpus'].append(gpu)
except Exception as e:
log_message(f"Error getting GPU stats with pynvml: {e}")
# Fallback to PyTorch if pynvml fails
if torch.cuda.is_available():
stats['gpu_count'] = torch.cuda.device_count()
stats['gpus'] = []
for i in range(torch.cuda.device_count()):
gpu = {
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0,
'backend': 'cuda'
}
stats['gpus'].append(gpu)
# Fallback to nvidia-smi
try:
import subprocess
result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
capture_output=True, text=True, timeout=10)
if result.returncode == 0:
lines = result.stdout.strip().split('\n')
stats['gpu_count'] = len(lines)
stats['gpus'] = []
for line in lines:
if line.strip():
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 4:
name = parts[0]
memory_used = float(parts[1]) / 1024 # Convert MB to GB
memory_total = float(parts[2]) / 1024 # Convert MB to GB
utilization = int(parts[3])
gpu = {
'name': name,
'memory_used': memory_used,
'memory_total': memory_total,
'utilization': utilization,
'backend': 'cuda'
}
stats['gpus'].append(gpu)
else:
raise Exception("nvidia-smi command failed")
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError, Exception) as e2:
log_message(f"nvidia-smi also failed: {e2}, falling back to PyTorch GPU stats")
if torch.cuda.is_available():
stats['gpu_count'] = torch.cuda.device_count()
stats['gpus'] = []
for i in range(torch.cuda.device_count()):
gpu = {
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0,
'backend': 'cuda'
}
stats['gpus'].append(gpu)
# CPU and RAM (local machine)
stats['cpu_percent'] = psutil.cpu_percent()
......
......@@ -503,43 +503,113 @@ class ClusterClient:
return {'gpus': gpu_stats, 'backend': 'cuda'}
except ImportError:
# Fallback to PyTorch-only stats if pynvml not available
# Fallback to nvidia-smi if pynvml not available
log_message("pynvml not available, trying nvidia-smi")
try:
import torch
if torch.cuda.is_available():
import subprocess
# Try to get GPU stats using nvidia-smi
result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
capture_output=True, text=True, timeout=10)
if result.returncode == 0:
lines = result.stdout.strip().split('\n')
gpu_stats = []
for i in range(torch.cuda.device_count()):
gpu_stats.append({
'device_id': i,
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0, # pynvml required for actual utilization
'backend': 'cuda'
})
for i, line in enumerate(lines):
if line.strip():
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 4:
name = parts[0]
memory_used = float(parts[1]) / 1024 # Convert MB to GB
memory_total = float(parts[2]) / 1024 # Convert MB to GB
utilization = int(parts[3])
gpu_stats.append({
'device_id': i,
'name': name,
'memory_used': memory_used,
'memory_total': memory_total,
'utilization': utilization,
'backend': 'cuda'
})
return {'gpus': gpu_stats, 'backend': 'cuda'}
except:
pass
else:
raise Exception("nvidia-smi command failed")
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError, Exception) as e:
log_message(f"nvidia-smi not available or failed: {e}, falling back to PyTorch GPU stats")
try:
import torch
if torch.cuda.is_available():
gpu_stats = []
for i in range(torch.cuda.device_count()):
gpu_stats.append({
'device_id': i,
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0, # pynvml or nvidia-smi required for actual utilization
'backend': 'cuda'
})
return {'gpus': gpu_stats, 'backend': 'cuda'}
except:
pass
except Exception as e:
log_message(f"Error collecting GPU stats with pynvml: {e}")
# Fallback to PyTorch if pynvml fails
# Fallback to nvidia-smi
try:
import torch
if torch.cuda.is_available():
import subprocess
result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
capture_output=True, text=True, timeout=10)
if result.returncode == 0:
lines = result.stdout.strip().split('\n')
gpu_stats = []
for i in range(torch.cuda.device_count()):
gpu_stats.append({
'device_id': i,
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0,
'backend': 'cuda'
})
for i, line in enumerate(lines):
if line.strip():
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 4:
name = parts[0]
memory_used = float(parts[1]) / 1024 # Convert MB to GB
memory_total = float(parts[2]) / 1024 # Convert MB to GB
utilization = int(parts[3])
gpu_stats.append({
'device_id': i,
'name': name,
'memory_used': memory_used,
'memory_total': memory_total,
'utilization': utilization,
'backend': 'cuda'
})
return {'gpus': gpu_stats, 'backend': 'cuda'}
except:
pass
else:
raise Exception("nvidia-smi command failed")
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError, Exception) as e2:
log_message(f"nvidia-smi also failed: {e2}, falling back to PyTorch GPU stats")
try:
import torch
if torch.cuda.is_available():
gpu_stats = []
for i in range(torch.cuda.device_count()):
gpu_stats.append({
'device_id': i,
'name': torch.cuda.get_device_name(i),
'memory_used': torch.cuda.memory_allocated(i) / 1024**3, # GB
'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
'utilization': 0,
'backend': 'cuda'
})
return {'gpus': gpu_stats, 'backend': 'cuda'}
except:
pass
return None
......
......@@ -76,10 +76,12 @@ class SocketCommunicator:
log_message(f"DEBUG: SocketCommunicator sending: {full_data}")
self.sock.sendall(full_data)
def receive_message(self) -> Optional[Message]:
"""Receive a message."""
def receive_message(self, timeout: Optional[float] = None) -> Optional[Message]:
"""Receive a message with optional timeout."""
if self.sock:
try:
if timeout is not None:
self.sock.settimeout(timeout)
data = self.sock.recv(4096)
if data:
decoded = data.decode('utf-8').strip()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment