Improve GPU memory detection fallback chain

- Add nvidia-smi as intermediate fallback before PyTorch in GPU stats collection - Fallback order: pynvml -> nvidia-smi -> PyTorch - Applied to api.py, backend.py, and cluster_client.py GPU stats functions - nvidia-smi provides accurate memory usage and utilization data - Fix SocketCommunicator.receive_message() timeout parameter error - Added optional timeout parameter to receive_message method - Fixes 'unexpected keyword argument timeout' error in api_stats and backend functions

Improve GPU memory detection fallback chain
- Add nvidia-smi as intermediate fallback before PyTorch in GPU stats collection - Fallback order: pynvml -> nvidia-smi -> PyTorch - Applied to api.py, backend.py, and cluster_client.py GPU stats functions - nvidia-smi provides accurate memory usage and utilization data - Fix SocketCommunicator.receive_message() timeout parameter error - Added optional timeout parameter to receive_message method - Fixes 'unexpected keyword argument timeout' error in api_stats and backend functions
c2855737 · Stefy Lanza (nextime / spora ) · ec797e8e · c2855737 · c2855737 · c2855737
Commit c2855737 authored Oct 09, 2025 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 291 additions and 84 deletions

api.py vidai/api.py +95 -27

backend.py vidai/backend.py +94 -27

cluster_client.py vidai/cluster_client.py +98 -28

comm.py vidai/comm.py +4 -2

No files found.
--- a/vidai/api.py
+++ b/vidai/api.py
@@ -166,35 +166,103 @@ def api_stats():
        pynvml.nvmlShutdown()
    except ImportError:
-        # Fallback to PyTorch-only stats if pynvml not available
+        # Fallback to nvidia-smi if pynvml not available
-        log_message("pynvml not available, falling back to PyTorch GPU stats")
+        log_message("pynvml not available, trying nvidia-smi")
-        if torch.cuda.is_available():
+        try:
-            data['gpu_count'] = torch.cuda.device_count()
+            import subprocess
-            data['gpus'] = []
+            import json
-            for i in range(torch.cuda.device_count()):
-                gpu = {
+            # Try to get GPU stats using nvidia-smi
-                    'name': torch.cuda.get_device_name(i),
+            result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
-                    'memory_used': torch.cuda.memory_allocated(i) / 1024**3,  # GB
+                                  capture_output=True, text=True, timeout=10)
-                    'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
-                    'utilization': 0,  # pynvml required for actual utilization
+            if result.returncode == 0:
-                    'backend': 'cuda'
+                lines = result.stdout.strip().split('\n')
-                }
+                data['gpu_count'] = len(lines)
-                data['gpus'].append(gpu)
+                data['gpus'] = []
+                for line in lines:
+                    if line.strip():
+                        parts = [p.strip() for p in line.split(',')]
+                        if len(parts) >= 4:
+                            name = parts[0]
+                            memory_used = float(parts[1]) / 1024  # Convert MB to GB
+                            memory_total = float(parts[2]) / 1024  # Convert MB to GB
+                            utilization = int(parts[3])
+                            gpu = {
+                                'name': name,
+                                'memory_used': memory_used,
+                                'memory_total': memory_total,
+                                'utilization': utilization,
+                                'backend': 'cuda'
+                            }
+                            data['gpus'].append(gpu)
+            else:
+                raise Exception("nvidia-smi command failed")
+        except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError, Exception) as e:
+            log_message(f"nvidia-smi not available or failed: {e}, falling back to PyTorch GPU stats")
+            if torch.cuda.is_available():
+                data['gpu_count'] = torch.cuda.device_count()
+                data['gpus'] = []
+                for i in range(torch.cuda.device_count()):
+                    gpu = {
+                        'name': torch.cuda.get_device_name(i),
+                        'memory_used': torch.cuda.memory_allocated(i) / 1024**3,  # GB
+                        'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
+                        'utilization': 0,  # pynvml or nvidia-smi required for actual utilization
+                        'backend': 'cuda'
+                    }
+                    data['gpus'].append(gpu)
    except Exception as e:
        log_message(f"Error getting GPU stats with pynvml: {e}")
-        # Fallback to PyTorch if pynvml fails
+        # Fallback to nvidia-smi
-        if torch.cuda.is_available():
+        try:
-            data['gpu_count'] = torch.cuda.device_count()
+            import subprocess
-            data['gpus'] = []
-            for i in range(torch.cuda.device_count()):
+            result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
-                gpu = {
+                                  capture_output=True, text=True, timeout=10)
-                    'name': torch.cuda.get_device_name(i),
-                    'memory_used': torch.cuda.memory_allocated(i) / 1024**3,  # GB
+            if result.returncode == 0:
-                    'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
+                lines = result.stdout.strip().split('\n')
-                    'utilization': 0,
+                data['gpu_count'] = len(lines)
-                    'backend': 'cuda'
+                data['gpus'] = []
-                }
-                data['gpus'].append(gpu)
+                for line in lines:
+                    if line.strip():
+                        parts = [p.strip() for p in line.split(',')]
+                        if len(parts) >= 4:
+                            name = parts[0]
+                            memory_used = float(parts[1]) / 1024  # Convert MB to GB
+                            memory_total = float(parts[2]) / 1024  # Convert MB to GB
+                            utilization = int(parts[3])
+                            gpu = {
+                                'name': name,
+                                'memory_used': memory_used,
+                                'memory_total': memory_total,
+                                'utilization': utilization,
+                                'backend': 'cuda'
+                            }
+                            data['gpus'].append(gpu)
+            else:
+                raise Exception("nvidia-smi command failed")
+        except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError, Exception) as e2:
+            log_message(f"nvidia-smi also failed: {e2}, falling back to PyTorch GPU stats")
+            if torch.cuda.is_available():
+                data['gpu_count'] = torch.cuda.device_count()
+                data['gpus'] = []
+                for i in range(torch.cuda.device_count()):
+                    gpu = {
+                        'name': torch.cuda.get_device_name(i),
+                        'memory_used': torch.cuda.memory_allocated(i) / 1024**3,  # GB
+                        'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
+                        'utilization': 0,
+                        'backend': 'cuda'
+                    }
+                    data['gpus'].append(gpu)
    # CPU and RAM (local machine)
    data['cpu_percent'] = psutil.cpu_percent()

--- a/vidai/backend.py
+++ b/vidai/backend.py
@@ -165,35 +165,102 @@ def handle_web_message(message: Message, client_sock=None) -> Message:
            pynvml.nvmlShutdown()
        except ImportError:
-            # Fallback to PyTorch-only stats if pynvml not available
+            # Fallback to nvidia-smi if pynvml not available
-            log_message("pynvml not available, falling back to PyTorch GPU stats")
+            log_message("pynvml not available, trying nvidia-smi")
-            if torch.cuda.is_available():
+            try:
-                stats['gpu_count'] = torch.cuda.device_count()
+                import subprocess
-                stats['gpus'] = []
-                for i in range(torch.cuda.device_count()):
+                # Try to get GPU stats using nvidia-smi
-                    gpu = {
+                result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
-                        'name': torch.cuda.get_device_name(i),
+                                      capture_output=True, text=True, timeout=10)
-                        'memory_used': torch.cuda.memory_allocated(i) / 1024**3,  # GB
-                        'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
+                if result.returncode == 0:
-                        'utilization': 0,  # pynvml required for actual utilization
+                    lines = result.stdout.strip().split('\n')
-                        'backend': 'cuda'
+                    stats['gpu_count'] = len(lines)
-                    }
+                    stats['gpus'] = []
-                    stats['gpus'].append(gpu)
+                    for line in lines:
+                        if line.strip():
+                            parts = [p.strip() for p in line.split(',')]
+                            if len(parts) >= 4:
+                                name = parts[0]
+                                memory_used = float(parts[1]) / 1024  # Convert MB to GB
+                                memory_total = float(parts[2]) / 1024  # Convert MB to GB
+                                utilization = int(parts[3])
+                                gpu = {
+                                    'name': name,
+                                    'memory_used': memory_used,
+                                    'memory_total': memory_total,
+                                    'utilization': utilization,
+                                    'backend': 'cuda'
+                                }
+                                stats['gpus'].append(gpu)
+                else:
+                    raise Exception("nvidia-smi command failed")
+            except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError, Exception) as e:
+                log_message(f"nvidia-smi not available or failed: {e}, falling back to PyTorch GPU stats")
+                if torch.cuda.is_available():
+                    stats['gpu_count'] = torch.cuda.device_count()
+                    stats['gpus'] = []
+                    for i in range(torch.cuda.device_count()):
+                        gpu = {
+                            'name': torch.cuda.get_device_name(i),
+                            'memory_used': torch.cuda.memory_allocated(i) / 1024**3,  # GB
+                            'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
+                            'utilization': 0,  # pynvml or nvidia-smi required for actual utilization
+                            'backend': 'cuda'
+                        }
+                        stats['gpus'].append(gpu)
        except Exception as e:
            log_message(f"Error getting GPU stats with pynvml: {e}")
-            # Fallback to PyTorch if pynvml fails
+            # Fallback to nvidia-smi
-            if torch.cuda.is_available():
+            try:
-                stats['gpu_count'] = torch.cuda.device_count()
+                import subprocess
-                stats['gpus'] = []
-                for i in range(torch.cuda.device_count()):
+                result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
-                    gpu = {
+                                      capture_output=True, text=True, timeout=10)
-                        'name': torch.cuda.get_device_name(i),
-                        'memory_used': torch.cuda.memory_allocated(i) / 1024**3,  # GB
+                if result.returncode == 0:
-                        'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
+                    lines = result.stdout.strip().split('\n')
-                        'utilization': 0,
+                    stats['gpu_count'] = len(lines)
-                        'backend': 'cuda'
+                    stats['gpus'] = []
-                    }
-                    stats['gpus'].append(gpu)
+                    for line in lines:
+                        if line.strip():
+                            parts = [p.strip() for p in line.split(',')]
+                            if len(parts) >= 4:
+                                name = parts[0]
+                                memory_used = float(parts[1]) / 1024  # Convert MB to GB
+                                memory_total = float(parts[2]) / 1024  # Convert MB to GB
+                                utilization = int(parts[3])
+                                gpu = {
+                                    'name': name,
+                                    'memory_used': memory_used,
+                                    'memory_total': memory_total,
+                                    'utilization': utilization,
+                                    'backend': 'cuda'
+                                }
+                                stats['gpus'].append(gpu)
+                else:
+                    raise Exception("nvidia-smi command failed")
+            except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError, Exception) as e2:
+                log_message(f"nvidia-smi also failed: {e2}, falling back to PyTorch GPU stats")
+                if torch.cuda.is_available():
+                    stats['gpu_count'] = torch.cuda.device_count()
+                    stats['gpus'] = []
+                    for i in range(torch.cuda.device_count()):
+                        gpu = {
+                            'name': torch.cuda.get_device_name(i),
+                            'memory_used': torch.cuda.memory_allocated(i) / 1024**3,  # GB
+                            'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
+                            'utilization': 0,
+                            'backend': 'cuda'
+                        }
+                        stats['gpus'].append(gpu)
        # CPU and RAM (local machine)
        stats['cpu_percent'] = psutil.cpu_percent()

--- a/vidai/cluster_client.py
+++ b/vidai/cluster_client.py
@@ -503,43 +503,113 @@ class ClusterClient:
            return {'gpus': gpu_stats, 'backend': 'cuda'}
        except ImportError:
-            # Fallback to PyTorch-only stats if pynvml not available
+            # Fallback to nvidia-smi if pynvml not available
+            log_message("pynvml not available, trying nvidia-smi")
            try:
-                import torch
+                import subprocess
-                if torch.cuda.is_available():
+                # Try to get GPU stats using nvidia-smi
+                result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
+                                      capture_output=True, text=True, timeout=10)
+                if result.returncode == 0:
+                    lines = result.stdout.strip().split('\n')
                    gpu_stats = []
-                    for i in range(torch.cuda.device_count()):
-                        gpu_stats.append({
+                    for i, line in enumerate(lines):
-                            'device_id': i,
+                        if line.strip():
-                            'name': torch.cuda.get_device_name(i),
+                            parts = [p.strip() for p in line.split(',')]
-                            'memory_used': torch.cuda.memory_allocated(i) / 1024**3,  # GB
+                            if len(parts) >= 4:
-                            'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
+                                name = parts[0]
-                            'utilization': 0,  # pynvml required for actual utilization
+                                memory_used = float(parts[1]) / 1024  # Convert MB to GB
-                            'backend': 'cuda'
+                                memory_total = float(parts[2]) / 1024  # Convert MB to GB
-                        })
+                                utilization = int(parts[3])
+                                gpu_stats.append({
+                                    'device_id': i,
+                                    'name': name,
+                                    'memory_used': memory_used,
+                                    'memory_total': memory_total,
+                                    'utilization': utilization,
+                                    'backend': 'cuda'
+                                })
                    return {'gpus': gpu_stats, 'backend': 'cuda'}
-            except:
+                else:
-                pass
+                    raise Exception("nvidia-smi command failed")
+            except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError, Exception) as e:
+                log_message(f"nvidia-smi not available or failed: {e}, falling back to PyTorch GPU stats")
+                try:
+                    import torch
+                    if torch.cuda.is_available():
+                        gpu_stats = []
+                        for i in range(torch.cuda.device_count()):
+                            gpu_stats.append({
+                                'device_id': i,
+                                'name': torch.cuda.get_device_name(i),
+                                'memory_used': torch.cuda.memory_allocated(i) / 1024**3,  # GB
+                                'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
+                                'utilization': 0,  # pynvml or nvidia-smi required for actual utilization
+                                'backend': 'cuda'
+                            })
+                        return {'gpus': gpu_stats, 'backend': 'cuda'}
+                except:
+                    pass
        except Exception as e:
            log_message(f"Error collecting GPU stats with pynvml: {e}")
-            # Fallback to PyTorch if pynvml fails
+            # Fallback to nvidia-smi
            try:
-                import torch
+                import subprocess
-                if torch.cuda.is_available():
+                result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.used,memory.total,utilization.gpu', '--format=csv,noheader,nounits'],
+                                      capture_output=True, text=True, timeout=10)
+                if result.returncode == 0:
+                    lines = result.stdout.strip().split('\n')
                    gpu_stats = []
-                    for i in range(torch.cuda.device_count()):
-                        gpu_stats.append({
+                    for i, line in enumerate(lines):
-                            'device_id': i,
+                        if line.strip():
-                            'name': torch.cuda.get_device_name(i),
+                            parts = [p.strip() for p in line.split(',')]
-                            'memory_used': torch.cuda.memory_allocated(i) / 1024**3,  # GB
+                            if len(parts) >= 4:
-                            'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
+                                name = parts[0]
-                            'utilization': 0,
+                                memory_used = float(parts[1]) / 1024  # Convert MB to GB
-                            'backend': 'cuda'
+                                memory_total = float(parts[2]) / 1024  # Convert MB to GB
-                        })
+                                utilization = int(parts[3])
+                                gpu_stats.append({
+                                    'device_id': i,
+                                    'name': name,
+                                    'memory_used': memory_used,
+                                    'memory_total': memory_total,
+                                    'utilization': utilization,
+                                    'backend': 'cuda'
+                                })
                    return {'gpus': gpu_stats, 'backend': 'cuda'}
-            except:
+                else:
-                pass
+                    raise Exception("nvidia-smi command failed")
+            except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError, Exception) as e2:
+                log_message(f"nvidia-smi also failed: {e2}, falling back to PyTorch GPU stats")
+                try:
+                    import torch
+                    if torch.cuda.is_available():
+                        gpu_stats = []
+                        for i in range(torch.cuda.device_count()):
+                            gpu_stats.append({
+                                'device_id': i,
+                                'name': torch.cuda.get_device_name(i),
+                                'memory_used': torch.cuda.memory_allocated(i) / 1024**3,  # GB
+                                'memory_total': torch.cuda.get_device_properties(i).total_memory / 1024**3,
+                                'utilization': 0,
+                                'backend': 'cuda'
+                            })
+                        return {'gpus': gpu_stats, 'backend': 'cuda'}
+                except:
+                    pass
        return None

--- a/vidai/comm.py
+++ b/vidai/comm.py
@@ -76,10 +76,12 @@ class SocketCommunicator:
                log_message(f"DEBUG: SocketCommunicator sending: {full_data}")
            self.sock.sendall(full_data)
-    def receive_message(self) -> Optional[Message]:
+    def receive_message(self, timeout: Optional[float] = None) -> Optional[Message]:
-        """Receive a message."""
+        """Receive a message with optional timeout."""
        if self.sock:
            try:
+                if timeout is not None:
+                    self.sock.settimeout(timeout)
                data = self.sock.recv(4096)
                if data:
                    decoded = data.decode('utf-8').strip()