Enforce GPU-only cluster participation

- Cluster clients now refuse to connect without GPU capabilities (CUDA/ROCm)
- Cluster master rejects authentication from clients without GPU backends
- Local master node only appears in cluster nodes list if GPU backends are available
- Master already prevented launching local worker processes without GPUs
- Systems without GPUs cannot participate in distributed processing
- Clear error messages when GPU requirements are not met
- Maintains cluster integrity by ensuring all nodes contribute computational power
parent abec9e31
...@@ -54,6 +54,17 @@ class ClusterClient: ...@@ -54,6 +54,17 @@ class ClusterClient:
async def connect(self) -> bool: async def connect(self) -> bool:
"""Connect to cluster master via secure websocket.""" """Connect to cluster master via secure websocket."""
try: try:
# Detect available backends first
from .compat import detect_gpu_backends, get_available_backends
gpu_info = detect_gpu_backends()
available_backends = get_available_backends()
# Check if we have any GPU backends available
gpu_backends = [b for b in available_backends if b in ['cuda', 'rocm']]
if not gpu_backends:
print("No GPU backends detected (CUDA/ROCm). Cluster client requires GPU capabilities to connect.")
return False
# Create SSL context that accepts self-signed certificates # Create SSL context that accepts self-signed certificates
ssl_context = ssl.create_default_context() ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False ssl_context.check_hostname = False
...@@ -62,11 +73,6 @@ class ClusterClient: ...@@ -62,11 +73,6 @@ class ClusterClient:
uri = f"wss://{self.host}:{self.port}/cluster" uri = f"wss://{self.host}:{self.port}/cluster"
self.websocket = await websockets.connect(uri, ssl=ssl_context) self.websocket = await websockets.connect(uri, ssl=ssl_context)
# Detect available backends
from .compat import detect_gpu_backends, get_available_backends
gpu_info = detect_gpu_backends()
available_backends = get_available_backends()
# Get hostname # Get hostname
import socket import socket
hostname = socket.gethostname() hostname = socket.gethostname()
......
...@@ -186,7 +186,7 @@ class ClusterMaster: ...@@ -186,7 +186,7 @@ class ClusterMaster:
return {'type': 'error', 'message': 'Unknown message type'} return {'type': 'error', 'message': 'Unknown message type'}
def _handle_auth(self, message: Dict[str, Any], client_sock: socket.socket) -> Dict[str, Any]: def _handle_auth(self, message: Dict[str, Any], websocket: websockets.WebSocketServerProtocol) -> Dict[str, Any]:
"""Handle client authentication.""" """Handle client authentication."""
token = message.get('token') token = message.get('token')
client_info = message.get('client_info', {}) client_info = message.get('client_info', {})
...@@ -194,6 +194,14 @@ class ClusterMaster: ...@@ -194,6 +194,14 @@ class ClusterMaster:
if not token: if not token:
return {'type': 'auth_failed', 'message': 'No token provided'} return {'type': 'auth_failed', 'message': 'No token provided'}
# Check if client has GPU capabilities
gpu_info = client_info.get('gpu_info', {})
available_backends = gpu_info.get('available_backends', [])
gpu_backends = [b for b in available_backends if b in ['cuda', 'rocm']]
if not gpu_backends:
return {'type': 'auth_failed', 'message': 'Client must have GPU capabilities (CUDA or ROCm) to join cluster'}
# Generate client ID from token # Generate client ID from token
client_id = hashlib.sha256(token.encode()).hexdigest()[:16] client_id = hashlib.sha256(token.encode()).hexdigest()[:16]
...@@ -202,7 +210,7 @@ class ClusterMaster: ...@@ -202,7 +210,7 @@ class ClusterMaster:
'token': token, 'token': token,
'info': client_info, 'info': client_info,
'weight': client_info.get('weight', 100), 'weight': client_info.get('weight', 100),
'gpu_info': client_info.get('gpu_info', {}), 'gpu_info': gpu_info,
'connected_at': time.time(), 'connected_at': time.time(),
'last_seen': time.time() 'last_seen': time.time()
} }
...@@ -214,7 +222,7 @@ class ClusterMaster: ...@@ -214,7 +222,7 @@ class ClusterMaster:
self.weight = 0 self.weight = 0
print("First client connected - changing master weight to 0 (automatic)") print("First client connected - changing master weight to 0 (automatic)")
print(f"Client {client_id} authenticated") print(f"Client {client_id} authenticated with GPU backends: {gpu_backends}")
return {'type': 'auth_success', 'client_id': client_id} return {'type': 'auth_success', 'client_id': client_id}
def _handle_register_processes(self, message: Dict[str, Any], websocket: websockets.WebSocketServerProtocol) -> Dict[str, Any]: def _handle_register_processes(self, message: Dict[str, Any], websocket: websockets.WebSocketServerProtocol) -> Dict[str, Any]:
......
...@@ -486,61 +486,61 @@ def api_cluster_nodes(): ...@@ -486,61 +486,61 @@ def api_cluster_nodes():
nodes.append(node_data) nodes.append(node_data)
# Detect and aggregate local worker processes on master # Detect and aggregate local worker processes on master
local_workers = detect_local_workers() # Only show local node if GPU backends are available
if local_workers: from vidai.compat import get_available_backends
# Group local workers by type local_available_gpu_backends = [b for b in get_available_backends() if b in ['cuda', 'rocm']]
local_analysis = [w for w in local_workers if w['type'] == 'analysis'] if local_available_gpu_backends:
local_training = [w for w in local_workers if w['type'] == 'training'] local_workers = detect_local_workers()
if local_workers:
# Calculate combined GPU info for local node # Group local workers by type
total_gpus = sum(w.get('gpus', 0) for w in local_workers) local_analysis = [w for w in local_workers if w['type'] == 'analysis']
all_gpu_memory = [] local_training = [w for w in local_workers if w['type'] == 'training']
seen_memory = set()
for w in local_workers: # Calculate combined GPU info for local node
for mem in w.get('gpu_memory', []): total_gpus = sum(w.get('gpus', 0) for w in local_workers)
if mem not in seen_memory: all_gpu_memory = []
all_gpu_memory.append(mem) seen_memory = set()
seen_memory.add(mem) for w in local_workers:
total_memory = sum(w.get('total_memory', 0) for w in local_workers) for mem in w.get('gpu_memory', []):
if mem not in seen_memory:
# Worker summary for local node all_gpu_memory.append(mem)
worker_types = [] seen_memory.add(mem)
if local_analysis: total_memory = sum(w.get('total_memory', 0) for w in local_workers)
backends = set(w['backend'] for w in local_analysis)
worker_types.append(f"Analysis ({', '.join(backends)})") # Worker summary for local node
if local_training: worker_types = []
backends = set(w['backend'] for w in local_training) if local_analysis:
worker_types.append(f"Training ({', '.join(backends)})") backends = set(w['backend'] for w in local_analysis)
worker_types.append(f"Analysis ({', '.join(backends)})")
# Use the longest uptime as representative if local_training:
max_uptime = max((w.get('uptime_seconds', 0) for w in local_workers), default=0) backends = set(w['backend'] for w in local_training)
worker_types.append(f"Training ({', '.join(backends)})")
# Get available GPU backends for local system
from vidai.compat import get_available_backends # Use the longest uptime as representative
local_available_gpu_backends = [b for b in get_available_backends() if b in ['cuda', 'rocm']] max_uptime = max((w.get('uptime_seconds', 0) for w in local_workers), default=0)
local_node = { local_node = {
'token': 'local', 'token': 'local',
'token_name': 'Local Master Node', 'token_name': 'Local Master Node',
'hostname': 'localhost', 'hostname': 'localhost',
'gpus': total_gpus, 'gpus': total_gpus,
'gpu_memory': all_gpu_memory, 'gpu_memory': all_gpu_memory,
'total_memory': total_memory, 'total_memory': total_memory,
'workers_available': len(local_workers), 'workers_available': len(local_workers),
'worker_types': worker_types, 'worker_types': worker_types,
'workers_summary': ', '.join(worker_types) if worker_types else 'No workers', 'workers_summary': ', '.join(worker_types) if worker_types else 'No workers',
'ip_address': '127.0.0.1', 'ip_address': '127.0.0.1',
'connected': True, 'connected': True,
'last_seen': current_time, 'last_seen': current_time,
'uptime_seconds': max_uptime, 'uptime_seconds': max_uptime,
'active_jobs': 0, # Placeholder 'active_jobs': 0, # Placeholder
'completed_jobs': 0, # Placeholder 'completed_jobs': 0, # Placeholder
'weight': 0, # Local workers don't participate in cluster load balancing 'weight': 0, # Local workers don't participate in cluster load balancing
'is_local': True, 'is_local': True,
'mixed_gpu': len(local_available_gpu_backends) > 1, 'mixed_gpu': len(local_available_gpu_backends) > 1,
'available_gpu_backends': local_available_gpu_backends 'available_gpu_backends': local_available_gpu_backends
} }
nodes.append(local_node) nodes.append(local_node)
# Sort: active first, then by last_seen desc # Sort: active first, then by last_seen desc
nodes.sort(key=lambda x: (not x['connected'], -x['last_seen'])) nodes.sort(key=lambda x: (not x['connected'], -x['last_seen']))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment