Allow CPU-only cluster clients and flexible backend support

- Removed GPU-only requirement for cluster client connections
- CPU-only clients can now join cluster and run CPU-based workers
- Master accepts all clients regardless of GPU availability
- Nodes are properly marked as CPU-only when no GPUs detected
- Driver selection modal supports CUDA, ROCm, and CPU backends
- Local and remote workers can use any available backend (GPU or CPU)
- Enhanced cluster flexibility for mixed hardware environments
- CPU nodes contribute to cluster for CPU-only processing tasks
- Maintains backward compatibility with existing GPU-only workflows
- Clear node type identification in cluster management interface
parent f57a1468
...@@ -214,7 +214,7 @@ function renderNodesTable() { ...@@ -214,7 +214,7 @@ function renderNodesTable() {
} }
function openDriverModal(hostname, token, hostnameValue) { function openDriverModal(hostname, token, hostnameValue) {
// Find the node data to get available GPU backends // Find the node data to get available backends
const node = nodesData.find(n => n.hostname === hostname && n.token === token); const node = nodesData.find(n => n.hostname === hostname && n.token === token);
if (!node) { if (!node) {
console.error('Node not found:', hostname, token); console.error('Node not found:', hostname, token);
...@@ -225,14 +225,18 @@ function openDriverModal(hostname, token, hostnameValue) { ...@@ -225,14 +225,18 @@ function openDriverModal(hostname, token, hostnameValue) {
document.getElementById('modalHostnameInput').value = hostnameValue; document.getElementById('modalHostnameInput').value = hostnameValue;
document.getElementById('modalTokenInput').value = token; document.getElementById('modalTokenInput').value = token;
// Populate driver options based on available GPU backends // Populate driver options based on available backends (GPU and CPU)
const driverSelect = document.getElementById('driverSelect'); const driverSelect = document.getElementById('driverSelect');
driverSelect.innerHTML = ''; driverSelect.innerHTML = '';
const availableBackends = node.available_gpu_backends || []; const availableBackends = node.available_backends || [];
if (availableBackends.length === 0) { if (availableBackends.length === 0) {
// Fallback for nodes without backend info // Fallback for nodes without backend info
availableBackends.push('cuda', 'rocm'); if (node.is_cpu_only) {
availableBackends.push('cpu');
} else {
availableBackends.push('cuda', 'rocm');
}
} }
availableBackends.forEach(backend => { availableBackends.forEach(backend => {
......
...@@ -54,17 +54,6 @@ class ClusterClient: ...@@ -54,17 +54,6 @@ class ClusterClient:
async def connect(self) -> bool: async def connect(self) -> bool:
"""Connect to cluster master via secure websocket.""" """Connect to cluster master via secure websocket."""
try: try:
# Detect available backends first
from .compat import detect_gpu_backends, get_available_backends
gpu_info = detect_gpu_backends()
available_backends = get_available_backends()
# Check if we have any GPU backends available
gpu_backends = [b for b in available_backends if b in ['cuda', 'rocm']]
if not gpu_backends:
print("No GPU backends detected (CUDA/ROCm). Cluster client requires GPU capabilities to connect.")
return False
# Create SSL context that accepts self-signed certificates # Create SSL context that accepts self-signed certificates
ssl_context = ssl.create_default_context() ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False ssl_context.check_hostname = False
...@@ -73,6 +62,11 @@ class ClusterClient: ...@@ -73,6 +62,11 @@ class ClusterClient:
uri = f"wss://{self.host}:{self.port}/cluster" uri = f"wss://{self.host}:{self.port}/cluster"
self.websocket = await websockets.connect(uri, ssl=ssl_context) self.websocket = await websockets.connect(uri, ssl=ssl_context)
# Detect available backends
from .compat import detect_gpu_backends, get_available_backends
gpu_info = detect_gpu_backends()
available_backends = get_available_backends()
# Get hostname # Get hostname
import socket import socket
hostname = socket.gethostname() hostname = socket.gethostname()
...@@ -139,16 +133,16 @@ class ClusterClient: ...@@ -139,16 +133,16 @@ class ClusterClient:
self.connected = False self.connected = False
async def start_local_processes(self) -> None: async def start_local_processes(self) -> None:
"""Start local worker processes based on available GPU backends.""" """Start local worker processes based on available backends (GPU and CPU)."""
from .compat import detect_gpu_backends, get_available_backends from .compat import detect_gpu_backends, get_available_backends
gpu_info = detect_gpu_backends() gpu_info = detect_gpu_backends()
available_backends = get_available_backends() available_backends = get_available_backends()
print(f"Client GPU detection: CUDA={gpu_info['cuda']}, ROCm={gpu_info['rocm']}") print(f"Client backend detection: CUDA={gpu_info['cuda']}, ROCm={gpu_info['rocm']}")
print(f"Available backends: {available_backends}") print(f"Available backends: {available_backends}")
# Start analysis workers for available backends # Start analysis workers for available backends (including CPU)
for backend in available_backends: for backend in available_backends:
proc_name = f'analysis_{backend}' proc_name = f'analysis_{backend}'
cmd = [sys.executable, '-m', 'vidai.worker_analysis', backend] cmd = [sys.executable, '-m', 'vidai.worker_analysis', backend]
...@@ -161,7 +155,7 @@ class ClusterClient: ...@@ -161,7 +155,7 @@ class ClusterClient:
self.process_models[proc_name] = 'Qwen/Qwen2.5-VL-7B-Instruct' self.process_models[proc_name] = 'Qwen/Qwen2.5-VL-7B-Instruct'
print(f"Started analysis worker for {backend}") print(f"Started analysis worker for {backend}")
# Start training workers for available backends # Start training workers for available backends (including CPU)
for backend in available_backends: for backend in available_backends:
proc_name = f'training_{backend}' proc_name = f'training_{backend}'
cmd = [sys.executable, '-m', 'vidai.worker_training', backend] cmd = [sys.executable, '-m', 'vidai.worker_training', backend]
......
...@@ -194,16 +194,14 @@ class ClusterMaster: ...@@ -194,16 +194,14 @@ class ClusterMaster:
if not token: if not token:
return {'type': 'auth_failed', 'message': 'No token provided'} return {'type': 'auth_failed', 'message': 'No token provided'}
# Check if client has GPU capabilities # Generate client ID from token
client_id = hashlib.sha256(token.encode()).hexdigest()[:16]
# Check GPU capabilities for logging
gpu_info = client_info.get('gpu_info', {}) gpu_info = client_info.get('gpu_info', {})
available_backends = gpu_info.get('available_backends', []) available_backends = gpu_info.get('available_backends', [])
gpu_backends = [b for b in available_backends if b in ['cuda', 'rocm']] gpu_backends = [b for b in available_backends if b in ['cuda', 'rocm']]
has_gpu = len(gpu_backends) > 0
if not gpu_backends:
return {'type': 'auth_failed', 'message': 'Client must have GPU capabilities (CUDA or ROCm) to join cluster'}
# Generate client ID from token
client_id = hashlib.sha256(token.encode()).hexdigest()[:16]
# Store client info including GPU capabilities and weight # Store client info including GPU capabilities and weight
self.clients[client_id] = { self.clients[client_id] = {
...@@ -222,7 +220,8 @@ class ClusterMaster: ...@@ -222,7 +220,8 @@ class ClusterMaster:
self.weight = 0 self.weight = 0
print("First client connected - changing master weight to 0 (automatic)") print("First client connected - changing master weight to 0 (automatic)")
print(f"Client {client_id} authenticated with GPU backends: {gpu_backends}") backend_type = "GPU" if has_gpu else "CPU-only"
print(f"Client {client_id} authenticated ({backend_type}) with backends: {available_backends}")
return {'type': 'auth_success', 'client_id': client_id} return {'type': 'auth_success', 'client_id': client_id}
def _handle_register_processes(self, message: Dict[str, Any], websocket: websockets.WebSocketServerProtocol) -> Dict[str, Any]: def _handle_register_processes(self, message: Dict[str, Any], websocket: websockets.WebSocketServerProtocol) -> Dict[str, Any]:
...@@ -604,6 +603,7 @@ class ClusterMaster: ...@@ -604,6 +603,7 @@ class ClusterMaster:
return False return False
if backend not in ['cuda', 'rocm', 'cpu']: if backend not in ['cuda', 'rocm', 'cpu']:
print(f"Invalid backend requested: {backend} - only CUDA, ROCm, and CPU supported")
return False return False
# Send restart command to client # Send restart command to client
......
...@@ -1535,7 +1535,7 @@ def get_client_driver_preference(hostname: str, token: str) -> str: ...@@ -1535,7 +1535,7 @@ def get_client_driver_preference(hostname: str, token: str) -> str:
def set_client_driver_preference(hostname: str, token: str, driver: str) -> bool: def set_client_driver_preference(hostname: str, token: str, driver: str) -> bool:
"""Set the preferred driver for a client (hostname + token).""" """Set the preferred driver for a client (hostname + token)."""
if driver not in ['cuda', 'rocm']: if driver not in ['cuda', 'rocm', 'cpu']:
return False return False
conn = get_db_connection() conn = get_db_connection()
......
...@@ -419,18 +419,17 @@ def api_cluster_nodes(): ...@@ -419,18 +419,17 @@ def api_cluster_nodes():
connected_at = client_info.get('connected_at', current_time) connected_at = client_info.get('connected_at', current_time)
uptime_seconds = current_time - connected_at uptime_seconds = current_time - connected_at
# Detect mixed GPU availability and available backends # Detect GPU capabilities and available backends
gpu_info = client_info.get('gpu_info', {}) gpu_info = client_info.get('gpu_info', {})
has_cuda = gpu_info.get('cuda_available', False) available_backends = gpu_info.get('available_backends', [])
has_rocm = gpu_info.get('rocm_available', False) gpu_backends = [b for b in available_backends if b in ['cuda', 'rocm']]
mixed_gpu = has_cuda and has_rocm cpu_backends = [b for b in available_backends if b == 'cpu']
# Determine available GPU backends for this node has_cuda = 'cuda' in gpu_backends
available_gpu_backends = [] has_rocm = 'rocm' in gpu_backends
if has_cuda: has_cpu = len(cpu_backends) > 0
available_gpu_backends.append('cuda') mixed_gpu = has_cuda and has_rocm
if has_rocm: is_cpu_only = not gpu_backends and has_cpu
available_gpu_backends.append('rocm')
node_map[node_key] = { node_map[node_key] = {
'token': token, 'token': token,
...@@ -448,7 +447,8 @@ def api_cluster_nodes(): ...@@ -448,7 +447,8 @@ def api_cluster_nodes():
'weight': client_info.get('weight', 100), 'weight': client_info.get('weight', 100),
'is_local': False, 'is_local': False,
'mixed_gpu': mixed_gpu, 'mixed_gpu': mixed_gpu,
'available_gpu_backends': available_gpu_backends, 'is_cpu_only': is_cpu_only,
'available_backends': available_backends,
'workers': [] # Will collect worker details 'workers': [] # Will collect worker details
} }
...@@ -537,8 +537,9 @@ def api_cluster_nodes(): ...@@ -537,8 +537,9 @@ def api_cluster_nodes():
'completed_jobs': 0, # Placeholder 'completed_jobs': 0, # Placeholder
'weight': 0, # Local workers don't participate in cluster load balancing 'weight': 0, # Local workers don't participate in cluster load balancing
'is_local': True, 'is_local': True,
'mixed_gpu': len(local_available_gpu_backends) > 1, 'mixed_gpu': len(local_gpu_backends) > 1,
'available_gpu_backends': local_available_gpu_backends 'is_cpu_only': not local_gpu_backends and local_cpu_backends,
'available_backends': local_available_backends
} }
nodes.append(local_node) nodes.append(local_node)
...@@ -640,8 +641,8 @@ def api_set_client_driver(): ...@@ -640,8 +641,8 @@ def api_set_client_driver():
if not hostname or not driver: if not hostname or not driver:
return {'success': False, 'error': 'Missing required parameters'}, 400 return {'success': False, 'error': 'Missing required parameters'}, 400
if driver not in ['cuda', 'rocm']: if driver not in ['cuda', 'rocm', 'cpu']:
return {'success': False, 'error': 'Invalid driver - only CUDA and ROCm are supported'}, 400 return {'success': False, 'error': 'Invalid driver - only CUDA, ROCm, and CPU are supported'}, 400
# Handle local workers # Handle local workers
if token == 'local': if token == 'local':
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment