Allow CPU-only cluster clients and flexible backend support

- Removed GPU-only requirement for cluster client connections
- CPU-only clients can now join cluster and run CPU-based workers
- Master accepts all clients regardless of GPU availability
- Nodes are properly marked as CPU-only when no GPUs detected
- Driver selection modal supports CUDA, ROCm, and CPU backends
- Local and remote workers can use any available backend (GPU or CPU)
- Enhanced cluster flexibility for mixed hardware environments
- CPU nodes contribute to cluster for CPU-only processing tasks
- Maintains backward compatibility with existing GPU-only workflows
- Clear node type identification in cluster management interface
parent f57a1468
......@@ -214,7 +214,7 @@ function renderNodesTable() {
}
function openDriverModal(hostname, token, hostnameValue) {
// Find the node data to get available GPU backends
// Find the node data to get available backends
const node = nodesData.find(n => n.hostname === hostname && n.token === token);
if (!node) {
console.error('Node not found:', hostname, token);
......@@ -225,15 +225,19 @@ function openDriverModal(hostname, token, hostnameValue) {
document.getElementById('modalHostnameInput').value = hostnameValue;
document.getElementById('modalTokenInput').value = token;
// Populate driver options based on available GPU backends
// Populate driver options based on available backends (GPU and CPU)
const driverSelect = document.getElementById('driverSelect');
driverSelect.innerHTML = '';
const availableBackends = node.available_gpu_backends || [];
const availableBackends = node.available_backends || [];
if (availableBackends.length === 0) {
// Fallback for nodes without backend info
if (node.is_cpu_only) {
availableBackends.push('cpu');
} else {
availableBackends.push('cuda', 'rocm');
}
}
availableBackends.forEach(backend => {
const option = document.createElement('option');
......
......@@ -54,17 +54,6 @@ class ClusterClient:
async def connect(self) -> bool:
"""Connect to cluster master via secure websocket."""
try:
# Detect available backends first
from .compat import detect_gpu_backends, get_available_backends
gpu_info = detect_gpu_backends()
available_backends = get_available_backends()
# Check if we have any GPU backends available
gpu_backends = [b for b in available_backends if b in ['cuda', 'rocm']]
if not gpu_backends:
print("No GPU backends detected (CUDA/ROCm). Cluster client requires GPU capabilities to connect.")
return False
# Create SSL context that accepts self-signed certificates
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
......@@ -73,6 +62,11 @@ class ClusterClient:
uri = f"wss://{self.host}:{self.port}/cluster"
self.websocket = await websockets.connect(uri, ssl=ssl_context)
# Detect available backends
from .compat import detect_gpu_backends, get_available_backends
gpu_info = detect_gpu_backends()
available_backends = get_available_backends()
# Get hostname
import socket
hostname = socket.gethostname()
......@@ -139,16 +133,16 @@ class ClusterClient:
self.connected = False
async def start_local_processes(self) -> None:
"""Start local worker processes based on available GPU backends."""
"""Start local worker processes based on available backends (GPU and CPU)."""
from .compat import detect_gpu_backends, get_available_backends
gpu_info = detect_gpu_backends()
available_backends = get_available_backends()
print(f"Client GPU detection: CUDA={gpu_info['cuda']}, ROCm={gpu_info['rocm']}")
print(f"Client backend detection: CUDA={gpu_info['cuda']}, ROCm={gpu_info['rocm']}")
print(f"Available backends: {available_backends}")
# Start analysis workers for available backends
# Start analysis workers for available backends (including CPU)
for backend in available_backends:
proc_name = f'analysis_{backend}'
cmd = [sys.executable, '-m', 'vidai.worker_analysis', backend]
......@@ -161,7 +155,7 @@ class ClusterClient:
self.process_models[proc_name] = 'Qwen/Qwen2.5-VL-7B-Instruct'
print(f"Started analysis worker for {backend}")
# Start training workers for available backends
# Start training workers for available backends (including CPU)
for backend in available_backends:
proc_name = f'training_{backend}'
cmd = [sys.executable, '-m', 'vidai.worker_training', backend]
......
......@@ -194,16 +194,14 @@ class ClusterMaster:
if not token:
return {'type': 'auth_failed', 'message': 'No token provided'}
# Check if client has GPU capabilities
# Generate client ID from token
client_id = hashlib.sha256(token.encode()).hexdigest()[:16]
# Check GPU capabilities for logging
gpu_info = client_info.get('gpu_info', {})
available_backends = gpu_info.get('available_backends', [])
gpu_backends = [b for b in available_backends if b in ['cuda', 'rocm']]
if not gpu_backends:
return {'type': 'auth_failed', 'message': 'Client must have GPU capabilities (CUDA or ROCm) to join cluster'}
# Generate client ID from token
client_id = hashlib.sha256(token.encode()).hexdigest()[:16]
has_gpu = len(gpu_backends) > 0
# Store client info including GPU capabilities and weight
self.clients[client_id] = {
......@@ -222,7 +220,8 @@ class ClusterMaster:
self.weight = 0
print("First client connected - changing master weight to 0 (automatic)")
print(f"Client {client_id} authenticated with GPU backends: {gpu_backends}")
backend_type = "GPU" if has_gpu else "CPU-only"
print(f"Client {client_id} authenticated ({backend_type}) with backends: {available_backends}")
return {'type': 'auth_success', 'client_id': client_id}
def _handle_register_processes(self, message: Dict[str, Any], websocket: websockets.WebSocketServerProtocol) -> Dict[str, Any]:
......@@ -604,6 +603,7 @@ class ClusterMaster:
return False
if backend not in ['cuda', 'rocm', 'cpu']:
print(f"Invalid backend requested: {backend} - only CUDA, ROCm, and CPU supported")
return False
# Send restart command to client
......
......@@ -1535,7 +1535,7 @@ def get_client_driver_preference(hostname: str, token: str) -> str:
def set_client_driver_preference(hostname: str, token: str, driver: str) -> bool:
"""Set the preferred driver for a client (hostname + token)."""
if driver not in ['cuda', 'rocm']:
if driver not in ['cuda', 'rocm', 'cpu']:
return False
conn = get_db_connection()
......
......@@ -419,18 +419,17 @@ def api_cluster_nodes():
connected_at = client_info.get('connected_at', current_time)
uptime_seconds = current_time - connected_at
# Detect mixed GPU availability and available backends
# Detect GPU capabilities and available backends
gpu_info = client_info.get('gpu_info', {})
has_cuda = gpu_info.get('cuda_available', False)
has_rocm = gpu_info.get('rocm_available', False)
mixed_gpu = has_cuda and has_rocm
available_backends = gpu_info.get('available_backends', [])
gpu_backends = [b for b in available_backends if b in ['cuda', 'rocm']]
cpu_backends = [b for b in available_backends if b == 'cpu']
# Determine available GPU backends for this node
available_gpu_backends = []
if has_cuda:
available_gpu_backends.append('cuda')
if has_rocm:
available_gpu_backends.append('rocm')
has_cuda = 'cuda' in gpu_backends
has_rocm = 'rocm' in gpu_backends
has_cpu = len(cpu_backends) > 0
mixed_gpu = has_cuda and has_rocm
is_cpu_only = not gpu_backends and has_cpu
node_map[node_key] = {
'token': token,
......@@ -448,7 +447,8 @@ def api_cluster_nodes():
'weight': client_info.get('weight', 100),
'is_local': False,
'mixed_gpu': mixed_gpu,
'available_gpu_backends': available_gpu_backends,
'is_cpu_only': is_cpu_only,
'available_backends': available_backends,
'workers': [] # Will collect worker details
}
......@@ -537,8 +537,9 @@ def api_cluster_nodes():
'completed_jobs': 0, # Placeholder
'weight': 0, # Local workers don't participate in cluster load balancing
'is_local': True,
'mixed_gpu': len(local_available_gpu_backends) > 1,
'available_gpu_backends': local_available_gpu_backends
'mixed_gpu': len(local_gpu_backends) > 1,
'is_cpu_only': not local_gpu_backends and local_cpu_backends,
'available_backends': local_available_backends
}
nodes.append(local_node)
......@@ -640,8 +641,8 @@ def api_set_client_driver():
if not hostname or not driver:
return {'success': False, 'error': 'Missing required parameters'}, 400
if driver not in ['cuda', 'rocm']:
return {'success': False, 'error': 'Invalid driver - only CUDA and ROCm are supported'}, 400
if driver not in ['cuda', 'rocm', 'cpu']:
return {'success': False, 'error': 'Invalid driver - only CUDA, ROCm, and CPU are supported'}, 400
# Handle local workers
if token == 'local':
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment