Fix job execution in cluster: implement proper job assignment and result handling

- Fixed process type mapping in queue manager ('analyze' -> 'analysis', 'train' -> 'training') - Implemented actual job sending in cluster master assign_job_to_worker() - Modified cluster client to forward jobs to local backend and monitor results - Added result polling mechanism for cluster jobs - Jobs should now execute on connected cluster workers instead of remaining queued The issue was that jobs were being assigned but never sent to workers. Now: 1. Queue manager selects worker using VRAM-aware logic 2. Cluster master assigns job and sends it via websocket 3. Cluster client receives job and forwards to local backend 4. Cluster client polls backend for results and sends back to master 5. Results are properly returned to web interface

Fix job execution in cluster: implement proper job assignment and result handling
- Fixed process type mapping in queue manager ('analyze' -> 'analysis', 'train' -> 'training') - Implemented actual job sending in cluster master assign_job_to_worker() - Modified cluster client to forward jobs to local backend and monitor results - Added result polling mechanism for cluster jobs - Jobs should now execute on connected cluster workers instead of remaining queued The issue was that jobs were being assigned but never sent to workers. Now: 1. Queue manager selects worker using VRAM-aware logic 2. Cluster master assigns job and sends it via websocket 3. Cluster client receives job and forwards to local backend 4. Cluster client polls backend for results and sends back to master 5. Results are properly returned to web interface
50a87bee · Stefy Lanza (nextime / spora ) · 0de46dee · 50a87bee · 50a87bee · 50a87bee
Commit 50a87bee authored Oct 08, 2025 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 119 additions and 44 deletions

cluster_client.py vidai/cluster_client.py +84 -38

cluster_master.py vidai/cluster_master.py +27 -4

queue.py vidai/queue.py +8 -2

No files found.
--- a/vidai/cluster_client.py
+++ b/vidai/cluster_client.py
@@ -49,6 +49,7 @@ class ClusterClient:
        self.local_processes = {}  # type: Dict[str, subprocess.Popen]
        self.process_weights = {}  # type: Dict[str, int]
        self.process_models = {}   # type: Dict[str, str]
+        self.pending_jobs = {}  # type: Dict[str, asyncio.Task]  # job_id -> result monitoring task
        self.loop = None

    async def connect(self) -> bool:
@@ -292,57 +293,102 @@ class ClusterClient:
    async def _handle_job_assignment(self, message: Dict[str, Any]) -> None:
        """Handle job assignment from master."""
        job_id = message.get('job_id')
+        process_type = message.get('process_type', 'analysis')
        job_data = message.get('job_data', {})

-        # Extract job parameters
-        request_type = job_data.get('request_type', 'analyze')
-        model_path = job_data.get('model_path', 'Qwen/Qwen2.5-VL-7B-Instruct')
-        media_path = job_data.get('local_path')
-        prompt = job_data.get('prompt', 'Describe this image.')
-        interval = job_data.get('interval', 10)
-
-        # Forward to appropriate local worker
-        worker_type = f'{request_type}_cuda'  # Assume CUDA for now
-        if worker_type in self.local_processes:
-            # Send job to local worker process
-            import json
+        print(f"Received job assignment: {job_id} for process type: {process_type}")
+
+        # Forward job to local backend, which will route it to the appropriate worker
+        try:
+            from .comm import SocketCommunicator
+            from .compat import get_socket_path
+
+            # Connect to local backend
+            backend_comm = SocketCommunicator(socket_path=get_socket_path('worker'), comm_type='unix')
+            backend_comm.connect()
+
+            # Send job to backend
            job_message = {
-                'msg_type': f'{request_type}_request',
+                'msg_type': f'{process_type}_request',
                'msg_id': job_id,
-                'data': {
-                    'model_path': model_path,
-                    'local_path': media_path,
-                    'prompt': prompt,
-                    'interval': interval
-                }
+                'data': job_data
            }

-            # Send to worker via socket or other mechanism
-            # For now, assume workers listen on sockets
-            try:
-                import socket
-                worker_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
-                worker_socket.connect(f'/tmp/vidai_{worker_type}.sock')
-                worker_socket.sendall(json.dumps(job_message).encode('utf-8'))
-                worker_socket.close()
+            backend_comm.send_message(job_message)
+            print(f"Job {job_id} forwarded to local backend for {process_type} processing")

-                # Wait for result (simplified)
-                await asyncio.sleep(1)  # Placeholder
+            # Start monitoring for result
+            self.pending_jobs[job_id] = asyncio.create_task(self._monitor_job_result(job_id))

-            except Exception as e:
-                print(f"Failed to send job to local worker: {e}")
-                await self._send_message({
-                    'type': 'job_result',
-                    'job_id': job_id,
-                    'result': {'status': 'failed', 'error': str(e)}
-                })
-        else:
+        except Exception as e:
+            print(f"Failed to forward job {job_id} to local backend: {e}")
            await self._send_message({
                'type': 'job_result',
                'job_id': job_id,
-                'result': {'status': 'failed', 'error': f'No local {worker_type} worker available'}
+                'result': {'status': 'failed', 'error': f'Failed to forward to worker: {str(e)}'}
            })

+    async def _monitor_job_result(self, job_id: str) -> None:
+        """Monitor for job result from local backend."""
+        try:
+            from .comm import SocketCommunicator
+            from .compat import get_socket_path
+
+            # Poll for result
+            for _ in range(300):  # Poll for up to 5 minutes (300 * 1s)
+                try:
+                    backend_comm = SocketCommunicator(socket_path=get_socket_path('worker'), comm_type='unix')
+                    backend_comm.connect()
+
+                    # Send get_result request
+                    result_request = {
+                        'msg_type': 'get_result',
+                        'msg_id': f'poll_{job_id}',
+                        'data': {'request_id': job_id}
+                    }
+                    backend_comm.send_message(result_request)
+
+                    # Try to receive response
+                    response = backend_comm.receive_message()
+                    if response and response.get('msg_type') in ['analyze_response', 'train_response']:
+                        result_data = response.get('data', {})
+                        print(f"Received result for job {job_id}")
+
+                        # Send result back to cluster master
+                        await self._send_message({
+                            'type': 'job_result',
+                            'job_id': job_id,
+                            'result': result_data
+                        })
+
+                        # Clean up
+                        if job_id in self.pending_jobs:
+                            del self.pending_jobs[job_id]
+                        return
+
+                    elif response and response.get('msg_type') == 'result_pending':
+                        # Result not ready yet, wait and try again
+                        await asyncio.sleep(1)
+                        continue
+
+                except Exception as e:
+                    print(f"Error polling for job {job_id} result: {e}")
+                    await asyncio.sleep(1)
+
+            # Timeout - job took too long
+            print(f"Job {job_id} timed out waiting for result")
+            await self._send_message({
+                'type': 'job_result',
+                'job_id': job_id,
+                'result': {'status': 'failed', 'error': 'Job timed out'}
+            })
+
+        except Exception as e:
+            print(f"Error monitoring job {job_id}: {e}")
+        finally:
+            if job_id in self.pending_jobs:
+                del self.pending_jobs[job_id]
+
    async def _handle_receive_file(self, message: Dict[str, Any]) -> None:
        """Handle receiving a file from master."""
        filename = message.get('filename')

--- a/vidai/cluster_master.py
+++ b/vidai/cluster_master.py
@@ -454,11 +454,34 @@ class ClusterMaster:
        if media_path and client_id in self.client_websockets:
            self._transfer_job_files(client_id, job_data, job_id)

-        # Send job assignment (simplified for now - would need async handling in real implementation)
+        # Send job assignment
        if client_id in self.client_websockets:
-            # For synchronous version, we'll skip the websocket send for now
-            # In a real implementation, this would need to be handled asynchronously
-            pass
+            try:
+                # Create the job assignment message
+                job_message = {
+                    'type': 'job_assignment',
+                    'job_id': job_id,
+                    'process_type': self.processes[worker_key]['name'].split('_')[0],  # 'analysis' or 'training'
+                    'job_data': job_data
+                }
+
+                # Send via websocket (async)
+                asyncio.create_task(self.client_websockets[client_id].send(json.dumps(job_message)))
+                print(f"Job {job_id} assigned to worker {worker_key} on client {client_id}")
+            except Exception as e:
+                print(f"Failed to send job {job_id} to worker {worker_key}: {e}")
+                # Clean up the failed assignment
+                self.worker_jobs[worker_key].remove(job_id)
+                self.worker_vram_usage[worker_key] -= vram_required
+                del self.active_jobs[job_id]
+                return None
+        else:
+            print(f"Client {client_id} not connected, cannot assign job {job_id}")
+            # Clean up the failed assignment
+            self.worker_jobs[worker_key].remove(job_id)
+            self.worker_vram_usage[worker_key] -= vram_required
+            del self.active_jobs[job_id]
+            return None

        return job_id


--- a/vidai/queue.py
+++ b/vidai/queue.py
@@ -127,8 +127,14 @@ class QueueManager:
        """Execute job using local workers or distributed cluster."""
        from .cluster_master import cluster_master

-        # Determine process type
-        process_type = job['request_type']  # 'analyze' or 'train'
+        # Determine process type - map to cluster master naming convention
+        request_type = job['request_type']
+        if request_type == 'analyze':
+            process_type = 'analysis'
+        elif request_type == 'train':
+            process_type = 'training'
+        else:
+            process_type = request_type
        model_path = job['data'].get('model_path', 'Qwen/Qwen2.5-VL-7B-Instruct')

        # Use advanced job scheduling with VRAM requirements