Change local job monitoring to poll database instead of TCP

- Use get_queue_by_job_id to check job status - More reliable than TCP polling for local jobs

Change local job monitoring to poll database instead of TCP
- Use get_queue_by_job_id to check job status - More reliable than TCP polling for local jobs
09a3589a · Stefy Lanza (nextime / spora ) · 08a9a99e · 09a3589a
Commit 09a3589a authored Oct 08, 2025 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 42 additions and 36 deletions

cluster_master.py vidai/cluster_master.py +42 -36

No files found.
--- a/vidai/cluster_master.py
+++ b/vidai/cluster_master.py
@@ -628,49 +628,55 @@ class ClusterMaster:
            return None
    async def _monitor_job_result(self, job_id: str, process_type: str) -> None:
-        """Monitor for job result from local backend."""
+        """Monitor for job result by polling database."""
        try:
-            from .comm import SocketCommunicator
+            from .database import get_queue_by_job_id
-            from .config import get_backend_web_port
+            import json
            # Poll for result
            for _ in range(300):  # Poll for up to 5 minutes (300 * 1s)
                try:
-                    backend_comm = SocketCommunicator(host='localhost', port=get_backend_web_port(), comm_type='tcp')
+                    queue_entry = get_queue_by_job_id(job_id)
-                    backend_comm.connect()
+                    if queue_entry:
+                        status = queue_entry['status']
-                    # Send get_result request
+                        if status == 'completed':
-                    result_request = Message(
+                            result = json.loads(queue_entry['result']) if queue_entry['result'] else {}
-                        msg_type='get_result',
+                            print(f"Received result for job {job_id}")
-                        msg_id=f'poll_{job_id}',
-                        data={'request_id': job_id}
+                            # Handle result
-                    )
+                            await self._handle_job_result({
-                    backend_comm.send_message(result_request)
+                                'job_id': job_id,
+                                'result': result
-                    # Try to receive response
+                            })
-                    response = backend_comm.receive_message()
-                    if response and response.msg_type in ['analyze_response', 'train_response']:
+                            # Clean up
-                        result_data = response.data
+                            if job_id in self.pending_jobs:
-                        print(f"Received result for job {job_id}")
+                                del self.pending_jobs[job_id]
+                            return
-                        # Handle result
-                        await self._handle_job_result({
+                        elif status == 'failed':
-                            'job_id': job_id,
+                            error = queue_entry.get('error', 'Unknown error')
-                            'result': result_data
+                            result = {'status': 'failed', 'error': error}
-                        })
+                            print(f"Job {job_id} failed: {error}")
-                        # Clean up
+                            # Handle failed result
-                        if job_id in self.pending_jobs:
+                            await self._handle_job_result({
-                            del self.pending_jobs[job_id]
+                                'job_id': job_id,
-                        return
+                                'result': result
+                            })
-                    elif response and response.msg_type == 'result_pending':
-                        # Result not ready yet, wait and try again
+                            # Clean up
-                        await asyncio.sleep(1)
+                            if job_id in self.pending_jobs:
-                        continue
+                                del self.pending_jobs[job_id]
+                            return
+                        elif status == 'processing':
+                            # Still processing, wait and try again
+                            await asyncio.sleep(1)
+                            continue
                except Exception as e:
-                    print(f"Error polling for job {job_id} result: {e}")
+                    print(f"Error polling for job {job_id} status: {e}")
                    await asyncio.sleep(1)
            # Timeout - job took too long