Add debug prints to worker and revert monitoring to TCP

- Worker now prints when receiving jobs and sending results - Cluster master uses TCP polling for consistency with clients

Add debug prints to worker and revert monitoring to TCP
- Worker now prints when receiving jobs and sending results - Cluster master uses TCP polling for consistency with clients
82f5cbfe · Stefy Lanza (nextime / spora ) · 09a3589a · 82f5cbfe · 82f5cbfe
Commit 82f5cbfe authored Oct 08, 2025 by Stefy Lanza (nextime / spora )
Show whitespace changes
Inline Side-by-side

Showing with 41 additions and 42 deletions

cluster_master.py vidai/cluster_master.py +36 -42

worker_analysis.py vidai/worker_analysis.py +5 -0

No files found.
--- a/vidai/cluster_master.py
+++ b/vidai/cluster_master.py
@@ -628,41 +628,35 @@ class ClusterMaster:
            return None

    async def _monitor_job_result(self, job_id: str, process_type: str) -> None:
-        """Monitor for job result by polling database."""
+        """Monitor for job result from local backend."""
        try:
-            from .database import get_queue_by_job_id
-            import json
+            from .comm import SocketCommunicator
+            from .config import get_backend_web_port

            # Poll for result
            for _ in range(300):  # Poll for up to 5 minutes (300 * 1s)
                try:
-                    queue_entry = get_queue_by_job_id(job_id)
-                    if queue_entry:
-                        status = queue_entry['status']
-                        if status == 'completed':
-                            result = json.loads(queue_entry['result']) if queue_entry['result'] else {}
-                            print(f"Received result for job {job_id}")
-
-                            # Handle result
-                            await self._handle_job_result({
-                                'job_id': job_id,
-                                'result': result
-                            })
+                    backend_comm = SocketCommunicator(host='localhost', port=get_backend_web_port(), comm_type='tcp')
+                    backend_comm.connect()

-                            # Clean up
-                            if job_id in self.pending_jobs:
-                                del self.pending_jobs[job_id]
-                            return
+                    # Send get_result request
+                    result_request = Message(
+                        msg_type='get_result',
+                        msg_id=f'poll_{job_id}',
+                        data={'request_id': job_id}
+                    )
+                    backend_comm.send_message(result_request)

-                        elif status == 'failed':
-                            error = queue_entry.get('error', 'Unknown error')
-                            result = {'status': 'failed', 'error': error}
-                            print(f"Job {job_id} failed: {error}")
+                    # Try to receive response
+                    response = backend_comm.receive_message()
+                    if response and response.msg_type in ['analyze_response', 'train_response']:
+                        result_data = response.data
+                        print(f"Received result for job {job_id}")

-                            # Handle failed result
+                        # Handle result
                        await self._handle_job_result({
                            'job_id': job_id,
-                                'result': result
+                            'result': result_data
                        })

                        # Clean up
@@ -670,13 +664,13 @@ class ClusterMaster:
                            del self.pending_jobs[job_id]
                        return

-                        elif status == 'processing':
-                            # Still processing, wait and try again
+                    elif response and response.msg_type == 'result_pending':
+                        # Result not ready yet, wait and try again
                        await asyncio.sleep(1)
                        continue

                except Exception as e:
-                    print(f"Error polling for job {job_id} status: {e}")
+                    print(f"Error polling for job {job_id} result: {e}")
                    await asyncio.sleep(1)

            # Timeout - job took too long

--- a/vidai/worker_analysis.py
+++ b/vidai/worker_analysis.py
@@ -250,22 +250,27 @@ def worker_process(backend_type: str):
        try:
            message = comm.receive_message()
            if message and message.msg_type == 'analyze_request':
+                print(f"DEBUG: Worker received analyze_request: {message.msg_id}")
                data = message.data
                media_path = data.get('local_path', data.get('file_name', ''))
                if not media_path:
                    result = 'No media path provided'
+                    print(f"DEBUG: No media path provided for job {message.msg_id}")
                else:
                    prompt = data.get('prompt', 'Describe this image.')
                    model_path = data.get('model_path', 'Qwen/Qwen2.5-VL-7B-Instruct')
                    interval = data.get('interval', 10)
                    job_id = data.get('job_id')  # Extract job_id for cancellation checking
+                    print(f"DEBUG: Starting analysis of {media_path} with model {model_path} for job {message.msg_id}")
                    result = analyze_media(media_path, prompt, model_path, interval, job_id)
+                    print(f"DEBUG: Analysis completed for job {message.msg_id}")

                    # Release model reference (don't unload yet, per requirements)
                    release_model(model_path)

                # Send result back
                response = Message('analyze_response', message.msg_id, {'result': result})
+                print(f"DEBUG: Sending analyze_response for job {message.msg_id}")
                comm.send_message(response)

                # If in cluster mode, also notify cluster master