Fix job timeout in cluster master by extending timeout when progress is detected

d3e3fb72 · Stefy Lanza (nextime / spora ) · b305223e · d3e3fb72
Commit d3e3fb72 authored Oct 09, 2025 by Stefy Lanza (nextime / spora )
Show whitespace changes
Inline Side-by-side

Showing with 17 additions and 3 deletions

cluster_master.py vidai/cluster_master.py +17 -3

No files found.
--- a/vidai/cluster_master.py
+++ b/vidai/cluster_master.py
@@ -719,8 +719,13 @@ class ClusterMaster:
            from .comm import SocketCommunicator
            from .config import get_backend_web_port

+            # Progress tracking for timeout extension
+            has_progress = False
+            last_progress_time = time.time()
+            max_iterations = 91  # Base timeout: 91 seconds
+
            # Poll for result
-            for _ in range(91):  # Poll for up to 91 seconds (91 * 1s)
+            for _ in range(max_iterations):
                try:
                    backend_comm = SocketCommunicator(host='localhost', port=get_backend_web_port(), comm_type='tcp')
                    backend_comm.connect()
@@ -777,6 +782,15 @@ class ClusterMaster:
                            return

                    elif response and response.msg_type == 'result_pending':
+                        # Check if we have recent progress to extend timeout
+                        from .web import get_progress
+                        progress_response = get_progress(job_id)
+                        if progress_response and 'progress' in progress_response:
+                            has_progress = True
+                            last_progress_time = time.time()
+                            # Extend timeout when progress is active
+                            max_iterations = min(max_iterations + 50, 500)  # Add up to 5 more seconds per progress check
+
                        # Result not ready yet, wait and try again
                        await asyncio.sleep(1)
                        continue
@@ -786,10 +800,10 @@ class ClusterMaster:
                    await asyncio.sleep(1)

            # Timeout - job took too long
-            print(f"Job {job_id} timed out waiting for result (91 seconds)")
+            print(f"Job {job_id} timed out waiting for result ({max_iterations} seconds)")
            await self._handle_job_result({
                'job_id': job_id,
-                'result': {'status': 'failed', 'error': 'Job timed out after 91 seconds'}
+                'result': {'status': 'failed', 'error': f'Job timed out after {max_iterations} seconds'}
            })

        except Exception as e: