Fix job timeout in cluster master by extending timeout when progress is detected

parent b305223e
...@@ -719,8 +719,13 @@ class ClusterMaster: ...@@ -719,8 +719,13 @@ class ClusterMaster:
from .comm import SocketCommunicator from .comm import SocketCommunicator
from .config import get_backend_web_port from .config import get_backend_web_port
# Progress tracking for timeout extension
has_progress = False
last_progress_time = time.time()
max_iterations = 91 # Base timeout: 91 seconds
# Poll for result # Poll for result
for _ in range(91): # Poll for up to 91 seconds (91 * 1s) for _ in range(max_iterations):
try: try:
backend_comm = SocketCommunicator(host='localhost', port=get_backend_web_port(), comm_type='tcp') backend_comm = SocketCommunicator(host='localhost', port=get_backend_web_port(), comm_type='tcp')
backend_comm.connect() backend_comm.connect()
...@@ -777,6 +782,15 @@ class ClusterMaster: ...@@ -777,6 +782,15 @@ class ClusterMaster:
return return
elif response and response.msg_type == 'result_pending': elif response and response.msg_type == 'result_pending':
# Check if we have recent progress to extend timeout
from .web import get_progress
progress_response = get_progress(job_id)
if progress_response and 'progress' in progress_response:
has_progress = True
last_progress_time = time.time()
# Extend timeout when progress is active
max_iterations = min(max_iterations + 50, 500) # Add up to 5 more seconds per progress check
# Result not ready yet, wait and try again # Result not ready yet, wait and try again
await asyncio.sleep(1) await asyncio.sleep(1)
continue continue
...@@ -786,10 +800,10 @@ class ClusterMaster: ...@@ -786,10 +800,10 @@ class ClusterMaster:
await asyncio.sleep(1) await asyncio.sleep(1)
# Timeout - job took too long # Timeout - job took too long
print(f"Job {job_id} timed out waiting for result (91 seconds)") print(f"Job {job_id} timed out waiting for result ({max_iterations} seconds)")
await self._handle_job_result({ await self._handle_job_result({
'job_id': job_id, 'job_id': job_id,
'result': {'status': 'failed', 'error': 'Job timed out after 91 seconds'} 'result': {'status': 'failed', 'error': f'Job timed out after {max_iterations} seconds'}
}) })
except Exception as e: except Exception as e:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment