Fix job timeout in cluster master by extending timeout when progress is detected

parent b305223e
......@@ -719,8 +719,13 @@ class ClusterMaster:
from .comm import SocketCommunicator
from .config import get_backend_web_port
# Progress tracking for timeout extension
has_progress = False
last_progress_time = time.time()
max_iterations = 91 # Base timeout: 91 seconds
# Poll for result
for _ in range(91): # Poll for up to 91 seconds (91 * 1s)
for _ in range(max_iterations):
try:
backend_comm = SocketCommunicator(host='localhost', port=get_backend_web_port(), comm_type='tcp')
backend_comm.connect()
......@@ -777,6 +782,15 @@ class ClusterMaster:
return
elif response and response.msg_type == 'result_pending':
# Check if we have recent progress to extend timeout
from .web import get_progress
progress_response = get_progress(job_id)
if progress_response and 'progress' in progress_response:
has_progress = True
last_progress_time = time.time()
# Extend timeout when progress is active
max_iterations = min(max_iterations + 50, 500) # Add up to 5 more seconds per progress check
# Result not ready yet, wait and try again
await asyncio.sleep(1)
continue
......@@ -786,10 +800,10 @@ class ClusterMaster:
await asyncio.sleep(1)
# Timeout - job took too long
print(f"Job {job_id} timed out waiting for result (91 seconds)")
print(f"Job {job_id} timed out waiting for result ({max_iterations} seconds)")
await self._handle_job_result({
'job_id': job_id,
'result': {'status': 'failed', 'error': 'Job timed out after 91 seconds'}
'result': {'status': 'failed', 'error': f'Job timed out after {max_iterations} seconds'}
})
except Exception as e:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment