Update _monitor_job_result to use indefinite timeout with ping/progress extension

parent bd927991
......@@ -769,12 +769,12 @@ class ClusterMaster:
from .config import get_backend_web_port
# Progress tracking for timeout extension
has_progress = False
last_progress_time = time.time()
max_iterations = 91 # Base timeout: 91 seconds
start_time = time.time()
timeout = 91 # Base timeout: 91 seconds
last_progress_time = start_time
# Poll for result
for _ in range(max_iterations):
while time.time() - start_time < timeout:
try:
backend_comm = SocketCommunicator(host='localhost', port=get_backend_web_port(), comm_type='tcp')
backend_comm.connect()
......@@ -834,15 +834,15 @@ class ClusterMaster:
# Check if we have recent progress to extend timeout
from .database import get_queue_by_job_id
job = get_queue_by_job_id(job_id)
print(f"DEBUG: job_id = {job_id}, job = {job}")
if job:
print(f"DEBUG: progress = {job.get('progress', 0)}")
if job and job.get('progress', 0) > 0:
print(f"DEBUG: extending timeout, has_progress = {has_progress}")
has_progress = True
last_progress_time = time.time()
# Extend timeout when progress is active
max_iterations = min(max_iterations + 50, 500) # Add up to 5 more seconds per progress check
timeout = min(timeout + 60, 3600) # Extend by 1 minute, max 1 hour
# Check if we have recent activity (progress or ping within 60 seconds)
if time.time() - last_progress_time < 60:
# Extend timeout
timeout = min(timeout + 60, 3600)
# Result not ready yet, wait and try again
await asyncio.sleep(1)
......@@ -853,10 +853,11 @@ class ClusterMaster:
await asyncio.sleep(1)
# Timeout - job took too long
print(f"Job {job_id} timed out waiting for result ({max_iterations} seconds)")
elapsed = time.time() - start_time
print(f"Job {job_id} timed out waiting for result ({elapsed:.0f} seconds)")
await self._handle_job_result({
'job_id': job_id,
'result': {'status': 'failed', 'error': f'Job timed out after {max_iterations} seconds'}
'result': {'status': 'failed', 'error': f'Job timed out after {elapsed:.0f} seconds'}
})
except Exception as e:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment