Add backend response checking in cluster master to re-queue jobs when workers are not available

parent d3e3fb72
......@@ -656,11 +656,24 @@ class ClusterMaster:
)
backend_comm.send_message(job_message)
print(f"Job {job_id} forwarded to local backend for {process_type} processing")
# Check response from backend
response = backend_comm.receive_message()
if response and response.msg_type == 'error':
error_msg = response.data.get('error', 'Unknown error')
print(f"Backend refused job {job_id}: {error_msg}")
# Re-queue the job
from .database import update_queue_status
update_queue_status(queue_id, 'queued', error=f'Backend refused job: {error_msg}')
# Clean up
self.worker_jobs[worker_key].remove(job_id)
self.worker_vram_usage[worker_key] -= vram_required
del self.active_jobs[job_id]
return None
else:
print(f"Job {job_id} forwarded to local backend for {process_type} processing")
# Start monitoring for result
self.pending_jobs[job_id] = asyncio.create_task(self._monitor_job_result(job_id, process_type))
return job_id
except Exception as e:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment