Add immediate job assignment notification when workers register

parent 28c25ebb
......@@ -144,6 +144,17 @@ def handle_worker_message(message: Message, client_sock) -> None:
if worker_type:
worker_sockets[worker_type] = client_sock
print(f"Worker {worker_type} registered")
# Notify cluster master that a worker has registered
try:
import socket
notification_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
notification_sock.connect(('localhost', 5004)) # Cluster master port + 1
notification_sock.sendall(b"worker_registered")
notification_sock.close()
print(f"Notified cluster master of worker {worker_type} registration")
except Exception as e:
print(f"Failed to notify cluster master: {e}")
elif message.msg_type == 'progress':
# Store progress update for web to poll and update progress timestamp
progress_key = f"progress_{message.data.get('job_id')}"
......
......@@ -146,6 +146,12 @@ class ClusterMaster:
start_server = websockets.serve(self._handle_client, self.host, self.port, ssl=ssl_context)
await start_server
# Start notification server for backend notifications
notification_server = await asyncio.start_server(
self._handle_notification, 'localhost', self.port + 1
)
print(f"Cluster master notification server started on port {self.port + 1}")
# Register local processes if master has weight
if self.weight > 0:
await self._register_local_processes()
......@@ -227,6 +233,21 @@ class ClusterMaster:
if client_id:
self._remove_client(client_id)
async def _handle_notification(self, reader: asyncio.StreamReader, writer: asyncio.StreamWriter) -> None:
"""Handle notification from backend."""
try:
data = await reader.read(1024)
message = data.decode().strip()
if message == "worker_registered":
print("Received worker registration notification - checking for pending jobs")
# Trigger immediate job assignment check
await self._check_pending_jobs()
except Exception as e:
print(f"Notification handling error: {e}")
finally:
writer.close()
await writer.wait_closed()
async def _process_message(self, message: Dict[str, Any], websocket: websockets.WebSocketServerProtocol) -> Optional[Dict[str, Any]]:
"""Process a message from a client."""
msg_type = message.get('type')
......@@ -1424,23 +1445,8 @@ class ClusterMaster:
self._remove_client(client_id)
return False
async def _management_loop(self) -> None:
"""Main management loop."""
while self.running:
try:
# Clean up dead clients
current_time = time.time()
dead_clients = []
for client_id, client_info in self.clients.items():
if client_info.get('local'):
continue # Don't clean up local clients
if current_time - client_info['last_seen'] > 60: # 1 minute timeout
dead_clients.append(client_id)
for client_id in dead_clients:
self._remove_client(client_id)
# Poll for jobs to assign
async def _check_pending_jobs(self) -> None:
"""Check for and assign pending jobs."""
from .database import get_db_connection
conn = get_db_connection()
cursor = conn.cursor()
......@@ -1490,6 +1496,25 @@ class ClusterMaster:
print(f"No suitable worker found for job {job['id']}, re-queuing")
update_queue_status(job['id'], 'queued', error='No suitable worker found, re-queued')
async def _management_loop(self) -> None:
"""Main management loop."""
while self.running:
try:
# Clean up dead clients
current_time = time.time()
dead_clients = []
for client_id, client_info in self.clients.items():
if client_info.get('local'):
continue # Don't clean up local clients
if current_time - client_info['last_seen'] > 60: # 1 minute timeout
dead_clients.append(client_id)
for client_id in dead_clients:
self._remove_client(client_id)
# Poll for jobs to assign
await self._check_pending_jobs()
# Check for cancelled jobs that need to be stopped
cancelled_jobs = []
for job_id, job_info in self.active_jobs.items():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment