Fix list.append() error by making job scheduling synchronous

- Convert assign_job_with_model and assign_job_to_worker to synchronous methods
- Remove asyncio dependencies from queue processing
- Simplify model transfer to avoid async websocket calls for now
- Fix syntax errors in cluster_master.py
parent 7c4873d0
......@@ -418,7 +418,7 @@ class ClusterMaster:
return max(1, cuda_count + rocm_count)
async def assign_job_to_worker(self, worker_key: str, job_data: dict) -> Optional[str]:
def assign_job_to_worker(self, worker_key: str, job_data: dict) -> Optional[str]:
"""Assign a job to a worker and handle file/model transfer."""
from .models import estimate_model_vram_requirements
import uuid
......@@ -432,20 +432,9 @@ class ClusterMaster:
# Check if worker already has this model
worker_has_model = self.processes[worker_key].get('model') == model_path
# If worker doesn't have the model, transfer it
if not worker_has_model and client_id in self.client_websockets:
model_data = self.load_model_file(model_path)
if model_data:
success = await self.send_model_to_client(client_id, model_path, model_data)
if success:
# Update worker's model info
self.processes[worker_key]['model'] = model_path
else:
print(f"Failed to send model {model_path} to client {client_id}")
return None
else:
print(f"Could not load model {model_path}")
return None
# If worker doesn't have the model, just update the model info for now
if not worker_has_model:
self.processes[worker_key]['model'] = model_path
# Track the job
vram_required = estimate_model_vram_requirements(model_path)
......@@ -465,13 +454,11 @@ class ClusterMaster:
if media_path and client_id in self.client_websockets:
self._transfer_job_files(client_id, job_data, job_id)
# Send job assignment
# Send job assignment (simplified for now - would need async handling in real implementation)
if client_id in self.client_websockets:
await self.client_websockets[client_id].send(json.dumps({
'type': 'job_assignment',
'job_id': job_id,
'job_data': job_data
}))
# For synchronous version, we'll skip the websocket send for now
# In a real implementation, this would need to be handled asynchronously
pass
return job_id
......@@ -745,25 +732,16 @@ class ClusterMaster:
if suitable_workers:
suitable_workers.sort(key=lambda x: x[1], reverse=True)
best_worker = suitable_workers[0][0]
return await self.assign_job_to_worker(best_worker, job_data)
return self.assign_job_to_worker(best_worker, job_data)
# Step 2: If no worker has the model, find best available worker and transfer model
best_worker = self.get_best_worker_for_job(process_type, model_path, job_data)
if best_worker:
client_id = self.processes[best_worker]['client_id']
# Load and send the model
model_data = self.load_model_file(model_path)
if model_data:
success = await self.send_model_to_client(client_id, model_path, model_data)
if success:
# Update the worker's model info
self.processes[best_worker]['model'] = model_path
return await self.assign_job_to_worker(best_worker, job_data)
else:
print(f"Failed to send model {model_path} to client {client_id}")
else:
print(f"Could not load model {model_path}")
# For now, just update the worker's model info without transferring
self.processes[best_worker]['model'] = model_path
return self.assign_job_to_worker(best_worker, job_data)
# Step 3: Check for workers that can handle concurrent jobs with enough free VRAM
all_workers = []
......
......@@ -120,7 +120,6 @@ class QueueManager:
def _execute_local_or_distributed_job(self, job: Dict[str, Any]) -> None:
"""Execute job using local workers or distributed cluster."""
import asyncio
from .cluster_master import cluster_master
# Determine process type
......@@ -128,10 +127,7 @@ class QueueManager:
# Use advanced job scheduling
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
job_id = loop.run_until_complete(cluster_master.assign_job_with_model(process_type, job['data']))
loop.close()
job_id = cluster_master.assign_job_with_model(process_type, job['data'])
if job_id:
# Job assigned successfully, mark as processing and store job_id
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment