Fix job re-queuing logic to prevent fallback to local processing

- Modified queue.py to allow retried jobs to use distributed processing when available - Fixed async coroutine warning by adding await to _transfer_job_files call - Jobs that fail on clients will now be properly re-queued for distributed processing instead of falling back to local workers that may not exist

Fix job re-queuing logic to prevent fallback to local processing
- Modified queue.py to allow retried jobs to use distributed processing when available - Fixed async coroutine warning by adding await to _transfer_job_files call - Jobs that fail on clients will now be properly re-queued for distributed processing instead of falling back to local workers that may not exist
e28db173 · Stefy Lanza (nextime / spora ) · d5d30329 · e28db173 · e28db173
Commit e28db173 authored Oct 08, 2025 by Stefy Lanza (nextime / spora )
Show whitespace changes
Inline Side-by-side

Showing with 13 additions and 11 deletions

cluster_master.py vidai/cluster_master.py +1 -1

queue.py vidai/queue.py +12 -10

No files found.
--- a/vidai/cluster_master.py
+++ b/vidai/cluster_master.py
@@ -478,7 +478,7 @@ class ClusterMaster:
        # Handle file transfer
        media_path = job_data.get('local_path')
        if media_path and client_id in self.client_websockets:
-            self._transfer_job_files(client_id, job_data, job_id)
+            await self._transfer_job_files(client_id, job_data, job_id)
        # Send job assignment
        if client_id in self.client_websockets:

--- a/vidai/queue.py
+++ b/vidai/queue.py
@@ -210,20 +210,22 @@ class QueueManager:
        # Check if this job has failed before (has retry_count)
        retry_count = job.get('retry_count', 0)
-        # For jobs that have failed before, prefer local processing to avoid
+        # Check if distributed workers are available
-        # immediate re-assignment to the same failing distributed worker
+        has_distributed = self._has_distributed_worker(process_type)
-        if retry_count == 0 and self._has_distributed_worker(process_type):
-            # Mark as processing, cluster master will assign it
+        if has_distributed:
+            # Use distributed processing for all jobs when available
+            # The cluster master will handle retry logic and worker selection
            from .database import update_queue_status
            update_queue_status(job['id'], 'processing', {'status': 'Waiting for assignment'})
+            if retry_count > 0:
+                print(f"Job {job['id']} retry {retry_count}, marked for distributed processing")
+            else:
                print(f"Job {job['id']} marked for distributed processing")
            return
-        # Fall back to local processing (also used for retried jobs)
+        # No distributed workers available, fall back to local processing
-        if retry_count > 0:
+        print(f"No distributed workers available for job {job['id']}, falling back to local processing")
-            print(f"Job {job['id']} failed before (retry {retry_count}), using local processing")
-        else:
-            print(f"No suitable distributed worker available for job {job['id']}, falling back to local processing")
        self._execute_local_job(job)
    def _send_to_distributed_worker(self, job: Dict[str, Any], worker_key: str) -> None: