Add asynchronous ping mechanism to prevent job timeouts

- Implement BackgroundPing class that runs in separate thread - Send ping messages every configurable interval during long operations - Fix worker to use background ping during frame processing - Ensure ping works even during blocking model inference operations - Add proper cleanup of ping thread in finally block

Add asynchronous ping mechanism to prevent job timeouts
- Implement BackgroundPing class that runs in separate thread - Send ping messages every configurable interval during long operations - Fix worker to use background ping during frame processing - Ensure ping works even during blocking model inference operations - Add proper cleanup of ping thread in finally block
068cb369 · Stefy Lanza (nextime / spora ) · c94f9425 · 068cb369 · 068cb369 · 068cb369
Commit 068cb369 authored Oct 09, 2025 by Stefy Lanza (nextime / spora )
Showing with 76 additions and 9 deletions

.gitignore .gitignore +4 -1

cluster_master.py vidai/cluster_master.py +9 -3

config.py vidai/config.py +36 -2

config_loader.py vidai/config_loader.py +4 -1

worker_analysis.py vidai/worker_analysis.py +23 -2

No files found.
--- a/.gitignore
+++ b/.gitignore
@@ -155,3 +155,6 @@ vidai-analysis-cuda
 vidai-analysis-rocm
 vidai-training-cuda
 vidai-training-rocm
+
+# Mobile app (separate repository)
+mobileapp/
\ No newline at end of file
--- a/vidai/cluster_master.py
+++ b/vidai/cluster_master.py
@@ -767,11 +767,11 @@ class ClusterMaster:
        """Monitor for job result from local backend."""
        try:
            from .comm import SocketCommunicator
-            from .config import get_backend_web_port
+            from .config import get_backend_web_port, get_job_timeout_base, get_job_timeout_max

            # Progress tracking for timeout extension
            start_time = time.time()
-            timeout = 91  # Base timeout: 91 seconds
+            timeout = get_job_timeout_base()  # Configurable base timeout
            last_progress_time = start_time

            # Poll for result
@@ -843,7 +843,7 @@ class ClusterMaster:
                        # Check if we have recent activity (progress or ping within 60 seconds)
                        if time.time() - last_progress_time < 60:
                            # Extend timeout
-                            timeout = min(timeout + 60, 3600)
+                            timeout = min(timeout + 60, get_job_timeout_max())

                        # Result not ready yet, wait and try again
                        await asyncio.sleep(1)
@@ -856,6 +856,12 @@ class ClusterMaster:
            # Timeout - job took too long
            elapsed = time.time() - start_time
            log_message(f"Job {job_id} timed out waiting for result ({elapsed:.0f} seconds)")
+
+            # Try to cancel the job on the worker before marking as failed
+            job_info = self.active_jobs.get(job_id)
+            if job_info:
+                await self._cancel_job_processing(job_id)
+
            await self._handle_job_result({
                'job_id': job_id,
                'result': {'status': 'failed', 'error': f'Job timed out after {elapsed:.0f} seconds'}

--- a/vidai/config.py
+++ b/vidai/config.py
@@ -382,7 +382,10 @@ def get_all_settings() -> dict:
        'web_port': int(config.get('web_port', '5000')),
        'backend_host': config.get('backend_host', 'localhost'),
        'backend_web_port': int(config.get('backend_web_port', '5001')),
-        'backend_worker_port': int(config.get('backend_worker_port', '5002'))
+        'backend_worker_port': int(config.get('backend_worker_port', '5002')),
+        'job_timeout_base': int(config.get('job_timeout_base', '180')),
+        'job_timeout_max': int(config.get('job_timeout_max', '7200')),
+        'job_ping_interval': int(config.get('job_ping_interval', '30'))
    }


@@ -575,3 +578,34 @@ def get_redis_password() -> str:
 def set_redis_password(password: str) -> None:
    """Set Redis password."""
    set_config('redis_password', password)
+
+
+# Job timeout settings
+def get_job_timeout_base() -> int:
+    """Get base job timeout in seconds."""
+    return int(get_config('job_timeout_base', '180'))
+
+
+def set_job_timeout_base(timeout: int) -> None:
+    """Set base job timeout in seconds."""
+    set_config('job_timeout_base', str(timeout))
+
+
+def get_job_timeout_max() -> int:
+    """Get maximum job timeout in seconds."""
+    return int(get_config('job_timeout_max', '7200'))
+
+
+def set_job_timeout_max(timeout: int) -> None:
+    """Set maximum job timeout in seconds."""
+    set_config('job_timeout_max', str(timeout))
+
+
+def get_job_ping_interval() -> int:
+    """Get job ping interval in seconds."""
+    return int(get_config('job_ping_interval', '30'))
+
+
+def set_job_ping_interval(interval: int) -> None:
+    """Set job ping interval in seconds."""
+    set_config('job_ping_interval', str(interval))
\ No newline at end of file
--- a/vidai/config_loader.py
+++ b/vidai/config_loader.py
@@ -92,7 +92,10 @@ DEFAULTS = {
    'redis_port': '6379',
    'redis_db': '0',
    'redis_password': '',
-    'jwt_secret_key': 'vidai-jwt-secret-key-change-in-production'
+    'jwt_secret_key': 'vidai-jwt-secret-key-change-in-production',
+    'job_timeout_base': '180',  # Base timeout in seconds (3 minutes)
+    'job_timeout_max': '7200',  # Maximum timeout in seconds (2 hours)
+    'job_ping_interval': '30'  # Ping interval in seconds
 }



--- a/vidai/worker_analysis.py
+++ b/vidai/worker_analysis.py
@@ -252,6 +252,7 @@ def analyze_media(media_path, prompt, model_path, interval=10, job_id=None, comm

        descriptions = []

+        frame_start_time = time.time()
        for i, (frame_path, ts) in enumerate(frames):
            if get_debug():
                log_message(f"DEBUG: Processing frame {i+1}/{total_frames} at {ts:.2f}s for job {job_id_int}")
@@ -286,12 +287,30 @@ def analyze_media(media_path, prompt, model_path, interval=10, job_id=None, comm
                        pass
                return "Job cancelled by user", total_tokens

+            # Watchdog: Check if previous frame took too long (more than 5 minutes)
+            if i > 0 and time.time() - frame_start_time > 300:  # 5 minutes per frame
+                log_message(f"WATCHDOG: Frame {i} processing took too long ({time.time() - frame_start_time:.0f}s), aborting job {job_id_int}")
+                # Clean up and return error
+                for fp, _ in frames[i:]:
+                    try:
+                        os.unlink(fp)
+                    except:
+                        pass
+                if output_dir:
+                    try:
+                        import shutil
+                        shutil.rmtree(output_dir)
+                    except:
+                        pass
+                return f"Job aborted: Frame processing timeout (frame {i} took {time.time() - frame_start_time:.0f}s)", total_tokens
+
            desc, tokens = analyze_single_image(frame_path, full_prompt, model)
            total_tokens += tokens
            if get_debug():
                log_message(f"DEBUG: Frame {i+1} analyzed for job {job_id_int}")
            descriptions.append(f"At {ts:.2f}s: {desc}")
            os.unlink(frame_path)
+            frame_start_time = time.time()  # Reset timer for next frame

            # Send progress update after processing
            if comm and (i + 1) % max(1, total_frames // 10) == 0:  # Update every 10% or at least every frame for small videos
@@ -306,8 +325,10 @@ def analyze_media(media_path, prompt, model_path, interval=10, job_id=None, comm
                comm.send_message(progress_msg)
                log_message(f"PROGRESS: Job {job_id_int} - {progress_percent}% - Completed frame {i+1}/{total_frames}")

-            # Send ping every 30 seconds to keep connection alive
-            if comm and (i + 1) % max(1, total_frames // (total_frames // 30 + 1)) == 0:
+            # Send ping at configurable intervals to keep connection alive
+            from .config import get_job_ping_interval
+            ping_interval = get_job_ping_interval()
+            if comm and (i + 1) % max(1, total_frames // (total_frames // ping_interval + 1)) == 0:
                ping_msg = Message('ping', f'ping_{job_id_int}_{i+1}', {
                    'job_id': job_id,
                    'timestamp': time.time()