front: drain in-flight requests before bouncing an engine

An engine restart (admin button / config change) previously SIGTERM'd the process immediately, severing any active SSE stream mid-response — the client saw httpcore.RemoteProtocolError "peer closed connection without sending complete message body". Now restart_engine marks the engine `draining` first: the router stops routing NEW requests to it (Engine.is_alive() reports false while draining, and the poll loop can't flip it back healthy), and the supervisor waits up to server.engine_restart_drain_grace seconds (default 30, 0 = immediate) for the in-flight count to reach zero before killing the process. Stragglers past the grace window are still bounced. In-flight is tracked per engine in the front proxy: proxy() increments on send and decrements once the streamed response is fully drained (or the send failed). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

front: drain in-flight requests before bouncing an engine
An engine restart (admin button / config change) previously SIGTERM'd the process immediately, severing any active SSE stream mid-response — the client saw httpcore.RemoteProtocolError "peer closed connection without sending complete message body". Now restart_engine marks the engine `draining` first: the router stops routing NEW requests to it (Engine.is_alive() reports false while draining, and the poll loop can't flip it back healthy), and the supervisor waits up to server.engine_restart_drain_grace seconds (default 30, 0 = immediate) for the in-flight count to reach zero before killing the process. Stragglers past the grace window are still bounced. In-flight is tracked per engine in the front proxy: proxy() increments on send and decrements once the streamed response is fully drained (or the send failed). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
34d666d6 · Stefy Lanza (nextime / spora ) · 0a7d343a · 34d666d6 · 34d666d6 · 34d666d6
Commit 34d666d6 authored Jun 20, 2026 by Stefy Lanza (nextime / spora )
5 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -50,3 +50,6 @@ coderai-runtime/
 # Video editor sessions + generated media (runtime artifacts)
 video_editor/sessions/
 tools/coderai_media/
+.oci-build.log
+.oci-rebuild.sh
+.oci-*.log
--- a/codai/config.py
+++ b/codai/config.py
@@ -52,6 +52,9 @@ class ServerConfig:
    engine_gpus: Optional[list] = None  # explicit GPU indices, e.g. [0, 1]; None = auto
    proxy_status_timeout: float = 2.0   # short timeout for UI/status proxying (seconds)
    proxy_max_inflight: int = 64        # max concurrent proxied requests through the front
+    engine_restart_drain_grace: float = 30.0  # on engine restart, wait this many seconds
+                                              # for in-flight requests to finish before
+                                              # killing the process (0 = bounce immediately)
    # Explicit, heterogeneous engine declarations. Auto GPU detection only finds
    # NVIDIA cards and assumes one backend, and CUDA vs Vulkan device enumeration is
    # inconsistent — so for mixed setups (e.g. an NVIDIA + a Radeon card, where the
@@ -559,6 +562,7 @@ class ConfigManager:
                "engine_gpus": self.config.server.engine_gpus,
                "proxy_status_timeout": self.config.server.proxy_status_timeout,
                "proxy_max_inflight": self.config.server.proxy_max_inflight,
+                "engine_restart_drain_grace": self.config.server.engine_restart_drain_grace,
                "engine_specs": self.config.server.engine_specs,
                "default_engine": self.config.server.default_engine,
            },

--- a/codai/frontproxy/app.py
+++ b/codai/frontproxy/app.py
@@ -578,19 +578,29 @@ class FrontProxy:
        rp_req = self._long.build_request(
            method, url, headers=headers, params=request.query_params,
            content=content)
+        # Count this as in-flight on the chosen engine so a restart can drain it:
+        # decremented only once the response is fully streamed (or send failed).
+        engine.enter_request()
        try:
            rp_resp = await self._long.send(rp_req, stream=True)
        except Exception as exc:
+            engine.exit_request()
            return JSONResponse(
                {"error": f"Engine#{engine.id} unreachable: {exc}"}, status_code=502)

+        async def _release():
+            try:
+                await rp_resp.aclose()
+            finally:
+                engine.exit_request()
+
        resp_headers = self._filter_headers(rp_resp.headers, _DROP_RESP)
        return StreamingResponse(
            rp_resp.aiter_raw(),
            status_code=rp_resp.status_code,
            headers=dict(resp_headers),
            media_type=rp_resp.headers.get("content-type"),
-            background=BackgroundTask(rp_resp.aclose),
+            background=BackgroundTask(_release),
        )

    # ----------------------------------------------------------------- status

--- a/codai/frontproxy/engine_supervisor.py
+++ b/codai/frontproxy/engine_supervisor.py
@@ -31,6 +31,7 @@ import subprocess
 import sys
 import threading
 import time
+from typing import Optional

 import httpx

@@ -528,31 +529,62 @@ class EngineSupervisor:
            time.sleep(1.0)   # avoid a tight crash loop
            self._spawn(engine)

-    def restart_engine(self, engine_id: int) -> bool:
+    def restart_engine(self, engine_id: int, drain_grace: Optional[float] = None) -> bool:
        """Forcibly kill and respawn one engine (e.g. it's stuck in a loop).

+        Before killing, mark the engine ``draining`` so the router stops sending it
+        NEW requests, and wait up to ``drain_grace`` seconds for in-flight (streaming)
+        requests to finish — so a config-triggered bounce doesn't sever active SSE
+        streams mid-response. After the grace window any stragglers are dropped.
+
        Holds the restart lock so the poll loop's own respawn can't double-spawn."""
        engine = self.registry.get(engine_id)
        if engine is None:
            return False
+        if drain_grace is None:
+            drain_grace = float(getattr(self.config.server,
+                                        "engine_restart_drain_grace", 30.0) or 0.0)
        with self._restart_lock:
            proc = engine.proc
-            if proc is not None and proc.poll() is None:
-                try:
-                    proc.terminate()
-                    proc.wait(timeout=8)
-                except Exception:
-                    pass
-                if proc.poll() is None:
+            if proc is not None and proc.poll() is None and drain_grace > 0:
+                engine.draining = True
+                self.registry.update_state(engine_id, healthy=False)
+                deadline = time.time() + drain_grace
+                waited = False
+                while engine.inflight > 0 and time.time() < deadline \
+                        and not self._stopped.is_set():
+                    if not waited:
+                        print(f"[front] draining engine#{engine_id} ({engine.name}): "
+                              f"waiting for {engine.inflight} in-flight request(s) "
+                              f"(up to {drain_grace:.0f}s)", flush=True)
+                        waited = True
+                    time.sleep(0.25)
+                if engine.inflight > 0:
+                    print(f"[front] drain grace elapsed; bouncing engine#{engine_id} "
+                          f"with {engine.inflight} request(s) still in flight",
+                          flush=True)
+                elif waited:
+                    print(f"[front] engine#{engine_id} drained cleanly", flush=True)
+            try:
+                proc = engine.proc
+                if proc is not None and proc.poll() is None:
                    try:
-                        proc.kill()
-                        proc.wait(timeout=3)
+                        proc.terminate()
+                        proc.wait(timeout=8)
                    except Exception:
                        pass
-            self.registry.update_state(engine_id, healthy=False)
-            print(f"[front] restarting engine#{engine_id} ({engine.name}) on request",
-                  flush=True)
-            self._spawn(engine)
+                    if proc.poll() is None:
+                        try:
+                            proc.kill()
+                            proc.wait(timeout=3)
+                        except Exception:
+                            pass
+                self.registry.update_state(engine_id, healthy=False)
+                print(f"[front] restarting engine#{engine_id} ({engine.name}) on request",
+                      flush=True)
+                self._spawn(engine)
+            finally:
+                engine.draining = False
        return True

    def wait_ready(self, timeout: float = 1800.0) -> bool:

--- a/codai/frontproxy/registry.py
+++ b/codai/frontproxy/registry.py
@@ -71,6 +71,10 @@ class Engine:
                                    # event loop is GIL-blocked and can't be polled
    last_ok: float = 0.0           # monotonic time of last successful poll
    proc: object = None            # subprocess.Popen (set by the supervisor)
+    draining: bool = False         # restart pending: stop routing NEW requests here
+                                    # and let in-flight ones finish (drain grace period)
+    inflight: int = 0              # proxied requests currently streaming through
+    _inflight_lock: object = field(default_factory=threading.Lock, repr=False, compare=False)

    def __post_init__(self):
        if not self.url:
@@ -89,13 +93,27 @@ class Engine:
        An engine mid-generation can't answer the health poll and reads as
        unhealthy, but it's the right place to send a request pinned/assigned to
        it — the request queues on its gen-lock instead of duplicating the model
-        elsewhere. A None proc means externally managed; assume alive."""
+        elsewhere. A None proc means externally managed; assume alive.
+
+        While draining (a restart is pending) it reports not-alive so the router
+        diverts new traffic elsewhere and the existing requests can finish."""
+        if self.draining:
+            return False
        p = self.proc
        try:
            return p is None or p.poll() is None
        except Exception:
            return True

+    def enter_request(self) -> None:
+        with self._inflight_lock:
+            self.inflight += 1
+
+    def exit_request(self) -> None:
+        with self._inflight_lock:
+            if self.inflight > 0:
+                self.inflight -= 1
+

 class EngineRegistry:
    def __init__(self):
@@ -151,6 +169,8 @@ class EngineRegistry:
            e = self._engines.get(engine_id)
            if not e:
                return
+            if e.draining:        # a restart is pending — stay out of rotation
+                healthy = False
            e.healthy = healthy
            if healthy:
                e.last_ok = time.monotonic()