Commit 005dfd46 authored by Your Name's avatar Your Name

Add --whisper-server support for audio transcription

- Add WhisperServerManager class to manage whisper-server subprocess
- Add --whisper-server argument to specify whisper-server binary path
- Add --whisper-server-port argument for port configuration (default 8081)
- Modify audio transcription endpoint to proxy to whisper-server
- Add cleanup on shutdown to stop whisper-server
- Model can stay loaded in VRAM as long as the server runs
parent 1ca724e8
......@@ -1941,6 +1941,155 @@ class ModelManager:
self.backend = None
# =============================================================================
# Whisper Server Manager - manages whisper-server subprocess
# =============================================================================
import subprocess
import signal
import requests
import time
import threading
class WhisperServerManager:
"""Manages whisper-server subprocess for audio transcription with model swapping support."""
def __init__(self, server_path: str = None, port: int = 8081):
self.server_path = server_path
self.port = port
self.process = None
self.current_model = None
self.base_url = f"http://127.0.0.1:{port}"
self.lock = threading.Lock()
self._health_check_thread = None
self._running = False
def is_running(self) -> bool:
"""Check if whisper-server is running."""
if self.process is None:
return False
return self.process.poll() is None
def start(self, model_path: str = None, gpu_device: int = 0) -> bool:
"""Start whisper-server with the specified model."""
with self.lock:
# Stop existing server if running
if self.is_running():
self.stop()
if not self.server_path:
print("Error: whisper-server path not set")
return False
# Build command
cmd = [self.server_path]
if model_path:
cmd.extend(["-m", model_path])
# Add GPU device
cmd.extend(["-dev", str(gpu_device)])
# Add host and port
cmd.extend(["--host", "127.0.0.1"])
cmd.extend(["--port", str(self.port)])
print(f"Starting whisper-server: {' '.join(cmd)}")
try:
self.process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
preexec_fn=lambda: signal.signal(signal.SIGTERM, signal.SIG_DFL)
)
self.current_model = model_path
# Wait for server to be ready
if self._wait_for_server(30):
print(f"whisper-server started on {self.base_url}")
self._running = True
return True
else:
print("Error: whisper-server failed to start")
self.stop()
return False
except Exception as e:
print(f"Error starting whisper-server: {e}")
return False
def stop(self):
"""Stop whisper-server."""
with self.lock:
self._running = False
if self.process:
try:
self.process.terminate()
try:
self.process.wait(timeout=5)
except subprocess.TimeoutExpired:
self.process.kill()
self.process.wait()
except Exception as e:
print(f"Error stopping whisper-server: {e}")
self.process = None
self.current_model = None
def restart(self, model_path: str = None, gpu_device: int = 0) -> bool:
"""Restart whisper-server with a new model."""
print(f"Restarting whisper-server with model: {model_path}")
return self.start(model_path, gpu_device)
def transcribe(self, audio_data: bytes, language: str = None, prompt: str = None) -> dict:
"""Send transcription request to whisper-server."""
if not self.is_running():
return {"error": "whisper-server not running"}
try:
files = {"file": ("audio.wav", audio_data, "audio/wav")}
data = {}
if language:
data["language"] = language
if prompt:
data["prompt"] = prompt
response = requests.post(
f"{self.base_url}/inference",
files=files,
data=data,
timeout=300
)
if response.status_code == 200:
return response.json()
else:
return {"error": f"Server error: {response.status_code}", "detail": response.text}
except Exception as e:
return {"error": str(e)}
def _wait_for_server(self, timeout: int = 30) -> bool:
"""Wait for whisper-server to be ready."""
start_time = time.time()
while time.time() - start_time < timeout:
try:
response = requests.get(f"{self.base_url}/health", timeout=2)
if response.status_code == 200:
return True
except:
pass
time.sleep(0.5)
return False
def get_status(self) -> dict:
"""Get whisper-server status."""
return {
"running": self.is_running(),
"model": self.current_model,
"url": self.base_url
}
# =============================================================================
# Multi-Model Manager (supports audio transcription and image generation)
# =============================================================================
......@@ -1971,6 +2120,8 @@ class MultiModelManager:
self.active_in_vram: Optional[str] = None # Which model is currently in VRAM
# Model aliases: alias -> actual model name mapping
self.model_aliases: Dict[str, str] = {}
# Whisper server manager
self.whisper_server: Optional[WhisperServerManager] = None
@property
def audio_model(self) -> Optional[str]:
......@@ -2333,6 +2484,9 @@ async def lifespan(app: FastAPI):
# Shutdown
multi_model_manager.cleanup()
model_manager.cleanup()
# Stop whisper-server if running
if multi_model_manager.whisper_server:
multi_model_manager.whisper_server.stop()
app = FastAPI(
......@@ -2502,14 +2656,31 @@ async def create_transcription(
Supports:
- OpenAI's whisper-1 model (via OpenAI API)
- Local faster-whisper models (when --audio-model is specified)
- whisper.cpp server (when --whisper-server is specified)
"""
audio_model = multi_model_manager.audio_model
# If no audio model configured, return an error
if not audio_model:
# Check if whisper-server is available
if multi_model_manager.whisper_server and multi_model_manager.whisper_server.is_running():
# Use whisper-server - read file and send to server
file_content = await file.read()
result = multi_model_manager.whisper_server.transcribe(
file_content,
language=language,
prompt=prompt
)
if "error" in result:
raise HTTPException(status_code=500, detail=result["error"])
# Convert whisper-server response to OpenAI format
text = result.get("text", "")
return {
"text": text
}
raise HTTPException(
status_code=400,
detail="Audio transcription not configured. Use --audio-model to specify a model."
detail="Audio transcription not configured. Use --audio-model or --whisper-server to specify a model."
)
# Determine model to use - always use the configured audio model
......@@ -3909,6 +4080,18 @@ def parse_args():
default=None,
help="Path to whisper.cpp CLI executable (e.g., ~/whisper.cpp/build/bin/whisper-cli). Uses Vulkan if available.",
)
parser.add_argument(
"--whisper-server",
type=str,
default=None,
help="Path to whisper.cpp server executable (e.g., ~/whisper.cpp/build/bin/whisper-server). Keeps model loaded in VRAM.",
)
parser.add_argument(
"--whisper-server-port",
type=int,
default=8081,
help="Port for whisper-server (default: 8081).",
)
parser.add_argument(
"--vision-ctx",
type=int,
......@@ -4224,7 +4407,26 @@ def main():
# - Using loadall or loadswap mode, OR
# - No main model is specified (only audio model configured)
should_preload = load_mode in ("loadall", "loadswap") or (not model_names and audio_models)
if should_preload:
# Initialize whisper-server if specified
if args.whisper_server:
print(f"\nWhisper server: {args.whisper_server}")
print(f" Port: {args.whisper_server_port}")
whisper_server_mgr = WhisperServerManager(
server_path=args.whisper_server,
port=args.whisper_server_port
)
multi_model_manager.whisper_server = whisper_server_mgr
# Start whisper-server if we should preload or if it's the only audio option
if audio_models and (should_preload or not args.whisper_cpp):
model_to_use = audio_models[0] if audio_models else None
gpu_device = getattr(args, 'audio_vulkan_device', 0) or 0
if whisper_server_mgr.start(model_path=model_to_use, gpu_device=gpu_device):
print(f"Whisper server started with model: {model_to_use}")
else:
print("Warning: Failed to start whisper-server, falling back to other backends")
elif should_preload:
print(f"Pre-loading audio model... {audio_models[0]}")
# Use first audio model for pre-loading
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment