Add video and pipelines

parent 74e32470
......@@ -139,7 +139,20 @@ if [ "$BACKEND" = "nvidia" ]; then
pip install -r requirements-nvidia.txt || {
echo -e "${YELLOW}Warning: Some NVIDIA packages failed to install${NC}"
}
# Extended modality dependencies (video/audio/embeddings/upscaling)
echo -e "${YELLOW}Installing extended modality dependencies...${NC}"
pip install "imageio[ffmpeg]" scipy soundfile sentence-transformers \
openai-whisper argostranslate edge-tts kokoro-tts timm || {
echo -e "${YELLOW}Warning: Some optional modality packages failed${NC}"
}
pip install realesrgan basicsr || {
echo -e "${YELLOW}Warning: realesrgan/basicsr failed (image upscaling optional)${NC}"
}
pip install audiocraft 2>/dev/null || {
echo -e "${YELLOW}Note: audiocraft not installed (audio generation with MusicGen optional)${NC}"
}
# Install Flash Attention 2 if requested
if [ "$FLASH" = true ]; then
echo ""
......@@ -510,7 +523,22 @@ elif [ "$BACKEND" = "all" ]; then
pip install -r requirements-nvidia.txt || {
echo -e "${YELLOW}Warning: Some NVIDIA packages failed to install${NC}"
}
# Extended modality dependencies (video/audio/embeddings/upscaling)
echo -e "${YELLOW}Installing extended modality dependencies...${NC}"
pip install "imageio[ffmpeg]" scipy soundfile sentence-transformers \
openai-whisper argostranslate edge-tts kokoro-tts timm || {
echo -e "${YELLOW}Warning: Some optional modality packages failed${NC}"
}
pip install realesrgan basicsr || {
echo -e "${YELLOW}Warning: realesrgan/basicsr failed (image upscaling optional)${NC}"
}
# audiocraft (MusicGen/AudioGen) — Meta package, may fail on some Python versions
pip install audiocraft 2>/dev/null || {
echo -e "${YELLOW}Note: audiocraft not installed (audio generation with MusicGen optional)${NC}"
echo -e "${YELLOW} Install manually: pip install audiocraft${NC}"
}
# Check for Vulkan development libraries
VULKAN_AVAILABLE=false
if pkg-config --exists vulkan 2>/dev/null; then
......
......@@ -252,7 +252,8 @@ async def api_status(username: str = Depends(require_auth)):
try:
if config_manager:
md = config_manager.models_data
for cat in ("text_models", "image_models", "audio_models", "vision_models", "tts_models"):
for cat in ("text_models", "image_models", "audio_models", "vision_models", "tts_models",
"video_models", "audio_gen_models", "embedding_models"):
for m in md.get(cat, []):
mid = (m.get("path") or m.get("id") or m) if isinstance(m, dict) else m
if mid and mid not in enabled_models:
......@@ -712,7 +713,8 @@ def _scan_caches() -> dict:
if config_manager:
md = config_manager.models_data
for cat in ("text_models", "image_models", "audio_models",
"gguf_models", "tts_models", "vision_models"):
"gguf_models", "tts_models", "vision_models", "video_models",
"audio_gen_models", "embedding_models"):
for m in md.get(cat, []):
if isinstance(m, str):
p = m
......@@ -1011,7 +1013,8 @@ async def api_model_enable(request: Request, username: str = Depends(require_adm
data = await request.json()
path = data.get("path") or data.get("model_id", "")
model_type = data.get("model_type", "text_models")
valid = {"text_models", "image_models", "audio_models", "gguf_models", "tts_models", "vision_models"}
valid = {"text_models", "image_models", "audio_models", "gguf_models", "tts_models", "vision_models",
"video_models", "audio_gen_models", "embedding_models"}
if model_type not in valid:
raise HTTPException(status_code=400, detail=f"model_type must be one of {valid}")
lst = config_manager.models_data.setdefault(model_type, [])
......@@ -1030,7 +1033,8 @@ async def api_model_disable(request: Request, username: str = Depends(require_ad
path = data.get("path") or data.get("model_id", "")
changed = False
for cat in ("text_models", "image_models", "audio_models",
"gguf_models", "tts_models", "vision_models"):
"gguf_models", "tts_models", "vision_models", "video_models",
"audio_gen_models", "embedding_models"):
lst = config_manager.models_data.get(cat, [])
new_lst = [m for m in lst
if (m if isinstance(m, str) else m.get("path", m.get("id", ""))) != path]
......@@ -1063,7 +1067,10 @@ async def api_model_load(request: Request, username: str = Depends(require_admin
if config_manager:
md = config_manager.models_data
for cat, mtype in (("image_models", "image"), ("audio_models", "audio"),
("vision_models", "vision"), ("tts_models", "tts")):
("vision_models", "vision"), ("tts_models", "tts"),
("video_models", "video"),
("audio_gen_models", "audio_gen"),
("embedding_models", "embedding")):
for m in md.get(cat, []):
mid = m if isinstance(m, str) else m.get("path") or m.get("id") or ""
if mid == path:
......@@ -1158,7 +1165,8 @@ async def api_model_configure(request: Request, username: str = Depends(require_
# Treat legacy gguf_models as text_models (GGUF is a format, not a type)
if model_type == "gguf_models":
model_type = "text_models"
valid = {"text_models", "image_models", "audio_models", "tts_models", "vision_models"}
valid = {"text_models", "image_models", "audio_models", "tts_models", "vision_models", "video_models",
"audio_gen_models", "embedding_models"}
if not path:
raise HTTPException(status_code=400, detail="path is required")
if model_type not in valid:
......
......@@ -22,7 +22,7 @@
</a>
<div class="nav-links">
<a href="/admin" class="nav-link {% if request.url.path == '/admin' %}active{% endif %}">Overview</a>
<a href="/chat" class="nav-link {% if request.url.path == '/chat' %}active{% endif %}">Chat</a>
<a href="/chat" class="nav-link {% if request.url.path == '/chat' %}active{% endif %}">Studio</a>
{% if is_admin|default(false) %}
<a href="/admin/models" class="nav-link {% if '/models' in request.url.path %}active{% endif %}">Models</a>
<a href="/admin/tokens" class="nav-link {% if '/tokens' in request.url.path %}active{% endif %}">Tokens</a>
......
This diff is collapsed.
......@@ -127,6 +127,8 @@
<option value="image-to-text">Image-to-text</option>
<option value="automatic-speech-recognition">Speech recog.</option>
<option value="text-to-speech">TTS</option>
<option value="text-to-video">Text-to-video</option>
<option value="image-to-video">Image-to-video</option>
<option value="feature-extraction">Embeddings</option>
</select>
</div>
......@@ -296,9 +298,12 @@
<select id="cfg-type" class="form-input">
<option value="text_models">Text (LLM)</option>
<option value="image_models">Image generation</option>
<option value="audio_models">Audio</option>
<option value="tts_models">TTS</option>
<option value="vision_models">Vision</option>
<option value="video_models">Video generation</option>
<option value="audio_models">Audio transcription (STT)</option>
<option value="tts_models">Text-to-speech (TTS)</option>
<option value="vision_models">Vision / VLM</option>
<option value="audio_gen_models">Audio generation (Music/SFX)</option>
<option value="embedding_models">Embeddings</option>
</select>
</div>
<div class="form-row" style="margin:0">
......
......@@ -70,6 +70,9 @@ from codai.api.transcriptions import router as transcriptions_router
from codai.api.images import router as images_router
from codai.api.tts import router as tts_router
from codai.api.text import router as text_router
from codai.api.video import router as video_router
from codai.api.audio_gen import router as audio_gen_router
from codai.api.embeddings import router as embeddings_router
from codai.admin.routes import router as admin_router
# Import and add middleware
......@@ -88,6 +91,9 @@ app.include_router(transcriptions_router)
app.include_router(images_router)
app.include_router(tts_router)
app.include_router(text_router)
app.include_router(video_router)
app.include_router(audio_gen_router)
app.include_router(embeddings_router)
app.include_router(admin_router)
......
"""
Audio generation endpoints for the codai API.
Supports music, sound effects, and ambient audio via MusicGen, AudioLDM2, StableAudio, etc.
POST /v1/audio/generate
"""
import asyncio
import base64
import io
import os
import time
import uuid
from fastapi import APIRouter, HTTPException, Request
from codai.models.manager import multi_model_manager
from codai.pydantic.audiogenrequest import AudioGenerationRequest, AudioGenerationResponse
router = APIRouter()
global_args = None
global_file_path = None
def set_global_args(args):
global global_args
global_args = args
def set_global_file_path(path):
global global_file_path
global_file_path = path
def _derive_device() -> str:
if global_args:
d = getattr(global_args, 'vulkan_device', None)
if d is not None:
return f"cuda:{d}"
return "cuda:0"
def _save_audio_response(audio_data: bytes, ext: str, http_request: Request) -> dict:
filename = f"{uuid.uuid4().hex}.{ext}"
if global_file_path:
os.makedirs(global_file_path, exist_ok=True)
fpath = os.path.join(global_file_path, filename)
with open(fpath, 'wb') as f:
f.write(audio_data)
url_setting = getattr(global_args, 'url', 'auto') if global_args else 'auto'
if url_setting == 'auto':
host = http_request.headers.get('host', '127.0.0.1') if http_request else '127.0.0.1'
if ':' in host:
parts = host.split(':')
if len(parts) == 2 and parts[1].isdigit():
host = parts[0]
use_https = getattr(global_args, 'https', False) if global_args else False
proto = 'https' if use_https else 'http'
port = getattr(global_args, 'port', 8000) if global_args else 8000
base_url = f"{proto}://{host}:{port}"
else:
base_url = url_setting.rstrip('/')
return {"url": f"{base_url}/v1/files/{filename}"}
else:
b64 = base64.b64encode(audio_data).decode()
return {f"b64_{ext}": b64}
def _load_musicgen(model_name: str, device: str):
from audiocraft.models import MusicGen, AudioGen
name_lower = model_name.lower()
if 'audiogen' in name_lower:
model = AudioGen.get_pretrained(model_name)
else:
model = MusicGen.get_pretrained(model_name)
model.set_generation_params(duration=30)
return model
def _load_audioldm(model_name: str, device: str):
import torch
from diffusers import AudioLDM2Pipeline
pipe = AudioLDM2Pipeline.from_pretrained(model_name, torch_dtype=torch.float16)
pipe = pipe.to(device)
return pipe
def _detect_audio_gen_type(model_name: str) -> str:
n = model_name.lower()
if 'audioldm' in n or 'stable-audio' in n:
return 'audioldm'
if 'audiogen' in n:
return 'audiogen'
return 'musicgen'
def _generate_audio(pipe, model_name: str, request: AudioGenerationRequest):
"""Run generation and return (audio_bytes, ext)."""
import numpy as np, io as _io
model_type = _detect_audio_gen_type(model_name)
if model_type in ('musicgen', 'audiogen'):
pipe.set_generation_params(
duration=request.duration,
top_k=request.top_k,
top_p=request.top_p,
temperature=request.temperature,
cfg_coef=request.cfg_coef,
)
if request.melody and model_type == 'musicgen':
import torchaudio, torch
raw = _decode_b64_or_url(request.melody)
melody_wav, sr = torchaudio.load(_io.BytesIO(raw))
wav = pipe.generate_with_chroma([request.prompt], melody_wav.unsqueeze(0), sr)
else:
wav = pipe.generate([request.prompt])
audio_np = wav[0, 0].cpu().numpy()
sr = pipe.sample_rate
elif model_type == 'audioldm':
result = pipe(
request.prompt,
num_inference_steps=50,
audio_length_in_s=request.duration,
)
audio_np = result.audios[0]
sr = 16000
# Write to wav
import scipy.io.wavfile as wavfile
buf = _io.BytesIO()
audio_int16 = (audio_np * 32767).astype(np.int16)
wavfile.write(buf, sr, audio_int16)
return buf.getvalue(), 'wav'
def _decode_b64_or_url(data: str) -> bytes:
if data.startswith("data:"):
_, enc = data.split(",", 1)
return base64.b64decode(enc)
if data.startswith("http"):
import urllib.request
with urllib.request.urlopen(data, timeout=30) as r:
return r.read()
return base64.b64decode(data)
@router.post("/v1/audio/generate", response_model=AudioGenerationResponse)
async def audio_generate(request: AudioGenerationRequest, http_request: Request = None):
"""
Generate music, sound effects, or ambient audio.
Compatible models: MusicGen, AudioGen, AudioLDM2, StableAudio.
"""
model_info = multi_model_manager.request_model(request.model, model_type="audio_gen")
model_name = model_info.get('model_name')
if not model_name:
err = model_info.get('error', f"Model '{request.model}' not found")
raise HTTPException(status_code=404, detail=err)
model_key = model_info['model_key']
pipe = model_info.get('model_object')
if pipe is None:
device = _derive_device()
model_type = _detect_audio_gen_type(model_name)
try:
if model_type in ('musicgen', 'audiogen'):
pipe = await asyncio.get_event_loop().run_in_executor(
None, _load_musicgen, model_name, device)
else:
pipe = await asyncio.get_event_loop().run_in_executor(
None, _load_audioldm, model_name, device)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to load audio gen model: {e}")
multi_model_manager.models[model_key] = pipe
multi_model_manager.current_model_key = model_key
try:
audio_bytes, ext = await asyncio.get_event_loop().run_in_executor(
None, _generate_audio, pipe, model_name, request)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Audio generation failed: {e}")
result = _save_audio_response(audio_bytes, ext, http_request)
return AudioGenerationResponse(created=int(time.time()), data=[result])
"""
Embeddings endpoint — OpenAI-compatible.
POST /v1/embeddings
Supports sentence-transformers, BGE, E5, nomic-embed, etc.
"""
import asyncio
import base64
import time
from typing import List
from fastapi import APIRouter, HTTPException, Request
from codai.models.manager import multi_model_manager
from codai.pydantic.embedrequest import EmbeddingsRequest, EmbeddingsResponse, EmbeddingObject
router = APIRouter()
global_args = None
def set_global_args(args):
global global_args
global_args = args
def _derive_device() -> str:
if global_args:
d = getattr(global_args, 'vulkan_device', None)
if d is not None:
return f"cuda:{d}"
return "cuda:0"
def _load_embedding_model(model_name: str, device: str):
try:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(model_name, device=device)
return ('sentence_transformers', model)
except ImportError:
pass
try:
from transformers import AutoTokenizer, AutoModel
import torch
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
return ('transformers', (tokenizer, model, device))
except Exception as e:
raise RuntimeError(f"Cannot load embedding model '{model_name}': {e}")
def _embed_texts(model_obj, texts: List[str], dimensions=None) -> List[List[float]]:
backend, model = model_obj
if backend == 'sentence_transformers':
vecs = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
results = [v.tolist() for v in vecs]
else:
import torch
tokenizer, hf_model, device = model
encoded = tokenizer(texts, padding=True, truncation=True,
return_tensors='pt', max_length=512)
encoded = {k: v.to(device) for k, v in encoded.items()}
with torch.no_grad():
out = hf_model(**encoded)
# mean-pool last hidden state
token_embs = out.last_hidden_state
attention = encoded['attention_mask'].unsqueeze(-1).float()
mean_emb = (token_embs * attention).sum(1) / attention.sum(1)
import torch.nn.functional as F
mean_emb = F.normalize(mean_emb, dim=-1)
results = [row.cpu().tolist() for row in mean_emb]
if dimensions:
results = [v[:dimensions] for v in results]
return results
@router.post("/v1/embeddings", response_model=EmbeddingsResponse)
async def create_embeddings(request: EmbeddingsRequest, http_request: Request = None):
"""
OpenAI-compatible embeddings endpoint.
"""
model_info = multi_model_manager.request_model(request.model, model_type="embedding")
model_name = model_info.get('model_name')
if not model_name:
err = model_info.get('error', f"Model '{request.model}' not found")
raise HTTPException(status_code=404, detail=err)
model_key = model_info['model_key']
model_obj = model_info.get('model_object')
if model_obj is None:
device = _derive_device()
try:
model_obj = await asyncio.get_event_loop().run_in_executor(
None, _load_embedding_model, model_name, device)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to load embedding model: {e}")
multi_model_manager.models[model_key] = model_obj
multi_model_manager.current_model_key = model_key
texts = [request.input] if isinstance(request.input, str) else request.input
try:
vectors = await asyncio.get_event_loop().run_in_executor(
None, _embed_texts, model_obj, texts, request.dimensions)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Embedding failed: {e}")
if request.encoding_format == 'base64':
import struct
data = [EmbeddingObject(
index=i,
embedding=base64.b64encode(struct.pack(f'{len(v)}f', *v)).decode()
) for i, v in enumerate(vectors)]
else:
data = [EmbeddingObject(index=i, embedding=v) for i, v in enumerate(vectors)]
total_tokens = sum(len(t.split()) for t in texts)
return EmbeddingsResponse(
data=data,
model=request.model,
usage={"prompt_tokens": total_tokens, "total_tokens": total_tokens},
)
This diff is collapsed.
This diff is collapsed.
......@@ -376,6 +376,27 @@ def main():
if mid:
multi_model_manager.set_tts_model(mid, config=_model_cfg(m, "tts") if isinstance(m, dict) else {})
# Video generation models
video_models = models_config.get("video_models", [])
for m in video_models:
mid = _model_id(m)
if mid:
multi_model_manager.set_video_model(mid, config=_model_cfg(m, "video") if isinstance(m, dict) else {})
# Audio generation models (MusicGen, AudioLDM2, …)
audio_gen_models = models_config.get("audio_gen_models", [])
for m in audio_gen_models:
mid = _model_id(m)
if mid:
multi_model_manager.set_audio_gen_model(mid, config=_model_cfg(m, "audio_gen") if isinstance(m, dict) else {})
# Embedding models
embedding_models = models_config.get("embedding_models", [])
for m in embedding_models:
mid = _model_id(m)
if mid:
multi_model_manager.set_embedding_model(mid, config=_model_cfg(m, "embedding") if isinstance(m, dict) else {})
# Register aliases
aliases = models_config.get("aliases", {})
for alias, model in aliases.items():
......@@ -387,7 +408,10 @@ def main():
[("audio", m) for m in audio_models] +
[("image", m) for m in image_models] +
[("vision", m) for m in vision_models] +
[("tts", m) for m in tts_models]
[("tts", m) for m in tts_models] +
[("video", m) for m in video_models] +
[("audio_gen", m) for m in audio_gen_models] +
[("embedding", m) for m in embedding_models]
)
for mtype, m in all_model_entries:
mid = _model_id(m)
......@@ -498,6 +522,22 @@ def main():
from codai.api.images import set_global_args as set_images_global_args
set_images_global_args(global_args)
# Set video module global args
from codai.api.video import set_global_args as set_video_global_args, set_global_file_path as set_video_file_path
set_video_global_args(global_args)
if global_file_path:
set_video_file_path(global_file_path)
# Set audio_gen module global args
from codai.api.audio_gen import set_global_args as set_audiogen_global_args, set_global_file_path as set_audiogen_file_path
set_audiogen_global_args(global_args)
if global_file_path:
set_audiogen_file_path(global_file_path)
# Set embeddings module global args
from codai.api.embeddings import set_global_args as set_embed_global_args
set_embed_global_args(global_args)
# Pre-load image models marked as load_mode == "load"
for m in image_models:
mid = _model_id(m)
......
This diff is collapsed.
This diff is collapsed.
"""Pydantic models for audio generation API."""
from typing import Dict, List, Optional
from pydantic import BaseModel, ConfigDict
class AudioGenerationRequest(BaseModel):
model: str
prompt: str
duration: Optional[float] = 10.0 # seconds
top_k: Optional[int] = 250
top_p: Optional[float] = 0.0
temperature: Optional[float] = 1.0
cfg_coef: Optional[float] = 3.0 # classifier-free guidance coefficient
seed: Optional[int] = None
# Reference audio for melody conditioning (MusicGen Melody)
melody: Optional[str] = None # base64/URL
# Output
response_format: Optional[str] = "url" # url | b64_wav | b64_mp3
user: Optional[str] = None
model_config = ConfigDict(extra="allow")
class AudioGenerationResponse(BaseModel):
created: int
data: List[Dict]
model_config = ConfigDict(extra="allow")
"""Pydantic models for embeddings API."""
from typing import Dict, List, Optional, Union
from pydantic import BaseModel, ConfigDict
class EmbeddingsRequest(BaseModel):
model: str
input: Union[str, List[str]] # text(s) to embed
image: Optional[Union[str, List[str]]] = None # base64/URL image(s) for multimodal embed
encoding_format: Optional[str] = "float" # float | base64
dimensions: Optional[int] = None # truncate to N dims if supported
user: Optional[str] = None
model_config = ConfigDict(extra="allow")
class EmbeddingObject(BaseModel):
object: str = "embedding"
index: int
embedding: Union[List[float], str] # float list or base64
class EmbeddingsResponse(BaseModel):
object: str = "list"
data: List[EmbeddingObject]
model: str
usage: Dict
model_config = ConfigDict(extra="allow")
......@@ -10,14 +10,15 @@ class ImageGenerationRequest(BaseModel):
prompt: str
n: int = 1
size: Optional[str] = "1024x1024"
steps: Optional[int] = None # Number of inference steps (overrides quality-based default)
guidance_scale: Optional[float] = None # CFG scale (overrides quality-based default)
steps: Optional[int] = None
guidance_scale: Optional[float] = None
quality: Optional[str] = "standard"
style: Optional[str] = None
response_format: Optional[str] = "url"
seed: Optional[int] = None
user: Optional[str] = None
disable_safety_checker: Optional[bool] = False
model_config = ConfigDict(extra="allow")
......
......@@ -103,6 +103,8 @@ class ModelInfo(BaseModel):
object: str = "model"
created: int = Field(default_factory=lambda: int(time.time()))
owned_by: str = "huggingface"
type: Optional[str] = None # e.g. "text", "image", "video", "audio", "tts", "vision", "embedding"
capabilities: Optional[List[str]] = None # list of capability strings
class ModelList(BaseModel):
......
"""Pydantic models for video generation API."""
from typing import Dict, List, Optional
from pydantic import BaseModel, ConfigDict
class VideoGenerationRequest(BaseModel):
model: str
prompt: str = ""
negative_prompt: Optional[str] = None
# Dimensions
width: Optional[int] = 512
height: Optional[int] = 512
# Temporal
num_frames: Optional[int] = None # model default if None
fps: Optional[int] = None # output FPS
# Diffusion
num_inference_steps: Optional[int] = None
guidance_scale: Optional[float] = None
seed: Optional[int] = None
# Mode
# t2v – text-to-video
# i2v – image-to-video (init_image required)
# v2v – video-to-video (video required)
# ti2v – text + init image → video (like i2v but prompt is primary driver)
# interp – frame interpolation (init_image + end_image)
mode: Optional[str] = "t2v"
# Input media (base64 or URL)
image: Optional[str] = None # alias for init_image
init_image: Optional[str] = None # first/reference frame
end_image: Optional[str] = None # last frame (for interp mode)
video: Optional[str] = None # input video (v2v / audio manipulation)
strength: Optional[float] = None # denoising strength for v2v
# Camera motion hint
camera_motion: Optional[str] = None # zoom-in | zoom-out | pan-left | pan-right | tilt-up | tilt-down | rotate
# ── Character consistency ─────────────────────────────────────────────
character_references: Optional[List[str]] = None # list of base64/URL reference images
character_strength: Optional[float] = 0.8
character_names: Optional[List[str]] = None # optional names per reference
# ── Audio generation / manipulation ──────────────────────────────────
add_audio: Optional[bool] = False
audio_type: Optional[str] = None # music | speech | sfx | ambient
audio_prompt: Optional[str] = None # prompt for music/sfx generation
audio_file: Optional[str] = None # existing audio to add (base64/URL)
tts_text: Optional[str] = None # text for speech synthesis
tts_voice: Optional[str] = None # TTS voice id
tts_speed: Optional[float] = 1.0
sync_audio: Optional[bool] = False # sync audio timing to video
lip_sync: Optional[bool] = False # warp mouth to match audio
lip_sync_method: Optional[str] = "wav2lip" # wav2lip | sadtalker
# ── Subtitles ────────────────────────────────────────────────────────
generate_subtitles: Optional[bool] = False
burn_subtitles: Optional[bool] = False
subtitle_language: Optional[str] = None # source language hint
translate_subtitles: Optional[bool] = False
subtitle_target_lang: Optional[str] = None
subtitle_style: Optional[str] = "default" # default | karaoke | minimal
whisper_model: Optional[str] = None # which whisper variant to use
# ── Video dubbing ─────────────────────────────────────────────────────
dub_video: Optional[bool] = False
dub_target_lang: Optional[str] = None
dub_source_lang: Optional[str] = None
voice_clone: Optional[bool] = False # clone original speaker voice
# ── Post-processing ───────────────────────────────────────────────────
upscale_output: Optional[bool] = False
upscale_factor: Optional[int] = 2
interpolate_output: Optional[bool] = False # increase FPS after generation
fps_multiplier: Optional[int] = 2 # e.g. 2 → 2× FPS via frame interp
convert_to_3d: Optional[bool] = False
depth_method: Optional[str] = "midas" # midas | zoe | depth-anything
# ── Memory / offload ─────────────────────────────────────────────────
offload_strategy: Optional[str] = None # sequential | model | none
# Nulls pipeline safety_checker / safety_concept so uncensored fine-tunes
# are not blocked. Has no effect on models without a safety checker.
disable_safety_checker: Optional[bool] = False
# ── Output ───────────────────────────────────────────────────────────
response_format: Optional[str] = "url" # url | b64_mp4
n: int = 1
user: Optional[str] = None
model_config = ConfigDict(extra="allow")
class VideoGenerationResponse(BaseModel):
created: int
data: List[Dict]
model_config = ConfigDict(extra="allow")
# ── Standalone operation requests ─────────────────────────────────────────────
class VideoUpscaleRequest(BaseModel):
model: str
video: str # base64/URL input video
upscale_factor: Optional[int] = 2
response_format: Optional[str] = "url"
model_config = ConfigDict(extra="allow")
class VideoSubtitleRequest(BaseModel):
model: str
video: str # base64/URL input video
language: Optional[str] = None
translate: Optional[bool] = False
target_lang: Optional[str] = None
burn: Optional[bool] = False
style: Optional[str] = "default"
response_format: Optional[str] = "srt" # srt | vtt | json | burned_video
model_config = ConfigDict(extra="allow")
class VideoInterpolateRequest(BaseModel):
model: str
video: Optional[str] = None # base64/URL input video (mutually exclusive with init/end)
init_image: Optional[str] = None # first frame
end_image: Optional[str] = None # last frame
fps_multiplier: Optional[int] = 2
response_format: Optional[str] = "url"
model_config = ConfigDict(extra="allow")
class VideoDubRequest(BaseModel):
model: str
video: str
target_lang: str
source_lang: Optional[str] = None
voice_clone: Optional[bool] = False
burn_subtitles: Optional[bool] = False
response_format: Optional[str] = "url"
model_config = ConfigDict(extra="allow")
......@@ -16,6 +16,21 @@ psutil>=5.9.0
# Optional: Audio transcription dependencies
faster-whisper>=0.10.0 # For NVIDIA/CUDA whisper transcription
whispercpp>=0.0.17 # Alternative whisper library (works without PyTorch)
openai-whisper>=20231117 # Whisper for subtitle generation
# Image/video/audio utilities
Pillow>=10.0.0
numpy>=1.24.0
imageio[ffmpeg]>=2.33.0 # frame I/O + ffmpeg bridge for video generation
scipy>=1.11.0
sentence-transformers>=2.7.0 # /v1/embeddings
argostranslate>=1.9.0 # subtitle translation
edge-tts>=6.1.9 # TTS dubbing (primary)
kokoro-tts>=0.9.0 # TTS dubbing (fallback)
soundfile>=0.12.0
realesrgan>=0.3.0
basicsr>=1.4.2
timm>=0.9.0
# Optional: for better performance with NVIDIA GPUs
bitsandbytes>=0.41.0
......
......@@ -49,6 +49,32 @@ whispercpp>=0.0.17 # Alternative whisper library (works without PyTorch)
# LiteLLM for standardized API responses
litellm>=1.40.0
# Image/video processing utilities
Pillow>=10.0.0
numpy>=1.24.0
imageio[ffmpeg]>=2.33.0 # frame I/O + ffmpeg bridge for video generation
scipy>=1.11.0 # audio/signal processing (wav export in audio_gen)
# Embeddings
sentence-transformers>=2.7.0 # /v1/embeddings with sentence-transformer models
# Video/audio post-processing (all optional – features degrade gracefully if absent)
openai-whisper>=20231117 # subtitle generation via Whisper transcription
argostranslate>=1.9.0 # subtitle translation
edge-tts>=6.1.9 # TTS for video dubbing (primary)
kokoro-tts>=0.9.0 # TTS for video dubbing (fallback)
soundfile>=0.12.0 # audio file I/O for kokoro TTS output
# Image upscaling / restoration
realesrgan>=0.3.0 # Real-ESRGAN upscaler
basicsr>=1.4.2 # backbone required by realesrgan
timm>=0.9.0 # vision model backbones (depth/segment endpoints)
# Audio generation (optional – only needed for /v1/audio/generate)
# audiocraft is Meta's MusicGen/AudioGen library; install separately if desired:
# pip install audiocraft
# AudioLDM2 is available via diffusers (already listed above)
# Optional: for better performance
# bitsandbytes>=0.41.0 # for 4-bit/8-bit quantization
# sentencepiece>=0.1.99 # for some tokenizers
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment