Add acceleration/distillation support (Lightning/Turbo/LCM/Hyper-SD)

Per-model `acceleration` config block fuses a distillation LoRA into the
pipeline at load and supplies low step-count / guidance defaults at
generation time, for a 5-10x speedup. Covers video (Wan), image diffusers
(SD/SDXL), and sd.cpp (step/cfg defaults + <lora:> prompt injection).

- New codai/models/acceleration.py: preset catalog (ACCEL_PRESETS),
  resolve_acceleration(), apply_accel_to_pipeline() (load->fuse->unload so
  it stays orthogonal to per-request character/env LoRAs), accel_call_defaults().
- video.py: fuse accel LoRA after load; _generate_video / _generate_sdcpp_video
  use preset steps/guidance (request always wins).
- images.py: _apply_image_acceleration on both diffusers load paths;
  _generate_image and _generate_with_sdcpp honour preset steps/guidance.
- main.py: surface `acceleration` as a first-class runtime kwarg.
- admin: persist `acceleration`; new GET /admin/api/accel-presets; models.html
  Acceleration/Distillation card (preset dropdown + manual override).

Also fix a latent null-trap: float(cfg.get('balanced_gpu_percent', 80))
crashed when the config stored an explicit null (written by the admin UI for
blank fields) since .get(key, default) returns the stored None. Use `or 80`.
Co-Authored-By: 's avatarClaude Opus 4.8 <noreply@anthropic.com>
parent bf50d8a1
......@@ -1939,7 +1939,7 @@ async def api_model_configure(request: Request, username: str = Depends(require_
"lora_train_base_model",
"max_vram", "sdcpp_flash_attn", "sdcpp_diffusion_flash_attn", "vae_tiling",
"component_quantization", "output_crf", "force_vram_update",
"balanced_gpu_percent"):
"balanced_gpu_percent", "acceleration"):
if key in data:
entry[key] = data[key]
......@@ -1970,6 +1970,18 @@ async def api_model_configure(request: Request, username: str = Depends(require_
return {"success": True, "applied_live": applied}
@router.get("/admin/api/accel-presets")
async def api_accel_presets(username: str = Depends(require_admin)):
"""Return the acceleration/distillation preset catalog (Lightning / Turbo /
LCM / Hyper-SD) so the model-config UI dropdown stays in sync with the Python
source of truth in codai/models/acceleration.py."""
try:
from codai.models.acceleration import ACCEL_PRESETS
return {"presets": ACCEL_PRESETS}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# --- System endpoints ---
@router.post("/admin/api/system/reload")
......
......@@ -689,6 +689,49 @@ window.__DEFAULT_WHISPER_SERVER_PATH__ = {{ default_whisper_server_path|tojson }
<span class="form-hint">When the model won't fit entirely in VRAM, fill this % of free VRAM then spill the rest to CPU RAM. Also the cap used by auto-balanced fallback.</span>
</div>
<!-- Acceleration / distillation (Lightning / Turbo / LCM / Hyper-SD) — image & video -->
<div id="cfg-accel-section" style="display:none">
<div class="card-title" style="margin-top:1.25rem">Acceleration / Distillation
<span class="muted" style="font-weight:normal">(Lightning / Turbo / LCM / Hyper-SD — image &amp; video)</span></div>
<label style="display:flex;align-items:center;gap:.5rem;cursor:pointer;font-size:13px;margin:.4rem 0">
<input type="checkbox" id="cfg-accel-enabled" onchange="onAccelToggle()"> Enable acceleration
<span class="muted">fuse a distill LoRA and run in few steps at low guidance</span></label>
<div id="cfg-accel-fields" style="display:none">
<div class="form-row" style="max-width:420px">
<label class="form-label">Preset</label>
<select id="cfg-accel-preset" class="form-input" onchange="onAccelPreset()">
<option value="custom">Custom (set fields manually)</option>
</select>
</div>
<div class="form-row" style="max-width:560px">
<label class="form-label">Distill LoRA <span class="muted">(path or HF repo, optionally repo:weight_name.safetensors; blank for turbo full-models)</span></label>
<input type="text" id="cfg-accel-lora" class="form-input" placeholder="e.g. ByteDance/SDXL-Lightning:sdxl_lightning_4step_lora.safetensors">
</div>
<div style="display:flex;gap:1rem;flex-wrap:wrap">
<div class="form-row" style="max-width:130px">
<label class="form-label">LoRA weight</label>
<input type="number" id="cfg-accel-weight" class="form-input" min="0" max="2" step="0.05" placeholder="1.0">
</div>
<div class="form-row" style="max-width:120px">
<label class="form-label">Steps</label>
<input type="number" id="cfg-accel-steps" class="form-input" min="1" max="50" step="1" placeholder="4">
</div>
<div class="form-row" style="max-width:130px">
<label class="form-label">Guidance</label>
<input type="number" id="cfg-accel-guidance" class="form-input" min="0" max="15" step="0.5" placeholder="1.0">
</div>
<div class="form-row" style="max-width:150px">
<label class="form-label">Flow shift <span class="muted">(Wan)</span></label>
<input type="number" id="cfg-accel-flowshift" class="form-input" min="0" max="20" step="0.5" placeholder="(none)">
</div>
<div class="form-row" style="max-width:200px">
<label class="form-label">Scheduler <span class="muted">(optional)</span></label>
<input type="text" id="cfg-accel-scheduler" class="form-input" placeholder="e.g. LCMScheduler">
</div>
</div>
</div>
</div>
<!-- components -->
<div class="card-title" style="margin-top:1.25rem">Components</div>
<div class="form-row">
......@@ -2225,6 +2268,10 @@ async function refreshLocal(){
loadGlobalSettings();
refreshLocal();
// Toggle the acceleration section as image/video model types are checked/unchecked.
document.querySelectorAll('.cfg-type-cb').forEach(cb =>
cb.addEventListener('change', () => _refreshAccelVisibility()));
// ── Deep-link from Studio: /admin/models?tab=search&q=...&pipeline=...&gguf=...
// ── or: /admin/models?local_cap=CAPABILITY — highlight local models with that capability
(function applyDeepLink(){
......@@ -2630,9 +2677,97 @@ function openCfgModal(idx, cfgIdx){
_setCompField('lora', s.lora_path || '');
document.getElementById('cfg-lora-dir').value = s.lora_model_dir || '';
document.getElementById('cfg-lora-train-base').value = s.lora_train_base_model || '';
_populateAccel(s.acceleration);
openModal('cfg-modal');
}
// ---- Acceleration / distillation (Lightning / Turbo / LCM / Hyper-SD) ----
let _accelPresets = null;
async function _loadAccelPresets(){
if (_accelPresets) return _accelPresets;
try {
const r = await fetch(ROOT_PATH + '/admin/api/accel-presets');
const d = await r.json();
_accelPresets = d.presets || {};
} catch(e){ _accelPresets = {}; }
return _accelPresets;
}
function _accelAppliesTo(){
// Which families the current model-type selection allows (image / video).
const types = new Set([...document.querySelectorAll('.cfg-type-cb:checked')].map(cb=>cb.value));
const out = new Set();
if (types.has('image_models')) out.add('image');
if (types.has('video_models')) out.add('video');
return out;
}
function _refreshAccelVisibility(){
const section = document.getElementById('cfg-accel-section');
if (!section) return;
const applies = _accelAppliesTo();
section.style.display = applies.size ? '' : 'none';
// Rebuild the preset dropdown filtered to the applicable families.
const sel = document.getElementById('cfg-accel-preset');
if (!sel) return;
const cur = sel.value;
sel.innerHTML = '<option value="custom">Custom (set fields manually)</option>';
const presets = _accelPresets || {};
Object.keys(presets).forEach(k=>{
const p = presets[k];
const at = p.applies_to || [];
if (!applies.size || at.some(a=>applies.has(a))){
const o = document.createElement('option');
o.value = k; o.textContent = p.label || k;
sel.appendChild(o);
}
});
// Preserve the previously selected preset if still present.
if ([...sel.options].some(o=>o.value===cur)) sel.value = cur;
}
function onAccelToggle(){
const on = document.getElementById('cfg-accel-enabled').checked;
document.getElementById('cfg-accel-fields').style.display = on ? '' : 'none';
}
function onAccelPreset(){
const key = document.getElementById('cfg-accel-preset').value;
const p = (_accelPresets || {})[key];
if (!p) return; // "custom" — leave fields as-is
document.getElementById('cfg-accel-lora').value = p.lora || '';
document.getElementById('cfg-accel-weight').value = p.lora_weight != null ? p.lora_weight : '';
document.getElementById('cfg-accel-steps').value = p.steps != null ? p.steps : '';
document.getElementById('cfg-accel-guidance').value = p.guidance_scale != null ? p.guidance_scale : '';
document.getElementById('cfg-accel-flowshift').value = p.flow_shift != null ? p.flow_shift : '';
document.getElementById('cfg-accel-scheduler').value = p.scheduler || '';
}
async function _populateAccel(a){
await _loadAccelPresets();
_refreshAccelVisibility();
a = a || {};
document.getElementById('cfg-accel-enabled').checked = !!a.enabled;
const sel = document.getElementById('cfg-accel-preset');
sel.value = [...sel.options].some(o=>o.value===(a.preset||'')) ? a.preset : 'custom';
document.getElementById('cfg-accel-lora').value = a.lora || '';
document.getElementById('cfg-accel-weight').value = a.lora_weight != null ? a.lora_weight : '';
document.getElementById('cfg-accel-steps').value = a.steps != null ? a.steps : '';
document.getElementById('cfg-accel-guidance').value = a.guidance_scale != null ? a.guidance_scale : '';
document.getElementById('cfg-accel-flowshift').value = a.flow_shift != null ? a.flow_shift : '';
document.getElementById('cfg-accel-scheduler').value = a.scheduler || '';
onAccelToggle();
}
function _collectAccel(){
if (!document.getElementById('cfg-accel-enabled').checked) return null;
const num = id => { const v = document.getElementById(id).value.trim(); return v === '' ? null : parseFloat(v); };
return {
enabled: true,
preset: document.getElementById('cfg-accel-preset').value || 'custom',
lora: document.getElementById('cfg-accel-lora').value.trim() || null,
lora_weight: num('cfg-accel-weight'),
steps: num('cfg-accel-steps'),
guidance_scale: num('cfg-accel-guidance'),
flow_shift: num('cfg-accel-flowshift'),
scheduler: document.getElementById('cfg-accel-scheduler').value.trim() || '',
};
}
function _updatePreloadAllVisibility() {
const loadMode = document.getElementById('cfg-load-mode').value;
const maxInst = parseInt(document.getElementById('cfg-max-instances').value) || 1;
......@@ -2730,6 +2865,7 @@ async function saveModelConfig(){
? null : parseInt(document.getElementById('cfg-output-crf').value)),
balanced_gpu_percent: (document.getElementById('cfg-balanced-gpu-pct').value.trim() === ''
? null : parseFloat(document.getElementById('cfg-balanced-gpu-pct').value)),
acceleration: _collectAccel(),
};
try{
const r = await fetch(ROOT_PATH + '/admin/api/model-configure',{
......
......@@ -331,6 +331,22 @@ def _disable_safety_checker(pipe):
return pipe
def _apply_image_acceleration(pipeline, model_config):
"""Fuse a configured acceleration/distillation LoRA (Lightning / Turbo / LCM /
Hyper-SD) into a freshly loaded diffusers image pipeline. No-op when no
acceleration is configured. Failures are caught inside apply_accel_to_pipeline."""
try:
from codai.models.acceleration import resolve_acceleration, apply_accel_to_pipeline
accel = resolve_acceleration(model_config)
if accel:
print(f" [image][accel] applying {accel.get('preset')} "
f"(steps={accel.get('steps')}, guidance={accel.get('guidance_scale')})")
apply_accel_to_pipeline(pipeline, accel)
except Exception as e:
print(f" [image][accel] skipped: {e}")
return pipeline
def _load_diffusers_pipeline(model_name: str, global_args, model_config: dict = None):
"""
Try to load a model using the diffusers library.
......@@ -443,7 +459,7 @@ def _load_diffusers_pipeline(model_name: str, global_args, model_config: dict =
if _img_quant_config is None:
raise # only quantized pipelines may reject .to()
print(f"--no-ram: Diffusers model loaded on {cuda_device}")
return pipeline
return _apply_image_acceleration(pipeline, _mc)
except Exception as e:
raise RuntimeError(
f"--no-ram: Failed to load diffusers model entirely on GPU ({cuda_device}). "
......@@ -591,7 +607,7 @@ def _load_diffusers_pipeline(model_name: str, global_args, model_config: dict =
raise
pipeline = None
return pipeline
return _apply_image_acceleration(pipeline, _mc)
async def _apply_vae_override(pipeline, vae_model_id: str):
......@@ -725,9 +741,17 @@ async def _generate_with_diffusers(pipeline, request, global_args, http_request=
generator = torch.Generator(device=pipeline.device).manual_seed(seed)
quality = request.quality or "standard"
num_steps = request.steps if request.steps else (30 if quality == "standard" else 50)
# Acceleration/distillation defaults (Lightning / Turbo / LCM / Hyper-SD): when
# the loaded pipeline has a fused distill LoRA, default to its low step-count /
# guidance. The request always wins if it specified steps/guidance.
_accel = getattr(pipeline, '_coderai_accel', None)
_accel_steps = _accel.get('steps') if _accel else None
_accel_cfg = _accel.get('guidance_scale') if _accel else None
num_steps = request.steps if request.steps else (
_accel_steps if _accel_steps else (30 if quality == "standard" else 50))
cfg_scale = request.guidance_scale if request.guidance_scale else (
getattr(global_args, 'image_cfg_scale', 7.5) if quality == "standard" else 9.0
_accel_cfg if _accel_cfg is not None else
(getattr(global_args, 'image_cfg_scale', 7.5) if quality == "standard" else 9.0)
)
_progress_reset(num_steps)
......@@ -950,7 +974,8 @@ async def _generate_with_diffusers(pipeline, request, global_args, http_request=
}
async def _generate_with_sdcpp(sd_model, request, global_args, http_request=None):
async def _generate_with_sdcpp(sd_model, request, global_args, http_request=None,
model_config=None):
"""Generate images using stable-diffusion-cpp-python."""
import time
......@@ -965,8 +990,27 @@ async def _generate_with_sdcpp(sd_model, request, global_args, http_request=None
except ValueError:
pass
# Acceleration/distillation defaults (Lightning / Turbo / LCM): sd.cpp can't
# fuse a diffusers LoRA, but honour the preset's low step-count / guidance and
# inject the distill LoRA via the "<lora:name:weight>" prompt syntax when a
# lora_model_dir is configured.
from codai.models.acceleration import resolve_acceleration
_accel = resolve_acceleration(model_config)
_accel_steps = _accel.get('steps') if _accel else None
_accel_cfg = _accel.get('guidance_scale') if _accel else None
# Use default steps for fast generation
steps = request.steps if request.steps else 4
steps = request.steps if request.steps else (_accel_steps or 4)
cfg_scale = request.guidance_scale or _accel_cfg or get_cfg_scale()
prompt = request.prompt
if _accel and _accel.get('lora') and (model_config or {}).get('lora_model_dir'):
from codai.models.acceleration import _split_lora_ref
_repo, _wn = _split_lora_ref(_accel['lora'])
_lname = (_wn or _repo).rsplit('/', 1)[-1]
for _suf in ('.safetensors', '.ckpt', '.pt', '.bin'):
if _lname.endswith(_suf):
_lname = _lname[: -len(_suf)]
prompt = f"{prompt} <lora:{_lname}:{_accel.get('lora_weight') or 1.0}>"
_progress_reset(steps)
......@@ -979,11 +1023,11 @@ async def _generate_with_sdcpp(sd_model, request, global_args, http_request=None
try:
result = await asyncio.to_thread(
sd_model.generate_image,
prompt=request.prompt,
prompt=prompt,
negative_prompt='',
width=width,
height=height,
cfg_scale=get_cfg_scale(),
cfg_scale=cfg_scale,
sample_steps=steps,
seed=seed if seed is not None else 42,
batch_count=request.n if request.n else 1,
......@@ -992,11 +1036,11 @@ async def _generate_with_sdcpp(sd_model, request, global_args, http_request=None
except TypeError:
result = await asyncio.to_thread(
sd_model.generate_image,
prompt=request.prompt,
prompt=prompt,
negative_prompt='',
width=width,
height=height,
cfg_scale=get_cfg_scale(),
cfg_scale=cfg_scale,
sample_steps=steps,
seed=seed if seed is not None else 42,
batch_count=request.n if request.n else 1,
......@@ -1266,7 +1310,10 @@ async def create_image_generation(request: ImageGenerationRequest, http_request:
if pipeline is not None:
if is_sdcpp:
print(f"Using cached sd.cpp model for generation")
return await _generate_with_sdcpp(pipeline, request, global_args, http_request)
_sdcpp_cfg = (multi_model_manager.config.get(model_key)
or multi_model_manager.config.get(model_name) or {})
return await _generate_with_sdcpp(pipeline, request, global_args,
http_request, model_config=_sdcpp_cfg)
else:
# Assume it's a diffusers pipeline
print(f"Using cached diffusers pipeline for generation")
......@@ -1335,7 +1382,8 @@ async def create_image_generation(request: ImageGenerationRequest, http_request:
pass
print(f"Loaded sd.cpp model: {model_name}")
return await _generate_with_sdcpp(sd_model, request, global_args, http_request)
return await _generate_with_sdcpp(sd_model, request, global_args,
http_request, model_config=cfg)
else:
sdcpp_error = f"Model '{model_name}' is not a local file, cannot use sd.cpp"
print(sdcpp_error)
......
......@@ -603,7 +603,27 @@ def _generate_sdcpp_video(sd_model, request, model_cfg=None):
mode = request.mode or 't2v'
fps = request.fps or 8
num_frames = request.num_frames or 25
steps = request.num_inference_steps or 20
# Acceleration/distillation defaults (sd.cpp can't fuse a diffusers LoRA, but
# we can still honour the preset's low step-count / guidance, and inject the
# distill LoRA via sd.cpp's "<lora:name:weight>" prompt syntax when a
# lora_model_dir is configured).
from codai.models.acceleration import resolve_acceleration
_accel = resolve_acceleration(model_cfg)
_accel_steps = _accel.get('steps') if _accel else None
_accel_cfg = _accel.get('guidance_scale') if _accel else None
steps = request.num_inference_steps or _accel_steps or 20
cfg_scale = request.guidance_scale or _accel_cfg or 7.0
prompt = request.prompt or ''
if _accel and _accel.get('lora') and (model_cfg or {}).get('lora_model_dir'):
from codai.models.acceleration import _split_lora_ref
_repo, _wn = _split_lora_ref(_accel['lora'])
_lname = (_wn or _repo).rsplit('/', 1)[-1]
for _suf in ('.safetensors', '.ckpt', '.pt', '.bin'):
if _lname.endswith(_suf):
_lname = _lname[: -len(_suf)]
prompt = f"{prompt} <lora:{_lname}:{_accel.get('lora_weight') or 1.0}>"
_vid_progress_reset(steps)
......@@ -611,13 +631,13 @@ def _generate_sdcpp_video(sd_model, request, model_cfg=None):
_vid_progress_step(step)
kw = {
'prompt': request.prompt or '',
'prompt': prompt,
'negative_prompt': request.negative_prompt or '',
'width': request.width or 512,
'height': request.height or 512,
'video_frames': num_frames,
'sample_steps': steps,
'cfg_scale': request.guidance_scale or 7.0,
'cfg_scale': cfg_scale,
'seed': request.seed if request.seed is not None else -1,
'progress_callback': _progress_cb,
}
......@@ -933,7 +953,7 @@ def _load_video_pipeline(model_name: str, device: str, mode: str, offload: str =
# RAM (and disk if needed). This is the preferred strategy when the
# model won't fit entirely in VRAM but should maximise GPU utilisation.
# `gpu_percent` (0–100) controls what fraction of FREE VRAM to occupy.
_gpu_pct = float((model_cfg or {}).get('balanced_gpu_percent', 80))
_gpu_pct = float((model_cfg or {}).get('balanced_gpu_percent') or 80)
try:
if torch.cuda.is_available():
_free_v, _ = torch.cuda.mem_get_info()
......@@ -1346,6 +1366,16 @@ def _generate_video(pipe, request: VideoGenerationRequest):
else 'v2v' if request.video else 't2v')
fps = request.fps or 8
kw = _build_call_kwargs(request)
# Acceleration/distillation defaults (Lightning / Lightx2v): when the model has
# a fused distill LoRA, default to its low step-count / guidance instead of the
# standard 25 steps / 7.5 CFG. The request always wins if it set these — note
# _build_call_kwargs only populates them when the request specified them, so
# setdefault below correctly leaves an explicit request value untouched.
_accel = getattr(pipe, '_coderai_accel', None)
if _accel:
from codai.models.acceleration import accel_call_defaults
for _k, _v in accel_call_defaults(_accel).items():
kw.setdefault(_k, _v)
kw.setdefault('num_inference_steps', 25)
kw.setdefault('guidance_scale', 7.5)
kw.setdefault('num_frames', 16)
......@@ -1940,7 +1970,7 @@ async def video_generations(request: VideoGenerationRequest,
_need_gb = multi_model_manager._get_model_used_vram_gb(
model_key, model_name)
if _need_gb > 0 and _free_gb < _need_gb:
_gpu_pct = float(_model_cfg.get('balanced_gpu_percent', 80))
_gpu_pct = float(_model_cfg.get('balanced_gpu_percent') or 80)
print(f" VRAM insufficient for full-GPU load "
f"({_need_gb:.1f} GB needed, {_free_gb:.1f} GB free) "
f"— auto-selecting balanced strategy "
......@@ -1961,6 +1991,24 @@ async def video_generations(request: VideoGenerationRequest,
"video model. Restart coderai to recover. "
f"Original error: {str(e).splitlines()[0]}"))
raise HTTPException(status_code=500, detail=f"Failed to load video model: {e}")
# Fuse any configured acceleration/distillation LoRA (Lightning / Lightx2v /
# LCM) into the freshly loaded pipeline. Done once at load; cached pipes keep
# it. No-op for sd.cpp pipes and when no acceleration is configured.
try:
from codai.models.acceleration import resolve_acceleration, apply_accel_to_pipeline
_accel = resolve_acceleration(_model_cfg)
_accel_is_sdcpp = False
try:
from stable_diffusion_cpp import StableDiffusion as _SDc
_accel_is_sdcpp = isinstance(pipe, _SDc)
except ImportError:
pass
if _accel and not _accel_is_sdcpp:
print(f" [video][accel] applying {_accel.get('preset')} "
f"(steps={_accel.get('steps')}, guidance={_accel.get('guidance_scale')})")
apply_accel_to_pipeline(pipe, _accel)
except Exception as _e:
print(f" [video][accel] skipped: {_e}")
multi_model_manager.models[model_key] = pipe
multi_model_manager.current_model_key = model_key
# Record the real VRAM used. record_vram_delta only persists when no
......
......@@ -160,6 +160,7 @@ def build_runtime_kwargs(model_cfg, model_type):
kwargs['seed'] = model_cfg.get('seed')
kwargs['vae_tiling'] = model_cfg.get('vae_tiling', False)
kwargs['clip_on_cpu'] = model_cfg.get('clip_on_cpu', False)
kwargs['acceleration'] = model_cfg.get('acceleration')
elif model_type == "audio":
kwargs['ctx'] = model_cfg.get('context_ms')
kwargs['offload'] = model_cfg.get('offload') or model_cfg.get('offload_strategy')
......@@ -172,6 +173,7 @@ def build_runtime_kwargs(model_cfg, model_type):
kwargs['vae_tiling'] = model_cfg.get('vae_tiling', True)
kwargs['balanced_gpu_percent'] = model_cfg.get('balanced_gpu_percent', 80)
kwargs['output_crf'] = model_cfg.get('output_crf')
kwargs['acceleration'] = model_cfg.get('acceleration')
return kwargs
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment