Better model vram requirements estimation

parent 71ba3d0d
......@@ -1823,7 +1823,7 @@ async def api_hf_search(
if effective_q:
pairs.append(("search", effective_q))
pairs.extend(filter_pairs)
pairs += [("sort", sort), ("direction", "-1"), ("limit", limit), ("full", "false")]
pairs += [("sort", sort), ("direction", "-1"), ("limit", limit), ("full", "true")]
url = "https://huggingface.co/api/models?" + urllib.parse.urlencode(pairs)
rq = urllib.request.Request(url, headers={"User-Agent": "coderai-admin/1.0"})
def _fetch():
......@@ -1857,12 +1857,14 @@ async def api_hf_search(
merged = [m for m in merged if "gguf" not in (m.get("modelId") or m.get("id", "")).lower()]
# Get VRAM info
vram_gb = None
vram_total_gb = None
vram_free_gb = None
try:
import torch
if torch.cuda.is_available():
free, total = torch.cuda.mem_get_info()
vram_gb = round(free / 1e9, 2)
vram_total_gb = round(total / 1e9, 2)
vram_free_gb = round(free / 1e9, 2)
except Exception:
pass
......@@ -1876,12 +1878,24 @@ async def api_hf_search(
# Only cache when pipeline_tag gave us authoritative information
if m.get("pipeline_tag"):
update_capability_cache(mid, caps)
# Estimate size from safetensors metadata when available
safetensors_size_gb = None
sf = m.get("safetensors") or {}
total_params = sf.get("total", 0)
if total_params:
params_by_dtype = sf.get("parameters") or {}
dominant = max(params_by_dtype, key=params_by_dtype.get) if params_by_dtype else "BF16"
bpp = {"F32": 4, "F16": 2, "BF16": 2, "F8_E4M3": 1, "F8_E5M2": 1, "I8": 1, "I4": 0.5, "U8": 1}.get(dominant, 2)
safetensors_size_gb = round(total_params * bpp / 1e9, 2)
results.append({
"id": mid,
"downloads": m.get("downloads", 0),
"likes": m.get("likes", 0),
"pipeline_tag": m.get("pipeline_tag", ""),
"vram_available": vram_gb,
"vram_total": vram_total_gb,
"vram_free": vram_free_gb,
"safetensors_size_gb": safetensors_size_gb,
"capabilities": caps.to_list(),
})
return results
......
......@@ -674,16 +674,33 @@ let _filesCache = {};
let _activeQuants = new Set();
let _cachedSearchIds = new Set(); // HF repo IDs (and GGUF source_repos) cached locally
function estimateModelSize(modelId){
function estimateModelSize(modelId, safetensorsSizeGb){
// 1. Safetensors metadata from HF API (most accurate)
if(safetensorsSizeGb != null) return safetensorsSizeGb;
const id = modelId.toLowerCase();
// Extract parameter count (e.g., 7b, 13b, 70b)
// 2. Xb parameter count in model name
const match = id.match(/(\d+\.?\d*)b/);
if(!match) return 8; // default guess
const params = parseFloat(match[1]);
// Rough estimate: Q4 ≈ 0.5GB per B params, Q8 ≈ 1GB per B, FP16 ≈ 2GB per B
if(id.includes('q4') || id.includes('4bit')) return params * 0.5;
if(id.includes('q8') || id.includes('8bit')) return params * 1.0;
return params * 2; // assume FP16
if(match){
const params = parseFloat(match[1]);
if(id.includes('q4') || id.includes('4bit')) return params * 0.5;
if(id.includes('q8') || id.includes('8bit')) return params * 1.0;
return params * 2; // assume FP16
}
// 3. Keyword heuristics for models without a param count in their name
if(/\b(tiny|nano|micro)\b/.test(id)) return 0.1;
if(/\bsmall\b/.test(id)) return 0.3;
if(/\bbase\b/.test(id)) return 0.5;
if(/\bmedium\b/.test(id)) return 1.0;
if(/\blarge[-_v]/.test(id) && /v[23]/.test(id)) return 3.0; // large-v2/v3
if(/\blarge\b/.test(id)) return 2.0;
if(/\bxxl\b/.test(id)) return 10.0;
if(/\bxl\b/.test(id)) return 5.0;
if(/\bhuge\b/.test(id)) return 7.0;
return null;
}
document.getElementById('search-q').addEventListener('keydown',e=>{if(e.key==='Enter')doSearch()});
......@@ -727,14 +744,18 @@ async function doSearch(){
if(!_results.length){out.innerHTML='<span class="muted small">No results. Try different keywords or fewer filters.</span>';return}
const vramAvail = _results[0]?.vram_available;
const vramTotal = _results[0]?.vram_total;
const vramFree = _results[0]?.vram_free;
out.innerHTML = _results.map((m,i)=>{
let vramDot = '';
if(vramAvail){
const estSize = estimateModelSize(m.id);
const color = estSize <= vramAvail*0.8 ? '#10b981' : estSize <= vramAvail*0.95 ? '#f59e0b' : '#ef4444';
vramDot = `<span style="display:inline-block;width:8px;height:8px;border-radius:50%;background:${color};margin-right:.35rem" title="Est. ${estSize}GB / ${vramAvail}GB available"></span>`;
if(vramTotal){
const estSize = estimateModelSize(m.id, m.safetensors_size_gb ?? null);
if(estSize !== null){
const color = estSize <= vramTotal*0.8 ? '#10b981' : estSize <= vramTotal*0.95 ? '#f59e0b' : '#ef4444';
const freeHint = vramFree != null ? ` — ${vramFree}GB free now` : '';
vramDot = `<span style="display:inline-block;width:8px;height:8px;border-radius:50%;background:${color};margin-right:.35rem" title="Est. ~${estSize}GB / ${vramTotal}GB total${freeHint}"></span>`;
}
}
const capBadges = fmtCapabilities(m.capabilities||[]);
const isDownloaded = _cachedSearchIds.has(m.id);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment