Add auto-detection of model capabilities from Hugging Face

- Added detect_model_capabilities function that analyzes HF model tags and pipeline - Auto-detects capabilities for Hugging Face models when adding via admin or --model - Special handling for known models like Qwen VL, LLaVA, Whisper, etc. - Updated UI with hints about auto-detection

Add auto-detection of model capabilities from Hugging Face
- Added detect_model_capabilities function that analyzes HF model tags and pipeline - Auto-detects capabilities for Hugging Face models when adding via admin or --model - Special handling for known models like Qwen VL, LLaVA, Whisper, etc. - Updated UI with hints about auto-detection
4a86491b · Stefy Lanza (nextime / spora ) · 7d9685c9 · 4a86491b · 4a86491b · 4a86491b
Commit 4a86491b authored Oct 08, 2025 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 90 additions and 7 deletions

models.html templates/admin/models.html +4 -2

admin.py vidai/admin.py +7 -2

database.py vidai/database.py +9 -2

utils.py vidai/utils.py +70 -1

No files found.
--- a/templates/admin/models.html
+++ b/templates/admin/models.html
@@ -110,7 +110,8 @@

                <div class="form-group">
                    <label for="capabilities">Capabilities</label>
-                    <input type="text" id="capabilities" name="capabilities" placeholder="e.g., video to text, image to text, audio to text">
+                    <input type="text" id="capabilities" name="capabilities" placeholder="Auto-detected for Hugging Face models, or enter manually">
+                    <small style="color: #64748b;">For Hugging Face models, capabilities are auto-detected. For local models, enter manually.</small>
                </div>

                <div class="form-group">
@@ -158,7 +159,8 @@
                    </div>
                    <div class="form-group">
                        <label for="editCapabilities">Capabilities</label>
-                        <input type="text" id="editCapabilities" name="capabilities" placeholder="e.g., video to text, image to text, audio to text">
+                        <input type="text" id="editCapabilities" name="capabilities" placeholder="Auto-detected for Hugging Face models, or enter manually">
+                        <small style="color: #64748b;">For Hugging Face models, capabilities are auto-detected. For local models, enter manually.</small>
                    </div>
                    <div class="form-group">
                        <label for="editVram">VRAM Estimate (GB)</label>

--- a/vidai/admin.py
+++ b/vidai/admin.py
@@ -510,10 +510,15 @@ def add_model():
        flash('Invalid model type', 'error')
        return redirect(url_for('admin.models'))

-    # If Hugging Face, download the model
+    # If Hugging Face, download the model and detect capabilities
    if model_type == 'huggingface':
        try:
-            from .utils import download_huggingface_model
+            from .utils import download_huggingface_model, detect_model_capabilities
+
+            # Auto-detect capabilities if not provided
+            if not capabilities:
+                capabilities = detect_model_capabilities(path)
+
            local_path = download_huggingface_model(path)
            if local_path:
                path = local_path

--- a/vidai/database.py
+++ b/vidai/database.py
@@ -2009,6 +2009,8 @@ def get_model_by_id(model_id: int) -> Optional[Dict[str, Any]]:

 def ensure_model_exists(name: str, model_type: str, path: str, vram_estimate: int = 0, available: bool = True) -> None:
    """Ensure a model exists in the database, create if not."""
+    from .utils import detect_model_capabilities
+
    conn = get_db_connection()
    cursor = conn.cursor()

@@ -2017,9 +2019,14 @@ def ensure_model_exists(name: str, model_type: str, path: str, vram_estimate: in
    existing = cursor.fetchone()

    if not existing:
+        # Auto-detect capabilities for Hugging Face models
+        capabilities = ''
+        if model_type == 'huggingface':
+            capabilities = detect_model_capabilities(path)
+
        # Create the model
-        cursor.execute('INSERT INTO models (name, type, path, vram_estimate, available) VALUES (?, ?, ?, ?, ?)',
-                       (name, model_type, path, vram_estimate, 1 if available else 0))
+        cursor.execute('INSERT INTO models (name, type, path, vram_estimate, available, capabilities) VALUES (?, ?, ?, ?, ?, ?)',
+                       (name, model_type, path, vram_estimate, 1 if available else 0, capabilities))
        conn.commit()
    else:
        # Update availability if it's not already available

--- a/vidai/utils.py
+++ b/vidai/utils.py
@@ -162,4 +162,73 @@ def download_huggingface_model(model_id: str) -> str:
        return local_path
    except Exception as e:
        print(f"Failed to download model {model_id}: {e}")
-        return None
\ No newline at end of file
+        return None
+
+
+def detect_model_capabilities(model_id: str) -> str:
+    """Detect model capabilities from Hugging Face model info."""
+    try:
+        from huggingface_hub import HfApi
+        api = HfApi()
+
+        # Get model info
+        model_info = api.model_info(model_id)
+
+        capabilities = []
+
+        # Check tags for capabilities
+        tags = model_info.tags or []
+        tags_lower = [tag.lower() for tag in tags]
+
+        # Vision capabilities
+        if any(tag in tags_lower for tag in ['vision', 'image', 'ocr', 'object-detection', 'image-classification']):
+            capabilities.append('image to text')
+
+        # Video capabilities (inferred from vision + temporal tags)
+        if 'vision' in tags_lower and any(tag in tags_lower for tag in ['video', 'temporal', 'action-recognition']):
+            capabilities.append('video to text')
+
+        # Audio capabilities
+        if any(tag in tags_lower for tag in ['audio', 'speech', 'asr', 'automatic-speech-recognition']):
+            capabilities.append('audio to text')
+
+        # Text generation capabilities
+        if any(tag in tags_lower for tag in ['text-generation', 'causal-lm', 'text2text-generation']):
+            capabilities.append('text generation')
+
+        # Check pipeline tag
+        pipeline = getattr(model_info, 'pipeline_tag', None)
+        if pipeline:
+            pipeline_lower = pipeline.lower()
+            if pipeline_lower == 'text-generation':
+                capabilities.append('text generation')
+            elif pipeline_lower == 'image-classification':
+                capabilities.append('image to text')
+            elif pipeline_lower == 'object-detection':
+                capabilities.append('image to text')
+            elif pipeline_lower == 'automatic-speech-recognition':
+                capabilities.append('audio to text')
+            elif pipeline_lower == 'text-to-image':
+                capabilities.append('text to image')
+            elif pipeline_lower == 'image-to-text':
+                capabilities.append('image to text')
+
+        # Special handling for known multimodal models
+        model_id_lower = model_id.lower()
+        if 'qwen' in model_id_lower and ('vl' in model_id_lower or 'vision' in model_id_lower):
+            capabilities = ['video to text', 'image to text']
+        elif 'llava' in model_id_lower:
+            capabilities = ['image to text']
+        elif 'whisper' in model_id_lower:
+            capabilities = ['audio to text']
+        elif 'wav2vec' in model_id_lower:
+            capabilities = ['audio to text']
+        elif 'clap' in model_id_lower:
+            capabilities = ['audio to text']
+
+        # Remove duplicates and return
+        return ', '.join(sorted(set(capabilities)))
+
+    except Exception as e:
+        print(f"Failed to detect capabilities for {model_id}: {e}")
+        return ''
\ No newline at end of file