Add GGUF audio model support with llama.cpp (Vulkan)

When audio model is in GGUF format, use llama.cpp instead of faster-whisper for pre-loading. This allows using Vulkan backend for audio transcription.

Add GGUF audio model support with llama.cpp (Vulkan)
When audio model is in GGUF format, use llama.cpp instead of faster-whisper for pre-loading. This allows using Vulkan backend for audio transcription.
3daca858 · Stefy Lanza (nextime / spora ) · 833a4ff3 · 3daca858
Commit 3daca858 authored Mar 08, 2026 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 120 additions and 57 deletions

coderai coderai +120 -57

No files found.
--- a/coderai
+++ b/coderai
@@ -3550,67 +3550,130 @@ def main():
        if should_preload:
            print(f"Pre-loading audio model...")
            try:
-                from faster_whisper import WhisperModel
-                import torch
-                
                model_to_use = args.audio_model
                model_path = None
                
-                # Check if model is a URL - handle caching
-                if model_to_use.startswith('http://') or model_to_use.startswith('https://'):
-                    cached_path = get_cached_model_path(model_to_use)
-                    if cached_path:
-                        model_path = cached_path
-                        print(f"Using cached model: {model_path}")
-                    else:
-                        # Download and cache
-                        print(f"Downloading audio model: {model_to_use}")
-                        import requests
-                        import hashlib
-                        
-                        cache_dir = get_model_cache_dir()
-                        url_path = model_to_use.split('?')[0]
-                        filename = os.path.basename(url_path)
-                        
-                        if not filename.endswith('.bin') and not filename.endswith('.ggml'):
-                            filename = "whisper-model.bin"
-                        
-                        url_hash = hashlib.sha256(model_to_use.encode()).hexdigest()
-                        cached_filename = f"{url_hash}_{filename}"
-                        model_path = os.path.join(cache_dir, cached_filename)
-                        
-                        response = requests.get(model_to_use, stream=True)
-                        response.raise_for_status()
-                        
-                        total_size = int(response.headers.get('content-length', 0))
-                        downloaded = 0
-                        
-                        with open(model_path, 'wb') as f:
-                            for chunk in response.iter_content(chunk_size=8192*1024):
-                                if chunk:
-                                    f.write(chunk)
-                                    downloaded += len(chunk)
-                                    if total_size > 0:
-                                        percent = (downloaded / total_size) * 100
-                                        print(f"Downloaded: {percent:.1f}%", end='\r')
-                        
-                        print(f"\nDownloaded and cached to: {model_path}")
-                        model_to_use = model_path
-                
-                # Determine compute type
-                compute_type = "float16" if torch.cuda.is_available() else "int8"
-                
-                # Load the model
-                whisper_model = WhisperModel(
-                    model_to_use,
-                    device="cuda" if torch.cuda.is_available() else "cpu",
-                    compute_type=compute_type
-                )
+                # Check if model is a GGUF file - use llama.cpp (Vulkan) instead of faster-whisper
+                is_gguf = model_to_use.endswith('.gguf') or '.gguf?' in model_to_use or 'gguf' in model_to_use.lower()
                
-                # Store in multi_model_manager
-                model_key = f"audio:{args.audio_model}"
-                multi_model_manager.add_model(model_key, whisper_model)
-                print(f"Audio model loaded successfully")
+                if is_gguf:
+                    # Use llama.cpp for GGUF audio models (works with Vulkan)
+                    print(f"Using GGUF format with llama.cpp (Vulkan)...")
+                    from llama_cpp import Llama
+                    
+                    # Check if model is a URL - handle caching
+                    if model_to_use.startswith('http://') or model_to_use.startswith('https://'):
+                        cached_path = get_cached_model_path(model_to_use)
+                        if cached_path:
+                            model_path = cached_path
+                            print(f"Using cached model: {model_path}")
+                        else:
+                            # Download and cache
+                            print(f"Downloading audio model: {model_to_use}")
+                            import requests
+                            import hashlib
+                            
+                            cache_dir = get_model_cache_dir()
+                            url_path = model_to_use.split('?')[0]
+                            filename = os.path.basename(url_path)
+                            
+                            if not filename.endswith('.gguf'):
+                                filename = "whisper-model.gguf"
+                            
+                            url_hash = hashlib.sha256(model_to_use.encode()).hexdigest()
+                            cached_filename = f"{url_hash}_{filename}"
+                            model_path = os.path.join(cache_dir, cached_filename)
+                            
+                            response = requests.get(model_to_use, stream=True)
+                            response.raise_for_status()
+                            
+                            total_size = int(response.headers.get('content-length', 0))
+                            downloaded = 0
+                            
+                            with open(model_path, 'wb') as f:
+                                for chunk in response.iter_content(chunk_size=8192*1024):
+                                    if chunk:
+                                        f.write(chunk)
+                                        downloaded += len(chunk)
+                                        if total_size > 0:
+                                            percent = (downloaded / total_size) * 100
+                                            print(f"Downloaded: {percent:.1f}%", end='\r')
+                            
+                            print(f"\nDownloaded and cached to: {model_path}")
+                            model_to_use = model_path
+                    
+                    # Load with llama.cpp (Vulkan)
+                    audio_model = Llama(
+                        model_path=model_to_use,
+                        n_gpu_layers=-1,  # All layers to GPU
+                        n_ctx=2048,
+                        verbose=False
+                    )
+                    
+                    # Store in multi_model_manager
+                    model_key = f"audio:{args.audio_model}"
+                    multi_model_manager.add_model(model_key, audio_model)
+                    print(f"Audio model loaded successfully (GGUF/Vulkan)")
+                else:
+                    # Use faster-whisper for non-GGUF models
+                    from faster_whisper import WhisperModel
+                    import torch
+                    
+                    # Check if model is a URL - handle caching
+                    if model_to_use.startswith('http://') or model_to_use.startswith('https://'):
+                        cached_path = get_cached_model_path(model_to_use)
+                        if cached_path:
+                            model_path = cached_path
+                            print(f"Using cached model: {model_path}")
+                        else:
+                            # Download and cache
+                            print(f"Downloading audio model: {model_to_use}")
+                            import requests
+                            import hashlib
+                            
+                            cache_dir = get_model_cache_dir()
+                            url_path = model_to_use.split('?')[0]
+                            filename = os.path.basename(url_path)
+                            
+                            if not filename.endswith('.bin') and not filename.endswith('.ggml'):
+                                filename = "whisper-model.bin"
+                            
+                            url_hash = hashlib.sha256(model_to_use.encode()).hexdigest()
+                            cached_filename = f"{url_hash}_{filename}"
+                            model_path = os.path.join(cache_dir, cached_filename)
+                            
+                            response = requests.get(model_to_use, stream=True)
+                            response.raise_for_status()
+                            
+                            total_size = int(response.headers.get('content-length', 0))
+                            downloaded = 0
+                            
+                            with open(model_path, 'wb') as f:
+                                for chunk in response.iter_content(chunk_size=8192*1024):
+                                    if chunk:
+                                        f.write(chunk)
+                                        downloaded += len(chunk)
+                                        if total_size > 0:
+                                            percent = (downloaded / total_size) * 100
+                                            print(f"Downloaded: {percent:.1f}%", end='\r')
+                            
+                            print(f"\nDownloaded and cached to: {model_path}")
+                            model_to_use = model_path
+                    
+                    # Determine compute type
+                    compute_type = "float16" if torch.cuda.is_available() else "int8"
+                    
+                    # Load the model
+                    whisper_model = WhisperModel(
+                        model_to_use,
+                        device="cuda" if torch.cuda.is_available() else "cpu",
+                        compute_type=compute_type
+                    )
+                    
+                    # Store in multi_model_manager
+                    model_key = f"audio:{args.audio_model}"
+                    multi_model_manager.add_model(model_key, whisper_model)
+                    print(f"Audio model loaded successfully")
                
            except Exception as e:
                print(f"Warning: Could not pre-load audio model: {e}")