Commit 8e072ebb authored by Your Name's avatar Your Name

Fix VulkanBackend to accept original_backend parameter

parent 059999f7
......@@ -8,10 +8,19 @@ from codai.backends.base import ModelBackend
class VulkanBackend(ModelBackend):
"""Backend for Vulkan GPU inference using llama.cpp."""
def __init__(self):
def __init__(self, original_backend: str = None):
self.model = None
self.model_name = None
self.device = None
self.n_gpu_layers = -1
self.n_ctx = 2048
self.verbose = True
self.main_gpu = 0
self.chat_template = None
self.hf_tokenizer = None
self.force_cuda = original_backend in ("nvidia", "cuda")
if self.force_cuda:
print("DEBUG: GGUF model will use CUDA backend (forced by --backend nvidia)")
def load_model(self, model_name: str, **kwargs) -> None:
"""Load the model."""
......@@ -23,6 +32,20 @@ class VulkanBackend(ModelBackend):
"""Generate text non-streaming."""
pass
def generate_chat(self, messages: List[Dict], max_tokens: Optional[int] = None,
temperature: float = 0.7, top_p: float = 1.0,
stop: Optional[List[str]] = None, tools: Optional[List] = None,
response_format: Optional[Dict] = None) -> str:
"""Generate chat completion non-streaming."""
pass
async def generate_chat_stream(self, messages: List[Dict], max_tokens: Optional[int] = None,
temperature: float = 0.7, top_p: float = 1.0,
stop: Optional[List[str]] = None, tools: Optional[List] = None,
response_format: Optional[Dict] = None):
"""Generate chat completion streaming."""
pass
def generate_stream(self, prompt: str, max_tokens: Optional[int] = None,
temperature: float = 0.7, top_p: float = 1.0,
stop: Optional[list] = None):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment