Use Qwen2.5-Omni-7B for transcription with 4-bit quantization

parent 3c9ba76d
Pipeline #198 canceled with stages
# Audio Transcription App # Audio Transcription App
This Python application transcribes audio files with speaker diarization and timestamps using Whisper and Resemblyzer models. This Python application transcribes audio files with speaker diarization and timestamps using Qwen2.5-Omni-7B and Resemblyzer models.
## Features ## Features
- Automatic speech recognition with Qwen-Omni-7B (4-bit quantized) - Automatic speech recognition with Qwen-Omni-7B (4-bit quantized)
......
import argparse import argparse
import torch import torch
from transformers import pipeline from transformers import AutoProcessor, AutoModel, BitsAndBytesConfig
from resemblyzer import VoiceEncoder from resemblyzer import VoiceEncoder
from sklearn.cluster import AgglomerativeClustering from sklearn.cluster import AgglomerativeClustering
import webrtcvad import webrtcvad
...@@ -62,7 +62,7 @@ def get_diarization(audio, sr): ...@@ -62,7 +62,7 @@ def get_diarization(audio, sr):
return merged return merged
def main(): def main():
parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps') parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps using Qwen2.5-Omni-7B')
parser.add_argument('audio_file', help='Path to the audio file') parser.add_argument('audio_file', help='Path to the audio file')
args = parser.parse_args() args = parser.parse_args()
...@@ -73,9 +73,10 @@ def main(): ...@@ -73,9 +73,10 @@ def main():
print(f"Error: Audio file '{audio_file}' not found.") print(f"Error: Audio file '{audio_file}' not found.")
return return
# Load Whisper for transcription # Load Qwen2.5-Omni-7B model with 4-bit quantization
device = 0 if torch.cuda.is_available() else -1 quantization_config = BitsAndBytesConfig(load_in_4bit=True)
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device) processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
model = AutoModel.from_pretrained("Qwen/Qwen2.5-Omni-7B", quantization_config=quantization_config, device_map="auto")
# Load audio # Load audio
audio, sr = librosa.load(audio_file, sr=16000) audio, sr = librosa.load(audio_file, sr=16000)
...@@ -94,9 +95,19 @@ def main(): ...@@ -94,9 +95,19 @@ def main():
if len(audio_chunk) == 0: if len(audio_chunk) == 0:
continue continue
# Transcribe with Whisper # Prepare inputs for Qwen-Omni
result = transcriber(audio_chunk, return_timestamps=False) conversation = [
text = result['text'].strip() {"role": "user", "content": [
{"type": "audio", "audio": {"waveform": audio_chunk, "sample_rate": sr}},
{"type": "text", "text": "Transcribe this audio segment exactly as spoken."}
]}
]
inputs = processor(conversation=conversation, return_tensors="pt").to(model.device)
# Generate transcription
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
# Format timestamps # Format timestamps
start_min, start_sec = divmod(start, 60) start_min, start_sec = divmod(start, 60)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment