Use Qwen2.5-Omni-7B for transcription with 4-bit quantization

parent 3c9ba76d
Pipeline #198 canceled with stages
# Audio Transcription App
This Python application transcribes audio files with speaker diarization and timestamps using Whisper and Resemblyzer models.
This Python application transcribes audio files with speaker diarization and timestamps using Qwen2.5-Omni-7B and Resemblyzer models.
## Features
- Automatic speech recognition with Qwen-Omni-7B (4-bit quantized)
......
import argparse
import torch
from transformers import pipeline
from transformers import AutoProcessor, AutoModel, BitsAndBytesConfig
from resemblyzer import VoiceEncoder
from sklearn.cluster import AgglomerativeClustering
import webrtcvad
......@@ -62,7 +62,7 @@ def get_diarization(audio, sr):
return merged
def main():
parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps')
parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps using Qwen2.5-Omni-7B')
parser.add_argument('audio_file', help='Path to the audio file')
args = parser.parse_args()
......@@ -73,9 +73,10 @@ def main():
print(f"Error: Audio file '{audio_file}' not found.")
return
# Load Whisper for transcription
device = 0 if torch.cuda.is_available() else -1
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
# Load Qwen2.5-Omni-7B model with 4-bit quantization
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
model = AutoModel.from_pretrained("Qwen/Qwen2.5-Omni-7B", quantization_config=quantization_config, device_map="auto")
# Load audio
audio, sr = librosa.load(audio_file, sr=16000)
......@@ -94,9 +95,19 @@ def main():
if len(audio_chunk) == 0:
continue
# Transcribe with Whisper
result = transcriber(audio_chunk, return_timestamps=False)
text = result['text'].strip()
# Prepare inputs for Qwen-Omni
conversation = [
{"role": "user", "content": [
{"type": "audio", "audio": {"waveform": audio_chunk, "sample_rate": sr}},
{"type": "text", "text": "Transcribe this audio segment exactly as spoken."}
]}
]
inputs = processor(conversation=conversation, return_tensors="pt").to(model.device)
# Generate transcription
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
# Format timestamps
start_min, start_sec = divmod(start, 60)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment