Implement Qwen2.5-Omni-7B using AutoModel as per Hugging Face documentation

parent 85466098
Pipeline #203 canceled with stages
# Audio Transcription App # Audio Transcription App
This Python application transcribes audio files with speaker diarization and timestamps using Whisper and Resemblyzer models. This Python application transcribes audio files with speaker diarization and timestamps using Qwen2.5-Omni-7B and Resemblyzer models.
## Features ## Features
- Automatic speech recognition with Qwen-Omni-7B (4-bit quantized) - Automatic speech recognition with Qwen-Omni-7B (4-bit quantized)
......
import argparse import argparse
import torch import torch
from transformers import pipeline from transformers import AutoProcessor, AutoModel
from resemblyzer import VoiceEncoder from resemblyzer import VoiceEncoder
from sklearn.cluster import AgglomerativeClustering from sklearn.cluster import AgglomerativeClustering
import webrtcvad import webrtcvad
...@@ -62,7 +62,7 @@ def get_diarization(audio, sr): ...@@ -62,7 +62,7 @@ def get_diarization(audio, sr):
return merged return merged
def main(): def main():
parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps') parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps using Qwen2.5-Omni-7B')
parser.add_argument('audio_file', help='Path to the audio file') parser.add_argument('audio_file', help='Path to the audio file')
args = parser.parse_args() args = parser.parse_args()
...@@ -73,9 +73,9 @@ def main(): ...@@ -73,9 +73,9 @@ def main():
print(f"Error: Audio file '{audio_file}' not found.") print(f"Error: Audio file '{audio_file}' not found.")
return return
# Load Whisper for transcription # Load Qwen2.5-Omni-7B model
device = 0 if torch.cuda.is_available() else -1 processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device) model = AutoModel.from_pretrained("Qwen/Qwen2.5-Omni-7B")
# Load audio # Load audio
audio, sr = librosa.load(audio_file, sr=16000) audio, sr = librosa.load(audio_file, sr=16000)
...@@ -94,9 +94,19 @@ def main(): ...@@ -94,9 +94,19 @@ def main():
if len(audio_chunk) == 0: if len(audio_chunk) == 0:
continue continue
# Transcribe with Whisper # Prepare inputs for Qwen-Omni
result = transcriber(audio_chunk, return_timestamps=False) conversation = [
text = result['text'].strip() {"role": "user", "content": [
{"type": "audio", "audio": {"waveform": audio_chunk, "sample_rate": sr}},
{"type": "text", "text": "Transcribe this audio segment exactly as spoken."}
]}
]
inputs = processor(conversation=conversation, return_tensors="pt")
# Generate transcription
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
# Format timestamps # Format timestamps
start_min, start_sec = divmod(start, 60) start_min, start_sec = divmod(start, 60)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment