Implement Qwen2.5-Omni-7B using AutoModel as per Hugging Face documentation

parent 85466098
Pipeline #203 canceled with stages
# Audio Transcription App
This Python application transcribes audio files with speaker diarization and timestamps using Whisper and Resemblyzer models.
This Python application transcribes audio files with speaker diarization and timestamps using Qwen2.5-Omni-7B and Resemblyzer models.
## Features
- Automatic speech recognition with Qwen-Omni-7B (4-bit quantized)
......
import argparse
import torch
from transformers import pipeline
from transformers import AutoProcessor, AutoModel
from resemblyzer import VoiceEncoder
from sklearn.cluster import AgglomerativeClustering
import webrtcvad
......@@ -62,7 +62,7 @@ def get_diarization(audio, sr):
return merged
def main():
parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps')
parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps using Qwen2.5-Omni-7B')
parser.add_argument('audio_file', help='Path to the audio file')
args = parser.parse_args()
......@@ -73,9 +73,9 @@ def main():
print(f"Error: Audio file '{audio_file}' not found.")
return
# Load Whisper for transcription
device = 0 if torch.cuda.is_available() else -1
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
# Load Qwen2.5-Omni-7B model
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
model = AutoModel.from_pretrained("Qwen/Qwen2.5-Omni-7B")
# Load audio
audio, sr = librosa.load(audio_file, sr=16000)
......@@ -94,9 +94,19 @@ def main():
if len(audio_chunk) == 0:
continue
# Transcribe with Whisper
result = transcriber(audio_chunk, return_timestamps=False)
text = result['text'].strip()
# Prepare inputs for Qwen-Omni
conversation = [
{"role": "user", "content": [
{"type": "audio", "audio": {"waveform": audio_chunk, "sample_rate": sr}},
{"type": "text", "text": "Transcribe this audio segment exactly as spoken."}
]}
]
inputs = processor(conversation=conversation, return_tensors="pt")
# Generate transcription
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
# Format timestamps
start_min, start_sec = divmod(start, 60)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment