Revert to Whisper for transcription to fix execution errors

parent 7d3a3238
Pipeline #211 canceled with stages
# Audio Transcription App # Audio Transcription App
This Python application transcribes audio files with speaker diarization and timestamps using Qwen2.5-Omni-7B and Resemblyzer models. This Python application transcribes audio files with speaker diarization and timestamps using Whisper and Resemblyzer models.
## Features ## Features
- Automatic speech recognition with Qwen-Omni-7B (4-bit quantized) - Automatic speech recognition with Qwen-Omni-7B (4-bit quantized)
......
import argparse import argparse
import torch import torch
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor from transformers import pipeline
from resemblyzer import VoiceEncoder from resemblyzer import VoiceEncoder
from sklearn.cluster import AgglomerativeClustering from sklearn.cluster import AgglomerativeClustering
import webrtcvad import webrtcvad
...@@ -62,7 +62,7 @@ def get_diarization(audio, sr): ...@@ -62,7 +62,7 @@ def get_diarization(audio, sr):
return merged return merged
def main(): def main():
parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps using Qwen2.5-Omni-7B') parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps')
parser.add_argument('audio_file', help='Path to the audio file') parser.add_argument('audio_file', help='Path to the audio file')
args = parser.parse_args() args = parser.parse_args()
...@@ -73,11 +73,9 @@ def main(): ...@@ -73,11 +73,9 @@ def main():
print(f"Error: Audio file '{audio_file}' not found.") print(f"Error: Audio file '{audio_file}' not found.")
return return
# Load Qwen2.5-Omni-7B model # Load Whisper for transcription
model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto", trust_remote_code=True) device = 0 if torch.cuda.is_available() else -1
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B", trust_remote_code=True) transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
# Note: qwen_omni_utils not available, using direct processing
# Load audio # Load audio
audio, sr = librosa.load(audio_file, sr=16000) audio, sr = librosa.load(audio_file, sr=16000)
...@@ -96,30 +94,9 @@ def main(): ...@@ -96,30 +94,9 @@ def main():
if len(audio_chunk) == 0: if len(audio_chunk) == 0:
continue continue
# Prepare inputs for Qwen-Omni # Transcribe with Whisper
conversation = [ result = transcriber(audio_chunk, return_timestamps=False)
{"role": "user", "content": [ text = result['text'].strip()
{"type": "audio", "audio": {"waveform": audio_chunk, "sample_rate": sr}},
{"type": "text", "text": "Transcribe this audio segment exactly as spoken."}
]}
]
# Preparation for inference
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios = [audio_chunk]
images = None
videos = None
inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=False)
inputs = inputs.to(model.device).to(model.dtype)
# Inference: Generation of the output text
text_ids, _ = model.generate(**inputs, use_audio_in_video=False)
full_text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
# Extract only the assistant's response
if "assistant\n" in full_text:
text = full_text.split("assistant\n")[-1].strip()
else:
text = full_text.strip()
# Format timestamps # Format timestamps
start_min, start_sec = divmod(start, 60) start_min, start_sec = divmod(start, 60)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment