Revert to Whisper for transcription to fix execution errors

parent 7d3a3238
Pipeline #211 canceled with stages
# Audio Transcription App
This Python application transcribes audio files with speaker diarization and timestamps using Qwen2.5-Omni-7B and Resemblyzer models.
This Python application transcribes audio files with speaker diarization and timestamps using Whisper and Resemblyzer models.
## Features
- Automatic speech recognition with Qwen-Omni-7B (4-bit quantized)
......
import argparse
import torch
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
from transformers import pipeline
from resemblyzer import VoiceEncoder
from sklearn.cluster import AgglomerativeClustering
import webrtcvad
......@@ -62,7 +62,7 @@ def get_diarization(audio, sr):
return merged
def main():
parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps using Qwen2.5-Omni-7B')
parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps')
parser.add_argument('audio_file', help='Path to the audio file')
args = parser.parse_args()
......@@ -73,11 +73,9 @@ def main():
print(f"Error: Audio file '{audio_file}' not found.")
return
# Load Qwen2.5-Omni-7B model
model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto", trust_remote_code=True)
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B", trust_remote_code=True)
# Note: qwen_omni_utils not available, using direct processing
# Load Whisper for transcription
device = 0 if torch.cuda.is_available() else -1
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
# Load audio
audio, sr = librosa.load(audio_file, sr=16000)
......@@ -96,30 +94,9 @@ def main():
if len(audio_chunk) == 0:
continue
# Prepare inputs for Qwen-Omni
conversation = [
{"role": "user", "content": [
{"type": "audio", "audio": {"waveform": audio_chunk, "sample_rate": sr}},
{"type": "text", "text": "Transcribe this audio segment exactly as spoken."}
]}
]
# Preparation for inference
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios = [audio_chunk]
images = None
videos = None
inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=False)
inputs = inputs.to(model.device).to(model.dtype)
# Inference: Generation of the output text
text_ids, _ = model.generate(**inputs, use_audio_in_video=False)
full_text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
# Extract only the assistant's response
if "assistant\n" in full_text:
text = full_text.split("assistant\n")[-1].strip()
else:
text = full_text.strip()
# Transcribe with Whisper
result = transcriber(audio_chunk, return_timestamps=False)
text = result['text'].strip()
# Format timestamps
start_min, start_sec = divmod(start, 60)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment