Final version using Whisper for transcription, Resemblyzer for diarization

parent 1439038f
Pipeline #202 canceled with stages
# Audio Transcription App
This Python application transcribes audio files with speaker diarization and timestamps using Qwen2.5-Omni-7B and Resemblyzer models.
This Python application transcribes audio files with speaker diarization and timestamps using Whisper and Resemblyzer models.
## Features
- Automatic speech recognition with Qwen-Omni-7B (4-bit quantized)
......
import argparse
import torch
from transformers import AutoProcessor, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import pipeline
from resemblyzer import VoiceEncoder
from sklearn.cluster import AgglomerativeClustering
import webrtcvad
......@@ -62,7 +62,7 @@ def get_diarization(audio, sr):
return merged
def main():
parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps using Qwen2.5-Omni-7B')
parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps')
parser.add_argument('audio_file', help='Path to the audio file')
args = parser.parse_args()
......@@ -73,10 +73,9 @@ def main():
print(f"Error: Audio file '{audio_file}' not found.")
return
# Load Qwen2.5-Omni-7B model with 4-bit quantization
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Omni-7B", quantization_config=quantization_config, device_map="auto")
# Load Whisper for transcription
device = 0 if torch.cuda.is_available() else -1
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
# Load audio
audio, sr = librosa.load(audio_file, sr=16000)
......@@ -95,19 +94,9 @@ def main():
if len(audio_chunk) == 0:
continue
# Prepare inputs for Qwen-Omni
conversation = [
{"role": "user", "content": [
{"type": "audio", "audio": {"waveform": audio_chunk, "sample_rate": sr}},
{"type": "text", "text": "Transcribe this audio segment exactly as spoken."}
]}
]
inputs = processor(conversation=conversation, return_tensors="pt").to(model.device)
# Generate transcription
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
# Transcribe with Whisper
result = transcriber(audio_chunk, return_timestamps=False)
text = result['text'].strip()
# Format timestamps
start_min, start_sec = divmod(start, 60)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment