Switch to Whisper for transcription, remove Qwen-Omni due to repo not found

parent 7f6696b6
# Audio Transcription App # Audio Transcription App
This Python application transcribes audio files with speaker diarization and timestamps using Qwen-Omni-7B model. This Python application transcribes audio files with speaker diarization and timestamps using Whisper and Resemblyzer models.
## Features ## Features
- Automatic speech recognition with Qwen-Omni-7B (4-bit quantized) - Automatic speech recognition with Qwen-Omni-7B (4-bit quantized)
......
import argparse import argparse
import torch import torch
from transformers import AutoModelForCausalLM, AutoProcessor, BitsAndBytesConfig from transformers import pipeline
from resemblyzer import VoiceEncoder from resemblyzer import VoiceEncoder
from sklearn.cluster import AgglomerativeClustering from sklearn.cluster import AgglomerativeClustering
import webrtcvad import webrtcvad
...@@ -62,7 +62,7 @@ def get_diarization(audio, sr): ...@@ -62,7 +62,7 @@ def get_diarization(audio, sr):
return merged return merged
def main(): def main():
parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps using Qwen-Omni-7B') parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps')
parser.add_argument('audio_file', help='Path to the audio file') parser.add_argument('audio_file', help='Path to the audio file')
args = parser.parse_args() args = parser.parse_args()
...@@ -73,14 +73,9 @@ def main(): ...@@ -73,14 +73,9 @@ def main():
print(f"Error: Audio file '{audio_file}' not found.") print(f"Error: Audio file '{audio_file}' not found.")
return return
# Load Qwen-Omni-7B model with 4-bit quantization # Load Whisper for transcription
quantization_config = BitsAndBytesConfig(load_in_4bit=True) device = 0 if torch.cuda.is_available() else -1
model = AutoModelForCausalLM.from_pretrained( transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
"Qwen/Qwen-Omni-7B",
quantization_config=quantization_config,
device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen-Omni-7B")
# Load audio # Load audio
audio, sr = librosa.load(audio_file, sr=16000) audio, sr = librosa.load(audio_file, sr=16000)
...@@ -99,19 +94,9 @@ def main(): ...@@ -99,19 +94,9 @@ def main():
if len(audio_chunk) == 0: if len(audio_chunk) == 0:
continue continue
# Prepare inputs for Qwen-Omni # Transcribe with Whisper
conversation = [ result = transcriber(audio_chunk, return_timestamps=False)
{"role": "user", "content": [ text = result['text'].strip()
{"type": "audio", "audio": {"waveform": audio_chunk, "sample_rate": sr}},
{"type": "text", "text": "Transcribe this audio segment exactly as spoken."}
]}
]
inputs = processor(conversation=conversation, return_tensors="pt").to(model.device)
# Generate transcription
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
# Format timestamps # Format timestamps
start_min, start_sec = divmod(start, 60) start_min, start_sec = divmod(start, 60)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment