Add --whisper option, use Qwen-Omni by default

parent 1cfee90a
Pipeline #212 canceled with stages
# Audio Transcription App
This Python application transcribes audio files with speaker diarization and timestamps using Whisper and Resemblyzer models.
This Python application transcribes audio files with speaker diarization and timestamps using Qwen2.5-Omni-7B and Resemblyzer models by default. Use --whisper to use Whisper instead.
## Features
- Automatic speech recognition with Qwen-Omni-7B (4-bit quantized)
......
......@@ -64,6 +64,7 @@ def get_diarization(audio, sr):
def main():
parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps')
parser.add_argument('audio_file', help='Path to the audio file')
parser.add_argument('--whisper', action='store_true', help='Use Whisper instead of Qwen-Omni for transcription')
args = parser.parse_args()
audio_file = args.audio_file
......@@ -73,10 +74,6 @@ def main():
print(f"Error: Audio file '{audio_file}' not found.")
return
# Load Whisper for transcription
device = 0 if torch.cuda.is_available() else -1
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
# Load audio
audio, sr = librosa.load(audio_file, sr=16000)
......@@ -94,9 +91,35 @@ def main():
if len(audio_chunk) == 0:
continue
# Transcribe with Whisper
if args.whisper:
# Use Whisper
device = 0 if torch.cuda.is_available() else -1
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
result = transcriber(audio_chunk, return_timestamps=False)
text = result['text'].strip()
else:
# Use Qwen-Omni
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto", trust_remote_code=True)
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B", trust_remote_code=True)
conversation = [
{"role": "user", "content": [
{"type": "audio", "audio": {"waveform": audio_chunk, "sample_rate": sr}},
{"type": "text", "text": "Transcribe this audio segment exactly as spoken."}
]}
]
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios = [audio_chunk]
images = None
videos = None
inputs = processor(text=text_prompt, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=False)
inputs = inputs.to(model.device).to(model.dtype)
text_ids, _ = model.generate(**inputs, use_audio_in_video=False)
full_text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
if "assistant\n" in full_text:
text = full_text.split("assistant\n")[-1].strip()
else:
text = full_text.strip()
# Format timestamps
start_min, start_sec = divmod(start, 60)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment