Add --whisper option, use Qwen-Omni by default

parent 1cfee90a
Pipeline #212 canceled with stages
# Audio Transcription App # Audio Transcription App
This Python application transcribes audio files with speaker diarization and timestamps using Whisper and Resemblyzer models. This Python application transcribes audio files with speaker diarization and timestamps using Qwen2.5-Omni-7B and Resemblyzer models by default. Use --whisper to use Whisper instead.
## Features ## Features
- Automatic speech recognition with Qwen-Omni-7B (4-bit quantized) - Automatic speech recognition with Qwen-Omni-7B (4-bit quantized)
......
...@@ -64,6 +64,7 @@ def get_diarization(audio, sr): ...@@ -64,6 +64,7 @@ def get_diarization(audio, sr):
def main(): def main():
parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps') parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps')
parser.add_argument('audio_file', help='Path to the audio file') parser.add_argument('audio_file', help='Path to the audio file')
parser.add_argument('--whisper', action='store_true', help='Use Whisper instead of Qwen-Omni for transcription')
args = parser.parse_args() args = parser.parse_args()
audio_file = args.audio_file audio_file = args.audio_file
...@@ -73,10 +74,6 @@ def main(): ...@@ -73,10 +74,6 @@ def main():
print(f"Error: Audio file '{audio_file}' not found.") print(f"Error: Audio file '{audio_file}' not found.")
return return
# Load Whisper for transcription
device = 0 if torch.cuda.is_available() else -1
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
# Load audio # Load audio
audio, sr = librosa.load(audio_file, sr=16000) audio, sr = librosa.load(audio_file, sr=16000)
...@@ -94,9 +91,35 @@ def main(): ...@@ -94,9 +91,35 @@ def main():
if len(audio_chunk) == 0: if len(audio_chunk) == 0:
continue continue
# Transcribe with Whisper if args.whisper:
# Use Whisper
device = 0 if torch.cuda.is_available() else -1
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
result = transcriber(audio_chunk, return_timestamps=False) result = transcriber(audio_chunk, return_timestamps=False)
text = result['text'].strip() text = result['text'].strip()
else:
# Use Qwen-Omni
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto", trust_remote_code=True)
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B", trust_remote_code=True)
conversation = [
{"role": "user", "content": [
{"type": "audio", "audio": {"waveform": audio_chunk, "sample_rate": sr}},
{"type": "text", "text": "Transcribe this audio segment exactly as spoken."}
]}
]
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios = [audio_chunk]
images = None
videos = None
inputs = processor(text=text_prompt, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=False)
inputs = inputs.to(model.device).to(model.dtype)
text_ids, _ = model.generate(**inputs, use_audio_in_video=False)
full_text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
if "assistant\n" in full_text:
text = full_text.split("assistant\n")[-1].strip()
else:
text = full_text.strip()
# Format timestamps # Format timestamps
start_min, start_sec = divmod(start, 60) start_min, start_sec = divmod(start, 60)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment