Add --whisper option, use Qwen-Omni by default

5d5500e4 · Stefy Lanza (nextime / spora ) · 1cfee90a · 5d5500e4 · 5d5500e4
Commit 5d5500e4 authored Dec 11, 2025 by Stefy Lanza (nextime / spora )
Show whitespace changes
Inline Side-by-side

Showing with 31 additions and 8 deletions

README.md README.md +1 -1

transcript.py transcript.py +30 -7

No files found.
--- a/README.md
+++ b/README.md
 # Audio Transcription App

-This Python application transcribes audio files with speaker diarization and timestamps using Whisper and Resemblyzer models.
+This Python application transcribes audio files with speaker diarization and timestamps using Qwen2.5-Omni-7B and Resemblyzer models by default. Use --whisper to use Whisper instead.

 ## Features
 - Automatic speech recognition with Qwen-Omni-7B (4-bit quantized)

--- a/transcript.py
+++ b/transcript.py
@@ -64,6 +64,7 @@ def get_diarization(audio, sr):
 def main():
    parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps')
    parser.add_argument('audio_file', help='Path to the audio file')
+    parser.add_argument('--whisper', action='store_true', help='Use Whisper instead of Qwen-Omni for transcription')
    args = parser.parse_args()

    audio_file = args.audio_file
@@ -73,10 +74,6 @@ def main():
        print(f"Error: Audio file '{audio_file}' not found.")
        return

-    # Load Whisper for transcription
-    device = 0 if torch.cuda.is_available() else -1
-    transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
-
    # Load audio
    audio, sr = librosa.load(audio_file, sr=16000)

@@ -94,9 +91,35 @@ def main():
        if len(audio_chunk) == 0:
            continue

-        # Transcribe with Whisper
+        if args.whisper:
+            # Use Whisper
+            device = 0 if torch.cuda.is_available() else -1
+            transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
            result = transcriber(audio_chunk, return_timestamps=False)
            text = result['text'].strip()
+        else:
+            # Use Qwen-Omni
+            from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
+            model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto", trust_remote_code=True)
+            processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B", trust_remote_code=True)
+            conversation = [
+                {"role": "user", "content": [
+                    {"type": "audio", "audio": {"waveform": audio_chunk, "sample_rate": sr}},
+                    {"type": "text", "text": "Transcribe this audio segment exactly as spoken."}
+                ]}
+            ]
+            text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+            audios = [audio_chunk]
+            images = None
+            videos = None
+            inputs = processor(text=text_prompt, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=False)
+            inputs = inputs.to(model.device).to(model.dtype)
+            text_ids, _ = model.generate(**inputs, use_audio_in_video=False)
+            full_text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+            if "assistant\n" in full_text:
+                text = full_text.split("assistant\n")[-1].strip()
+            else:
+                text = full_text.strip()

        # Format timestamps
        start_min, start_sec = divmod(start, 60)