Revert to Whisper for transcription to fix execution errors

1cfee90a · Stefy Lanza (nextime / spora ) · 7d3a3238 · 1cfee90a · 1cfee90a
Commit 1cfee90a authored Dec 11, 2025 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 32 deletions

README.md README.md +1 -1

transcript.py transcript.py +8 -31

No files found.
--- a/README.md
+++ b/README.md
 # Audio Transcription App
-This Python application transcribes audio files with speaker diarization and timestamps using Qwen2.5-Omni-7B and Resemblyzer models.
+This Python application transcribes audio files with speaker diarization and timestamps using Whisper and Resemblyzer models.
 ## Features
 - Automatic speech recognition with Qwen-Omni-7B (4-bit quantized)

--- a/transcript.py
+++ b/transcript.py
 import argparse
 import torch
-from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
+from transformers import pipeline
 from resemblyzer import VoiceEncoder
 from sklearn.cluster import AgglomerativeClustering
 import webrtcvad
@@ -62,7 +62,7 @@ def get_diarization(audio, sr):
    return merged
 def main():
-    parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps using Qwen2.5-Omni-7B')
+    parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps')
    parser.add_argument('audio_file', help='Path to the audio file')
    args = parser.parse_args()
@@ -73,11 +73,9 @@ def main():
        print(f"Error: Audio file '{audio_file}' not found.")
        return
-    # Load Qwen2.5-Omni-7B model
+    # Load Whisper for transcription
-    model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto", trust_remote_code=True)
+    device = 0 if torch.cuda.is_available() else -1
-    processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B", trust_remote_code=True)
+    transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
-    # Note: qwen_omni_utils not available, using direct processing
    # Load audio
    audio, sr = librosa.load(audio_file, sr=16000)
@@ -96,30 +94,9 @@ def main():
        if len(audio_chunk) == 0:
            continue
-        # Prepare inputs for Qwen-Omni
+        # Transcribe with Whisper
-        conversation = [
+        result = transcriber(audio_chunk, return_timestamps=False)
-            {"role": "user", "content": [
+        text = result['text'].strip()
-                {"type": "audio", "audio": {"waveform": audio_chunk, "sample_rate": sr}},
-                {"type": "text", "text": "Transcribe this audio segment exactly as spoken."}
-            ]}
-        ]
-        # Preparation for inference
-        text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
-        audios = [audio_chunk]
-        images = None
-        videos = None
-        inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=False)
-        inputs = inputs.to(model.device).to(model.dtype)
-        # Inference: Generation of the output text
-        text_ids, _ = model.generate(**inputs, use_audio_in_video=False)
-        full_text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        # Extract only the assistant's response
-        if "assistant\n" in full_text:
-            text = full_text.split("assistant\n")[-1].strip()
-        else:
-            text = full_text.strip()
        # Format timestamps
        start_min, start_sec = divmod(start, 60)