Switch to Whisper for transcription, remove Qwen-Omni due to repo not found

3c9ba76d · Stefy Lanza (nextime / spora ) · 7f6696b6 · 3c9ba76d · 3c9ba76d · 3c9ba76d
Commit 3c9ba76d authored Dec 11, 2025 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 26 deletions

README.md README.md +1 -1

requirements.txt requirements.txt +0 -2

transcript.py transcript.py +8 -23

No files found.
--- a/README.md
+++ b/README.md
 # Audio Transcription App
-This Python application transcribes audio files with speaker diarization and timestamps using Qwen-Omni-7B model.
+This Python application transcribes audio files with speaker diarization and timestamps using Whisper and Resemblyzer models.
 ## Features
 - Automatic speech recognition with Qwen-Omni-7B (4-bit quantized)

--- a/requirements.txt
+++ b/requirements.txt
 torch
 transformers
 librosa
-bitsandbytes
-accelerate
 resemblyzer
 webrtcvad
 scikit-learn
\ No newline at end of file
--- a/transcript.py
+++ b/transcript.py
 import argparse
 import torch
-from transformers import AutoModelForCausalLM, AutoProcessor, BitsAndBytesConfig
+from transformers import pipeline
 from resemblyzer import VoiceEncoder
 from sklearn.cluster import AgglomerativeClustering
 import webrtcvad
@@ -62,7 +62,7 @@ def get_diarization(audio, sr):
    return merged
 def main():
-    parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps using Qwen-Omni-7B')
+    parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps')
    parser.add_argument('audio_file', help='Path to the audio file')
    args = parser.parse_args()
@@ -73,14 +73,9 @@ def main():
        print(f"Error: Audio file '{audio_file}' not found.")
        return
-    # Load Qwen-Omni-7B model with 4-bit quantization
+    # Load Whisper for transcription
-    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+    device = 0 if torch.cuda.is_available() else -1
-    model = AutoModelForCausalLM.from_pretrained(
+    transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
-        "Qwen/Qwen-Omni-7B",
-        quantization_config=quantization_config,
-        device_map="auto"
-    )
-    processor = AutoProcessor.from_pretrained("Qwen/Qwen-Omni-7B")
    # Load audio
    audio, sr = librosa.load(audio_file, sr=16000)
@@ -99,19 +94,9 @@ def main():
        if len(audio_chunk) == 0:
            continue
-        # Prepare inputs for Qwen-Omni
+        # Transcribe with Whisper
-        conversation = [
+        result = transcriber(audio_chunk, return_timestamps=False)
-            {"role": "user", "content": [
+        text = result['text'].strip()
-                {"type": "audio", "audio": {"waveform": audio_chunk, "sample_rate": sr}},
-                {"type": "text", "text": "Transcribe this audio segment exactly as spoken."}
-            ]}
-        ]
-        inputs = processor(conversation=conversation, return_tensors="pt").to(model.device)
-        # Generate transcription
-        with torch.no_grad():
-            generated_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
-        text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        # Format timestamps
        start_min, start_sec = divmod(start, 60)