Final version using Whisper for transcription, Resemblyzer for diarization

85466098 · Stefy Lanza (nextime / spora ) · 1439038f · 85466098 · 85466098 · 85466098
Commit 85466098 authored Dec 11, 2025 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 25 deletions

README.md README.md +1 -1

requirements.txt requirements.txt +1 -5

transcript.py transcript.py +8 -19

No files found.
--- a/README.md
+++ b/README.md
 # Audio Transcription App

-This Python application transcribes audio files with speaker diarization and timestamps using Qwen2.5-Omni-7B and Resemblyzer models.
+This Python application transcribes audio files with speaker diarization and timestamps using Whisper and Resemblyzer models.

 ## Features
 - Automatic speech recognition with Qwen-Omni-7B (4-bit quantized)

--- a/requirements.txt
+++ b/requirements.txt
 torch
-torchvision
 transformers
 librosa
-bitsandbytes
-accelerate
 resemblyzer
 webrtcvad
-scikit-learn
-pillow
\ No newline at end of file
+scikit-learn
\ No newline at end of file
--- a/transcript.py
+++ b/transcript.py
 import argparse
 import torch
-from transformers import AutoProcessor, AutoModelForCausalLM, BitsAndBytesConfig
+from transformers import pipeline
 from resemblyzer import VoiceEncoder
 from sklearn.cluster import AgglomerativeClustering
 import webrtcvad
@@ -62,7 +62,7 @@ def get_diarization(audio, sr):
    return merged

 def main():
-    parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps using Qwen2.5-Omni-7B')
+    parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps')
    parser.add_argument('audio_file', help='Path to the audio file')
    args = parser.parse_args()

@@ -73,10 +73,9 @@ def main():
        print(f"Error: Audio file '{audio_file}' not found.")
        return

-    # Load Qwen2.5-Omni-7B model with 4-bit quantization
-    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-    processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
-    model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Omni-7B", quantization_config=quantization_config, device_map="auto")
+    # Load Whisper for transcription
+    device = 0 if torch.cuda.is_available() else -1
+    transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)

    # Load audio
    audio, sr = librosa.load(audio_file, sr=16000)
@@ -95,19 +94,9 @@ def main():
        if len(audio_chunk) == 0:
            continue

-        # Prepare inputs for Qwen-Omni
-        conversation = [
-            {"role": "user", "content": [
-                {"type": "audio", "audio": {"waveform": audio_chunk, "sample_rate": sr}},
-                {"type": "text", "text": "Transcribe this audio segment exactly as spoken."}
-            ]}
-        ]
-        inputs = processor(conversation=conversation, return_tensors="pt").to(model.device)
-
-        # Generate transcription
-        with torch.no_grad():
-            generated_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
-        text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        # Transcribe with Whisper
+        result = transcriber(audio_chunk, return_timestamps=False)
+        text = result['text'].strip()

        # Format timestamps
        start_min, start_sec = divmod(start, 60)