Replace pyannote with resemblyzer for token-free diarization

ef3ace79 · Stefy Lanza (nextime / spora ) · 460eaa23 · ef3ace79 · ef3ace79 · ef3ace79
Commit ef3ace79 authored Dec 11, 2025 by Stefy Lanza (nextime / spora )
Show whitespace changes
Inline Side-by-side

Showing with 68 additions and 17 deletions

README.md README.md +1 -4

requirements.txt requirements.txt +4 -2

transcript.py transcript.py +63 -11

No files found.
--- a/README.md
+++ b/README.md
@@ -27,10 +27,7 @@ This Python application transcribes audio files with speaker diarization and tim
   ```bash
   pip install -r requirements.txt
   ```
-5. Optional: Set Hugging Face token for pyannote models (required for first download):
-   ```bash
-   export HF_TOKEN=your_huggingface_token
-   ```
+5. No additional setup required - models download automatically on first run.

 ## Usage
 ```bash

--- a/requirements.txt
+++ b/requirements.txt
 torch
 transformers
-pyannote.audio
 librosa
 bitsandbytes
 accelerate
+resemblyzer
+webrtcvad
+scikit-learn
\ No newline at end of file
--- a/transcript.py
+++ b/transcript.py
 import argparse
 import torch
 from transformers import AutoModelForCausalLM, AutoProcessor, BitsAndBytesConfig
-from pyannote.audio import Pipeline
+from resemblyzer import VoiceEncoder
+from sklearn.cluster import AgglomerativeClustering
+import webrtcvad
 import librosa
+import numpy as np
 import os

+def get_diarization(audio, sr):
+    encoder = VoiceEncoder()
+    vad = webrtcvad.Vad(3)
+    frame_duration = 0.01
+    frame_length = int(sr * frame_duration)
+    segments = []
+    start = None
+    for i in range(0, len(audio) - frame_length, frame_length):
+        frame = audio[i:i+frame_length]
+        is_speech = vad.is_speech((frame * 32767).astype(np.int16).tobytes(), sr)
+        if is_speech and start is None:
+            start = i / sr
+        elif not is_speech and start is not None:
+            end = i / sr
+            segments.append((start, end))
+            start = None
+    if start is not None:
+        segments.append((start, len(audio)/sr))
+    if not segments:
+        return []
+    embeddings = []
+    for start, end in segments:
+        start_sample = int(start * sr)
+        end_sample = int(end * sr)
+        chunk = audio[start_sample:end_sample]
+        if len(chunk) > 0:
+            emb = encoder.embed_utterance(chunk)
+            embeddings.append(emb)
+    if len(embeddings) <= 1:
+        labels = [0] * len(segments)
+    else:
+        clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.5).fit(embeddings)
+        labels = clustering.labels_
+        n_speakers = len(set(labels))
+        if n_speakers > 1:
+            clustering = AgglomerativeClustering(n_clusters=n_speakers).fit(embeddings)
+            labels = clustering.labels_
+    diarization = []
+    for (start, end), label in zip(segments, labels):
+        diarization.append((start, end, f"SPEAKER_{label:02d}"))
+    # Merge continuous same speaker
+    merged = []
+    if diarization:
+        current_start, current_end, current_speaker = diarization[0]
+        for start, end, speaker in diarization[1:]:
+            if speaker == current_speaker and abs(start - current_end) < 0.1:  # small gap
+                current_end = end
+            else:
+                merged.append((current_start, current_end, current_speaker))
+                current_start, current_end, current_speaker = start, end, speaker
+        merged.append((current_start, current_end, current_speaker))
+    return merged
+
 def main():
    parser = argparse.ArgumentParser(description='Transcribe audio with speakers and timestamps using Qwen-Omni-7B')
    parser.add_argument('audio_file', help='Path to the audio file')
@@ -26,27 +82,23 @@ def main():
    )
    processor = AutoProcessor.from_pretrained("Qwen/Qwen-Omni-7B")

-    # Load diarization pipeline
-    # Optional: Set HF_TOKEN environment variable for pyannote authentication (required for first download)
-    diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=os.getenv("HF_TOKEN"))
-
    # Load audio
    audio, sr = librosa.load(audio_file, sr=16000)

-    # Diarize first
-    diarization = diarization_pipeline(audio_file)
+    # Diarize
+    diarization = get_diarization(audio, sr)

    # Transcribe each speaker segment
    output_lines = []
-    for turn, _, speaker_id in diarization.itertracks(yield_label=True):
-        start = turn.start
-        end = turn.end
-
+    for start, end, speaker_id in diarization:
        # Extract audio chunk
        start_sample = int(start * sr)
        end_sample = int(end * sr)
        audio_chunk = audio[start_sample:end_sample]

+        if len(audio_chunk) == 0:
+            continue
+
        # Prepare inputs for Qwen-Omni
        conversation = [
            {"role": "user", "content": [