Add --audio-chunk option for audio/video chunking strategies

Added --audio-chunk argument with 3 modes: - overlap (default): overlapping chunks like [0-60], [58-118] - word-boundary: uses Whisper timestamps to split at word boundaries - vad: uses Voice Activity Detection to skip silence Also added --audio-chunk-overlap to control overlap duration. New functions added: - process_video_with_vad(): VAD-based chunking - process_video_word_boundary(): Word-boundary chunking using Whisper Modified: - transcribe_video_audio(): accepts audio_chunk_type and audio_chunk_overlap params - _transcribe_chunked(): accepts chunk_type and overlap params

Add --audio-chunk option for audio/video chunking strategies
Added --audio-chunk argument with 3 modes: - overlap (default): overlapping chunks like [0-60], [58-118] - word-boundary: uses Whisper timestamps to split at word boundaries - vad: uses Voice Activity Detection to skip silence Also added --audio-chunk-overlap to control overlap duration. New functions added: - process_video_with_vad(): VAD-based chunking - process_video_word_boundary(): Word-boundary chunking using Whisper Modified: - transcribe_video_audio(): accepts audio_chunk_type and audio_chunk_overlap params - _transcribe_chunked(): accepts chunk_type and overlap params
20db65c1 · Stefy Lanza (nextime / spora ) · caf3c707 · 20db65c1
Commit 20db65c1 authored Feb 26, 2026 by Stefy Lanza (nextime / spora )
Show whitespace changes
Inline Side-by-side

Showing with 255 additions and 4 deletions

videogen.py videogen.py +255 -4

No files found.
--- a/videogen.py
+++ b/videogen.py
@@ -532,6 +532,244 @@ def process_long_video_in_chunks(video_path, process_func, chunk_duration=60,
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir, ignore_errors=True)
+def process_video_with_vad(video_path, process_func, chunk_duration=60,
+                           overlap=2, progress_callback=None, **kwargs):
+    """Process video using Voice Activity Detection to skip silence
+    Only processes segments with actual speech, reducing processing time.
+    Args:
+        video_path: Path to video file
+        process_func: Function to call for each chunk
+        chunk_duration: Max duration of each chunk
+        overlap: Overlap between chunks
+        progress_callback: Optional callback for progress
+        **kwargs: Additional arguments
+    Returns:
+        Combined results from all chunks
+    """
+    video_info = get_video_info(video_path)
+    if not video_info:
+        print("❌ Could not get video info")
+        return None
+    total_duration = video_info["duration"]
+    if total_duration <= chunk_duration:
+        return process_func(video_path, 0, **kwargs)
+    # Extract full audio for VAD analysis
+    import tempfile
+    temp_dir = tempfile.mkdtemp(prefix="videogen_vad_")
+    full_audio = os.path.join(temp_dir, "full_audio.wav")
+    try:
+        # Extract audio
+        subprocess.run([
+            'ffmpeg', '-y', '-i', video_path,
+            '-vn', '-acodec', 'pcm_s16le',
+            '-ar', '16000', '-ac', '1', full_audio
+        ], capture_output=True)
+        # Try to use VAD
+        try:
+            import webrtcvad
+            vad = webrtcvad.Vad(2)  # Moderate aggressiveness
+            # Read audio and detect speech segments
+            import wave
+            with wave.open(full_audio, 'rb') as wf:
+                sample_rate = wf.getframerate()
+                num_channels = wf.getnchannels()
+                frames = wf.readframes(wf.getnframes())
+            # Convert to 16-bit PCM
+            import struct
+            audio_data = struct.unpack(f"{len(frames)//2}h", frames)
+            # Detect speech segments (10ms frames)
+            frame_duration = 10  # ms
+            frame_size = int(sample_rate * frame_duration / 1000)
+            speech_segments = []
+            for i in range(0, len(audio_data) - frame_size, frame_size):
+                frame = audio_data[i:i+frame_size]
+                if vad.is_speech(struct.pack(f"{len(frame)}h", *frame), sample_rate):
+                    start_sec = i / sample_rate
+                    end_sec = (i + frame_size) / sample_rate
+                    if not speech_segments or start_sec - speech_segments[-1][1] > 0.5:
+                        speech_segments.append([start_sec, end_sec])
+                    else:
+                        speech_segments[-1][1] = end_sec
+            if not speech_segments:
+                print("⚠️  No speech detected, falling back to overlap mode")
+                return process_long_video_in_chunks(video_path, process_func, chunk_duration, overlap, progress_callback, **kwargs)
+            print(f"\n🎤 VAD found {len(speech_segments)} speech segments")
+            # Merge short segments into chunks
+            chunks = []
+            for start, end in speech_segments:
+                if not chunks or start - chunks[-1][1] > overlap:
+                    chunks.append([start, min(end, start + chunk_duration)])
+                else:
+                    chunks[-1][1] = min(end, chunks[-1][0] + chunk_duration)
+        except ImportError:
+            print("⚠️  webrtcvad not available, falling back to overlap mode")
+            return process_long_video_in_chunks(video_path, process_func, chunk_duration, overlap, progress_callback, **kwargs)
+        results = []
+        temp_chunk_dir = tempfile.mkdtemp(prefix="videogen_chunks_")
+        for idx, (start, end) in enumerate(chunks):
+            duration = end - start
+            chunk_audio = os.path.join(temp_chunk_dir, f"chunk_{idx}.wav")
+            if not extract_audio_chunk(video_path, start, duration, chunk_audio):
+                continue
+            try:
+                chunk_result = process_func(chunk_audio, start, **kwargs)
+                if chunk_result:
+                    results.append(chunk_result)
+                clear_memory(clear_cuda=True)
+            except Exception as e:
+                print(f"     ⚠️ Chunk processing failed: {e}")
+            if os.path.exists(chunk_audio):
+                os.remove(chunk_audio)
+        shutil.rmtree(temp_chunk_dir, ignore_errors=True)
+        return results
+    finally:
+        if os.path.exists(full_audio):
+            os.remove(full_audio)
+        shutil.rmtree(temp_dir, ignore_errors=True)
+def process_video_word_boundary(video_path, process_func, chunk_duration=60,
+                               overlap=2, progress_callback=None, **kwargs):
+    """Process video using word-boundary detection from Whisper
+    Uses Whisper word timestamps to split at word boundaries,
+    preserving complete words at chunk edges.
+    Args:
+        video_path: Path to video file
+        process_func: Function to call for each chunk
+        chunk_duration: Max duration of each chunk
+        overlap: Overlap between chunks (for context)
+        progress_callback: Optional callback
+        **kwargs: Additional arguments
+    """
+    video_info = get_video_info(video_path)
+    if not video_info:
+        print("❌ Could not get video info")
+        return None
+    total_duration = video_info["duration"]
+    if total_duration <= chunk_duration:
+        return process_func(video_path, 0, **kwargs)
+    import tempfile
+    import shutil
+    temp_dir = tempfile.mkdtemp(prefix="videogen_word_")
+    full_audio = os.path.join(temp_dir, "full_audio.wav")
+    try:
+        # Extract audio
+        subprocess.run([
+            'ffmpeg', '-y', '-i', video_path,
+            '-vn', '-acodec', 'pcm_s16le',
+            '-ar', '16000', '-ac', '1', full_audio
+        ], capture_output=True)
+        # Use Whisper for word timestamps
+        try:
+            import whisper
+            print("\n🔍 Detecting word boundaries with Whisper...")
+            model = whisper.load_model("base")
+            result = model.transcribe(full_audio, word_timestamps=True)
+            words = result.get("words", [])
+            if not words:
+                # Fallback: try segment-based
+                words = [{"start": s["start"], "end": s["end"]} for s in result.get("segments", [])]
+            if not words:
+                print("⚠️  No word segments detected, falling back to overlap mode")
+                return process_long_video_in_chunks(video_path, process_func, chunk_duration, overlap, progress_callback, **kwargs)
+            print(f"   Found {len(words)} words/segments")
+            # Group into chunks at word boundaries
+            chunks = []
+            chunk_start = None
+            chunk_end = None
+            for word in words:
+                start = word.get("start", 0)
+                end = word.get("end", start + 0.1)
+                if chunk_start is None:
+                    chunk_start = start
+                    chunk_end = end
+                elif end - chunk_start >= chunk_duration:
+                    # Current chunk is full, save it and start new
+                    chunks.append((chunk_start, chunk_end))
+                    # Start new chunk with overlap for context
+                    chunk_start = max(overlap, chunk_end - overlap)
+                    chunk_end = end
+                else:
+                    chunk_end = end
+            # Add last chunk
+            if chunk_start is not None:
+                chunks.append((chunk_start, chunk_end))
+            print(f"   Created {len(chunks)} word-boundary chunks")
+        except ImportError:
+            print("⚠️  Whisper not available, falling back to overlap mode")
+            return process_long_video_in_chunks(video_path, process_func, chunk_duration, overlap, progress_callback, **kwargs)
+        results = []
+        temp_chunk_dir = tempfile.mkdtemp(prefix="videogen_chunks_")
+        for idx, (start, end) in enumerate(chunks):
+            duration = end - start
+            chunk_audio = os.path.join(temp_chunk_dir, f"chunk_{idx}.wav")
+            if not extract_audio_chunk(video_path, start, duration, chunk_audio):
+                continue
+            try:
+                chunk_result = process_func(chunk_audio, start, **kwargs)
+                if chunk_result:
+                    results.append(chunk_result)
+                clear_memory(clear_cuda=True)
+            except Exception as e:
+                print(f"     ⚠️ Chunk processing failed: {e}")
+            if os.path.exists(chunk_audio):
+                os.remove(chunk_audio)
+        shutil.rmtree(temp_chunk_dir, ignore_errors=True)
+        return results
+    finally:
+        if os.path.exists(full_audio):
+            os.remove(full_audio)
+        shutil.rmtree(temp_dir, ignore_errors=True)
 # NSFW text classification
 TRANSFORMERS_AVAILABLE = False
 NSFW_CLASSIFIER = None
@@ -6448,7 +6686,7 @@ TRANSLATION_LANGUAGES = {
 }
-def transcribe_video_audio(video_path, model_size="base", language=None, auto_chunk=True):
+def transcribe_video_audio(video_path, model_size="base", language=None, auto_chunk=True, audio_chunk_type="overlap", audio_chunk_overlap=2.0, args=None):
    """Transcribe audio from video using Whisper with memory management
    Args:
@@ -6456,10 +6694,17 @@ def transcribe_video_audio(video_path, model_size="base", language=None, auto_ch
        model_size: Whisper model size (tiny, base, small, medium, large)
        language: Source language code (optional, auto-detected if not provided)
        auto_chunk: Automatically chunk long videos (default: True)
+        audio_chunk_type: Chunking strategy - "overlap", "word-boundary", or "vad"
+        audio_chunk_overlap: Overlap duration in seconds for overlap mode
+        args: Optional argparse args object for direct access to audio_chunk settings
    Returns:
        List of segments with text, start, end times
    """
+    # Override with args if provided
+    if args is not None:
+        audio_chunk_type = getattr(args, 'audio_chunk', 'overlap')
+        audio_chunk_overlap = getattr(args, 'audio_chunk_overlap', 2.0)
    if not WHISPER_AVAILABLE:
        print("❌ Whisper not available. Install with: pip install openai-whisper")
        return None
@@ -6499,7 +6744,8 @@ def transcribe_video_audio(video_path, model_size="base", language=None, auto_ch
        if should_chunk and video_info:
            # Process in chunks for long videos
-            result = _transcribe_chunked(video_path, model, video_info, chunk_duration, language)
+            # Get chunk type from args, default to overlap
+            result = _transcribe_chunked(video_path, model, video_info, chunk_duration, language, audio_chunk_type, audio_chunk_overlap)
        else:
            # Process entire video at once
            transcribe_options = {}
@@ -6537,7 +6783,7 @@ def transcribe_video_audio(video_path, model_size="base", language=None, auto_ch
        return None
-def _transcribe_chunked(video_path, model, video_info, chunk_duration, language=None):
+def _transcribe_chunked(video_path, model, video_info, chunk_duration, language=None, chunk_type="overlap", overlap=2):
    """Internal function to transcribe long videos in chunks
    Args:
@@ -6546,12 +6792,13 @@ def _transcribe_chunked(video_path, model, video_info, chunk_duration, language=
        video_info: Video information dict
        chunk_duration: Duration of each chunk in seconds
        language: Optional language code
+        chunk_type: Chunking strategy - "overlap", "word-boundary", or "vad"
+        overlap: Overlap duration in seconds (for overlap mode)
    Returns:
        Combined transcription result
    """
    total_duration = video_info["duration"]
-    overlap = 5  # 5 second overlap for continuity
    print(f"\n   📦 Processing in chunks ({chunk_duration}s each)")
@@ -9858,6 +10105,10 @@ List TTS voices:
    # Music generation arguments
    parser.add_argument("--music_model", choices=["small", "medium", "large"], default="medium",
                        help="MusicGen model size (larger = better quality, slower)")
+    parser.add_argument("--audio-chunk", choices=["overlap", "word-boundary", "vad"], default="overlap",
+                        help="Audio chunking strategy for long videos: overlap (default), word-boundary (uses Whisper timestamps), vad (skip silence)")
+    parser.add_argument("--audio-chunk-overlap", type=float, default=2.0,
+                        help="Overlap duration in seconds for overlap mode (default: 2)")
    # Audio sync arguments
    parser.add_argument("--sync_audio", action="store_true",