Add --audio-chunk option for audio/video chunking strategies

Added --audio-chunk argument with 3 modes:
- overlap (default): overlapping chunks like [0-60], [58-118]
- word-boundary: uses Whisper timestamps to split at word boundaries
- vad: uses Voice Activity Detection to skip silence

Also added --audio-chunk-overlap to control overlap duration.

New functions added:
- process_video_with_vad(): VAD-based chunking
- process_video_word_boundary(): Word-boundary chunking using Whisper

Modified:
- transcribe_video_audio(): accepts audio_chunk_type and audio_chunk_overlap params
- _transcribe_chunked(): accepts chunk_type and overlap params
parent caf3c707
...@@ -532,6 +532,244 @@ def process_long_video_in_chunks(video_path, process_func, chunk_duration=60, ...@@ -532,6 +532,244 @@ def process_long_video_in_chunks(video_path, process_func, chunk_duration=60,
if os.path.exists(temp_dir): if os.path.exists(temp_dir):
shutil.rmtree(temp_dir, ignore_errors=True) shutil.rmtree(temp_dir, ignore_errors=True)
def process_video_with_vad(video_path, process_func, chunk_duration=60,
overlap=2, progress_callback=None, **kwargs):
"""Process video using Voice Activity Detection to skip silence
Only processes segments with actual speech, reducing processing time.
Args:
video_path: Path to video file
process_func: Function to call for each chunk
chunk_duration: Max duration of each chunk
overlap: Overlap between chunks
progress_callback: Optional callback for progress
**kwargs: Additional arguments
Returns:
Combined results from all chunks
"""
video_info = get_video_info(video_path)
if not video_info:
print("❌ Could not get video info")
return None
total_duration = video_info["duration"]
if total_duration <= chunk_duration:
return process_func(video_path, 0, **kwargs)
# Extract full audio for VAD analysis
import tempfile
temp_dir = tempfile.mkdtemp(prefix="videogen_vad_")
full_audio = os.path.join(temp_dir, "full_audio.wav")
try:
# Extract audio
subprocess.run([
'ffmpeg', '-y', '-i', video_path,
'-vn', '-acodec', 'pcm_s16le',
'-ar', '16000', '-ac', '1', full_audio
], capture_output=True)
# Try to use VAD
try:
import webrtcvad
vad = webrtcvad.Vad(2) # Moderate aggressiveness
# Read audio and detect speech segments
import wave
with wave.open(full_audio, 'rb') as wf:
sample_rate = wf.getframerate()
num_channels = wf.getnchannels()
frames = wf.readframes(wf.getnframes())
# Convert to 16-bit PCM
import struct
audio_data = struct.unpack(f"{len(frames)//2}h", frames)
# Detect speech segments (10ms frames)
frame_duration = 10 # ms
frame_size = int(sample_rate * frame_duration / 1000)
speech_segments = []
for i in range(0, len(audio_data) - frame_size, frame_size):
frame = audio_data[i:i+frame_size]
if vad.is_speech(struct.pack(f"{len(frame)}h", *frame), sample_rate):
start_sec = i / sample_rate
end_sec = (i + frame_size) / sample_rate
if not speech_segments or start_sec - speech_segments[-1][1] > 0.5:
speech_segments.append([start_sec, end_sec])
else:
speech_segments[-1][1] = end_sec
if not speech_segments:
print("⚠️ No speech detected, falling back to overlap mode")
return process_long_video_in_chunks(video_path, process_func, chunk_duration, overlap, progress_callback, **kwargs)
print(f"\n🎤 VAD found {len(speech_segments)} speech segments")
# Merge short segments into chunks
chunks = []
for start, end in speech_segments:
if not chunks or start - chunks[-1][1] > overlap:
chunks.append([start, min(end, start + chunk_duration)])
else:
chunks[-1][1] = min(end, chunks[-1][0] + chunk_duration)
except ImportError:
print("⚠️ webrtcvad not available, falling back to overlap mode")
return process_long_video_in_chunks(video_path, process_func, chunk_duration, overlap, progress_callback, **kwargs)
results = []
temp_chunk_dir = tempfile.mkdtemp(prefix="videogen_chunks_")
for idx, (start, end) in enumerate(chunks):
duration = end - start
chunk_audio = os.path.join(temp_chunk_dir, f"chunk_{idx}.wav")
if not extract_audio_chunk(video_path, start, duration, chunk_audio):
continue
try:
chunk_result = process_func(chunk_audio, start, **kwargs)
if chunk_result:
results.append(chunk_result)
clear_memory(clear_cuda=True)
except Exception as e:
print(f" ⚠️ Chunk processing failed: {e}")
if os.path.exists(chunk_audio):
os.remove(chunk_audio)
shutil.rmtree(temp_chunk_dir, ignore_errors=True)
return results
finally:
if os.path.exists(full_audio):
os.remove(full_audio)
shutil.rmtree(temp_dir, ignore_errors=True)
def process_video_word_boundary(video_path, process_func, chunk_duration=60,
overlap=2, progress_callback=None, **kwargs):
"""Process video using word-boundary detection from Whisper
Uses Whisper word timestamps to split at word boundaries,
preserving complete words at chunk edges.
Args:
video_path: Path to video file
process_func: Function to call for each chunk
chunk_duration: Max duration of each chunk
overlap: Overlap between chunks (for context)
progress_callback: Optional callback
**kwargs: Additional arguments
"""
video_info = get_video_info(video_path)
if not video_info:
print("❌ Could not get video info")
return None
total_duration = video_info["duration"]
if total_duration <= chunk_duration:
return process_func(video_path, 0, **kwargs)
import tempfile
import shutil
temp_dir = tempfile.mkdtemp(prefix="videogen_word_")
full_audio = os.path.join(temp_dir, "full_audio.wav")
try:
# Extract audio
subprocess.run([
'ffmpeg', '-y', '-i', video_path,
'-vn', '-acodec', 'pcm_s16le',
'-ar', '16000', '-ac', '1', full_audio
], capture_output=True)
# Use Whisper for word timestamps
try:
import whisper
print("\n🔍 Detecting word boundaries with Whisper...")
model = whisper.load_model("base")
result = model.transcribe(full_audio, word_timestamps=True)
words = result.get("words", [])
if not words:
# Fallback: try segment-based
words = [{"start": s["start"], "end": s["end"]} for s in result.get("segments", [])]
if not words:
print("⚠️ No word segments detected, falling back to overlap mode")
return process_long_video_in_chunks(video_path, process_func, chunk_duration, overlap, progress_callback, **kwargs)
print(f" Found {len(words)} words/segments")
# Group into chunks at word boundaries
chunks = []
chunk_start = None
chunk_end = None
for word in words:
start = word.get("start", 0)
end = word.get("end", start + 0.1)
if chunk_start is None:
chunk_start = start
chunk_end = end
elif end - chunk_start >= chunk_duration:
# Current chunk is full, save it and start new
chunks.append((chunk_start, chunk_end))
# Start new chunk with overlap for context
chunk_start = max(overlap, chunk_end - overlap)
chunk_end = end
else:
chunk_end = end
# Add last chunk
if chunk_start is not None:
chunks.append((chunk_start, chunk_end))
print(f" Created {len(chunks)} word-boundary chunks")
except ImportError:
print("⚠️ Whisper not available, falling back to overlap mode")
return process_long_video_in_chunks(video_path, process_func, chunk_duration, overlap, progress_callback, **kwargs)
results = []
temp_chunk_dir = tempfile.mkdtemp(prefix="videogen_chunks_")
for idx, (start, end) in enumerate(chunks):
duration = end - start
chunk_audio = os.path.join(temp_chunk_dir, f"chunk_{idx}.wav")
if not extract_audio_chunk(video_path, start, duration, chunk_audio):
continue
try:
chunk_result = process_func(chunk_audio, start, **kwargs)
if chunk_result:
results.append(chunk_result)
clear_memory(clear_cuda=True)
except Exception as e:
print(f" ⚠️ Chunk processing failed: {e}")
if os.path.exists(chunk_audio):
os.remove(chunk_audio)
shutil.rmtree(temp_chunk_dir, ignore_errors=True)
return results
finally:
if os.path.exists(full_audio):
os.remove(full_audio)
shutil.rmtree(temp_dir, ignore_errors=True)
# NSFW text classification # NSFW text classification
TRANSFORMERS_AVAILABLE = False TRANSFORMERS_AVAILABLE = False
NSFW_CLASSIFIER = None NSFW_CLASSIFIER = None
...@@ -6448,7 +6686,7 @@ TRANSLATION_LANGUAGES = { ...@@ -6448,7 +6686,7 @@ TRANSLATION_LANGUAGES = {
} }
def transcribe_video_audio(video_path, model_size="base", language=None, auto_chunk=True): def transcribe_video_audio(video_path, model_size="base", language=None, auto_chunk=True, audio_chunk_type="overlap", audio_chunk_overlap=2.0, args=None):
"""Transcribe audio from video using Whisper with memory management """Transcribe audio from video using Whisper with memory management
Args: Args:
...@@ -6456,10 +6694,17 @@ def transcribe_video_audio(video_path, model_size="base", language=None, auto_ch ...@@ -6456,10 +6694,17 @@ def transcribe_video_audio(video_path, model_size="base", language=None, auto_ch
model_size: Whisper model size (tiny, base, small, medium, large) model_size: Whisper model size (tiny, base, small, medium, large)
language: Source language code (optional, auto-detected if not provided) language: Source language code (optional, auto-detected if not provided)
auto_chunk: Automatically chunk long videos (default: True) auto_chunk: Automatically chunk long videos (default: True)
audio_chunk_type: Chunking strategy - "overlap", "word-boundary", or "vad"
audio_chunk_overlap: Overlap duration in seconds for overlap mode
args: Optional argparse args object for direct access to audio_chunk settings
Returns: Returns:
List of segments with text, start, end times List of segments with text, start, end times
""" """
# Override with args if provided
if args is not None:
audio_chunk_type = getattr(args, 'audio_chunk', 'overlap')
audio_chunk_overlap = getattr(args, 'audio_chunk_overlap', 2.0)
if not WHISPER_AVAILABLE: if not WHISPER_AVAILABLE:
print("❌ Whisper not available. Install with: pip install openai-whisper") print("❌ Whisper not available. Install with: pip install openai-whisper")
return None return None
...@@ -6499,7 +6744,8 @@ def transcribe_video_audio(video_path, model_size="base", language=None, auto_ch ...@@ -6499,7 +6744,8 @@ def transcribe_video_audio(video_path, model_size="base", language=None, auto_ch
if should_chunk and video_info: if should_chunk and video_info:
# Process in chunks for long videos # Process in chunks for long videos
result = _transcribe_chunked(video_path, model, video_info, chunk_duration, language) # Get chunk type from args, default to overlap
result = _transcribe_chunked(video_path, model, video_info, chunk_duration, language, audio_chunk_type, audio_chunk_overlap)
else: else:
# Process entire video at once # Process entire video at once
transcribe_options = {} transcribe_options = {}
...@@ -6537,7 +6783,7 @@ def transcribe_video_audio(video_path, model_size="base", language=None, auto_ch ...@@ -6537,7 +6783,7 @@ def transcribe_video_audio(video_path, model_size="base", language=None, auto_ch
return None return None
def _transcribe_chunked(video_path, model, video_info, chunk_duration, language=None): def _transcribe_chunked(video_path, model, video_info, chunk_duration, language=None, chunk_type="overlap", overlap=2):
"""Internal function to transcribe long videos in chunks """Internal function to transcribe long videos in chunks
Args: Args:
...@@ -6546,12 +6792,13 @@ def _transcribe_chunked(video_path, model, video_info, chunk_duration, language= ...@@ -6546,12 +6792,13 @@ def _transcribe_chunked(video_path, model, video_info, chunk_duration, language=
video_info: Video information dict video_info: Video information dict
chunk_duration: Duration of each chunk in seconds chunk_duration: Duration of each chunk in seconds
language: Optional language code language: Optional language code
chunk_type: Chunking strategy - "overlap", "word-boundary", or "vad"
overlap: Overlap duration in seconds (for overlap mode)
Returns: Returns:
Combined transcription result Combined transcription result
""" """
total_duration = video_info["duration"] total_duration = video_info["duration"]
overlap = 5 # 5 second overlap for continuity
print(f"\n 📦 Processing in chunks ({chunk_duration}s each)") print(f"\n 📦 Processing in chunks ({chunk_duration}s each)")
...@@ -9858,6 +10105,10 @@ List TTS voices: ...@@ -9858,6 +10105,10 @@ List TTS voices:
# Music generation arguments # Music generation arguments
parser.add_argument("--music_model", choices=["small", "medium", "large"], default="medium", parser.add_argument("--music_model", choices=["small", "medium", "large"], default="medium",
help="MusicGen model size (larger = better quality, slower)") help="MusicGen model size (larger = better quality, slower)")
parser.add_argument("--audio-chunk", choices=["overlap", "word-boundary", "vad"], default="overlap",
help="Audio chunking strategy for long videos: overlap (default), word-boundary (uses Whisper timestamps), vad (skip silence)")
parser.add_argument("--audio-chunk-overlap", type=float, default=2.0,
help="Overlap duration in seconds for overlap mode (default: 2)")
# Audio sync arguments # Audio sync arguments
parser.add_argument("--sync_audio", action="store_true", parser.add_argument("--sync_audio", action="store_true",
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment