Remove qwen_omni_utils import and set audio inputs directly

parent 6b2dfd39
Pipeline #208 canceled with stages
...@@ -77,8 +77,7 @@ def main(): ...@@ -77,8 +77,7 @@ def main():
model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto", trust_remote_code=True) model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto", trust_remote_code=True)
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B", trust_remote_code=True) processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B", trust_remote_code=True)
# Import utils after loading model # Note: qwen_omni_utils not available, using direct processing
from qwen_omni_utils import process_mm_info
# Load audio # Load audio
audio, sr = librosa.load(audio_file, sr=16000) audio, sr = librosa.load(audio_file, sr=16000)
...@@ -107,7 +106,9 @@ def main(): ...@@ -107,7 +106,9 @@ def main():
# Preparation for inference # Preparation for inference
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios, images, videos = process_mm_info(conversation, use_audio_in_video=False) audios = [audio_chunk]
images = []
videos = []
inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=False) inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=False)
inputs = inputs.to(model.device).to(model.dtype) inputs = inputs.to(model.device).to(model.dtype)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment