feat: add multimodal client doubt modes

fbdcc1d5 · Stefy Lanza (nextime / spora ) · 49c2f682 · fbdcc1d5 · fbdcc1d5
Commit fbdcc1d5 authored May 06, 2026 by Stefy Lanza (nextime / spora )
Hide whitespace changes
Inline Side-by-side

Showing with 80 additions and 0 deletions

test_manual_multimodal_test_client.py tests/test_manual_multimodal_test_client.py +46 -0

manual_multimodal_test_client.py tools/manual_multimodal_test_client.py +34 -0

No files found.
--- a/tests/test_manual_multimodal_test_client.py
+++ b/tests/test_manual_multimodal_test_client.py
@@ -234,6 +234,52 @@ def test_build_request_spec_for_video_generation_uses_json_payload(tmp_path):
    }


+def test_build_request_spec_for_video_doubt_uses_text_endpoint_with_video_context(tmp_path):
+    video_path = tmp_path / "clip.mp4"
+    video_path.write_bytes(b"video-bytes")
+    config = {
+        "mode": "video-doubt",
+        "url": "http://127.0.0.1:6745",
+        "model": "vision:test",
+        "prompt": "What happens in this clip?",
+        "output_dir": tmp_path,
+        "token": None,
+        "audio_file": None,
+        "video_file": str(video_path),
+        "response_format": None,
+    }
+
+    spec = build_request_spec(config)
+
+    assert spec["url"].endswith("/v1/chat/completions")
+    assert spec["json"]["model"] == "vision:test"
+    assert str(video_path) in spec["json"]["messages"][0]["content"]
+    assert "What happens in this clip?" in spec["json"]["messages"][0]["content"]
+
+
+def test_build_request_spec_for_music_audio_doubt_uses_text_endpoint_with_audio_context(tmp_path):
+    audio_path = tmp_path / "clip.wav"
+    audio_path.write_bytes(b"audio-bytes")
+    config = {
+        "mode": "music-audio-doubt",
+        "url": "http://127.0.0.1:6745",
+        "model": "audio:test",
+        "prompt": "Describe the music.",
+        "output_dir": tmp_path,
+        "token": None,
+        "audio_file": str(audio_path),
+        "video_file": None,
+        "response_format": None,
+    }
+
+    spec = build_request_spec(config)
+
+    assert spec["url"].endswith("/v1/chat/completions")
+    assert spec["json"]["model"] == "audio:test"
+    assert str(audio_path) in spec["json"]["messages"][0]["content"]
+    assert "Describe the music." in spec["json"]["messages"][0]["content"]
+
+
 def test_build_request_spec_for_transcription_requires_audio_file_flag(tmp_path):
    config = {
        "mode": "transcription",

--- a/tools/manual_multimodal_test_client.py
+++ b/tools/manual_multimodal_test_client.py
@@ -145,6 +145,40 @@ def build_request_spec(config: dict) -> dict:
            },
        }

+    if mode == "video-doubt":
+        video_path = _require_file(config.get("video_file"), "--video-file")
+        content = (
+            f"Video file: {video_path}\n"
+            f"Question: {config['prompt']}\n"
+            "Answer based on the referenced video input if the model/backend supports it."
+        )
+        return {
+            "method": "POST",
+            "url": f"{config['url']}/v1/chat/completions",
+            "headers": headers,
+            "json": {
+                "model": config["model"],
+                "messages": [{"role": "user", "content": content}],
+            },
+        }
+
+    if mode == "music-audio-doubt":
+        audio_path = _require_file(config.get("audio_file"), "--audio-file")
+        content = (
+            f"Audio file: {audio_path}\n"
+            f"Question: {config['prompt']}\n"
+            "Answer based on the referenced audio input if the model/backend supports it."
+        )
+        return {
+            "method": "POST",
+            "url": f"{config['url']}/v1/chat/completions",
+            "headers": headers,
+            "json": {
+                "model": config["model"],
+                "messages": [{"role": "user", "content": content}],
+            },
+        }
+
    raise ValueError(f"Unsupported mode for this task: {mode}")