llm-jp
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎eval_with_vllm.sh‎
Lines changed: 59 additions & 0 deletions b/‎eval_with_vllm.sh‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎examples/base_vllm.py‎
Lines changed: 91 additions & 0 deletions b/‎examples/base_vllm.py‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎examples/base_vlm.py‎
Lines changed: 56 additions & 5 deletions b/‎examples/base_vlm.py‎
Lines changed: 56 additions & 5 deletions
@@ -139,3 +139,6 @@ uv.lock
 
 # vscode
 .vscode/
+
+# cache
+.cache/
@@ -0,0 +1,59 @@
+# Set CUDA devices
+set -eux  # エラーが発生したらスクリプトを停止する
+
+#export CUDA_VISIBLE_DEVICES=0
+
+# Model name to group name mapping
+declare -A MODEL_GROUP_MAP=(
+    ["Qwen/Qwen2.5-VL-3B-Instruct"]="normal"
+    ["Qwen/Qwen2.5-VL-7B-Instruct"]="normal"
+    ["Qwen/Qwen2.5-VL-32B-Instruct"]="normal"
+    # ["Qwen/Qwen2.5-VL-72B-Instruct"]="normal"
+    ["google/gemma-3-4b-it"]="normal"
+    ["google/gemma-3-12b-it"]="normal"
+    ["google/gemma-3-27b-it"]="normal"
+)
+
+# Task list
+declare -a task_list=(
+    "japanese-heron-bench"
+)
+
+# Define metrics per task
+declare -A METRIC_MAP=(
+    ["japanese-heron-bench"]="heron-bench"
+    ["ja-vlm-bench-in-the-wild"]="llm-as-a-judge,rougel"
+    ["ja-vg-vqa-500"]="llm-as-a-judge,rougel"
+    ["jmmmu"]="jmmmu"
+    ["ja-multi-image-vqa"]="llm-as-a-judge,rougel"
+    ["jdocqa"]="jdocqa,llm-as-a-judge"
+    ["mmmu"]="mmmu"
+    ["llava-bench-in-the-wild"]="llm-as-a-judge,rougel"
+    ["jic-vqa"]="jic-vqa"
+    ["mecha-ja"]="mecha-ja"
+)
+
+# Result directories
+declare -a result_dir_list=(
+    "result"
+)
+
+# Main evaluation loop
+for RESULT_DIR in "${result_dir_list[@]}"; do
+    for task in "${task_list[@]}"; do
+        METRIC=${METRIC_MAP[$task]}
+        for model_name in "${!MODEL_GROUP_MAP[@]}"; do
+            model_group=${MODEL_GROUP_MAP[$model_name]}
+            uv sync --group vllm_normal
+            uv run --group vllm_normal  python examples/sample_vllm.py \
+                --model_id "$model_name" \
+                --task_id "$task" \
+                --metrics "$METRIC" \
+                --judge_model "gpt-4o-2024-11-20" \
+                --result_dir "$RESULT_DIR" \
+                --inference_only
+        done
+    done
+done
+
+echo "All evaluations are done."
@@ -0,0 +1,91 @@
+from vllm import LLM, SamplingParams
+from PIL import Image
+from utils import GenerationConfig
+from base_vlm import BaseVLM
+from vllm_registry import VLLMModelRegistry
+import torch
+
+
+class VLLM(BaseVLM):
+    def __init__(self, model_id: str = "google/gemma-3-4b-it") -> None:
+        self.model_id = model_id
+        self.registry = VLLMModelRegistry(self.model_id)
+        self.processor = self.registry.processor
+        self.vllm_loader = self.registry.loader_map[self.model_id]
+
+        engine_config = self.registry.get_engine_config(self.model_id)
+        self.engine_args_dict = {
+            "model": self.model_id,
+            "tensor_parallel_size": 2,  # number of GPUs of the machine, but 40 should be divisible by tensor_parallel_size
+            "download_dir": "./.cache/vllm",
+            **engine_config,
+        }
+        self.model = LLM(**self.engine_args_dict)
+
+    def generate(
+        self,
+        images: list[Image.Image] | None,
+        text: str,
+        gen_kwargs: GenerationConfig = GenerationConfig(),
+    ) -> str:
+        if images is None:
+            images = []
+        req_data = self.vllm_loader(text, images)
+        sampling_params = SamplingParams(
+            temperature=gen_kwargs.temperature,
+            max_tokens=gen_kwargs.max_new_tokens,
+            stop_token_ids=req_data.stop_token_ids,
+        )
+        outputs = self.model.generate(
+            {
+                "prompt": req_data.prompt,
+                "multi_modal_data": {"image": req_data.image_data},
+            },
+            sampling_params=sampling_params,
+            lora_request=req_data.lora_requests,
+        )
+        return outputs[0].outputs[0].text
+
+    def batch_generate(
+        self,
+        images_list: list[list[Image.Image]] | None,
+        text_list: list[str],
+        gen_kwargs: GenerationConfig = GenerationConfig(),
+    ) -> list[str]:
+        if images_list is None:
+            images_list = [[] for _ in range(len(text_list))]
+
+        assert len(images_list) == len(text_list)
+
+        from tqdm import tqdm
+
+        req_data_list = []
+
+        for text, images in tqdm(zip(text_list, images_list)):
+            req_data_list.append(self.vllm_loader(text, images))
+
+        sampling_params = SamplingParams(
+            temperature=gen_kwargs.temperature,
+            max_tokens=gen_kwargs.max_new_tokens,
+        )
+
+        print(f"Generated {len(req_data_list)} requests")
+
+        outputs = self.model.generate(
+            [
+                {
+                    "prompt": req_data.prompt,
+                    "multi_modal_data": {"image": req_data.image_data},
+                }
+                for req_data in req_data_list
+            ],
+            sampling_params=sampling_params,
+        )
+        return [output.outputs[0].text for output in outputs]
+
+
+if __name__ == "__main__":
+    print("=== Qwen/Qwen2.5-VL-3B-Instruct ===")
+    vllm = VLLM("Qwen/Qwen2.5-VL-3B-Instruct")
+    vllm.test_vlm()
+    vllm.test_vlm_batch_100()
@@ -17,6 +17,15 @@ def generate(
         """Generate a response given an image (or list of images) and a prompt."""
         raise NotImplementedError
 
+    def batch_generate(
+        self,
+        images_list: list[list[Image.Image]] | None,
+        text_list: list[str],
+        gen_kwargs: GenerationConfig = GenerationConfig(),
+    ) -> list[str]:
+        """Generate a response given a list of images and a list of prompts."""
+        raise NotImplementedError
+
     def test_vlm(self):
         """Test the model with one or two images."""
         image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -35,8 +44,50 @@ def test_vlm(self):
             output, str
         ), f"Expected output to be a string, but got {type(output)}"
 
-        output = self.generate([], "画像には何が映っていますか?")
-        logger.info(f"Output: {output}")
-        assert isinstance(
-            output, str
-        ), f"Expected output to be a string, but got {type(output)}"
+        # --- No image case ---
+        # output = self.generate([], "画像には何が映っていますか?")
+        # logger.info(f"Output: {output}")
+        # assert isinstance(
+        #     output, str
+        # ), f"Expected output to be a string, but got {type(output)}"
+
+    def test_vlm_100(self):
+        """Test the model with one or two images."""
+        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        image = Image.open(requests.get(image_file, stream=True).raw)
+
+        import time
+
+        start_time = time.time()
+        for _ in range(100):
+            output = self.generate([image], "画像には何が映っていますか?")
+            logger.info(f"Output: {output}")
+            assert isinstance(
+                output, str
+            ), f"Expected output to be a string, but got {type(output)}"
+        end_time = time.time()
+        logger.info(f"Time taken: {end_time - start_time} seconds for 100 times")
+
+    def test_vlm_batch_100(self):
+        """Test the model with one or two images."""
+
+        print("=== Batch 100 test ===")
+        print(f"Model: {self.model_id}")
+
+        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        image = Image.open(requests.get(image_file, stream=True).raw)
+
+        import time
+
+        image_list = [[image] for _ in range(100)]
+        text_list = [["画像には何が映っていますか?"] for _ in range(100)]
+
+        start_time = time.time()
+        outputs = self.batch_generate(image_list, text_list)
+        for output in outputs:
+            assert isinstance(
+                output, str
+            ), f"Expected output to be a string, but got {type(output)}"
+
+        end_time = time.time()
+        logger.info(f"Time taken: {end_time - start_time} seconds for BATCH 100 times")