Skip to content

Commit 85df021

Browse files
authored
Merge pull request #175 from llm-jp/174-qwen3
feat: add Qwen3 VLLM evaluation support
2 parents 9a478f4 + 42f7b93 commit 85df021

File tree

7 files changed

+966
-244
lines changed

7 files changed

+966
-244
lines changed

create_env.fish

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
set -x CUDA_HOME /usr/local/cuda
2+
set -x PATH /usr/local/cuda/bin $PATH
3+
set -x LD_LIBRARY_PATH /usr/local/cuda/lib64 $LD_LIBRARY_PATH
4+
set -x ENV_NAME normal vllm_normal # dev evovlm vilaja sarashina normal old stablevlm phi pixtral calm heron_nvila vllm_normal
5+
for env_name in $ENV_NAME
6+
uv venv .uv/$env_name-env --python python3.12
7+
source .uv/$env_name-env/bin/activate.fish
8+
echo "===> Installingdependencies for $env_name"
9+
uv sync --group $env_name --active
10+
end
11+
12+
# Refer to this url for handling flash-attn
13+
# https://docs.astral.sh/uv/concepts/projects/config/#build-isolation

eval_all.sh

Lines changed: 56 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -5,64 +5,82 @@ set -eux # エラーが発生したらスクリプトを停止する
55

66
# Model name to group name mapping
77
declare -A MODEL_GROUP_MAP=(
8-
# ["stabilityai/japanese-instructblip-alpha"]="normal"
9-
# ["stabilityai/japanese-stable-vlm"]="normal"
10-
# ["cyberagent/llava-calm2-siglip"]="calm"
11-
# ["llava-hf/llava-1.5-7b-hf"]="normal"
12-
# ["llava-hf/llava-v1.6-mistral-7b-hf"]="normal"
13-
# ["neulab/Pangea-7B-hf"]="sarashina"
14-
# ["meta-llama/Llama-3.2-11B-Vision-Instruct"]="normal"
15-
# ["meta-llama/Llama-3.2-90B-Vision-Instruct"]="normal"
16-
# ["OpenGVLab/InternVL2-8B"]="normal"
17-
# ["OpenGVLab/InternVL2-26B"]="normal"
18-
# ["Qwen/Qwen2-VL-7B-Instruct"]="normal"
19-
# ["Qwen/Qwen2-VL-72B-Instruct"]="normal"
20-
# ["Qwen/Qwen2.5-VL-7B-Instruct"]="normal"
21-
# ["Qwen/Qwen2.5-VL-72B-Instruct"]="normal"
22-
# ["gpt-4o-2024-11-20"]="normal"
23-
# ["mistralai/Pixtral-12B-2409"]="pixtral"
24-
# ["llm-jp/llm-jp-3-vila-14b"]="vilaja"
25-
# ["Efficient-Large-Model/VILA1.5-13b"]="vilaja"
26-
# ["SakanaAI/Llama-3-EvoVLM-JP-v2"]="evovlm"
27-
# ["google/gemma-3-4b-it"]="normal"
28-
# ["google/gemma-3-12b-it"]="normal"
29-
# ["google/gemma-3-27b-it"]="normal"
30-
# ["sbintuitions/sarashina2-vision-8b"]="sarashina"
31-
# ["sbintuitions/sarashina2-vision-14b"]="sarashina"
32-
# ["microsoft/Phi-4-multimodal-instruct"]="phi"
8+
["stabilityai/japanese-instructblip-alpha"]="normal"
9+
["stabilityai/japanese-stable-vlm"]="normal"
10+
["cyberagent/llava-calm2-siglip"]="calm"
11+
["llava-hf/llava-1.5-7b-hf"]="normal"
12+
["llava-hf/llava-v1.6-mistral-7b-hf"]="normal"
13+
["neulab/Pangea-7B-hf"]="sarashina"
14+
["meta-llama/Llama-3.2-11B-Vision-Instruct"]="normal"
15+
["meta-llama/Llama-3.2-90B-Vision-Instruct"]="normal"
16+
["OpenGVLab/InternVL2-8B"]="normal"
17+
["OpenGVLab/InternVL2-26B"]="normal"
18+
["Qwen/Qwen2-VL-7B-Instruct"]="normal"
19+
["Qwen/Qwen2-VL-72B-Instruct"]="normal"
20+
["Qwen/Qwen2.5-VL-7B-Instruct"]="normal"
21+
["Qwen/Qwen2.5-VL-72B-Instruct"]="normal"
22+
["gpt-4o-2024-11-20"]="normal"
23+
["mistralai/Pixtral-12B-2409"]="pixtral"
24+
["llm-jp/llm-jp-3-vila-14b"]="vilaja"
25+
["Efficient-Large-Model/VILA1.5-13b"]="vilaja"
26+
["SakanaAI/Llama-3-EvoVLM-JP-v2"]="evovlm"
27+
["google/gemma-3-4b-it"]="normal"
28+
["google/gemma-3-12b-it"]="normal"
29+
["google/gemma-3-27b-it"]="normal"
30+
["sbintuitions/sarashina2-vision-8b"]="sarashina"
31+
["sbintuitions/sarashina2-vision-14b"]="sarashina"
32+
["microsoft/Phi-4-multimodal-instruct"]="phi"
3333
["turing-motors/Heron-NVILA-Lite-15B"]="heron_nvila"
3434
)
3535

36-
# Task list
36+
3737
declare -a task_list=(
38-
# "japanese-heron-bench"
38+
"japanese-heron-bench"
3939
"ja-vlm-bench-in-the-wild"
40-
# "ja-vg-vqa-500"
40+
"ja-vg-vqa-500"
4141
"jmmmu"
4242
"ja-multi-image-vqa"
4343
"jdocqa"
4444
"mmmu"
4545
"llava-bench-in-the-wild"
46-
# "jic-vqa"
46+
"jic-vqa"
47+
"cvqa"
48+
"cc-ocr"
4749
"mecha-ja"
48-
# "cc-ocr"
49-
# "cvqa"
50+
"ai2d"
51+
"blink"
52+
"docvqa"
53+
"infographicvqa"
54+
"textvqa"
55+
"chartqa"
56+
"chartqapro"
57+
"mathvista"
58+
"okvqa"
5059
)
5160

52-
# Define metrics per task
61+
# === Metrics Mapping ===
5362
declare -A METRIC_MAP=(
5463
["japanese-heron-bench"]="heron-bench"
55-
["ja-vlm-bench-in-the-wild"]="llm-as-a-judge rougel"
56-
["ja-vg-vqa-500"]="llm-as-a-judge rougel"
64+
["ja-vlm-bench-in-the-wild"]="llm-as-a-judge"
65+
["ja-vg-vqa-500"]="llm-as-a-judge"
5766
["jmmmu"]="jmmmu"
58-
["ja-multi-image-vqa"]="llm-as-a-judge rougel"
59-
["jdocqa"]="jdocqa llm-as-a-judge"
67+
["ja-multi-image-vqa"]="llm-as-a-judge"
68+
["jdocqa"]="llm-as-a-judge"
6069
["mmmu"]="mmmu"
61-
["llava-bench-in-the-wild"]="llm-as-a-judge rougel"
70+
["llava-bench-in-the-wild"]="llm-as-a-judge"
6271
["jic-vqa"]="jic-vqa"
6372
["mecha-ja"]="mecha-ja"
6473
["cc-ocr"]="cc-ocr"
74+
["ai2d"]="ai2d"
75+
["blink"]="blink"
6576
["cvqa"]="substring-match"
77+
["docvqa"]="substring-match"
78+
["infographicvqa"]="substring-match"
79+
["textvqa"]="substring-match"
80+
["chartqa"]="substring-match"
81+
["chartqapro"]="substring-match"
82+
["mathvista"]="mathvista"
83+
["okvqa"]="substring-match"
6684
)
6785

6886
# Result directories
@@ -81,7 +99,7 @@ for RESULT_DIR in "${result_dir_list[@]}"; do
8199
--model_id "$model_name" \
82100
--task_id "$task" \
83101
--metrics $METRIC \
84-
--judge_model "gpt-4o-2024-11-20" \
102+
--judge_model "gpt-4.1-2025-04-14" \
85103
--result_dir "$RESULT_DIR"
86104
done
87105
done

eval_with_vllm.sh

Lines changed: 62 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,78 @@
11
# Set CUDA devices
22
set -eux # エラーが発生したらスクリプトを停止する
33

4-
#export CUDA_VISIBLE_DEVICES=0
4+
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5
55

66
# Model name to group name mapping
77
declare -A MODEL_GROUP_MAP=(
8-
["Qwen/Qwen2.5-VL-3B-Instruct"]="normal"
9-
["Qwen/Qwen2.5-VL-7B-Instruct"]="normal"
10-
["Qwen/Qwen2.5-VL-32B-Instruct"]="normal"
11-
# ["Qwen/Qwen2.5-VL-72B-Instruct"]="normal"
12-
["google/gemma-3-4b-it"]="normal"
13-
["google/gemma-3-12b-it"]="normal"
14-
["google/gemma-3-27b-it"]="normal"
8+
# ["Qwen/Qwen3-VL-30B-A3B-Instruct"]="vllm_normal"
9+
# ["moonshotai/Kimi-VL-A3B-Instruct"]="vllm_normal" # 今は動かない
10+
# ["deepseek-ai/deepseek-vl2"]="vllm_normal"
11+
# ["openbmb/MiniCPM-o-2_6"]="vllm_normal"
12+
["zai-org/GLM-4.5V"]="vllm_normal"
13+
# ["AIDC-AI/Ovis2-1B"]="vllm_normal"
14+
# ["AIDC-AI/Ovis2-2B"]="vllm_normal"
15+
# ["AIDC-AI/Ovis2-4B"]="vllm_normal"
16+
# ["AIDC-AI/Ovis2-8B"]="vllm_normal"
17+
# ["AIDC-AI/Ovis2-16B"]="vllm_normal"
18+
# ["AIDC-AI/Ovis2-34B"]="vllm_normal"
19+
# ["AIDC-AI/Ovis2.5-2B"]="vllm_normal"
20+
# ["AIDC-AI/Ovis2.5-9B"]="vllm_normal"
21+
# ["OpenGVLab/InternVL3-1B"]="vllm_normal"
22+
# ["OpenGVLab/InternVL3-2B"]="vllm_normal"
23+
# ["OpenGVLab/InternVL3-8B"]="vllm_normal"
24+
# ["OpenGVLab/InternVL3-14B"]="vllm_normal"
25+
# ["OpenGVLab/InternVL3-38B"]="vllm_normal"
26+
# ["OpenGVLab/InternVL3-78B"]="vllm_normal"
1527
)
1628

17-
# Task list
1829
declare -a task_list=(
1930
"japanese-heron-bench"
31+
"ja-vlm-bench-in-the-wild"
32+
"ja-vg-vqa-500"
33+
"jmmmu"
34+
"ja-multi-image-vqa"
35+
"jdocqa"
36+
"mmmu"
37+
"llava-bench-in-the-wild"
38+
"jic-vqa"
39+
"cvqa"
40+
"cc-ocr"
41+
"mecha-ja"
42+
"ai2d"
43+
# "blink"
44+
"docvqa"
45+
"infographicvqa"
46+
"textvqa"
47+
"chartqa"
48+
# "chartqapro"
49+
# "mathvista"
50+
"okvqa"
2051
)
2152

22-
# Define metrics per task
53+
# === Metrics Mapping ===
2354
declare -A METRIC_MAP=(
2455
["japanese-heron-bench"]="heron-bench"
25-
["ja-vlm-bench-in-the-wild"]="llm-as-a-judge,rougel"
26-
["ja-vg-vqa-500"]="llm-as-a-judge,rougel"
56+
["ja-vlm-bench-in-the-wild"]="llm-as-a-judge"
57+
["ja-vg-vqa-500"]="llm-as-a-judge"
2758
["jmmmu"]="jmmmu"
28-
["ja-multi-image-vqa"]="llm-as-a-judge,rougel"
29-
["jdocqa"]="jdocqa,llm-as-a-judge"
59+
["ja-multi-image-vqa"]="llm-as-a-judge"
60+
["jdocqa"]="llm-as-a-judge"
3061
["mmmu"]="mmmu"
31-
["llava-bench-in-the-wild"]="llm-as-a-judge,rougel"
62+
["llava-bench-in-the-wild"]="llm-as-a-judge"
3263
["jic-vqa"]="jic-vqa"
3364
["mecha-ja"]="mecha-ja"
65+
["cc-ocr"]="cc-ocr"
66+
["ai2d"]="ai2d"
67+
["blink"]="blink"
68+
["cvqa"]="substring-match"
69+
["docvqa"]="substring-match"
70+
["infographicvqa"]="substring-match"
71+
["textvqa"]="substring-match"
72+
["chartqa"]="substring-match"
73+
["chartqapro"]="substring-match"
74+
["mathvista"]="mathvista"
75+
["okvqa"]="substring-match"
3476
)
3577

3678
# Result directories
@@ -44,13 +86,15 @@ for RESULT_DIR in "${result_dir_list[@]}"; do
4486
METRIC=${METRIC_MAP[$task]}
4587
for model_name in "${!MODEL_GROUP_MAP[@]}"; do
4688
model_group=${MODEL_GROUP_MAP[$model_name]}
47-
uv sync --group vllm_normal
48-
uv run --group vllm_normal python examples/sample_vllm.py \
89+
source .uv/vllm_normal-env/bin/activate
90+
uv pip list
91+
python examples/sample_vllm.py \
4992
--model_id "$model_name" \
5093
--task_id "$task" \
5194
--metrics "$METRIC" \
52-
--judge_model "gpt-4o-2024-11-20" \
95+
--judge_model "gpt-4.1-2025-04-14" \
5396
--result_dir "$RESULT_DIR" \
97+
--tensor_parallel_size 4 \
5498
--inference_only
5599
done
56100
done

0 commit comments

Comments
 (0)