@@ -5,64 +5,82 @@ set -eux # エラーが発生したらスクリプトを停止する
55
66# Model name to group name mapping
77declare -A MODEL_GROUP_MAP=(
8- # ["stabilityai/japanese-instructblip-alpha"]="normal"
9- # ["stabilityai/japanese-stable-vlm"]="normal"
10- # ["cyberagent/llava-calm2-siglip"]="calm"
11- # ["llava-hf/llava-1.5-7b-hf"]="normal"
12- # ["llava-hf/llava-v1.6-mistral-7b-hf"]="normal"
13- # ["neulab/Pangea-7B-hf"]="sarashina"
14- # ["meta-llama/Llama-3.2-11B-Vision-Instruct"]="normal"
15- # ["meta-llama/Llama-3.2-90B-Vision-Instruct"]="normal"
16- # ["OpenGVLab/InternVL2-8B"]="normal"
17- # ["OpenGVLab/InternVL2-26B"]="normal"
18- # ["Qwen/Qwen2-VL-7B-Instruct"]="normal"
19- # ["Qwen/Qwen2-VL-72B-Instruct"]="normal"
20- # ["Qwen/Qwen2.5-VL-7B-Instruct"]="normal"
21- # ["Qwen/Qwen2.5-VL-72B-Instruct"]="normal"
22- # ["gpt-4o-2024-11-20"]="normal"
23- # ["mistralai/Pixtral-12B-2409"]="pixtral"
24- # ["llm-jp/llm-jp-3-vila-14b"]="vilaja"
25- # ["Efficient-Large-Model/VILA1.5-13b"]="vilaja"
26- # ["SakanaAI/Llama-3-EvoVLM-JP-v2"]="evovlm"
27- # ["google/gemma-3-4b-it"]="normal"
28- # ["google/gemma-3-12b-it"]="normal"
29- # ["google/gemma-3-27b-it"]="normal"
30- # ["sbintuitions/sarashina2-vision-8b"]="sarashina"
31- # ["sbintuitions/sarashina2-vision-14b"]="sarashina"
32- # ["microsoft/Phi-4-multimodal-instruct"]="phi"
8+ [" stabilityai/japanese-instructblip-alpha" ]=" normal"
9+ [" stabilityai/japanese-stable-vlm" ]=" normal"
10+ [" cyberagent/llava-calm2-siglip" ]=" calm"
11+ [" llava-hf/llava-1.5-7b-hf" ]=" normal"
12+ [" llava-hf/llava-v1.6-mistral-7b-hf" ]=" normal"
13+ [" neulab/Pangea-7B-hf" ]=" sarashina"
14+ [" meta-llama/Llama-3.2-11B-Vision-Instruct" ]=" normal"
15+ [" meta-llama/Llama-3.2-90B-Vision-Instruct" ]=" normal"
16+ [" OpenGVLab/InternVL2-8B" ]=" normal"
17+ [" OpenGVLab/InternVL2-26B" ]=" normal"
18+ [" Qwen/Qwen2-VL-7B-Instruct" ]=" normal"
19+ [" Qwen/Qwen2-VL-72B-Instruct" ]=" normal"
20+ [" Qwen/Qwen2.5-VL-7B-Instruct" ]=" normal"
21+ [" Qwen/Qwen2.5-VL-72B-Instruct" ]=" normal"
22+ [" gpt-4o-2024-11-20" ]=" normal"
23+ [" mistralai/Pixtral-12B-2409" ]=" pixtral"
24+ [" llm-jp/llm-jp-3-vila-14b" ]=" vilaja"
25+ [" Efficient-Large-Model/VILA1.5-13b" ]=" vilaja"
26+ [" SakanaAI/Llama-3-EvoVLM-JP-v2" ]=" evovlm"
27+ [" google/gemma-3-4b-it" ]=" normal"
28+ [" google/gemma-3-12b-it" ]=" normal"
29+ [" google/gemma-3-27b-it" ]=" normal"
30+ [" sbintuitions/sarashina2-vision-8b" ]=" sarashina"
31+ [" sbintuitions/sarashina2-vision-14b" ]=" sarashina"
32+ [" microsoft/Phi-4-multimodal-instruct" ]=" phi"
3333 [" turing-motors/Heron-NVILA-Lite-15B" ]=" heron_nvila"
3434)
3535
36- # Task list
36+
3737declare -a task_list=(
38- # "japanese-heron-bench"
38+ " japanese-heron-bench"
3939 " ja-vlm-bench-in-the-wild"
40- # "ja-vg-vqa-500"
40+ " ja-vg-vqa-500"
4141 " jmmmu"
4242 " ja-multi-image-vqa"
4343 " jdocqa"
4444 " mmmu"
4545 " llava-bench-in-the-wild"
46- # "jic-vqa"
46+ " jic-vqa"
47+ " cvqa"
48+ " cc-ocr"
4749 " mecha-ja"
48- # "cc-ocr"
49- # "cvqa"
50+ " ai2d"
51+ " blink"
52+ " docvqa"
53+ " infographicvqa"
54+ " textvqa"
55+ " chartqa"
56+ " chartqapro"
57+ " mathvista"
58+ " okvqa"
5059)
5160
52- # Define metrics per task
61+ # === Metrics Mapping ===
5362declare -A METRIC_MAP=(
5463 [" japanese-heron-bench" ]=" heron-bench"
55- [" ja-vlm-bench-in-the-wild" ]=" llm-as-a-judge rougel "
56- [" ja-vg-vqa-500" ]=" llm-as-a-judge rougel "
64+ [" ja-vlm-bench-in-the-wild" ]=" llm-as-a-judge"
65+ [" ja-vg-vqa-500" ]=" llm-as-a-judge"
5766 [" jmmmu" ]=" jmmmu"
58- [" ja-multi-image-vqa" ]=" llm-as-a-judge rougel "
59- [" jdocqa" ]=" jdocqa llm-as-a-judge"
67+ [" ja-multi-image-vqa" ]=" llm-as-a-judge"
68+ [" jdocqa" ]=" llm-as-a-judge"
6069 [" mmmu" ]=" mmmu"
61- [" llava-bench-in-the-wild" ]=" llm-as-a-judge rougel "
70+ [" llava-bench-in-the-wild" ]=" llm-as-a-judge"
6271 [" jic-vqa" ]=" jic-vqa"
6372 [" mecha-ja" ]=" mecha-ja"
6473 [" cc-ocr" ]=" cc-ocr"
74+ [" ai2d" ]=" ai2d"
75+ [" blink" ]=" blink"
6576 [" cvqa" ]=" substring-match"
77+ [" docvqa" ]=" substring-match"
78+ [" infographicvqa" ]=" substring-match"
79+ [" textvqa" ]=" substring-match"
80+ [" chartqa" ]=" substring-match"
81+ [" chartqapro" ]=" substring-match"
82+ [" mathvista" ]=" mathvista"
83+ [" okvqa" ]=" substring-match"
6684)
6785
6886# Result directories
@@ -81,7 +99,7 @@ for RESULT_DIR in "${result_dir_list[@]}"; do
8199 --model_id " $model_name " \
82100 --task_id " $task " \
83101 --metrics $METRIC \
84- --judge_model " gpt-4o-2024-11-20 " \
102+ --judge_model " gpt-4.1-2025-04-14 " \
85103 --result_dir " $RESULT_DIR "
86104 done
87105 done
0 commit comments