huggingface · NathanHB · May 19, 2025 · Apr 15, 2025 · Apr 15, 2025 · Apr 15, 2025
diff --git a/src/lighteval/data.py b/src/lighteval/data.py
@@ -260,7 +260,7 @@ def init_split_limits(self, num_dataset_splits):
         splits_indices = [tuple(e) for e in splits_indices]
         return num_dataset_splits, splits_indices
 
-    def _sorting_criteria(self, request: GreedyUntilRequest) -> tuple[bool, bool, list, int, int]:
+    def _sorting_criteria(self, request: GreedyUntilRequest) -> tuple[bool, bool, tuple, int, int]:
         """
         Collate function for generating batches.
 

diff --git a/src/lighteval/models/model_loader.py b/src/lighteval/models/model_loader.py
@@ -42,6 +42,7 @@
 from lighteval.models.transformers.adapter_model import AdapterModel, AdapterModelConfig
 from lighteval.models.transformers.delta_model import DeltaModel, DeltaModelConfig
 from lighteval.models.transformers.transformers_model import TransformersModel, TransformersModelConfig
+from lighteval.models.transformers.vlm_transformers import VLMTransformersModel
 from lighteval.models.vllm.vllm_model import VLLMModel, VLLMModelConfig
 from lighteval.utils.imports import (
     NO_LITELLM_ERROR_MSG,
@@ -198,7 +199,7 @@ def load_model_with_accelerate_or_default(
         model = VLLMModel(config=config)
         return model
     else:
-        model = TransformersModel(config=config)
+        model = VLMTransformersModel(config=config)
 
     return model
 

diff --git a/src/lighteval/models/transformers/vlm_transformers.py b/src/lighteval/models/transformers/vlm_transformers.py
diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
@@ -26,6 +26,7 @@
 import random
 import re
 import string
+from typing import Optional
 
 import numpy as np
 import pycountry
@@ -43,6 +44,54 @@
 # fmt: on
 
 
+def mmmu_pro(line, task_name: Optional[str] = None):
+    # fmt: off
+    question = line["question"]        # "What is the capital of France?"
+    choices_string = line["options"]   # "[Paris, London, Berlin, Madrid]"
+    answer = line["answer"]            # "A"
+    # fmt: on
+
+    # TODO: Should be different for "vision"/"standard (4 options)" subsets
+    instructions = (
+        "Answer with the option letter from the given choices directly. "
+        "The last line of your response should be of the following format: "
+        "'Answer: $LETTER' (without quotes) where LETTER is one of options."
+    )
+
+    # Preprocess choices
+    # "[Paris, London, Berlin, Madrid]" -> ["A. Paris", "B. London", "C. Berlin", "D. Madrid"]
+    choices = ast.literal_eval(str(choices_string))
+    choices_letters = [chr(ord("A") + i) for i in range(len(choices))]  # ["A", "B", "C", "D"]
+    choices = [f"{letter}. {choice}" for letter, choice in zip(choices_letters, choices)]
+
+    # Construct prompt
+    formatted_choices = "\n".join(choices)
+    prompt = f"{question}\n{formatted_choices}\n{instructions}"
+
+    # Collect images
+    image_order = []
+    for num in re.findall(r"<image\s+(\d+)>", prompt):
+        num = int(num)
+        if num not in image_order:
+            image_order.append(num)
+    images = [line[f"image_{i}"] for i in image_order]
+
+    gold_index = string.ascii_uppercase.index(answer)
+
+    # Replace image placeholders in prompt <image 1>, <image 2>, ... with [image 1], [image 2], ...
+    prompt = re.sub(r"<image\s+(\d+)>", "[image \\1]", prompt)
+    choices = [re.sub(r"<image\s+(\d+)>", "[image \\1]", choice) for choice in choices]
+
+    return Doc(
+        task_name=task_name,
+        query=prompt,
+        choices=choices,
+        gold_index=gold_index,
+        images=images,
+        specific={"id": line["id"]},
+    )
+
+
 def simpleqa(line, task_name: str = None):
     query = line["problem"]
     choices = [line["answer"]]

diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py
@@ -24,6 +24,22 @@
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 
 
+mmmu_pro = LightevalTaskConfig(
+    name="mmmu_pro",
+    suite=["lighteval"],
+    prompt_function=prompt.mmmu_pro,
+    hf_repo="MMMU/MMMU_pro",
+    hf_subset="standard (4 options)",
+    hf_avail_splits=["test"],
+    evaluation_splits=["test"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=30,  # expected an answer in a format 'Answer: B'
+    metric=[Metrics.gpqa_instruct_metric],
+    stop_sequence=None,
+    trust_dataset=True,
+    version=0,
+)
 abstract_narrative_understanding_bigbench = LightevalTaskConfig(
     name="abstract_narrative_understanding",
     suite=["bigbench", "bigbench_json"],

diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
@@ -364,6 +364,7 @@ def construct_requests(
                     context=context,
                     choice=gold,
                     metric_categories=[MetricCategory.TARGET_PERPLEXITY],
+                    images=formatted_doc.images,
                 )
                 for i, gold in enumerate(golds)
             ]
@@ -375,12 +376,13 @@ def construct_requests(
                     request_index=0,
                     context=context,
                     metric_categories=[MetricCategory.PERPLEXITY],
+                    images=formatted_doc.images,
                 )
             ]
         if self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]:
             # All the possible sampling tasks require the same generation process - we can do them in one step
             # so we select the maximum number of samples and the metrics will select only the
-            # relevant number of tiems
+            # relevant number of items
             requests[RequestType.GREEDY_UNTIL] += [
                 GreedyUntilRequest(
                     task_name=current_task_name,
@@ -394,6 +396,7 @@ def construct_requests(
                     do_sample=True,
                     use_logits=False,
                     metric_categories=[MetricCategory.GENERATIVE_SAMPLING],
+                    images=formatted_doc.images,
                 )
             ]
         if (
@@ -420,6 +423,7 @@ def construct_requests(
                         ]
                         if self.has_metric_category[c]
                     ],
+                    images=formatted_doc.images,
                 )
             ]
         if (
@@ -438,6 +442,7 @@ def construct_requests(
                         for c in [MetricCategory.MULTICHOICE, MetricCategory.MULTICHOICE_PMI]
                         if self.has_metric_category[c]
                     ],
+                    images=formatted_doc.images,
                 )
                 for i, choice in enumerate(formatted_doc.choices)
             ]
@@ -454,6 +459,7 @@ def construct_requests(
                     context=formatted_doc.unconditioned_query,
                     choice=choice,
                     metric_categories=[MetricCategory.MULTICHOICE_PMI],
+                    images=formatted_doc.images,
                 )
                 for i, choice in enumerate(formatted_doc.choices)
             ]
@@ -466,6 +472,7 @@ def construct_requests(
                     context=context,
                     choices=formatted_doc.choices,
                     metric_categories=[MetricCategory.MULTICHOICE_ONE_TOKEN],
+                    images=formatted_doc.images,
                 )
             ]
         if self.has_metric_category[MetricCategory.LLM_AS_JUDGE_MULTI_TURN]:
@@ -478,6 +485,7 @@ def construct_requests(
                     stop_sequence=self.stop_sequence,
                     generation_size=self.generation_size,
                     metric_categories=[MetricCategory.LLM_AS_JUDGE_MULTI_TURN],
+                    images=formatted_doc.images,
                 )
             ]
         if self.has_metric_category[MetricCategory.LLM_AS_JUDGE]:
@@ -492,6 +500,7 @@ def construct_requests(
                     generation_grammar=self.generation_grammar,
                     num_samples=1,
                     metric_categories=[MetricCategory.LLM_AS_JUDGE],
+                    images=formatted_doc.images,
                 )
             ]
 
@@ -569,6 +578,15 @@ def load_datasets(tasks: list["LightevalTask"], dataset_loading_processes: int =
                     ],
                 )
 
+        # TODO: debug purpose, to remove later
+        import os
+
+        debug_samples = int(os.getenv("DATASET_SAMPLES", 0))
+        if debug_samples > 0:
+            for dataset in datasets:
+                for split in dataset.keys():
+                    dataset[split] = dataset[split].select(range(debug_samples))
+
         for task, dataset in zip(tasks, datasets):
             task.dataset = dataset
 

diff --git a/src/lighteval/tasks/prompt_manager.py b/src/lighteval/tasks/prompt_manager.py
@@ -210,15 +210,20 @@ def _single_turn_context(
             system_prompt=system_prompt,
             use_chat_template=use_chat_template,
             cot_prompt=cot_prompt,
+            doc=doc,
         )
-        if not use_chat_template:
-            toks = self.model.tok_encode(output)
-        else:
-            toks = [self.model.tok_encode(msg["content"]) for msg in output]
-            toks = [t for ts in toks for t in ts]
+
+        if truncate_few_shots and doc.images is not None:
+            raise NotImplementedError("Few shot evaluation is not supported for multi-modal tasks yet.")
 
         # If we need to truncate few-shots to fit in the context
         if truncate_few_shots and self.model.max_length is not None and self.model.tokenizer is not None:
+            if not use_chat_template:
+                toks = self.model.tok_encode(output)
+            else:
+                toks = [self.model.tok_encode(msg["content"]) for msg in output]
+                toks = [t for ts in toks for t in ts]
+
             # If self.generation_size is None, the maximum allowed generation size depends
             # on the model maximum context length, not on the task - we don't take it into account here
             # but we probably should
@@ -258,8 +263,27 @@ def get_examples(
         system_prompt: Union[str | None],
         use_chat_template: bool,
         cot_prompt: Union[str | None],
+        doc: Doc,
     ):
+        is_multimodal = doc.images is not None
+
+        if is_multimodal and not use_chat_template:
+            raise NotImplementedError("Multi-modal tasks do not support formatting without chat template yet.")
+
+        if is_multimodal and fewshot_ex:
+            raise NotImplementedError("Multi-modal tasks do not support fewshot evaluation yet.")
+
+        content = example + cot_prompt if cot_prompt is not None else example
+
+        if is_multimodal:
+            text_content = [{"type": "text", "text": content}]
+            image_content = [{"type": "image", "image": image} for image in doc.images]
+            message = {"role": "user", "content": text_content + image_content}
+            return [message]
+
+        # Regular text (not multimodal)
         examples = []
+
         # Few shot examples
         for ex in fewshot_ex:
             if use_chat_template:
@@ -269,8 +293,6 @@ def get_examples(
                 examples.append(self.doc_to_text(ex, return_instructions=False) + self.doc_to_target(ex))
 
         # Actual example
-        content = example + cot_prompt if cot_prompt is not None else example
-
         if use_chat_template:
             examples.append({"role": "user", "content": content})
         else:
@@ -284,10 +306,8 @@ def get_examples(
                 examples[0]["content"] = instruction + examples[0]["content"]
             return examples
         else:
-            if system_prompt is not None:
-                output = system_prompt + instruction + "\n\n".join(examples)
-            else:
-                output = instruction + "\n\n".join(examples)
+            system_prompt = system_prompt if system_prompt is not None else ""
+            output = system_prompt + instruction + "\n\n".join(examples)
             if output == "\n\n":
                 return ""
             return output

diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
@@ -75,6 +75,7 @@ class LoglikelihoodRequest(Request):
     request_type = RequestType.LOGLIKELIHOOD
     tokenized_context: list[int] = None
     tokenized_continuation: list[int] = None
+    images: Optional[list["PIL.Image.Image"]] = None  # noqa F821
 
 
 @dataclass
@@ -92,6 +93,7 @@ class LoglikelihoodSingleTokenRequest(Request):
     request_type = RequestType.LOGLIKELIHOOD_SINGLE_TOKEN
     tokenized_context: list[int] = None
     tokenized_continuation: list[int] = None
+    images: Optional[list["PIL.Image.Image"]] = None  # noqa F821
 
 
 @dataclass
@@ -105,6 +107,7 @@ class LoglikelihoodRollingRequest(Request):
     request_type = RequestType.LOGLIKELIHOOD_ROLLING
     tokenized_context: list[int] = None
     tokenized_continuation: list[int] = None
+    images: Optional[list["PIL.Image.Image"]] = None  # noqa F821
 
 
 @dataclass
@@ -128,6 +131,7 @@ class GreedyUntilRequest(Request):
     num_samples: int = None
     do_sample: bool = False
     use_logits: bool = False
+    images: Optional[list["PIL.Image.Image"]] = None  # noqa F821
 
 
 @dataclass
@@ -145,6 +149,7 @@ class GreedyUntilMultiTurnRequest(Request):
     generation_size: int
     request_type = RequestType.GREEDY_UNTIL_MULTI_TURN
     use_logits: bool = False
+    images: Optional[list["PIL.Image.Image"]] = None  # noqa F821
 
 
 class SampleUid(NamedTuple):
@@ -190,6 +195,9 @@ class Doc:
     # The uncoditioned query shouldn't contain any information about the task, thus usually it's empty string or 'Answer:'.
     unconditioned_query: Optional[str] = None
 
+    # For multi-modal tasks
+    images: Optional[list["PIL.Image.Image"]] = None  # noqa F821
+
     def __post_init__(self):
         if self.instruction is None:
             self.instruction = ""