llm-jp
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 8 additions & 1 deletion b/‎.github/workflows/test.yml‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎src/eval_mm/tasks/ai2d.py‎
Lines changed: 7 additions & 1 deletion b/‎src/eval_mm/tasks/ai2d.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎src/eval_mm/tasks/blink.py‎
Lines changed: 6 additions & 3 deletions b/‎src/eval_mm/tasks/blink.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎src/eval_mm/tasks/cc_ocr.py‎
Lines changed: 26 additions & 29 deletions b/‎src/eval_mm/tasks/cc_ocr.py‎
Lines changed: 26 additions & 29 deletions
diff --git a/‎src/eval_mm/tasks/chartqa.py‎
Lines changed: 7 additions & 1 deletion b/‎src/eval_mm/tasks/chartqa.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎src/eval_mm/tasks/chartqapro.py‎
Lines changed: 6 additions & 1 deletion b/‎src/eval_mm/tasks/chartqapro.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/eval_mm/tasks/cvqa.py‎
Lines changed: 32 additions & 34 deletions b/‎src/eval_mm/tasks/cvqa.py‎
Lines changed: 32 additions & 34 deletions
diff --git a/‎src/eval_mm/tasks/docvqa.py‎
Lines changed: 7 additions & 1 deletion b/‎src/eval_mm/tasks/docvqa.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎src/eval_mm/tasks/infographicvqa.py‎
Lines changed: 11 additions & 1 deletion b/‎src/eval_mm/tasks/infographicvqa.py‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎src/eval_mm/tasks/ja_multi_image_vqa.py‎
Lines changed: 8 additions & 1 deletion b/‎src/eval_mm/tasks/ja_multi_image_vqa.py‎
Lines changed: 8 additions & 1 deletion
@@ -11,6 +11,9 @@ jobs:
       # HF token is required to avoid 429 rate limits on HF Hub
       HF_TOKEN: ${{ secrets.HF_TOKEN }}
       HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
+      HF_DATASETS_CACHE: ${{ runner.temp }}/hf-datasets
+      HUGGINGFACE_HUB_CACHE: ${{ runner.temp }}/hf-hub
+      TRANSFORMERS_CACHE: ${{ runner.temp }}/hf-transformers
 
     steps:
       - uses: actions/checkout@v4
@@ -29,8 +32,12 @@ jobs:
       - name: Run tests (metrics)
         run: uv run pytest src/eval_mm/metrics/*.py
 
+      - name: Clear HF caches before task tests
+        run: |
+          rm -rf "$HF_DATASETS_CACHE" "$HUGGINGFACE_HUB_CACHE" || true
+
       - name: Run tests (tasks)
-        run: uv run  pytest src/eval_mm/tasks/*.py
+        run: uv run pytest src/eval_mm/tasks/*.py
 
       # Optional model smoke; enable when runners have resources
       # - name: Run model smoke tests
 
@@ -10,7 +10,13 @@ def __init__(self, config):
         super().__init__(config)
 
     def _prepare_dataset(self) -> Dataset:
-        ds = load_dataset("lmms-lab/ai2d", split=self._maybe_slice_split("test"))
+        ds = load_dataset("lmms-lab/ai2d", split="test")
+        ds = ds.map(lambda example, idx: {"question_id": idx}, with_indices=True)
+        return ds
+
+    def _prepare_test_dataset(self) -> Dataset:
+        n = getattr(self.config, "max_dataset_len", 10)
+        ds = load_dataset("lmms-lab/ai2d", split=f"test[:{n}]")
         ds = ds.map(lambda example, idx: {"question_id": idx}, with_indices=True)
         return ds
 
 
@@ -35,9 +35,7 @@ def _prepare_dataset(self) -> Dataset:
         total = 0
 
         for config_name in BLINK.CONFIGS:
-            ds = load_dataset(
-                "BLINK-Benchmark/BLINK", config_name, split=self._maybe_slice_split("val")
-            )
+            ds = load_dataset("BLINK-Benchmark/BLINK", config_name, split="val")
             ds = ds.map(lambda x: {"config_name": config_name})
             all_datasets.append(ds)
             total += len(ds)
@@ -52,6 +50,11 @@ def _prepare_dataset(self) -> Dataset:
         )
 
         return combined_dataset
+
+    def _prepare_test_dataset(self) -> Dataset:
+        # Reuse the same incremental loading logic; Task base will apply
+        # final length cap if needed.
+        return self._prepare_dataset()
 
     @staticmethod
     def doc_to_text(doc) -> str:
 
@@ -23,35 +23,6 @@ class CCOCR(Task):
     default_metric = "ccocr"
 
     def _prepare_dataset(self) -> Dataset:
-        # Use streaming during tests to avoid empty slices after filtering
-        n = getattr(self.config, "max_dataset_len", None)
-        test_subset = os.getenv("PYTEST_CURRENT_TEST") or os.getenv("EVAL_MM_TEST_SUBSET") == "1"
-        if n is not None and test_subset:
-            stream = load_dataset(
-                "wulipc/CC-OCR", "multi_lan_ocr", split="test", streaming=True
-            )
-            buf = {
-                "index": [],
-                "question_id": [],
-                "question": [],
-                "answer": [],
-                "input_text": [],
-                "image": [],
-            }
-            count = 0
-            for ex in stream:
-                if ex.get("l2-category") == "Japanese":
-                    buf["index"].append(str(count))
-                    buf["question_id"].append(str(count))
-                    buf["question"].append(ex["question"])
-                    buf["answer"].append(ex["answer"])
-                    buf["input_text"].append(ex["question"])
-                    buf["image"].append(ex["image"])
-                    count += 1
-                    if count >= n:
-                        break
-            return Dataset.from_dict(buf)
-
         ds = load_dataset("wulipc/CC-OCR", "multi_lan_ocr", split="test")
         ds = ds.filter(lambda example: example["l2-category"] == "Japanese")
         ds = ds.map(
@@ -67,6 +38,32 @@ def _prepare_dataset(self) -> Dataset:
         )
         return ds
 
+    def _prepare_test_dataset(self) -> Dataset:
+        # Stream to collect first N Japanese samples without downloading full split
+        n = getattr(self.config, "max_dataset_len", 10)
+        stream = load_dataset("wulipc/CC-OCR", "multi_lan_ocr", split="test", streaming=True)
+        buf = {
+            "index": [],
+            "question_id": [],
+            "question": [],
+            "answer": [],
+            "input_text": [],
+            "image": [],
+        }
+        count = 0
+        for ex in stream:
+            if ex.get("l2-category") == "Japanese":
+                buf["index"].append(str(count))
+                buf["question_id"].append(str(count))
+                buf["question"].append(ex["question"])
+                buf["answer"].append(ex["answer"])
+                buf["input_text"].append(ex["question"])
+                buf["image"].append(ex["image"])
+                count += 1
+                if count >= n:
+                    break
+        return Dataset.from_dict(buf)
+
     @staticmethod
     def doc_to_text(doc) -> str:
         return doc["input_text"]
 
@@ -19,7 +19,13 @@ def __init__(self, config):
     def _prepare_dataset(self) -> Dataset:
         """Load ChartQA validation set."""
         # Load the ChartQA dataset from lmms-lab
-        ds = load_dataset("lmms-lab/ChartQA", split=self._maybe_slice_split("test"))
+        ds = load_dataset("lmms-lab/ChartQA", split="test")
+        ds = ds.map(lambda example, idx: {"question_id": idx}, with_indices=True)
+        return ds
+
+    def _prepare_test_dataset(self) -> Dataset:
+        n = getattr(self.config, "max_dataset_len", 10)
+        ds = load_dataset("lmms-lab/ChartQA", split=f"test[:{n}]")
         ds = ds.map(lambda example, idx: {"question_id": idx}, with_indices=True)
         return ds
 
 
@@ -21,9 +21,14 @@ def __init__(self, config):
     def _prepare_dataset(self) -> Dataset:
         """Load ChartQAPro test set."""
         # Load the ChartQAPro dataset from ahmed-masry
-        ds = load_dataset("ahmed-masry/ChartQAPro", split=self._maybe_slice_split("test"))
+        ds = load_dataset("ahmed-masry/ChartQAPro", split="test")
 
         return ds
+
+    def _prepare_test_dataset(self) -> Dataset:
+        n = getattr(self.config, "max_dataset_len", 10)
+        ds = load_dataset("ahmed-masry/ChartQAPro", split=f"test[:{n}]")
+        return ds
 
     @staticmethod
     def doc_to_text(doc) -> str:
 
@@ -37,40 +37,6 @@ class CVQA(Task):
     default_metric = "substring-match"
 
     def _prepare_dataset(self) -> Dataset:
-        # Use streaming during tests to ensure we pick N Japanese samples
-        # even if they are sparse early in the split.
-        n = getattr(self.config, "max_dataset_len", None)
-        test_subset = os.getenv("PYTEST_CURRENT_TEST") or os.getenv("EVAL_MM_TEST_SUBSET") == "1"
-        if n is not None and test_subset:
-            stream = load_dataset("afaji/cvqa", split="test", streaming=True)
-            buf = {
-                "index": [],
-                "question_id": [],
-                "question": [],
-                "question_en": [],
-                "options": [],
-                "translated_options": [],
-                "answer": [],
-                "answer_text": [],
-                "image": [],
-            }
-            count = 0
-            for ex in stream:
-                if ex.get("Subset") == "('Japanese', 'Japan')":
-                    buf["index"].append(str(count))
-                    buf["question_id"].append(str(count))
-                    buf["question"].append(ex["Question"])
-                    buf["question_en"].append(ex.get("Translated Question"))
-                    buf["options"].append(ex["Options"])
-                    buf["translated_options"].append(ex.get("Translated Options"))
-                    buf["answer"].append(ex["Label"])  # 0~3
-                    buf["answer_text"].append(OPTIONS_MAP[ex["Label"]])
-                    buf["image"].append(ex["image"])  # keep original to lazily decode later
-                    count += 1
-                    if count >= n:
-                        break
-            return Dataset.from_dict(buf)
-
         ds = load_dataset("afaji/cvqa", split="test")
         ds = ds.filter(lambda x: x["Subset"] == "('Japanese', 'Japan')")
         ds = ds.map(
@@ -89,6 +55,38 @@ def _prepare_dataset(self) -> Dataset:
         )
         return ds
 
+    def _prepare_test_dataset(self) -> Dataset:
+        # Stream to pick the first N Japanese samples and build a tiny Dataset
+        n = getattr(self.config, "max_dataset_len", 10)
+        stream = load_dataset("afaji/cvqa", split="test", streaming=True)
+        buf = {
+            "index": [],
+            "question_id": [],
+            "question": [],
+            "question_en": [],
+            "options": [],
+            "translated_options": [],
+            "answer": [],
+            "answer_text": [],
+            "image": [],
+        }
+        count = 0
+        for ex in stream:
+            if ex.get("Subset") == "('Japanese', 'Japan')":
+                buf["index"].append(str(count))
+                buf["question_id"].append(str(count))
+                buf["question"].append(ex["Question"])
+                buf["question_en"].append(ex.get("Translated Question"))
+                buf["options"].append(ex["Options"])
+                buf["translated_options"].append(ex.get("Translated Options"))
+                buf["answer"].append(ex["Label"])  # 0~3
+                buf["answer_text"].append(OPTIONS_MAP[ex["Label"]])
+                buf["image"].append(ex["image"])  # keep original to lazily decode later
+                count += 1
+                if count >= n:
+                    break
+        return Dataset.from_dict(buf)
+
     @staticmethod
     def doc_to_text(doc) -> str:
         # Lazily construct the prompt to reduce preprocessing cost
 
@@ -19,12 +19,18 @@ def __init__(self, config):
     def _prepare_dataset(self) -> Dataset:
         """Load DocVQA validation set."""
         # Load the DocVQA config from lmms-lab/DocVQA dataset
-        ds = load_dataset("lmms-lab/DocVQA", "DocVQA", split=self._maybe_slice_split("validation"))
+        ds = load_dataset("lmms-lab/DocVQA", "DocVQA", split="validation")
 
         # Rename questionId to question_id for consistency
         ds = ds.rename_column("questionId", "question_id")
 
         return ds
+
+    def _prepare_test_dataset(self) -> Dataset:
+        n = getattr(self.config, "max_dataset_len", 10)
+        ds = load_dataset("lmms-lab/DocVQA", "DocVQA", split=f"validation[:{n}]")
+        ds = ds.rename_column("questionId", "question_id")
+        return ds
 
     @staticmethod
     def doc_to_text(doc) -> str:
 
@@ -22,13 +22,23 @@ def _prepare_dataset(self) -> Dataset:
         ds = load_dataset(
             "lmms-lab/DocVQA",
             "InfographicVQA",
-            split=self._maybe_slice_split("validation"),
+            split="validation",
         )
 
         # Rename questionId to question_id for consistency
         ds = ds.rename_column("questionId", "question_id")
 
         return ds
+
+    def _prepare_test_dataset(self) -> Dataset:
+        n = getattr(self.config, "max_dataset_len", 10)
+        ds = load_dataset(
+            "lmms-lab/DocVQA",
+            "InfographicVQA",
+            split=f"validation[:{n}]",
+        )
+        ds = ds.rename_column("questionId", "question_id")
+        return ds
 
     @staticmethod
     def doc_to_text(doc) -> str:
 
@@ -14,7 +14,14 @@ class JAMultiImageVQA(Task):
     default_metric = "rougel"
 
     def _prepare_dataset(self) -> Dataset:
-        ds = load_dataset("SakanaAI/JA-Multi-Image-VQA", split=self._maybe_slice_split("test"))
+        ds = load_dataset("SakanaAI/JA-Multi-Image-VQA", split="test")
+        ds = ds.rename_column("question", "input_text")
+        ds = ds.map(lambda example, idx: {"question_id": idx}, with_indices=True)
+        return ds
+
+    def _prepare_test_dataset(self) -> Dataset:
+        n = getattr(self.config, "max_dataset_len", 10)
+        ds = load_dataset("SakanaAI/JA-Multi-Image-VQA", split=f"test[:{n}]")
         ds = ds.rename_column("question", "input_text")
         ds = ds.map(lambda example, idx: {"question_id": idx}, with_indices=True)
         return ds