Skip to content

Commit ac7e2f4

Browse files
author
silviase
committed
update
1 parent df3da4c commit ac7e2f4

File tree

1 file changed

+29
-27
lines changed

1 file changed

+29
-27
lines changed

src/eval_mm/tasks/textvqa.py

Lines changed: 29 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,25 @@ def _prepare_dataset(self) -> Dataset:
2323
return ds
2424

2525
def _prepare_test_dataset(self) -> Dataset:
26+
# Stream a tiny subset to avoid heavy downloads/cache writes in CI
2627
n = getattr(self.config, "max_dataset_len", 10)
27-
ds = load_dataset("lmms-lab/textvqa", split=f"validation[:{n}]")
28-
return ds
28+
stream = load_dataset("lmms-lab/textvqa", split="validation", streaming=True)
29+
buf = {
30+
"question_id": [],
31+
"question": [],
32+
"answers": [],
33+
"image": [],
34+
}
35+
count = 0
36+
for ex in stream:
37+
buf["question_id"].append(str(ex["question_id"]))
38+
buf["question"].append(ex["question"])
39+
buf["answers"].append(ex["answers"]) # list[str]
40+
buf["image"].append(ex["image"]) # keep image column for lazy decode
41+
count += 1
42+
if count >= n:
43+
break
44+
return Dataset.from_dict(buf)
2945

3046
@staticmethod
3147
def doc_to_text(doc) -> str:
@@ -56,31 +72,17 @@ def doc_to_answer(doc) -> list[str]:
5672

5773

5874
def test_textvqa_task():
59-
"""Test TextVQA task implementation."""
75+
"""Basic loader/type checks for TextVQA."""
6076
from eval_mm.tasks.task import TaskConfig
61-
62-
# Create task instance
77+
6378
task = TextVQA(TaskConfig(max_dataset_len=10))
64-
65-
# Load dataset
66-
print("Loading TextVQA dataset...")
6779
ds = task.dataset
68-
print(f"Dataset size: {len(ds)}")
69-
70-
# Test with first example
71-
example = ds[0]
72-
print(f"\nFirst example:")
73-
print(f" ID: {task.doc_to_id(example)}")
74-
print(f" Question: {task.doc_to_text(example)}")
75-
print(f" Image: {task.doc_to_visual(example)[0]}")
76-
print(f" Valid answers: {task.doc_to_answer(example)}")
77-
78-
# Verify data types
79-
assert isinstance(task.doc_to_text(example), str)
80-
assert isinstance(task.doc_to_visual(example), list)
81-
assert all(isinstance(img, Image.Image) for img in task.doc_to_visual(example))
82-
assert isinstance(task.doc_to_id(example), str)
83-
assert isinstance(task.doc_to_answer(example), list)
84-
assert all(isinstance(ans, str) for ans in task.doc_to_answer(example))
85-
86-
print("\nAll tests passed!")
80+
assert len(ds) <= 10
81+
ex = ds[0]
82+
# Verify data shapes/types without verbose prints
83+
assert isinstance(task.doc_to_text(ex), str)
84+
vis = task.doc_to_visual(ex)
85+
assert isinstance(vis, list) and isinstance(vis[0], Image.Image)
86+
assert isinstance(task.doc_to_id(ex), str)
87+
answers = task.doc_to_answer(ex)
88+
assert isinstance(answers, list) and all(isinstance(a, str) for a in answers)

0 commit comments

Comments
 (0)