Fix JIC-VQA dataset preparation

speed1313 · speed1313 · commit 200f46f448b7 · 2025-03-17T21:28:39.000+09:00
diff --git a/README.md b/README.md
@@ -32,6 +32,7 @@ For details on the data format and the list of supported data, please check [DAT
     - [How to Add Inference Code for a VLM Model](#how-to-add-inference-code-for-a-vlm-model)
     - [How to Add Dependencies](#how-to-add-dependencies)
     - [Formatting and Linting with ruff](#formatting-and-linting-with-ruff)
+    - [Testing](#testing)
     - [How to Release to PyPI](#how-to-release-to-pypi)
     - [How to Update the Website](#how-to-update-the-website)
   - [Acknowledgements](#acknowledgements)
@@ -206,6 +207,14 @@ uv run ruff format src
 uv run ruff check --fix src
 ```
 
+### Testing
+
+You can test task classes and metric classes with the following command:
+```
+bash test.sh
+```
+
+
 ### How to Release to PyPI
 
 ```
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,7 @@ dependencies = [
     "backoff>=2.2.1",
     "scipy>=1.15.1",
     "torch>=2.5.1",
+    "webdataset>=0.2.111",
 ]
 readme = "README.md"
 license = "Apache-2.0"
diff --git a/scripts/consistency_mecha_ja.py b/scripts/consistency_mecha_ja.py
@@ -2,7 +2,7 @@
 import json
 import pandas as pd
 import matplotlib.pyplot as plt
-import japanize_matplotlib
+import japanize_matplotlib  # noqa
 import numpy as np
 
 # ======================================
diff --git a/scripts/prepare_jic_vqa.py b/scripts/prepare_jic_vqa.py
@@ -0,0 +1,109 @@
+from datasets import load_dataset
+import os
+import requests
+from PIL import Image
+from io import BytesIO
+import backoff
+import webdataset as wds
+from tqdm import tqdm
+
+
+# 画像をダウンロード
+@backoff.on_exception(
+    backoff.expo,  # 指数バックオフ
+    requests.exceptions.RequestException,  # 対象例外
+    max_tries=5,  # 最大リトライ回数
+)
+def download_image(image_url: str) -> Image:
+    user_agent_string = (
+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0"
+    )
+    response = requests.get(
+        image_url, headers={"User-Agent": user_agent_string}, timeout=10
+    )
+    response.raise_for_status()
+    image = Image.open(BytesIO(response.content)).convert("RGB")
+    return image
+
+
+def download_image_wrap(image_url: str) -> Image:
+    try:
+        return download_image(image_url)
+    except Exception as e:
+        print(f"Failed to process {image_url}: {e}")
+        return None
+
+
+def get_domain_from_question(question: str) -> str:
+    for keyword, domain in domain_dict.items():
+        if keyword in question:
+            return domain
+
+
+ds = load_dataset("line-corporation/JIC-VQA", split="train")
+
+input_texts = []
+answers = []
+images = []
+question_ids = []
+domains = []
+
+domain_dict = {
+    "花": "jaflower30",
+    "食べ物": "jafood101",
+    "ランドマーク": "jalandmark10",
+    "施設": "jafacility20",
+}
+
+output_dir = "dataset/jic_vqa"
+os.makedirs(output_dir, exist_ok=True)
+if not os.path.exists(f"{output_dir}/images.tar"):
+    with wds.TarWriter(f"{output_dir}/images.tar") as sink:
+        for i, example in tqdm(enumerate(ds), total=len(ds)):
+            image_url = example["url"]
+            image = download_image_wrap(image_url)
+            # resize
+            if image is not None:
+                image = image.resize((224, 224))
+                image = image.convert("RGB")
+            if image is None:
+                continue
+            sample = {
+                "__key__": str(example["id"]),
+                "jpg": image,
+                "txt": example["category"],
+                "url.txt": image_url,
+                "question.txt": example["question"],
+            }
+            sink.write(sample)
+
+ds = load_dataset("webdataset", data_files=f"{output_dir}/images.tar", split="train")
+print(ds)
+print(ds[0])
+
+ds = ds.remove_columns(["__url__"])
+ds = ds.rename_columns(
+    {
+        "txt": "category",
+        "url.txt": "url",
+        "question.txt": "question",
+    }
+)
+
+# Phase 2: Load images and populate data structures
+ds = ds.map(
+    lambda x: {
+        "input_text": x["question"].decode("utf-8"),
+        "url": x["url"].decode("utf-8").encode("utf-8"),
+        "answer": str(x["category"]),
+        "image": x["jpg"],
+        "question_id": int(x["__key__"]),
+        "domain": get_domain_from_question(str(x["question"].decode("utf-8"))),
+    }
+)
+ds = ds.remove_columns(["question", "__key__", "jpg"])
+
+print(ds)
+print(ds[0])
+# {'category': 'ガソリンスタンド', 'url': b'https://live.staticflickr.com/5536/11190751074_f97587084e_o.jpg', 'input_text': "この画像にはどの施設が映っていますか？次の四つの選択肢から正しいものを選んでください: ['スーパーマーケット', 'コンビニ', '駐車場', 'ガソリンスタンド']", 'answer': 'ガソリンスタンド', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=224x224 at 0x7F83A660F710>, 'question_id': '11190751074', 'domain': 'jafacility20'}
+ds.to_parquet("dataset/jic_vqa.parquet")
diff --git a/src/eval_mm/tasks/jic_vqa.py b/src/eval_mm/tasks/jic_vqa.py
@@ -1,112 +1,25 @@
-import time
-import warnings
-from io import BytesIO
-
-import requests
 from PIL import Image
 from datasets import Dataset, load_dataset
-from huggingface_hub import cached_assets_path
 
 from ..api.registry import register_task
 from ..api.task import Task
 from eval_mm.metrics import ScorerRegistry
-from tqdm import tqdm
+import os
 
 
 @register_task("jic-vqa")
 class JICVQA(Task):
     @staticmethod
     def _prepare_dataset() -> Dataset:
-        cache_dir = cached_assets_path(
-            library_name="datasets", namespace="JICVQA", subfolder="download"
-        )
-
-        dataset = load_dataset("line-corporation/JIC-VQA")
-        input_texts = []
-        answers = []
-        images = []
-        question_ids = []
-        domains = []
-
-        domain_dict = {
-            "花": "jaflower30",
-            "食べ物": "jafood101",
-            "ランドマーク": "jalandmark10",
-            "施設": "jafacility20",
-        }
-
-        def get_domain_from_question(question):
-            for keyword, domain in domain_dict.items():
-                if keyword in question:
-                    return domain
-
-        def download_image(url, image_id):
-            # TODO: Multi-threading for faster download
-            img_format = url.split(".")[-1]
-            image_path = cache_dir / f"{image_id}.{img_format}"
-            if image_path.exists():
-                return
-
-            max_attempts = 5
-            attempt_errors = []
-            for _ in range(max_attempts):
-                try:
-                    response = requests.get(url)
-                    if response.status_code == 200:
-                        image = Image.open(BytesIO(response.content))
-                        image.save(image_path)
-                        print(f"Downloaded: {image_path}")
-                        wait_time = 1.0
-                        time.sleep(wait_time)
-                        return
-                    else:
-                        error_msg = f"Status code: {response.status_code}"
-                        attempt_errors.append(error_msg)
-
-                except Exception as e:
-                    error_msg = f"Exception: {e}"
-                    attempt_errors.append(error_msg)
-
-            warnings.warn(
-                f"Failed to download {url} after {max_attempts} attempts. Errors: {attempt_errors}"
+        if not os.path.exists("dataset/jic_vqa.parquet"):
+            raise FileNotFoundError(
+                "Dataset not found. Please run `scripts/prepare_jic_vqa.py` to prepare the dataset."
             )
 
-        # Phase 1: Download all images
-        for subset in dataset:
-            for entry in tqdm(dataset[subset], desc=f"Downloading {subset} images"):
-                url = entry["url"]
-                image_id = entry["id"]
-                download_image(url, image_id)
-
-        # Phase 2: Load images and populate data structures
-        for subset in dataset:
-            for entry in dataset[subset]:
-                image_id = entry["id"]
-                img_format = entry["url"].split(".")[-1]
-                image_path = cache_dir / f"{image_id}.{img_format}"
-
-                if not image_path.exists():
-                    warnings.warn(f"The image path {image_path} does not exist.")
-                    continue
-                try:
-                    image = Image.open(image_path)
-                except Exception as e:
-                    print(f"{e} : Failed to open {image_path}")
-                images.append(image)
-                input_texts.append(entry["question"])
-                answers.append(entry["category"])
-                question_ids.append(image_id)
-                domain = get_domain_from_question(entry["question"])
-                domains.append(domain)
-
-        data_dict = {
-            "input_text": input_texts,
-            "answer": answers,
-            "image": images,
-            "question_id": question_ids,
-            "domain": domains,
-        }
-        return Dataset.from_dict(data_dict)
+        dataset = load_dataset(
+            "parquet", data_files="dataset/jic_vqa.parquet", split="train"
+        )
+        return dataset
 
     @staticmethod
     def doc_to_text(doc) -> str:
diff --git a/test.sh b/test.sh
@@ -0,0 +1,2 @@
+uv run pytest src/eval_mm/tasks/*.py
+uv run pytest src/eval_mm/metrics/*.py

Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@ dependencies = [`
`20`	`20`	`"backoff>=2.2.1",`
`21`	`21`	`"scipy>=1.15.1",`
`22`	`22`	`"torch>=2.5.1",`
	`23`	`+ "webdataset>=0.2.111",`
`23`	`24`	`]`
`24`	`25`	`readme = "README.md"`
`25`	`26`	`license = "Apache-2.0"`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+uv run pytest src/eval_mm/tasks/*.py`
	`2`	`+uv run pytest src/eval_mm/metrics/*.py`