Skip to content

Commit aac33a6

Browse files
Cards for the Real MM RAG datasets (#1795)
* extending the rag e2e task * adding AddIncrementalId * mm rag cards * mm rag cards * format * format * update * update * update * Improve speed readability and unit-testability Signed-off-by: elronbandel <[email protected]> * Revert naming Signed-off-by: elronbandel <[email protected]> * Revert changes to rag files Signed-off-by: elronbandel <[email protected]> * Fix hotpot qa Signed-off-by: elronbandel <[email protected]> --------- Signed-off-by: elronbandel <[email protected]> Co-authored-by: elronbandel <[email protected]>
1 parent 8f3c446 commit aac33a6

File tree

18 files changed

+689
-134
lines changed

18 files changed

+689
-134
lines changed

prepare/cards/rag/end_to_end/hotpotqa.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,9 @@
1818
# Benchmark
1919
benchmark_card = TaskCard(
2020
loader=LoadHF(
21-
path="hotpotqa/hotpot_qa",
21+
path="vincentkoc/hotpot_qa_archive",
2222
name="distractor",
23+
revision="c060661",
2324
data_classification_policy=["public"],
2425
),
2526
preprocess_steps=[
@@ -85,7 +86,8 @@
8586
# Documents
8687
documents_card = TaskCard(
8788
loader=LoadHF(
88-
path="hotpotqa/hotpot_qa",
89+
path="vincentkoc/hotpot_qa_archive",
90+
revision="c060661",
8991
name="distractor",
9092
data_classification_policy=["public"],
9193
),
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import json
2+
3+
from unitxt import add_to_catalog
4+
from unitxt.blocks import TaskCard
5+
from unitxt.collections_operators import Wrap
6+
from unitxt.image_operators import HashImage, ToImage
7+
from unitxt.loaders import LoadHF
8+
from unitxt.operators import (
9+
AddIncrementalId,
10+
Cast,
11+
Copy,
12+
Deduplicate,
13+
FilterByCondition,
14+
)
15+
from unitxt.splitters import RenameSplits, SplitRandomMix
16+
from unitxt.templates import InputOutputTemplate
17+
from unitxt.test_utils.card import test_card
18+
19+
description = (
20+
"We introduced REAL-MM-RAG-Bench, a real-world multi-modal retrieval benchmark designed to evaluate "
21+
"retrieval models in reliable, challenging, and realistic settings. The benchmark was constructed using "
22+
"an automated pipeline, where queries were generated by a vision-language model (VLM), filtered by a large "
23+
"language model (LLM), and rephrased by an LLM to ensure high-quality retrieval evaluation. "
24+
"To simulate real-world retrieval challenges, we introduce multi-level query rephrasing, modifying queries "
25+
"at three distinct levels—from minor wording adjustments to significant structural changes—ensuring models "
26+
"are tested on their true semantic understanding rather than simple keyword matching."
27+
)
28+
29+
datasets = [
30+
{"hf_name": "REAL-MM-RAG_FinSlides", "subset": "fin_slides"},
31+
{"hf_name": "REAL-MM-RAG_FinReport", "subset": "fin_report"},
32+
{"hf_name": "REAL-MM-RAG_TechReport", "subset": "tech_report"},
33+
{"hf_name": "REAL-MM-RAG_TechSlides", "subset": "tech_slides"},
34+
]
35+
36+
hf_ibm_research = "ibm-research"
37+
hf_url_base = "https://huggingface.co/datasets/"
38+
39+
for dataset in datasets:
40+
hf_name = dataset["hf_name"]
41+
hf_dataset_id = f"{hf_ibm_research}/{hf_name}"
42+
hf_url = f"{hf_url_base}/{hf_dataset_id}"
43+
subset = dataset["subset"]
44+
45+
# first we create the card for the benchmark
46+
card = TaskCard(
47+
loader=LoadHF(
48+
path=hf_dataset_id,
49+
name="default",
50+
split="test",
51+
data_classification_policy=["public"],
52+
),
53+
preprocess_steps=[
54+
FilterByCondition(values={"query": None}, condition="ne"),
55+
HashImage(
56+
field="image",
57+
to_field="reference_context_ids",
58+
),
59+
Copy( field="query", to_field="question"),
60+
AddIncrementalId(to_field="question_id"),
61+
Cast(field="question_id", to="str"),
62+
SplitRandomMix(
63+
{
64+
"test": "test[30%]",
65+
"train": "test[70%]",
66+
}),
67+
Wrap(
68+
field="answer",
69+
inside="list",
70+
to_field="reference_answers",
71+
),
72+
Wrap(
73+
field="reference_context_ids",
74+
inside="list",
75+
to_field="reference_context_ids",
76+
),
77+
],
78+
task="tasks.rag.end_to_end",
79+
templates={"default": "templates.rag.end_to_end.json_predictions"},
80+
__tags__={"license": "cdla-permissive-2.0", "url": hf_url},
81+
__title__=dataset["hf_name"].replace("-", "").replace("_", ": "),
82+
__description__=description,
83+
)
84+
85+
wrong_answer = {
86+
"contexts": ["hi"],
87+
"is_answerable": True,
88+
"answer": "Don't know",
89+
"context_ids": ["id0"],
90+
}
91+
92+
test_card(
93+
card,
94+
strict=True,
95+
full_mismatch_prediction_values=[json.dumps(wrong_answer)],
96+
debug=False,
97+
)
98+
99+
add_to_catalog(card, f"cards.rag.benchmark.real_mm_rag_{subset}.en", overwrite=True)
100+
101+
# next we create the card for the pages (documents)
102+
card = TaskCard(
103+
loader=LoadHF(
104+
path=hf_dataset_id,
105+
name="default",
106+
split="test",
107+
data_classification_policy=["public"],
108+
),
109+
preprocess_steps=[
110+
RenameSplits({"test": "train"}),
111+
HashImage(
112+
field="image",
113+
to_field="document_id",
114+
),
115+
Deduplicate(by=["document_id"]),
116+
ToImage(field="image"),
117+
Wrap(field="image", inside="list", to_field="passages"),
118+
],
119+
task="tasks.rag.corpora",
120+
templates={
121+
"empty": InputOutputTemplate(
122+
input_format="",
123+
output_format="",
124+
),
125+
},
126+
__tags__={"license": "cdla-permissive-2.0", "url": hf_url},
127+
__title__=dataset["hf_name"].replace("-", "").replace("_", ": "),
128+
__description__=description,
129+
)
130+
# Not testing card, because documents are not evaluated.
131+
add_to_catalog(card, f"cards.rag.documents.real_mm_rag_{subset}.en", overwrite=True)

prepare/tasks/rag/rag_end_to_end.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from unitxt import add_to_catalog
44
from unitxt.blocks import Task
5-
from unitxt.types import Dialog, RagResponse
5+
from unitxt.types import Dialog, Image, RagResponse
66

77
add_to_catalog(
88
Task(
@@ -50,7 +50,7 @@
5050
input_fields={
5151
"document_id": str,
5252
"title": str,
53-
"passages": List[str],
53+
"passages": List[Union[str, Image]],
5454
"metadata_tags": Dict[str, str],
5555
},
5656
reference_fields={},

src/unitxt/catalog/cards/rag/benchmark/hotpotqa/en.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
"__type__": "task_card",
33
"loader": {
44
"__type__": "load_hf",
5-
"path": "hotpotqa/hotpot_qa",
5+
"path": "vincentkoc/hotpot_qa_archive",
66
"name": "distractor",
7+
"revision": "c060661",
78
"data_classification_policy": [
89
"public"
910
]
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
{
2+
"__type__": "task_card",
3+
"loader": {
4+
"__type__": "load_hf",
5+
"path": "ibm-research/REAL-MM-RAG_FinReport",
6+
"name": "default",
7+
"split": "test",
8+
"data_classification_policy": [
9+
"public"
10+
]
11+
},
12+
"preprocess_steps": [
13+
{
14+
"__type__": "filter_by_condition",
15+
"values": {
16+
"query": null
17+
},
18+
"condition": "ne"
19+
},
20+
{
21+
"__type__": "hash_image",
22+
"field": "image",
23+
"to_field": "reference_context_ids"
24+
},
25+
{
26+
"__type__": "copy",
27+
"field": "query",
28+
"to_field": "question"
29+
},
30+
{
31+
"__type__": "add_incremental_id",
32+
"to_field": "question_id"
33+
},
34+
{
35+
"__type__": "cast",
36+
"field": "question_id",
37+
"to": "str"
38+
},
39+
{
40+
"__type__": "split_random_mix",
41+
"mix": {
42+
"test": "test[30%]",
43+
"train": "test[70%]"
44+
}
45+
},
46+
{
47+
"__type__": "wrap",
48+
"field": "answer",
49+
"inside": "list",
50+
"to_field": "reference_answers"
51+
},
52+
{
53+
"__type__": "wrap",
54+
"field": "reference_context_ids",
55+
"inside": "list",
56+
"to_field": "reference_context_ids"
57+
}
58+
],
59+
"task": "tasks.rag.end_to_end",
60+
"templates": {
61+
"default": "templates.rag.end_to_end.json_predictions"
62+
},
63+
"__tags__": {
64+
"license": "cdla-permissive-2.0",
65+
"url": "https://huggingface.co/datasets//ibm-research/REAL-MM-RAG_FinReport"
66+
},
67+
"__title__": "REALMMRAG: FinReport",
68+
"__description__": "We introduced REAL-MM-RAG-Bench, a real-world multi-modal retrieval benchmark designed to evaluate retrieval models in reliable, challenging, and realistic settings. The benchmark was constructed using an automated pipeline, where queries were generated by a vision-language model (VLM), filtered by a large language model (LLM), and rephrased by an LLM to ensure high-quality retrieval evaluation. To simulate real-world retrieval challenges, we introduce multi-level query rephrasing, modifying queries at three distinct levels—from minor wording adjustments to significant structural changes—ensuring models are tested on their true semantic understanding rather than simple keyword matching."
69+
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
{
2+
"__type__": "task_card",
3+
"loader": {
4+
"__type__": "load_hf",
5+
"path": "ibm-research/REAL-MM-RAG_FinSlides",
6+
"name": "default",
7+
"split": "test",
8+
"data_classification_policy": [
9+
"public"
10+
]
11+
},
12+
"preprocess_steps": [
13+
{
14+
"__type__": "filter_by_condition",
15+
"values": {
16+
"query": null
17+
},
18+
"condition": "ne"
19+
},
20+
{
21+
"__type__": "hash_image",
22+
"field": "image",
23+
"to_field": "reference_context_ids"
24+
},
25+
{
26+
"__type__": "copy",
27+
"field": "query",
28+
"to_field": "question"
29+
},
30+
{
31+
"__type__": "add_incremental_id",
32+
"to_field": "question_id"
33+
},
34+
{
35+
"__type__": "cast",
36+
"field": "question_id",
37+
"to": "str"
38+
},
39+
{
40+
"__type__": "split_random_mix",
41+
"mix": {
42+
"test": "test[30%]",
43+
"train": "test[70%]"
44+
}
45+
},
46+
{
47+
"__type__": "wrap",
48+
"field": "answer",
49+
"inside": "list",
50+
"to_field": "reference_answers"
51+
},
52+
{
53+
"__type__": "wrap",
54+
"field": "reference_context_ids",
55+
"inside": "list",
56+
"to_field": "reference_context_ids"
57+
}
58+
],
59+
"task": "tasks.rag.end_to_end",
60+
"templates": {
61+
"default": "templates.rag.end_to_end.json_predictions"
62+
},
63+
"__tags__": {
64+
"license": "cdla-permissive-2.0",
65+
"url": "https://huggingface.co/datasets//ibm-research/REAL-MM-RAG_FinSlides"
66+
},
67+
"__title__": "REALMMRAG: FinSlides",
68+
"__description__": "We introduced REAL-MM-RAG-Bench, a real-world multi-modal retrieval benchmark designed to evaluate retrieval models in reliable, challenging, and realistic settings. The benchmark was constructed using an automated pipeline, where queries were generated by a vision-language model (VLM), filtered by a large language model (LLM), and rephrased by an LLM to ensure high-quality retrieval evaluation. To simulate real-world retrieval challenges, we introduce multi-level query rephrasing, modifying queries at three distinct levels—from minor wording adjustments to significant structural changes—ensuring models are tested on their true semantic understanding rather than simple keyword matching."
69+
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
{
2+
"__type__": "task_card",
3+
"loader": {
4+
"__type__": "load_hf",
5+
"path": "ibm-research/REAL-MM-RAG_TechReport",
6+
"name": "default",
7+
"split": "test",
8+
"data_classification_policy": [
9+
"public"
10+
]
11+
},
12+
"preprocess_steps": [
13+
{
14+
"__type__": "filter_by_condition",
15+
"values": {
16+
"query": null
17+
},
18+
"condition": "ne"
19+
},
20+
{
21+
"__type__": "hash_image",
22+
"field": "image",
23+
"to_field": "reference_context_ids"
24+
},
25+
{
26+
"__type__": "copy",
27+
"field": "query",
28+
"to_field": "question"
29+
},
30+
{
31+
"__type__": "add_incremental_id",
32+
"to_field": "question_id"
33+
},
34+
{
35+
"__type__": "cast",
36+
"field": "question_id",
37+
"to": "str"
38+
},
39+
{
40+
"__type__": "split_random_mix",
41+
"mix": {
42+
"test": "test[30%]",
43+
"train": "test[70%]"
44+
}
45+
},
46+
{
47+
"__type__": "wrap",
48+
"field": "answer",
49+
"inside": "list",
50+
"to_field": "reference_answers"
51+
},
52+
{
53+
"__type__": "wrap",
54+
"field": "reference_context_ids",
55+
"inside": "list",
56+
"to_field": "reference_context_ids"
57+
}
58+
],
59+
"task": "tasks.rag.end_to_end",
60+
"templates": {
61+
"default": "templates.rag.end_to_end.json_predictions"
62+
},
63+
"__tags__": {
64+
"license": "cdla-permissive-2.0",
65+
"url": "https://huggingface.co/datasets//ibm-research/REAL-MM-RAG_TechReport"
66+
},
67+
"__title__": "REALMMRAG: TechReport",
68+
"__description__": "We introduced REAL-MM-RAG-Bench, a real-world multi-modal retrieval benchmark designed to evaluate retrieval models in reliable, challenging, and realistic settings. The benchmark was constructed using an automated pipeline, where queries were generated by a vision-language model (VLM), filtered by a large language model (LLM), and rephrased by an LLM to ensure high-quality retrieval evaluation. To simulate real-world retrieval challenges, we introduce multi-level query rephrasing, modifying queries at three distinct levels—from minor wording adjustments to significant structural changes—ensuring models are tested on their true semantic understanding rather than simple keyword matching."
69+
}

0 commit comments

Comments
 (0)