IBM
diff --git a/‎README.md
Lines changed: 4 additions & 4 deletions b/‎README.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/_static/custom.css
Lines changed: 5 additions & 0 deletions b/‎docs/_static/custom.css
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/catalog.py
Lines changed: 49 additions & 6 deletions b/‎docs/catalog.py
Lines changed: 49 additions & 6 deletions
diff --git a/‎examples/evaluate_existing_dataset_with_install.py
Lines changed: 4 additions & 3 deletions b/‎examples/evaluate_existing_dataset_with_install.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎prepare/cards/ag_news.py
Lines changed: 4 additions & 15 deletions b/‎prepare/cards/ag_news.py
Lines changed: 4 additions & 15 deletions
diff --git a/‎prepare/cards/head_qa.py
Lines changed: 6 additions & 3 deletions b/‎prepare/cards/head_qa.py
Lines changed: 6 additions & 3 deletions
diff --git a/‎prepare/cards/safety/mlcommons_ailuminate.py
Lines changed: 40 additions & 0 deletions b/‎prepare/cards/safety/mlcommons_ailuminate.py
Lines changed: 40 additions & 0 deletions
diff --git a/‎prepare/cards/safety/simple_safety_tests.py
Lines changed: 13 additions & 8 deletions b/‎prepare/cards/safety/simple_safety_tests.py
Lines changed: 13 additions & 8 deletions
diff --git a/‎prepare/cards/sst2.py
Lines changed: 3 additions & 3 deletions b/‎prepare/cards/sst2.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎prepare/cards/wnli.py
Lines changed: 2 additions & 2 deletions b/‎prepare/cards/wnli.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎prepare/metrics/llm_as_judge/safety/llamaguard.py
Lines changed: 28 additions & 27 deletions b/‎prepare/metrics/llm_as_judge/safety/llamaguard.py
Lines changed: 28 additions & 27 deletions
@@ -31,11 +31,11 @@ https://github.com/IBM/unitxt/assets/23455264/baef9131-39d4-4164-90b2-05da52919f
 
 ### 🦄 Currently on Unitxt Catalog
 
-![Abstract Tasks](https://img.shields.io/badge/Abstract_Tasks-62-blue)
-![Dataset Cards](https://img.shields.io/badge/Dataset_Cards-3025-blue)
+![Abstract Tasks](https://img.shields.io/badge/Abstract_Tasks-64-blue)
+![Dataset Cards](https://img.shields.io/badge/Dataset_Cards-3174-blue)
 ![Templates](https://img.shields.io/badge/Templates-342-blue)
-![Benchmarks](https://img.shields.io/badge/Benchmarks-4-blue)
-![Metrics](https://img.shields.io/badge/Metrics-422-blue)
+![Benchmarks](https://img.shields.io/badge/Benchmarks-6-blue)
+![Metrics](https://img.shields.io/badge/Metrics-462-blue)
 
 ### 🦄 Run Unitxt Exploration Dashboard
 
 
@@ -206,3 +206,8 @@ div.document div.documentwrapper {
 .red {
   color: red;
 }
+
+#unitxtImports {
+  /* Display nothing for the element */
+  display: none;
+}         
@@ -1,8 +1,10 @@
 import json
 import os
 import re
+from collections import defaultdict
 from functools import lru_cache
 from pathlib import Path
+from typing import List
 
 from docutils.core import publish_parts
 from pygments import highlight
@@ -43,6 +45,44 @@ def dict_to_syntax_highlighted_html(nested_dict):
     # Apply syntax highlighting
     return highlight(py_str, PythonLexer(), formatter)
 
+def imports_to_syntax_highlighted_html(subtypes: List[str])-> str:
+    if len(subtypes) == 0:
+        return ""
+    module_to_class_names = defaultdict(list)
+    for subtype in subtypes:
+        subtype_class = Artifact._class_register.get(subtype)
+        module_to_class_names[subtype_class.__module__].append(subtype_class.__name__)
+
+    imports_txt = ""
+    for modu in sorted(module_to_class_names.keys()):
+        classes_string = ", ".join(sorted(module_to_class_names[modu]))
+        imports_txt += f"from {modu} import {classes_string}\n"
+
+    formatter = HtmlFormatter(nowrap=True)
+    htm = highlight(imports_txt, PythonLexer(), formatter)
+
+    imports_html = f'\n<p><div><pre><span id="unitxtImports">{htm}</span></pre>\n'
+    imports_html += """<button onclick="toggleText()" id="textButton">
+    Show Imports
+</button>
+
+<script>
+    function toggleText() {
+        let showImports = document.getElementById("unitxtImports");
+        let buttonText = document.getElementById("textButton");
+        if (showImports.style.display === "none"  || showImports.style.display === "") {
+            showImports.style.display = "inline";
+            buttonText.innerHTML = "Close";
+        }
+
+        else {
+            showImports.style.display = "none";
+            buttonText.innerHTML = "Show Imports";
+        }
+    }
+</script>
+</div></p>\n"""
+    return imports_html
 
 def write_title(title, label):
     title = f"📁 {title}"
@@ -177,26 +217,29 @@ def make_content(artifact, label, all_labels):
 
     # Replacement function
     html_for_dict = re.sub(pattern, r"\1\2\3", html_for_dict)
+
+    subtypes = all_subtypes_of_artifact(artifact)
+    subtypes = list(set(subtypes))
+    subtypes.remove(artifact_type)  # this was already documented
+    html_for_imports = imports_to_syntax_highlighted_html(subtypes)
+
     source_link = f"""<a class="reference external" href="https://github.com/IBM/unitxt/blob/main/src/unitxt/catalog/{catalog_id.replace(".", "/")}.json"><span class="viewcode-link"><span class="pre">[source]</span></span></a>"""
-    html_for_dict = f"""<div class="admonition note">
+    html_for_element = f"""<div class="admonition note">
 <p class="admonition-title">{catalog_id}</p>
 <div class="highlight-json notranslate">
 <div class="highlight"><pre>
 {html_for_dict.strip()}
-</pre>{source_link}</div></div>
+</pre>{source_link}{html_for_imports.strip()}</div></div>
 </div>""".replace("\n", "\n    ")
 
-    result += "    " + html_for_dict + "\n"
+    result += "    " + html_for_element + "\n"
 
     if artifact_class.__doc__:
         explanation_str = f"Explanation about `{type_class_name}`"
         result += f"\n{explanation_str}\n"
         result += "+" * len(explanation_str) + "\n\n"
         result += artifact_class.__doc__ + "\n"
 
-    subtypes = all_subtypes_of_artifact(artifact)
-    subtypes = list(set(subtypes))
-    subtypes.remove(artifact_type)  # this was already documented
     for subtype in subtypes:
         subtype_class = Artifact._class_register.get(subtype)
         subtype_class_name = subtype_class.__name__
 
@@ -23,8 +23,9 @@
 
 results = evaluate(predictions=predictions, data=dataset)
 
-print("Global Results:")
-print(results.global_scores.summary)
 
 print("Instance Results:")
-print(results.instance_scores.summary)
+print(results.instance_scores)
+
+print("Global Results:")
+print(results.global_scores.summary)
@@ -1,4 +1,3 @@
-from datasets import load_dataset_builder
 from unitxt import add_to_catalog
 from unitxt.blocks import (
     LoadHF,
@@ -9,26 +8,16 @@
 )
 from unitxt.test_utils.card import test_card
 
-dataset_name = "ag_news"
-
-ds_builder = load_dataset_builder(dataset_name)
-classlabels = ds_builder.info.features["label"]
-
-mappers = {}
-for i in range(len(classlabels.names)):
-    mappers[str(i)] = classlabels.names[i]
-
-
 card = TaskCard(
-    loader=LoadHF(path=f"{dataset_name}"),
+    loader=LoadHF(path="fancyzhx/ag_news"),
     preprocess_steps=[
         SplitRandomMix(
             {"train": "train[87.5%]", "validation": "train[12.5%]", "test": "test"}
         ),
-        MapInstanceValues(mappers={"label": mappers}),
+        MapInstanceValues(mappers={"label": {"0": "World", "1": "Sports", "2": "Business", "3": "Sci/Tech"}}),
         Set(
             fields={
-                "classes": classlabels.names,
+                "classes": ["World", "Sports", "Business", "Sci/Tech"],
                 "text_type": "sentence",
             }
         ),
@@ -52,4 +41,4 @@
     ),
 )
 test_card(card, debug=False)
-add_to_catalog(card, f"cards.{dataset_name}", overwrite=True)
+add_to_catalog(card, "cards.ag_news", overwrite=True)
@@ -6,13 +6,15 @@
     Set,
     TaskCard,
 )
+from unitxt.splitters import RenameSplits
 from unitxt.test_utils.card import test_card
 
 with unitxt.settings.context(allow_unverified_code=True):
-    for subset in ["es", "en"]:
+    for subset in ["es", "en", "gl", "it", "ru"]:
         card = TaskCard(
-            loader=LoadHF(path="dvilares/head_qa", name=subset),
+            loader=LoadHF(path="alesi12/head_qa_v2", name=subset),
             preprocess_steps=[
+                RenameSplits({"train": "test"}),
                 Rename(field_to_field={"qtext": "text", "category": "label"}),
                 Set(
                     fields={
@@ -46,5 +48,6 @@
                 "task_ids": "multiple-choice-qa",
             },
         )
-        test_card(card, debug=False)
+        if subset == "es":
+            test_card(card, debug=False)
         add_to_catalog(card, f"cards.head_qa.{subset}", overwrite=True)
@@ -0,0 +1,40 @@
+from unitxt import add_to_catalog
+from unitxt.blocks import (
+    InputOutputTemplate,
+    Task,
+    TaskCard,
+)
+from unitxt.loaders import LoadCSV
+from unitxt.operators import Rename
+from unitxt.templates import TemplatesDict
+from unitxt.test_utils.card import test_card
+
+card = TaskCard(
+    loader=LoadCSV(files={ "test": "https://raw.githubusercontent.com/mlcommons/ailuminate/0fe054f3d34209a0a1acf1b95174c5e8926f5c57/airr_official_1.0_practice_prompt_set_release_public_subset.csv"},   # pragma: allowlist secret
+                  data_classification_policy=["public"]),
+    preprocess_steps=[
+        Rename(field_to_field={"prompt_text": "input",
+                               "hazard": "label" }),
+    ],
+    task=Task(
+        input_fields={"release_prompt_id": str, "input": str, "label": str, "persona": str, "locale": str, "prompt_hash": str},
+        reference_fields={"label": str},
+        prediction_type=str,
+        metrics=["metrics.llm_as_judge.safety.llamaguard"]
+    ),
+    templates=TemplatesDict({"default": InputOutputTemplate(input_format="{input}\n", output_format="")}),
+
+    __description__="The DEMO prompt library of the AILuminate 1.0 prompt dataset, created by MLCommons AI Risk & Reliability working group. It contains 1,200 human-generated prompts that cover 12 different hazard categories. The dataset is a mix of adversarial and non-adversarial prompts. This dataset is a 10% subset of the full AILuminate training dataset.",
+    __tags__={
+        "languages": ["english"]
+    },
+)
+
+test_card(
+    card,
+    strict=False,
+    demos_taken_from="test",
+    num_demos=0,
+)
+
+add_to_catalog(card, "cards.safety.mlcommons_ailuminate", overwrite=True)
@@ -1,6 +1,7 @@
 from unitxt import add_to_catalog
 from unitxt.blocks import (
     LoadHF,
+    Task,
     TaskCard,
 )
 from unitxt.operators import (
@@ -17,14 +18,19 @@
         Rename(field_to_field={"prompt": "input"}),
         Set(
             fields={
-                "output": "None",
-                "type_of_input": "question",
-                "type_of_output": "answer",
-                "prediction_type": "str",
+                "output": "",
             }
         ),
     ],
-    task="tasks.generation",
+    task=Task(
+        input_fields={"input": str, "id": str, "harm_area": str, "category": str},
+        reference_fields={"output": str},
+        prediction_type=str,
+        metrics=[
+            "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=input,assistant_message_field=output,score_prefix=graniteguardian_]",
+            "metrics.llm_as_judge.safety.llamaguard[score_prefix=llamaguard_]"
+        ],
+    ),
     templates=["templates.generation.empty"],
     __description__="100 test prompts across five harm areas that LLMs, for the vast majority of applications, should refuse to comply with.",
     __tags__={
@@ -33,7 +39,6 @@
     },
 )
 
-test_card(
-    card, format="formats.empty", strict=False, demos_taken_from="test", num_demos=0
-)
+test_card(card, strict=False, demos_taken_from="test", num_demos=0)
+
 add_to_catalog(card, "cards.safety.simple_safety_tests", overwrite=True)
@@ -1,10 +1,10 @@
 from unitxt.blocks import LoadHF, MapInstanceValues, TaskCard
 from unitxt.catalog import add_to_catalog
-from unitxt.operators import ExtractFieldValues, Rename, Set
+from unitxt.operators import Rename, Set
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
-    loader=LoadHF(path="glue", name="sst2"),
+    loader=LoadHF(path="stanfordnlp/sst2"),
     preprocess_steps=[
         "splitters.small_no_test",
         MapInstanceValues(mappers={"label": {"0": "negative", "1": "positive"}}),
@@ -13,9 +13,9 @@
             fields={
                 "text_type": "sentence",
                 "type_of_class": "sentiment",
+                "classes": ["negative", "positive"]
             }
         ),
-        ExtractFieldValues(field="label", to_field="classes", stream_name="train"),
     ],
     task="tasks.classification.multi_class",
     templates="templates.classification.multi_class.all",
 
@@ -10,7 +10,7 @@
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
-    loader=LoadHF(path="glue", name="wnli", data_classification_policy=["public"]),
+    loader=LoadHF(path="nyu-mll/glue", name="wnli", data_classification_policy=["public"]),
     preprocess_steps=[
         SplitRandomMix(
             {"train": "train[95%]", "validation": "train[5%]", "test": "validation"}
@@ -57,7 +57,7 @@
 
 
 card = TaskCard(
-    loader=LoadHF(path="glue", name="wnli"),
+    loader=LoadHF(path="nyu-mll/glue", name="wnli"),
     preprocess_steps=[
         SplitRandomMix(
             {"train": "train[95%]", "validation": "train[5%]", "test": "validation"}
 
@@ -1,34 +1,35 @@
 from unitxt import add_to_catalog
-from unitxt.inference import IbmGenAiInferenceEngine
+from unitxt.inference import CrossProviderInferenceEngine
 from unitxt.llm_as_judge_from_template import LLMAsJudge
 from unitxt.random_utils import get_seed
 
-model_list = [
-    "meta-llama/llama-3-8b-instruct",
-    "meta-llama/llama-3-70b-instruct",
-]  # will point to llamaguard2
-format = "formats.llama3_instruct"
-template = "templates.safety.unsafe_content"
+model = "llama-3-3-70b-instruct"
+format = "formats.chat_api"
+template = "templates.safety.llamaguard"
 task = "rating.single_turn"
 
-for model_id in model_list:
-    inference_model = IbmGenAiInferenceEngine(
-        model_name=model_id, max_new_tokens=252, random_seed=get_seed()
-    )
-    model_label = model_id.split("/")[1].replace("-", "_").replace(".", ",").lower()
-    model_label = f"{model_label}_ibm_genai"
-    template_label = template.split(".")[-1]
-    metric_label = f"{model_label}_template_{template_label}"
-    metric = LLMAsJudge(
-        inference_model=inference_model,
-        template=template,
-        task=task,
-        format=format,
-        main_score=metric_label,
-    )
+inference_model = CrossProviderInferenceEngine(
+    model=model, max_tokens=20, seed=get_seed(), temperature=1e-7
+)
 
-    add_to_catalog(
-        metric,
-        f"metrics.llm_as_judge.safety.{model_label}_template_{template_label}",
-        overwrite=True,
-    )
+model_label = (
+    model.replace("-", "_").replace(".", ",").lower() + "_cross_provider"
+)
+
+template_label = template.split(".")[-1]
+
+metric_label = f"{model_label}_template_{template_label}"
+
+metric = LLMAsJudge(
+    inference_model=inference_model,
+    template=template,
+    task=task,
+    format=format,
+    main_score=metric_label,
+)
+
+add_to_catalog(
+    metric,
+    "metrics.llm_as_judge.safety.llamaguard",
+    overwrite=True,
+)
Original file line number	Diff line number	Diff line change
`@@ -206,3 +206,8 @@ div.document div.documentwrapper {`
`206`	`206`	`.red {`
`207`	`207`	`color: red;`
`208`	`208`	`}`
	`209`	`+`
	`210`	`+#unitxtImports {`
	`211`	`+ /* Display nothing for the element */`
	`212`	`+ display: none;`
	`213`	`+}`