Switch provoq to GG metric.

bnayahu · bnayahu · commit b970a1c7d731 · 2025-10-30T15:10:55.000+02:00
Signed-off-by: Jonathan Bnayahu &lt;bnayahu@il.ibm.com&gt;
diff --git a/prepare/benchmarks/safety.py b/prepare/benchmarks/safety.py
@@ -2,7 +2,7 @@
 from unitxt.catalog import add_to_catalog
 from unitxt.standard import DatasetRecipe
 
-MAX_TEST_INSTANCES = 500
+MAX_TEST_INSTANCES = 1000
 
 benchmark = Benchmark(
     subsets={
@@ -13,7 +13,7 @@
             max_test_instances=MAX_TEST_INSTANCES,
         ),
         "provoq": DatasetRecipe(
-            card="cards.safety.provoq",
+            card="cards.safety.provoq_gg",
             template_card_index="default",
             group_by=["group"],
             max_test_instances=MAX_TEST_INSTANCES,
diff --git a/prepare/cards/safety/provoq_gg.py b/prepare/cards/safety/provoq_gg.py
@@ -0,0 +1,31 @@
+from unitxt import add_to_catalog
+from unitxt.blocks import InputOutputTemplate, LoadHF, Task, TaskCard, TemplatesDict
+from unitxt.test_utils.card import test_card
+
+card = TaskCard(
+    loader=LoadHF(path="IBM/ProvoQ", data_classification_policy=["public"]),
+    task=Task(
+        input_fields={"question": str, "stigma": str, "group": str},
+        reference_fields={},
+        prediction_type=str,
+        metrics=[
+            "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=question,assistant_message_field=prediction]",
+        ],
+    ),
+    templates=TemplatesDict(
+        {"default": InputOutputTemplate(input_format="{question}", output_format="")}
+    ),
+    __description__="The ProvoQ dataset is designed to evaluate the sensitivity of large language models (LLMs) to stigma-related topics. It contains a comprehensive set of human-curated provocative questions that systematically target minority-stigma pairs in the United States, creating a diverse and nuanced set of questions that reflect these sensitive topics. The dataset aims to support research in understanding and mitigating biases in AI systems, particularly in the context of minority groups. While most questions are toxic, others may seem benign but potentially elicit harmful responses. The dataset contains questions in text format, organized by minority-stigma pairs.",
+    __tags__={
+        "languages": ["english"],
+    },
+)
+
+test_card(
+    card,
+    strict=False,
+    demos_taken_from="test",
+    num_demos=0,
+)
+
+add_to_catalog(card, "cards.safety.provoq_gg", overwrite=True)
diff --git a/src/unitxt/catalog/benchmarks/safety.json b/src/unitxt/catalog/benchmarks/safety.json
@@ -8,16 +8,16 @@
             "group_by": [
                 "label"
             ],
-            "max_test_instances": 500
+            "max_test_instances": 1000
         },
         "provoq": {
             "__type__": "dataset_recipe",
-            "card": "cards.safety.provoq",
+            "card": "cards.safety.provoq_gg",
             "template_card_index": "default",
             "group_by": [
                 "group"
             ],
-            "max_test_instances": 500
+            "max_test_instances": 1000
         },
         "airbench": {
             "__type__": "dataset_recipe",
@@ -26,7 +26,7 @@
             "group_by": [
                 "l2-name"
             ],
-            "max_test_instances": 500
+            "max_test_instances": 1000
         },
         "ailuminate": {
             "__type__": "dataset_recipe",
@@ -35,7 +35,7 @@
             "group_by": [
                 "hazard"
             ],
-            "max_test_instances": 500
+            "max_test_instances": 1000
         }
     }
 }
diff --git a/src/unitxt/catalog/cards/safety/provoq_gg.json b/src/unitxt/catalog/cards/safety/provoq_gg.json
@@ -0,0 +1,39 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "IBM/ProvoQ",
+        "data_classification_policy": [
+            "public"
+        ]
+    },
+    "task": {
+        "__type__": "task",
+        "input_fields": {
+            "question": "str",
+            "stigma": "str",
+            "group": "str"
+        },
+        "reference_fields": {},
+        "prediction_type": "str",
+        "metrics": [
+            "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=question,assistant_message_field=prediction]"
+        ]
+    },
+    "templates": {
+        "__type__": "templates_dict",
+        "items": {
+            "default": {
+                "__type__": "input_output_template",
+                "input_format": "{question}",
+                "output_format": ""
+            }
+        }
+    },
+    "__description__": "The ProvoQ dataset is designed to evaluate the sensitivity of large language models (LLMs) to stigma-related topics. It contains a comprehensive set of human-curated provocative questions that systematically target minority-stigma pairs in the United States, creating a diverse and nuanced set of questions that reflect these sensitive topics. The dataset aims to support research in understanding and mitigating biases in AI systems, particularly in the context of minority groups. While most questions are toxic, others may seem benign but potentially elicit harmful responses. The dataset contains questions in text format, organized by minority-stigma pairs.",
+    "__tags__": {
+        "languages": [
+            "english"
+        ]
+    }
+}