IBM · elronbandel · Jan 7, 2025 · Jan 7, 2025 · Jan 7, 2025
diff --git a/src/unitxt/task.py b/src/unitxt/task.py
@@ -75,6 +75,8 @@ class Task(InstanceOperator, ArtifactFetcherMixin):
 
     def prepare_args(self):
         super().prepare_args()
+        if isinstance(self.metrics, str):
+            self.metrics = [self.metrics]
 
         if self.input_fields is not None and self.inputs is not None:
             raise UnitxtError(

diff --git a/tests/library/test_tasks.py b/tests/library/test_tasks.py
@@ -1,5 +1,6 @@
 from typing import Any, Dict, List
 
+from unitxt.catalog import get_from_catalog
 from unitxt.error_utils import UnitxtError
 from unitxt.task import Task
 
@@ -26,6 +27,16 @@ def test_task_metrics_type_checking(self):
             str(e.exception),
         )
 
+    def test_single_metric_string_loading(self):
+        task = get_from_catalog("tasks.qa.with_context[metrics=metrics.rouge]")
+        self.assertListEqual(task.metrics, ["metrics.rouge"])
+
+    def test_multiple_metrics_string_loading(self):
+        task = get_from_catalog(
+            "tasks.qa.with_context[metrics=[metrics.rouge, metrics.bleu]]"
+        )
+        self.assertListEqual(task.metrics, ["metrics.rouge", "metrics.bleu"])
+
     def test_task_metrics_type_checking_with_inputs_outputs(self):
         operator = Task(
             inputs={"input": str},