pre-commit run

Signed-off-by: Yotam-Perlitz <[email protected]>
IBM · perlitz · Jan 23, 2025 · Dec 4, 2024 · Dec 4, 2024 · Dec 4, 2024
commit e9366fdc42c33bb7a06577632f6b87968efc9fcb
diff --git a/src/unitxt/db_utils.py b/src/unitxt/db_utils.py
@@ -190,9 +190,9 @@ class RemoteDatabaseConnector(DatabaseConnector):
     def __init__(self, db_config: SQLDatabase):
         super().__init__(db_config)
 
-        assert db_config["db_id"], (
-            "db_id must be in db_config for RemoteDatabaseConnector"
-        )
+        assert db_config[
+            "db_id"
+        ], "db_id must be in db_config for RemoteDatabaseConnector"
         self.api_url, self.database_id = (
             db_config["db_id"].split(",")[0],
             db_config["db_id"].split("db_id=")[-1].split(",")[0],

diff --git a/src/unitxt/loaders.py b/src/unitxt/loaders.py
@@ -572,15 +572,15 @@ def prepare(self):
 
     def lazy_verify(self):
         super().verify()
-        assert self.endpoint_url is not None, (
-            f"Please set the {self.endpoint_url_env} environmental variable"
-        )
-        assert self.aws_access_key_id is not None, (
-            f"Please set {self.aws_access_key_id_env} environmental variable"
-        )
-        assert self.aws_secret_access_key is not None, (
-            f"Please set {self.aws_secret_access_key_env} environmental variable"
-        )
+        assert (
+            self.endpoint_url is not None
+        ), f"Please set the {self.endpoint_url_env} environmental variable"
+        assert (
+            self.aws_access_key_id is not None
+        ), f"Please set {self.aws_access_key_id_env} environmental variable"
+        assert (
+            self.aws_secret_access_key is not None
+        ), f"Please set {self.aws_secret_access_key_env} environmental variable"
         if self.streaming:
             raise NotImplementedError("LoadFromKaggle cannot load with streaming.")
 
@@ -1095,4 +1095,3 @@ def process(self) -> MultiStream:
             self.__class__._loader_cache.max_size = settings.loader_cache_size
             self.__class__._loader_cache[str(self)] = iterables
         return MultiStream.from_iterables(iterables, copying=True)
-
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
@@ -1119,9 +1119,9 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato
             )
 
         for reduction, fields in self.reduction_map.items():
-            assert reduction in self.implemented_reductions, (
-                f"Reduction {reduction} is not implemented, use one of {self.implemented_reductions}"
-            )
+            assert (
+                reduction in self.implemented_reductions
+            ), f"Reduction {reduction} is not implemented, use one of {self.implemented_reductions}"
 
             if reduction == "mean":
                 for field_name in fields:
@@ -1390,12 +1390,12 @@ class InstanceMetric(StreamOperator, MetricWithConfidenceInterval):
     def _validate_group_mean_task_data(self, instance):
         # instances need to all have task_data field with field group_id
         assert "task_data" in instance, "each instance must have an task_data field"
-        assert isinstance(instance["task_data"], dict), (
-            "each instance must have an task_data field that is a dict"
-        )
-        assert "group_id" in instance["task_data"], (
-            "each instance task_data dict must have a key group_id"
-        )
+        assert isinstance(
+            instance["task_data"], dict
+        ), "each instance must have an task_data field that is a dict"
+        assert (
+            "group_id" in instance["task_data"]
+        ), "each instance task_data dict must have a key group_id"
 
     def _validate_group_mean_reduction(self):
         """Ensure that group_mean reduction_map is properly formatted.
@@ -1448,40 +1448,40 @@ def accuracy_diff(subgroup_scores_dict, expected_subgroup_types=['original', 'pa
             2           'Why are ants eating my food?'               'original'
         """
         # validate the reduction_map
-        assert "group_mean" in self.reduction_map, (
-            "reduction_map must have a 'group_mean' key"
-        )
+        assert (
+            "group_mean" in self.reduction_map
+        ), "reduction_map must have a 'group_mean' key"
         fields = self.reduction_map["group_mean"]
         # for group_mean, expects a dict
         assert isinstance(fields, dict)
-        assert "agg_func" in fields, (
-            "fields should have a key 'agg_func' whose value is a 3-element list of a function name, function definition, and a boolean indicator"
-        )
-        assert isinstance(fields["agg_func"], list), (
-            "fields['agg_func'] should be a list"
-        )
-        assert len(fields["agg_func"]) == 3, (
-            "fields['agg_func'] should be a 3-element list"
-        )
-        assert isinstance(fields["agg_func"][0], str), (
-            "first item in fields['agg_func'] should be a string name of a function"
-        )
-        assert callable(fields["agg_func"][1]), (
-            "second item in fields['agg_func'] should be a callable function"
-        )
-        assert isinstance(fields["agg_func"][2], bool), (
-            "third item in fields['agg_func'] should be a boolean value"
-        )
+        assert (
+            "agg_func" in fields
+        ), "fields should have a key 'agg_func' whose value is a 3-element list of a function name, function definition, and a boolean indicator"
+        assert isinstance(
+            fields["agg_func"], list
+        ), "fields['agg_func'] should be a list"
+        assert (
+            len(fields["agg_func"]) == 3
+        ), "fields['agg_func'] should be a 3-element list"
+        assert isinstance(
+            fields["agg_func"][0], str
+        ), "first item in fields['agg_func'] should be a string name of a function"
+        assert callable(
+            fields["agg_func"][1]
+        ), "second item in fields['agg_func'] should be a callable function"
+        assert isinstance(
+            fields["agg_func"][2], bool
+        ), "third item in fields['agg_func'] should be a boolean value"
         if "score_fields" in fields:
             assert isinstance(fields["score_fields"], list)
 
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         instance_scores = self.compute_instance_scores(stream)
         global_score = {"num_of_instances": len(instance_scores)}
         for reduction_type, reduction_params in self.reduction_map.items():
-            assert reduction_type in self.implemented_reductions, (
-                f"Reduction {reduction_type} is not implemented, use one of {self.implemented_reductions}"
-            )
+            assert (
+                reduction_type in self.implemented_reductions
+            ), f"Reduction {reduction_type} is not implemented, use one of {self.implemented_reductions}"
 
             field_name_full_prefix = ""
             # used for passing to the bootstrapping, depends on whether the groups are fixed or not
@@ -1579,9 +1579,7 @@ def compute_instance_scores(
                 assert (
                     "task_data" in instance
                     and self.subgroup_column in instance["task_data"]
-                ), (
-                    f"each instance task_data dict must have a key {self.subgroup_column}"
-                )
+                ), f"each instance task_data dict must have a key {self.subgroup_column}"
 
             task_data = instance["task_data"] if "task_data" in instance else {}
 
@@ -2183,15 +2181,15 @@ def disable_confidence_interval_calculation(self):
 
     def verify(self):
         super().verify()
-        assert self.metric is not None, (
-            f"'metric' is not set in {self.get_metric_name()}"
-        )
-        assert self.main_score is not None, (
-            f"'main_score' is not set in {self.get_metric_name()}"
-        )
-        assert isinstance(self.metric, Metric), (
-            f"'metric' is not set to a Metric class in {self.get_metric_name()} (type{self.metric})"
-        )
+        assert (
+            self.metric is not None
+        ), f"'metric' is not set in {self.get_metric_name()}"
+        assert (
+            self.main_score is not None
+        ), f"'main_score' is not set in {self.get_metric_name()}"
+        assert isinstance(
+            self.metric, Metric
+        ), f"'metric' is not set to a Metric class in {self.get_metric_name()} (type{self.metric})"
         if self.postpreprocess_steps is not None:
             depr_message = "Field 'postpreprocess_steps' is deprecated. Please use 'postprocess_steps' for the same purpose."
             warnings.warn(depr_message, DeprecationWarning, stacklevel=2)
@@ -2212,9 +2210,9 @@ def prepare(self):
             and isinstance(self.postprocess_steps, list)
             and len(self.postprocess_steps) > 0
         )
-        assert not (has_postpreprocess and has_postprocess), (
-            "Must define at most one of postpreprocess_steps (which is deprecated) and postprocess_steps (to be used from now on)"
-        )
+        assert not (
+            has_postpreprocess and has_postprocess
+        ), "Must define at most one of postpreprocess_steps (which is deprecated) and postprocess_steps (to be used from now on)"
         if has_postpreprocess:
             self.postprocess_steps = self.postpreprocess_steps
         self.prepare_score = SequentialOperator(
@@ -2289,16 +2287,14 @@ def verify(self):
                 Documentation.HUGGINGFACE_METRICS,
             )
 
-        assert self.hf_additional_input_fields is None or isoftype(
-            self.hf_additional_input_fields, List[str]
-        ), (
-            f"Argument hf_additional_input_fields should be either None or List[str]. It is now: {self.hf_additional_input_fields}."
-        )
-        assert self.hf_additional_input_fields_pass_one_value is None or isoftype(
-            self.hf_additional_input_fields_pass_one_value, List[str]
-        ), (
-            f"Argument hf_additional_input_fields_pass_one_value should be either None or List[str]. It is now: {self.hf_additional_input_fields_pass_one_value}."
-        )
+        assert (
+            self.hf_additional_input_fields is None
+            or isoftype(self.hf_additional_input_fields, List[str])
+        ), f"Argument hf_additional_input_fields should be either None or List[str]. It is now: {self.hf_additional_input_fields}."
+        assert (
+            self.hf_additional_input_fields_pass_one_value is None
+            or isoftype(self.hf_additional_input_fields_pass_one_value, List[str])
+        ), f"Argument hf_additional_input_fields_pass_one_value should be either None or List[str]. It is now: {self.hf_additional_input_fields_pass_one_value}."
 
         return super().verify()
 
@@ -2317,25 +2313,25 @@ def compute(
     ) -> dict:
         passed_task_data = {}
         for additional_input_field in self.hf_additional_input_fields:
-            assert additional_input_field in task_data[0], (
-                f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in task_data: {task_data[0]}"
-            )
+            assert (
+                additional_input_field in task_data[0]
+            ), f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in task_data: {task_data[0]}"
             passed_task_data[additional_input_field] = [
                 additional_input[additional_input_field]
                 for additional_input in task_data
             ]
         for additional_input_field in self.hf_additional_input_fields_pass_one_value:
-            assert additional_input_field in task_data[0], (
-                f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in task_data: {task_data[0]}"
-            )
+            assert (
+                additional_input_field in task_data[0]
+            ), f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in task_data: {task_data[0]}"
 
             values = {
                 additional_input[additional_input_field]
                 for additional_input in task_data
             }
-            assert len(values) == 1, (
-                f"Values of '{additional_input_field}' field required by {__class__.__name__}  should all be the same, but have multiple values {values}"
-            )
+            assert (
+                len(values) == 1
+            ), f"Values of '{additional_input_field}' field required by {__class__.__name__}  should all be the same, but have multiple values {values}"
 
             passed_task_data[additional_input_field] = next(iter(values))
 
@@ -2350,22 +2346,22 @@ def compute(
             result[self.main_score] = float(result[self.hf_main_score])
             del result[self.hf_main_score]
         if self.scale != 1.0:
-            assert self.scaled_fields is not None, (
-                f"Scaling factor was set to {self.scale}, but no fields specified"
-            )
+            assert (
+                self.scaled_fields is not None
+            ), f"Scaling factor was set to {self.scale}, but no fields specified"
             for key in self.scaled_fields:
-                assert key in result, (
-                    f"Trying to scale field '{key}' which is not in results of metrics: {result}"
-                )
+                assert (
+                    key in result
+                ), f"Trying to scale field '{key}' which is not in results of metrics: {result}"
                 if isinstance(result[key], list):
-                    assert all(isinstance(v, float) for v in result[key]), (
-                        "Not all scaled field '{key}' values are floats: {result[key]}"
-                    )
+                    assert all(
+                        isinstance(v, float) for v in result[key]
+                    ), "Not all scaled field '{key}' values are floats: {result[key]}"
                     result[key] = [v / self.scale for v in result[key]]
                 else:
-                    assert isinstance(result[key], float), (
-                        "Scaled field '{key}' is not float: {result[key]}"
-                    )
+                    assert isinstance(
+                        result[key], float
+                    ), "Scaled field '{key}' is not float: {result[key]}"
                     result[key] /= self.scale
         if self.main_score in result:
             result[self.main_score] = float(result[self.main_score])
@@ -2394,9 +2390,9 @@ def compute(
     ) -> List[Dict[str, Any]]:
         passed_task_data = {}
         for additional_input_field in self.hf_additional_input_fields:
-            assert additional_input_field in task_data[0], (
-                f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in task_data: {task_data[0]}"
-            )
+            assert (
+                additional_input_field in task_data[0]
+            ), f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in task_data: {task_data[0]}"
             passed_task_data[additional_input_field] = [
                 additional_input[additional_input_field]
                 for additional_input in task_data
@@ -2733,9 +2729,9 @@ def download_finqa_eval_script_file(url, local_path, hash_of_script):
                 response = requests.get(url)
                 response.raise_for_status()
                 content = response.content
-                assert hashlib.md5(content).hexdigest() == hash_of_script, (
-                    f'URL ("{url}") is different than expected. Make sure you added the right one.'
-                )
+                assert (
+                    hashlib.md5(content).hexdigest() == hash_of_script
+                ), f'URL ("{url}") is different than expected. Make sure you added the right one.'
 
                 with open(local_path, "wb") as file:
                     file.write(content)
@@ -2869,9 +2865,9 @@ def compute(
             labels=labels_param,
         )
         if isinstance(result[self.metric], numpy.ndarray):
-            assert len(result[self.metric]) == len(labels), (
-                f"F1 result ({result[self.metric]}) has more entries than labels ({labels})"
-            )
+            assert (
+                len(result[self.metric]) == len(labels)
+            ), f"F1 result ({result[self.metric]}) has more entries than labels ({labels})"
             final_result = {self.main_score: nan_mean(result[self.metric])}
             for i, label in enumerate(labels):
                 final_result[self.metric + "_" + label] = result[self.metric][i]
@@ -4654,12 +4650,12 @@ def validate_subgroup_types(
             for subgroup_name, score_list in subgroup_scores_dict.items()
         }
     )
-    assert isinstance(control_subgroup_types, list), (
-        "control_subgroup_types must be a list"
-    )
-    assert isinstance(comparison_subgroup_types, list), (
-        "comparison_subgroup_types must be a list"
-    )
+    assert isinstance(
+        control_subgroup_types, list
+    ), "control_subgroup_types must be a list"
+    assert isinstance(
+        comparison_subgroup_types, list
+    ), "comparison_subgroup_types must be a list"
     # make sure each list is unique, so that labels aren't double-counted
     control_subgroup_types = list(set(control_subgroup_types))
     comparison_subgroup_types = list(set(comparison_subgroup_types))
@@ -4814,9 +4810,9 @@ def normalized_cohens_h(
 
     # requires scores to be in [0,1]
     for subgroup_name, score_list in subgroup_scores_dict.items():
-        assert all(0 <= score <= 1 for score in score_list), (
-            f"all {subgroup_name} scores must be in [0,1]"
-        )
+        assert all(
+            0 <= score <= 1 for score in score_list
+        ), f"all {subgroup_name} scores must be in [0,1]"
 
     # combine all scores from each label (if there are more than 1 in each group) into a list
     group_scores_list = [
@@ -5620,9 +5616,9 @@ def prepare(self):
 
     def create_ensemble_scores(self, instance):
         score = self.ensemble(instance)
-        instance["prediction"] = (
-            score  # We use here the prediction field to pass the score to the compute method.
-        )
+        instance[
+            "prediction"
+        ] = score  # We use here the prediction field to pass the score to the compute method.
         return instance
 
     def ensemble(self, instance):
@@ -5802,9 +5798,9 @@ def load_weights(json_file):
             return json.load(file)
 
     def ensemble(self, instance):
-        assert self.weights is not None, (
-            "RandomForestMetricsEnsemble must set self.weights before it can be used"
-        )
+        assert (
+            self.weights is not None
+        ), "RandomForestMetricsEnsemble must set self.weights before it can be used"
         ensemble_model = self.decode_forest(self.weights)
 
         prediction_lst = []

diff --git a/utils/.secrets.baseline b/utils/.secrets.baseline
@@ -151,7 +151,7 @@
         "filename": "src/unitxt/loaders.py",
         "hashed_secret": "840268f77a57d5553add023cfa8a4d1535f49742",
         "is_verified": false,
-        "line_number": 500,
+        "line_number": 502,
         "is_secret": false
       }
     ],
@@ -161,7 +161,7 @@
         "filename": "src/unitxt/metrics.py",
         "hashed_secret": "fa172616e9af3d2a24b5597f264eab963fe76889",
         "is_verified": false,
-        "line_number": 2754,
+        "line_number": 2748,
         "is_secret": false
       }
     ],
@@ -184,5 +184,5 @@
       }
     ]
   },
-  "generated_at": "2025-01-17T15:22:31Z"
+  "generated_at": "2025-01-21T18:10:24Z"
 }