changes from code review

exasol · Nov 20, 2024 · f234187 · f234187
1 parent 1763436
commit f234187
Show file tree

Hide file tree

Showing 5 changed files with 9 additions and 10 deletions.
diff --git a/...udf_wrapper_params/token_classification/error_not_cached_multiple_model_multiple_batch.py b/...udf_wrapper_params/token_classification/error_not_cached_multiple_model_multiple_batch.py
@@ -26,15 +26,15 @@ class ErrorNotCachedMultipleModelMultipleBatch:
                   make_output_row(bucketfs_conn=bfs_conn2, sub_dir=subdir2,
                                   model_name="non_existing_model",text_data=text2,
                                   score=None, start=None, end=None, word=None, entity=None,
-                                  error_msg="Traceback") * 1 * data_size#error on load_model -> only one output per input
+                                  error_msg="Traceback") * 1 * data_size #error on load_model -> only one output per input
 
     work_with_span_input_data = make_input_row_with_span(bucketfs_conn=bfs_conn1, sub_dir=subdir1,text_data=text1) * data_size + \
                                 make_input_row_with_span(bucketfs_conn=bfs_conn2, sub_dir=subdir2,
                                                          model_name="non_existing_model",text_data=text2) * data_size
 
     work_with_span_output_data1 = make_output_row_with_span(bucketfs_conn=bfs_conn1, sub_dir=subdir1) * n_entities * data_size
     work_with_span_output_data2 = [(bfs_conn2, subdir2, "non_existing_model", text_docid, text_start, text_end, agg_strategy_simple,
-                                   None, None, None, text_docid, None, None,"Traceback")] * 1 * data_size#error on load_model -> only one output per input
+                                   None, None, None, text_docid, None, None,"Traceback")] * 1 * data_size #error on load_model -> only one output per input
     work_with_span_output_data = work_with_span_output_data1 + work_with_span_output_data2
 
     tokenizer_model_output_df_model1 =  [make_model_output_for_one_input_row(number_entities=n_entities) * data_size]

diff --git a/...s/udf_wrapper_params/token_classification/error_not_cached_single_model_multiple_batch.py b/...s/udf_wrapper_params/token_classification/error_not_cached_single_model_multiple_batch.py
@@ -17,7 +17,7 @@ class ErrorNotCachedSingleModelMultipleBatch:
     input_data = make_input_row(model_name="non_existing_model") * data_size
     output_data = make_output_row(model_name="non_existing_model",
                                   score=None, start=None, end=None, word=None, entity=None,
-                                  error_msg="Traceback") * 1 * data_size#error on load_model -> only one output per input
+                                  error_msg="Traceback") * 1 * data_size #error on load_model -> only one output per input
 
     work_with_span_input_data = make_input_row_with_span(model_name="non_existing_model") * data_size
     work_with_span_output_data =  [(bucketfs_conn, sub_dir, "non_existing_model",

diff --git a/..._wrapper_params/token_classification/error_on_prediction_multiple_model_multiple_batch.py b/..._wrapper_params/token_classification/error_on_prediction_multiple_model_multiple_batch.py
@@ -43,7 +43,7 @@ class ErrorOnPredictionMultipleModelMultipleBatch:
                                   )]
     tokenizer_model_output_df_model2_batch2 =  [make_model_output_for_one_input_row(number_entities=1, #error on pred -> only one output per input
                                   score=None, start=None, end=None, word=None, entity_group=None,
-                                  )] # do we expect error on prediction to happen once per input or once per entity? per input makes more sense right?
+                                  )]
 
     tokenizer_models_output_df = [tokenizer_model_output_df_model1, tokenizer_model_output_df_model2_batch1, tokenizer_model_output_df_model2_batch2]
 

diff --git a/tests/unit_tests/udf_wrapper_params/token_classification/make_data_row_functions.py b/tests/unit_tests/udf_wrapper_params/token_classification/make_data_row_functions.py
@@ -62,7 +62,7 @@ def make_input_row_with_span(device_id=device_id, bucketfs_conn=bucketfs_conn, s
                              text_data_char_begin=text_start, text_data_char_end=text_end,
                              aggregation_strategy=agg_strategy_simple):
     """
-    creates an input row for token classification with span usage as a list,
+    Creates an input row for token classification with span usage as a list,
     using base params for all params that are not specified.
     """
     return [(device_id, bucketfs_conn, sub_dir, model_name,text_data,text_data_docid,
@@ -75,9 +75,9 @@ def make_output_row_with_span(bucketfs_conn=bucketfs_conn, sub_dir=sub_dir,
                               entity_covered_text=token, entity_type=entity_type, score=score,
                               error_msg = error_msg):
     """
-    creates an output row for token classification with span usage as a list,
+    Creates an output row for token classification with span usage as a list,
     using base params for all params that are not specified.
-    the found token is called "entity_covered_text" in our non span udf output.
+    The found token is called "entity_covered_text" in our non span udf output.
     """
     entity_char_begin = text_start + token_start
     entity_char_end = text_start + token_end
@@ -89,7 +89,7 @@ def make_output_row_with_span(bucketfs_conn=bucketfs_conn, sub_dir=sub_dir,
 
 def make_number_of_strings(input_str: str, desired_number: int):
     """
-    returns desired number of "input_strX", where X is counting up to desired_number.
+    Returns desired number of "input_strX", where X is counting up to desired_number.
     """
     return (input_str + f"{i}" for i in range(desired_number))
 
diff --git a/tests/unit_tests/udfs/test_token_classification.py b/tests/unit_tests/udfs/test_token_classification.py
@@ -119,7 +119,6 @@ def create_mock_metadata():
     )
     return meta
 
-# todo these functions should be reusable for the other unit tests. should we move them to a utils file or something?
 def create_db_mocks(bfs_connection, model_input_data, mock_meta):
     mock_ctx = StandaloneMockContext(inp=model_input_data, metadata=mock_meta)
     mock_exa = MockExaEnvironment(
@@ -155,7 +154,7 @@ def create_mock_pipeline_factory(tokenizer_models_output_df, number_of_intended_
     """
     mock_pipeline: List[Union[AutoModel, MagicMock]] = [
         create_autospec(Pipeline, side_effect=tokenizer_models_output_df[i]) if tokenizer_models_output_df[i][0][0][0]["word"]
-        else [Exception("Traceback mock_pipeline is throwing an error intentionally")] # todo we could probably put this exception into the tokenizer_models_output_df in the params files instead
+        else [Exception("Traceback mock_pipeline is throwing an error intentionally")]
     for i in range(0, number_of_intended_used_models)
     ]