Skip to content

Commit

Permalink
Bug: #275 fix error on no token found (#277)
Browse files Browse the repository at this point in the history
* added new test cases

* moved exception throw in model to test params, udf now drops input rows with empty result set, better error message

* updated type hints and made sure empty results are dealt with correctly
  • Loading branch information
MarleneKress79789 authored Dec 5, 2024
1 parent 4091c57 commit c1bf935
Show file tree
Hide file tree
Showing 10 changed files with 234 additions and 28 deletions.
1 change: 1 addition & 0 deletions doc/changes/changes_2.2.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ n/a
### Bugs

- #272: Fixed unit tests assertions not working correctly
- #275: Fixed a bug where models returning unexpected results was not handled correctly

### Documentation

Expand Down
1 change: 1 addition & 0 deletions doc/user_guide/user_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,7 @@ SELECT TE_TOKEN_CLASSIFICATION_UDF(
The inference results are presented with _START_POS_ indicating the index of the starting character of the token,
_END_POS_ indicating the index of the ending character of the token, _WORD_ indicating the token, predicted _ENTITY_, and
confidence _SCORE_ columns, combined with the inputs used when calling this UDF.
In case the model returns an empty result for an input row, the row is dropped entirely and not part of the result set.
In case of any error during model loading or prediction, these new
columns are set to `null`, and column _ERROR_MESSAGE_ is set
to the stacktrace of the error. For example:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@ def extract_unique_param_based_dataframes(

yield param_based_model_df

def execute_prediction(self, model_df: pd.DataFrame) -> List[Union[
Dict[str, Any], List[Dict[str, Any]]]]:
def execute_prediction(self, model_df: pd.DataFrame) -> List[List[Dict[str, Any]]]:
"""
Predict the given text list using recently loaded models, return
probability scores, entities and associated words
Expand All @@ -66,6 +65,7 @@ def execute_prediction(self, model_df: pd.DataFrame) -> List[Union[
aggregation_strategy = model_df['aggregation_strategy'].iloc[0]
results = self.last_created_pipeline(
text_data, aggregation_strategy=aggregation_strategy)

results = results if type(results[0]) == list else [results]

if aggregation_strategy == "none":
Expand Down Expand Up @@ -120,7 +120,8 @@ def append_predictions_to_input_dataframe(

# Concat predictions and model_df
pred_df = pd.concat(pred_df_list, axis=0).reset_index(drop=True)
model_df = pd.concat([model_df, pred_df], axis=1)
model_df = pd.concat([model_df, pred_df], axis=1, join='inner') # join='inner' -> drop rows where results are empty

if self.work_with_spans:
model_df = self.create_new_span_columns(model_df)
model_df[["entity_docid", "entity_char_begin", "entity_char_end"]] =\
Expand All @@ -129,8 +130,7 @@ def append_predictions_to_input_dataframe(
return model_df

def create_dataframes_from_predictions(
self, predictions: List[Union[
Dict[str, Any], List[Dict[str, Any]]]]) -> List[pd.DataFrame]:
self, predictions: List[List[Dict[str, Any]]]) -> List[pd.DataFrame]:
"""
Convert predictions to dataframe. Only score and answer fields are
presented.
Expand All @@ -141,12 +141,24 @@ def create_dataframes_from_predictions(
"""
results_df_list = []
for result in predictions:
result_df = pd.DataFrame(result)
result_df = result_df[self._desired_fields_in_prediction].rename(
columns={
"start": "start_pos",
"end": "end_pos",
"entity_group": "entity"})
if result and result[0]:
result_df = pd.DataFrame(result)
# need to save before trying to rename, otherwise they get lost and cant be printed in error message
result_df_column_names = result_df.columns
try:
result_df = result_df[self._desired_fields_in_prediction].rename(
columns={
"start": "start_pos",
"end": "end_pos",
"entity_group": "entity"})
except KeyError as e:
# adding more detailed error message
raise KeyError(f"Some expected column was not found in prediction results. "
f"Expected columns are: {self._desired_fields_in_prediction}. "
f"Prediction results contain columns: {result_df_column_names}") from e
else:
# if the result for an input is empty, just append an empty result df, and the input will be dropped during concatenation
result_df = pd.DataFrame({})
results_df_list.append(result_df)

return results_df_list
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,9 @@ class ErrorOnPredictionMultipleModelMultipleBatch:
work_with_span_output_data = work_with_span_output_data1 + work_with_span_output_data2

tokenizer_model_output_df_model1 = [make_model_output_for_one_input_row(number_entities=n_entities) * data_size]
tokenizer_model_output_df_model2_batch1 = [make_model_output_for_one_input_row(number_entities=1, #error on pred -> only one output per input
score=None, start=None, end=None, word=None, entity_group=None,
)]
tokenizer_model_output_df_model2_batch2 = [make_model_output_for_one_input_row(number_entities=1, #error on pred -> only one output per input
score=None, start=None, end=None, word=None, entity_group=None,
)]
tokenizer_model_output_df_model2_batch1 = [[Exception("Traceback mock_pipeline is throwing an error intentionally")]] #error on pred -> only one output per input

tokenizer_model_output_df_model2_batch2 = [[Exception("Traceback mock_pipeline is throwing an error intentionally")]]#error on pred -> only one output per input

tokenizer_models_output_df = [tokenizer_model_output_df_model1, tokenizer_model_output_df_model2_batch1, tokenizer_model_output_df_model2_batch2]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,10 @@ class ErrorOnPredictionSingleModelMultipleBatch:

number_complete_batches = data_size // batch_size
number_remaining_data_entries_in_last_batch = data_size % batch_size
tokenizer_model_output_df_model1 = [make_model_output_for_one_input_row(number_entities=1, #error on pred -> only one output per input
score=None, start=None, end=None, word=None, entity_group=None) * batch_size] * \
tokenizer_model_output_df_model1 = [[Exception("Traceback mock_pipeline is throwing an error intentionally")] #error on pred -> only one output per input
* batch_size] * \
number_complete_batches + \
[make_model_output_for_one_input_row(number_entities=1,
score=None, start=None, end=None, word=None, entity_group=None,
) * number_remaining_data_entries_in_last_batch]
[[Exception("Traceback mock_pipeline is throwing an error intentionally")] * number_remaining_data_entries_in_last_batch]
tokenizer_models_output_df = [tokenizer_model_output_df_model1]

tmpdir_name = "_".join(("/tmpdir", __qualname__))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from pathlib import PurePosixPath
from exasol_udf_mock_python.connection import Connection
from tests.unit_tests.udf_wrapper_params.token_classification.make_data_row_functions import make_input_row, \
make_output_row, make_input_row_with_span, make_output_row_with_span, bucketfs_conn, \
text_docid, text_start, text_end, agg_strategy_simple, make_model_output_for_one_input_row, sub_dir, model_name


class ErrorPredictionOnlyContainsUnknownFields:
"""
Model returns only not expected output columns. Udf returns error message and empty results.
"""
expected_model_counter = 1
batch_size = 2
data_size = 5
n_entities = 3

text_data = "error_result_contains_only_unknown fields"

input_data = make_input_row(text_data=text_data) * data_size
output_data = make_output_row(text_data=text_data,
score=None, start=None, end=None, word=None, entity=None,
error_msg="Traceback") * data_size # only one output per input

work_with_span_input_data = make_input_row_with_span(text_data=text_data) * data_size
work_with_span_output_data = [(bucketfs_conn, sub_dir, model_name,
text_docid, text_start, text_end, agg_strategy_simple,
None, None, None, None, None, None,
"Traceback")] * data_size # only one output per input


number_complete_batches = data_size // batch_size
number_remaining_data_entries_in_last_batch = data_size % batch_size
model_output_row_wrong_keys = [[{"unknown key": "some value", "diff unknown key": i}] for i in range(n_entities)]
tokenizer_model_output_df_model1 = [model_output_row_wrong_keys * batch_size] * \
number_complete_batches + \
[model_output_row_wrong_keys * number_remaining_data_entries_in_last_batch]
tokenizer_models_output_df = [tokenizer_model_output_df_model1]

tmpdir_name = "_".join(("/tmpdir", __qualname__))
base_cache_dir1 = PurePosixPath(tmpdir_name, bucketfs_conn)
bfs_connections = {
bucketfs_conn: Connection(address=f"file://{base_cache_dir1}")
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from pathlib import PurePosixPath
from exasol_udf_mock_python.connection import Connection
from tests.unit_tests.udf_wrapper_params.token_classification.make_data_row_functions import make_input_row, \
make_output_row, make_input_row_with_span, make_output_row_with_span, bucketfs_conn, \
text_docid, text_start, text_end, agg_strategy_simple, make_model_output_for_one_input_row, sub_dir, model_name


class ErrorPredictionMissingExpectedFields:
"""
Model Outputs is missing column "score", error message about missing column is returned, with empty output columns.
Existing output columns are dropped for all rows where one output column is missing.
"""
expected_model_counter = 1
batch_size = 2
data_size = 5
n_entities = 3

text_data = "error_result_missing_field_'word'"

input_data = make_input_row(text_data=text_data) * data_size
output_data = make_output_row(text_data=text_data, score=None,
start=None, end=None, word=None, entity=None,
error_msg="Traceback") * data_size # only one output per input

work_with_span_input_data = make_input_row_with_span(text_data=text_data) * data_size
work_with_span_output_data = [(bucketfs_conn, sub_dir, model_name,
text_docid, text_start, text_end, agg_strategy_simple,
None, None, None, text_docid, None, None,
"Traceback")] * data_size # only one output per input


number_complete_batches = data_size // batch_size
number_remaining_data_entries_in_last_batch = data_size % batch_size

model_output_rows = make_model_output_for_one_input_row(number_entities=n_entities)
model_output_row_missing_key = []
for model_output_row in model_output_rows:
removed = model_output_row[0].pop("score")
model_output_row_missing_key.append(model_output_row)

tokenizer_model_output_df_model1 = [model_output_row_missing_key * batch_size] * \
number_complete_batches + \
[model_output_row_missing_key * number_remaining_data_entries_in_last_batch]
tokenizer_models_output_df = [tokenizer_model_output_df_model1]

tmpdir_name = "_".join(("/tmpdir", __qualname__))
base_cache_dir1 = PurePosixPath(tmpdir_name, bucketfs_conn)
bfs_connections = {
bucketfs_conn: Connection(address=f"file://{base_cache_dir1}")
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from pathlib import PurePosixPath
from exasol_udf_mock_python.connection import Connection
from tests.unit_tests.udf_wrapper_params.token_classification.make_data_row_functions import make_input_row, \
make_output_row, make_input_row_with_span, make_output_row_with_span, bucketfs_conn, \
text_docid, text_start, text_end, agg_strategy_simple, make_model_output_for_one_input_row, sub_dir, model_name

class PredictionContainsAdditionalFields:
"""
Output from model contains additional unrecognized columns. These are ignored and expected columns returned as normal.
"""
expected_model_counter = 1
batch_size = 2
data_size = 2
n_entities = 3

text_data = "result contains additional keys"

#todod these are not filled out
input_data = make_input_row(text_data=text_data) * data_size
output_data = make_output_row(text_data=text_data) * n_entities * data_size

work_with_span_input_data = make_input_row_with_span(text_data=text_data) * data_size
work_with_span_output_data = make_output_row_with_span() * n_entities * data_size

model_output_rows = make_model_output_for_one_input_row(number_entities=n_entities)
for model_output_row in model_output_rows:
model_output_row[0].update({"unknown key": "some value", "diff unknown key": 1})

tokenizer_model_output_df_model1 = [model_output_rows * data_size]
tokenizer_models_output_df = [tokenizer_model_output_df_model1]

tmpdir_name = "_".join(("/tmpdir", __qualname__))
base_cache_dir1 = PurePosixPath(tmpdir_name, bucketfs_conn)
bfs_connections = {
bucketfs_conn: Connection(address=f"file://{base_cache_dir1}")
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from pathlib import PurePosixPath
from exasol_udf_mock_python.connection import Connection
from tests.unit_tests.udf_wrapper_params.token_classification.make_data_row_functions import make_input_row, \
make_output_row, make_input_row_with_span, make_output_row_with_span, bucketfs_conn, \
text_docid, text_start, text_end, agg_strategy_simple, make_model_output_for_one_input_row, sub_dir, model_name


class PredictionReturnsEmptyResult:
"""
Output from model is empty. Respective input row should be dropped and remaining output returned normally.
Tests different formats for empty result.
"""
expected_model_counter = 1
batch_size = 6
data_size = 1
n_entities = 3

text_data = "error_result_empty"
input_data = make_input_row() * data_size + \
make_input_row(text_data=text_data) * data_size + \
make_input_row(text_data=text_data) * data_size + \
make_input_row(text_data=text_data) * data_size + \
make_input_row(text_data=text_data) * data_size + \
make_input_row() * data_size
output_data = make_output_row() * n_entities * data_size + \
make_output_row() * n_entities * data_size # Result of input #2 is empty, so the row does not appear in the output

work_with_span_input_data = make_input_row_with_span() * data_size + \
make_input_row_with_span(text_data=text_data) * data_size + \
make_input_row_with_span(text_data=text_data) * data_size + \
make_input_row_with_span(text_data=text_data) * data_size + \
make_input_row_with_span(text_data=text_data) * data_size + \
make_input_row_with_span() * data_size

work_with_span_output_data = make_output_row_with_span() * n_entities * data_size + \
make_output_row_with_span() * n_entities * data_size # Result of input #2 is empty, so the row does not appear in the output


tokenizer_model_output_df_model1 = make_model_output_for_one_input_row(number_entities=n_entities) * data_size
tokenizer_model_output_df_model1.append([])
tokenizer_model_output_df_model1.append({})
tokenizer_model_output_df_model1.append([[]])
tokenizer_model_output_df_model1.append([{}])
tokenizer_model_output_df_model1 = tokenizer_model_output_df_model1 + make_model_output_for_one_input_row(number_entities=n_entities) * data_size

tokenizer_models_output_df = [[tokenizer_model_output_df_model1]]

tmpdir_name = "_".join(("/tmpdir", __qualname__))
base_cache_dir1 = PurePosixPath(tmpdir_name, bucketfs_conn)
bfs_connections = {
bucketfs_conn: Connection(address=f"file://{base_cache_dir1}")
}
28 changes: 22 additions & 6 deletions tests/unit_tests/udfs/test_token_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@

from exasol_transformers_extension.udfs.models.token_classification_udf import TokenClassificationUDF
from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol
from tests.unit_tests.udf_wrapper_params.token_classification.error_prediction_containing_only_unknown_fields import \
ErrorPredictionOnlyContainsUnknownFields
from tests.unit_tests.udf_wrapper_params.token_classification.error_prediction_missing_expected_field import \
ErrorPredictionMissingExpectedFields
from tests.unit_tests.udf_wrapper_params.token_classification.prediction_returns_empty_result import \
PredictionReturnsEmptyResult
from tests.unit_tests.udf_wrapper_params.token_classification.prediction_contains_additional_keys import \
PredictionContainsAdditionalFields
from tests.unit_tests.udfs.output_matcher import Output, OutputMatcher
from tests.utils.mock_bucketfs_location import fake_bucketfs_location_from_conn_object, fake_local_bucketfs_path
from tests.utils.mock_cast import mock_cast
Expand Down Expand Up @@ -55,6 +63,7 @@
SingleModelSingleBatchIncomplete



def create_mock_metadata_with_span():
meta = MockMetaData(
script_code_wrapper_function=None,
Expand Down Expand Up @@ -153,10 +162,9 @@ def create_mock_pipeline_factory(tokenizer_models_output_df, number_of_intended_
This mock_pipeline is feed into a mock_pipeline_factory.
"""
mock_pipeline: List[Union[AutoModel, MagicMock]] = [
create_autospec(Pipeline, side_effect=tokenizer_models_output_df[i]) if tokenizer_models_output_df[i][0][0][0]["word"]
else [Exception("Traceback mock_pipeline is throwing an error intentionally")]
for i in range(0, number_of_intended_used_models)
]
create_autospec(Pipeline, side_effect=tokenizer_models_output_df[i])
for i in range(0, number_of_intended_used_models)
]

mock_pipeline_factory: Union[Pipeline, MagicMock] = create_autospec(Pipeline,
side_effect=mock_pipeline)
Expand Down Expand Up @@ -198,7 +206,11 @@ def assert_result_matches_expected_output(result, expected_output_data, input_co
ErrorNotCachedSingleModelMultipleBatch,
ErrorNotCachedMultipleModelMultipleBatch,
ErrorOnPredictionMultipleModelMultipleBatch,
ErrorOnPredictionSingleModelMultipleBatch
ErrorOnPredictionSingleModelMultipleBatch,
PredictionReturnsEmptyResult,
ErrorPredictionMissingExpectedFields,
ErrorPredictionOnlyContainsUnknownFields,
PredictionContainsAdditionalFields
])

@patch('exasol.python_extension_common.connections.bucketfs_location.create_bucketfs_location_from_conn_object')
Expand Down Expand Up @@ -258,7 +270,11 @@ def test_token_classification_with_span(mock_local_path, mock_create_loc, params
ErrorNotCachedSingleModelMultipleBatch,
ErrorNotCachedMultipleModelMultipleBatch,
ErrorOnPredictionMultipleModelMultipleBatch,
ErrorOnPredictionSingleModelMultipleBatch
ErrorOnPredictionSingleModelMultipleBatch,
PredictionReturnsEmptyResult,
ErrorPredictionMissingExpectedFields,
ErrorPredictionOnlyContainsUnknownFields,
PredictionContainsAdditionalFields
])
@patch('exasol.python_extension_common.connections.bucketfs_location.create_bucketfs_location_from_conn_object')
@patch('exasol_transformers_extension.utils.bucketfs_operations.get_local_bucketfs_path')
Expand Down

0 comments on commit c1bf935

Please sign in to comment.