Skip to content

Commit

Permalink
added new test cases
Browse files Browse the repository at this point in the history
  • Loading branch information
MarleneKress79789 committed Nov 19, 2024
1 parent e4c85a6 commit 6f37648
Show file tree
Hide file tree
Showing 6 changed files with 185 additions and 9 deletions.
1 change: 1 addition & 0 deletions doc/changes/changes_2.2.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ n/a
### Bugs

- #272: Fixed unit tests assertions not working correctly
- #275: Fixed a bug where models returning unexpected results was not handled correctly

### Documentation

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from pathlib import PurePosixPath
from exasol_udf_mock_python.connection import Connection
from tests.unit_tests.udf_wrapper_params.token_classification.make_data_row_functions import make_input_row, \
make_output_row, make_input_row_with_span, make_output_row_with_span, bucketfs_conn, \
text_docid, text_start, text_end, agg_strategy_simple, make_model_output_for_one_input_row, sub_dir, model_name


class ErrorPredictionOnlyContainsUnknownFields:
"""
"""
expected_model_counter = 1
batch_size = 2
data_size = 5
n_entities = 3

text_data = "error_result_contains_only_unknown fields"

input_data = make_input_row(text_data=text_data) * data_size
output_data = make_output_row(text_data=text_data,
score=None, start=None, end=None, word=None, entity=None,
error_msg="Traceback") * n_entities * data_size

work_with_span_input_data = make_input_row_with_span(text_data=text_data) * data_size
work_with_span_output_data = [(bucketfs_conn, sub_dir, model_name,
text_docid, text_start, text_end, agg_strategy_simple,
None, None, None, None, None, None,
"Traceback")] * n_entities * data_size


number_complete_batches = data_size // batch_size
number_remaining_data_entries_in_last_batch = data_size % batch_size
model_output_row_wrong_keys = [[{"unknown key": "some value", "diff unknown key": i}] for i in range(n_entities)]
tokenizer_model_output_df_model1 = [model_output_row_wrong_keys * batch_size] * \
number_complete_batches + \
[model_output_row_wrong_keys * number_remaining_data_entries_in_last_batch]
tokenizer_models_output_df = [tokenizer_model_output_df_model1]

tmpdir_name = "_".join(("/tmpdir", __qualname__))
base_cache_dir1 = PurePosixPath(tmpdir_name, bucketfs_conn)
bfs_connections = {
bucketfs_conn: Connection(address=f"file://{base_cache_dir1}")
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from pathlib import PurePosixPath
from exasol_udf_mock_python.connection import Connection
from tests.unit_tests.udf_wrapper_params.token_classification.make_data_row_functions import make_input_row, \
make_output_row, make_input_row_with_span, make_output_row_with_span, bucketfs_conn, \
text_docid, text_start, text_end, agg_strategy_simple, make_model_output_for_one_input_row, sub_dir, model_name


class ErrorPredictionMissingExpectedFields:
"""
"""
expected_model_counter = 1
batch_size = 2
data_size = 5
n_entities = 3

text_data = "error_result_missing_field_'word'" #todo do we want tests for different combinations? seems like a lot of work
# todo do we want tests for multiple models? multiple inputs where one works and one does not? how many test cases are to many test cases?
# todo these should be moved to the base model tests together with the others

input_data = make_input_row(text_data=text_data) * data_size
output_data = make_output_row(text_data=text_data, score=None, error_msg="Traceback") * n_entities * data_size

work_with_span_input_data = make_input_row_with_span(text_data=text_data) * data_size
work_with_span_output_data = make_output_row_with_span(score=None,
error_msg="Traceback") * n_entities * data_size


number_complete_batches = data_size // batch_size
number_remaining_data_entries_in_last_batch = data_size % batch_size

model_output_row_missing_key = [[model_output_row[0].pop("score")]
for model_output_row in make_model_output_for_one_input_row(number_entities=n_entities)]

tokenizer_model_output_df_model1 = [model_output_row_missing_key * batch_size] * \
number_complete_batches + \
[model_output_row_missing_key * number_remaining_data_entries_in_last_batch]
tokenizer_models_output_df = [tokenizer_model_output_df_model1]

tmpdir_name = "_".join(("/tmpdir", __qualname__))
base_cache_dir1 = PurePosixPath(tmpdir_name, bucketfs_conn)
bfs_connections = {
bucketfs_conn: Connection(address=f"file://{base_cache_dir1}")
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from pathlib import PurePosixPath
from exasol_udf_mock_python.connection import Connection
from tests.unit_tests.udf_wrapper_params.token_classification.make_data_row_functions import make_input_row, \
make_output_row, make_input_row_with_span, make_output_row_with_span, bucketfs_conn, \
text_docid, text_start, text_end, agg_strategy_simple, make_model_output_for_one_input_row, sub_dir, model_name


class PredictionReturnsEmptyResult:
"""
"""
expected_model_counter = 1
batch_size = 4
data_size = 5
n_entities = 3

text_data = "error_result_empty"
# todo throws error but meassage could be better
# TODO mention in docu if result is empty row not in output
input_data = make_input_row() * data_size + \
make_input_row(text_data=text_data) * data_size
output_data = make_output_row() * n_entities * data_size # Result of input #2 is empty, so the row does not appear in the output

work_with_span_input_data = make_input_row_with_span() * data_size + \
make_input_row_with_span(text_data=text_data) * data_size
work_with_span_output_data = make_output_row_with_span() * n_entities * data_size # Result of input #2 is empty, so the row does not appear in the output

# error on pred -> only one output per input
tokenizer_model_output_df_model1 = make_model_output_for_one_input_row(number_entities=n_entities) * data_size
tokenizer_models_output_df = [tokenizer_model_output_df_model1]

tmpdir_name = "_".join(("/tmpdir", __qualname__))
base_cache_dir1 = PurePosixPath(tmpdir_name, bucketfs_conn)
bfs_connections = {
bucketfs_conn: Connection(address=f"file://{base_cache_dir1}")
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from pathlib import PurePosixPath
from exasol_udf_mock_python.connection import Connection
from tests.unit_tests.udf_wrapper_params.token_classification.make_data_row_functions import make_input_row, \
make_output_row, make_input_row_with_span, make_output_row_with_span, bucketfs_conn, \
text_docid, text_start, text_end, agg_strategy_simple, make_model_output_for_one_input_row, sub_dir, model_name

# todo do we wan to throw in this case? or just ignore additional results?

class ErrorPredictionContainsAdditionalFields:
"""
"""
expected_model_counter = 1
batch_size = 2
data_size = 2
n_entities = 3

text_data = "result contains additional keys"

#todod these are not filled out
input_data = make_input_row(text_data=text_data) * data_size
output_data = make_output_row(text_data=text_data, error_msg="Traceback") * n_entities * data_size

work_with_span_input_data = make_input_row_with_span(text_data=text_data) * data_size
work_with_span_output_data = make_output_row_with_span(error_msg="Traceback") * n_entities * data_size

model_output_rows = make_model_output_for_one_input_row(number_entities=n_entities)
model_output_row_wrong_keys = [model_output_row[0].update({"unknown key": "some value", "diff unknown key": 1})
for model_output_row in model_output_rows]
tokenizer_model_output_df_model1 = [model_output_row_wrong_keys * data_size]
tokenizer_models_output_df = [tokenizer_model_output_df_model1]

tmpdir_name = "_".join(("/tmpdir", __qualname__))
base_cache_dir1 = PurePosixPath(tmpdir_name, bucketfs_conn)
bfs_connections = {
bucketfs_conn: Connection(address=f"file://{base_cache_dir1}")
}
33 changes: 24 additions & 9 deletions tests/unit_tests/udfs/test_token_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@

from exasol_transformers_extension.udfs.models.token_classification_udf import TokenClassificationUDF
from exasol_transformers_extension.utils.model_factory_protocol import ModelFactoryProtocol
from tests.unit_tests.udf_wrapper_params.token_classification.error_prediction_containing_only_unknown_fields import \
ErrorPredictionOnlyContainsUnknownFields
from tests.unit_tests.udf_wrapper_params.token_classification.error_prediction_missing_expected_field import \
ErrorPredictionMissingExpectedFields
from tests.unit_tests.udf_wrapper_params.token_classification.prediction_returns_empty_result import \
PredictionReturnsEmptyResult
from tests.unit_tests.udf_wrapper_params.token_classification.result_contains_additional_keys import \
ErrorPredictionContainsAdditionalFields
from tests.unit_tests.udfs.output_matcher import Output, OutputMatcher
from tests.utils.mock_bucketfs_location import fake_bucketfs_location_from_conn_object, fake_local_bucketfs_path
from tests.utils.mock_cast import mock_cast
Expand Down Expand Up @@ -55,14 +63,10 @@
SingleModelSingleBatchIncomplete


def udf_wrapper_empty():
# placeholder to use for MockMetaData creation.
# todo replace with newer version and then delete this
pass

def create_mock_metadata_with_span():
meta = MockMetaData(
script_code_wrapper_function=udf_wrapper_empty,
script_code_wrapper_function=None,
input_type="SET",
input_columns=[
Column("device_id", int, "INTEGER"),
Expand Down Expand Up @@ -97,7 +101,7 @@ def create_mock_metadata_with_span():

def create_mock_metadata():
meta = MockMetaData(
script_code_wrapper_function=udf_wrapper_empty,
script_code_wrapper_function=None,
input_type="SET",
input_columns=[
Column("device_id", int, "INTEGER"),
Expand Down Expand Up @@ -159,7 +163,7 @@ def create_mock_pipeline_factory(tokenizer_models_output_df, number_of_intended_
This mock_pipeline is feed into a mock_pipeline_factory.
"""
mock_pipeline: List[Union[AutoModel, MagicMock]] = [
create_autospec(Pipeline, side_effect=tokenizer_models_output_df[i]) if tokenizer_models_output_df[i][0][0][0]["word"]
create_autospec(Pipeline, side_effect=tokenizer_models_output_df[i]) if tokenizer_models_output_df[i][0][0][0]["start"]#todo fix
else [Exception("Traceback mock_pipeline is throwing an error intentionally")] # todo we could probably put this exception into the tokenizer_models_output_df in the params files instead
for i in range(0, number_of_intended_used_models)
]
Expand Down Expand Up @@ -204,7 +208,11 @@ def assert_result_matches_expected_output(result, expected_output_data, input_co
ErrorNotCachedSingleModelMultipleBatch,
ErrorNotCachedMultipleModelMultipleBatch,
ErrorOnPredictionMultipleModelMultipleBatch,
ErrorOnPredictionSingleModelMultipleBatch
ErrorOnPredictionSingleModelMultipleBatch,
PredictionReturnsEmptyResult,
ErrorPredictionMissingExpectedFields,
ErrorPredictionOnlyContainsUnknownFields,
ErrorPredictionContainsAdditionalFields
])

@patch('exasol.python_extension_common.connections.bucketfs_location.create_bucketfs_location_from_conn_object')
Expand Down Expand Up @@ -264,7 +272,11 @@ def test_token_classification_with_span(mock_local_path, mock_create_loc, params
ErrorNotCachedSingleModelMultipleBatch,
ErrorNotCachedMultipleModelMultipleBatch,
ErrorOnPredictionMultipleModelMultipleBatch,
ErrorOnPredictionSingleModelMultipleBatch
ErrorOnPredictionSingleModelMultipleBatch,
PredictionReturnsEmptyResult,
ErrorPredictionMissingExpectedFields,
ErrorPredictionOnlyContainsUnknownFields,
ErrorPredictionContainsAdditionalFields
])
@patch('exasol.python_extension_common.connections.bucketfs_location.create_bucketfs_location_from_conn_object')
@patch('exasol_transformers_extension.utils.bucketfs_operations.get_local_bucketfs_path')
Expand Down Expand Up @@ -296,6 +308,9 @@ def test_token_classification(mock_local_path, mock_create_loc, params):

udf.run(mock_ctx)
result = mock_ctx.output
print(model_input_data)
print("____________________________________________")
print(result)

assert_correct_number_of_results(result, mock_meta.output_columns, expected_output_data)
assert_result_matches_expected_output(result, expected_output_data, mock_meta.input_columns)
Expand Down

0 comments on commit 6f37648

Please sign in to comment.