Skip to content

Commit

Permalink
changes in dummy implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
MarleneKress79789 committed Dec 16, 2024
1 parent dd959ff commit c2c803a
Show file tree
Hide file tree
Showing 6 changed files with 517 additions and 524 deletions.
907 changes: 424 additions & 483 deletions poetry.lock

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from pathlib import PurePosixPath
from exasol_udf_mock_python.connection import Connection
from tests.unit_tests.udf_wrapper_params.token_classification.make_data_row_functions import make_input_row, \
make_output_row, make_input_row_with_span, make_output_row_with_span, bucketfs_conn, \
sub_dir, text_docid, text_start, text_end, agg_strategy_simple, make_model_output_for_one_input_row, make_number_of_strings, text_data
from tests.unit_tests.udf_wrapper_params.base_udf.make_data_row_functions import make_input_row, \
make_output_row, bucketfs_conn, \
sub_dir, make_model_output_for_one_input_row, make_number_of_strings


class ErrorNotCachedMultipleModelMultipleBatch:
"""
not cached error, multiple model, multiple batch
not cached error, multiple model, multiple batch todo
"""
expected_model_counter = 1
batch_size = 3
Expand All @@ -16,27 +16,17 @@ class ErrorNotCachedMultipleModelMultipleBatch:

bfs_conn1, bfs_conn2 = make_number_of_strings(bucketfs_conn, 2)
subdir1, subdir2 = make_number_of_strings(sub_dir, 2)
text1, text2 = make_number_of_strings(text_data, 2)

input_data = make_input_row(bucketfs_conn=bfs_conn1, sub_dir=subdir1, text_data=text1) * data_size + \
input_data = make_input_row(bucketfs_conn=bfs_conn1, sub_dir=subdir1,) * data_size + \
make_input_row(bucketfs_conn=bfs_conn2, sub_dir=subdir2,
model_name="non_existing_model",text_data=text2) * data_size
model_name="non_existing_model",) * data_size

output_data = make_output_row(bucketfs_conn=bfs_conn1, sub_dir=subdir1, text_data=text1) * n_entities * data_size + \
output_data = make_output_row(bucketfs_conn=bfs_conn1, sub_dir=subdir1, ) * n_entities * data_size + \
make_output_row(bucketfs_conn=bfs_conn2, sub_dir=subdir2,
model_name="non_existing_model",text_data=text2,
score=None, start=None, end=None, word=None, entity=None,
model_name="non_existing_model",
score=None, answer=None,
error_msg="Traceback") * 1 * data_size #error on load_model -> only one output per input

work_with_span_input_data = make_input_row_with_span(bucketfs_conn=bfs_conn1, sub_dir=subdir1,text_data=text1) * data_size + \
make_input_row_with_span(bucketfs_conn=bfs_conn2, sub_dir=subdir2,
model_name="non_existing_model",text_data=text2) * data_size

work_with_span_output_data1 = make_output_row_with_span(bucketfs_conn=bfs_conn1, sub_dir=subdir1) * n_entities * data_size
work_with_span_output_data2 = [(bfs_conn2, subdir2, "non_existing_model", text_docid, text_start, text_end, agg_strategy_simple,
None, None, None, text_docid, None, None,"Traceback")] * 1 * data_size #error on load_model -> only one output per input
work_with_span_output_data = work_with_span_output_data1 + work_with_span_output_data2

tokenizer_model_output_df_model1 = [make_model_output_for_one_input_row(number_entities=n_entities) * data_size]
tokenizer_model_output_df_model2_batch1 = [] # no model loaded so no model to output anything
tokenizer_model_output_df_model2_batch2 = [] # no model loaded so no model to output anything
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from pathlib import PurePosixPath
from exasol_udf_mock_python.connection import Connection
from tests.unit_tests.udf_wrapper_params.token_classification.make_data_row_functions import make_input_row, \
make_output_row, make_input_row_with_span, make_output_row_with_span, bucketfs_conn, \
text_docid, text_start, text_end, agg_strategy_simple, sub_dir, model_name, make_model_output_for_one_input_row, make_number_of_strings
from tests.unit_tests.udf_wrapper_params.base_udf.make_data_row_functions import make_input_row, \
make_output_row, bucketfs_conn, \
sub_dir, model_name, make_model_output_for_one_input_row, make_number_of_strings


class ErrorNotCachedSingleModelMultipleBatch:
Expand All @@ -16,21 +16,15 @@ class ErrorNotCachedSingleModelMultipleBatch:

input_data = make_input_row(model_name="non_existing_model") * data_size
output_data = make_output_row(model_name="non_existing_model",
score=None, start=None, end=None, word=None, entity=None,
score=None, answer=None,
error_msg="Traceback") * 1 * data_size #error on load_model -> only one output per input

work_with_span_input_data = make_input_row_with_span(model_name="non_existing_model") * data_size
work_with_span_output_data = [(bucketfs_conn, sub_dir, "non_existing_model",
text_docid, text_start, text_end, agg_strategy_simple,
None, None, None, text_docid, None, None,
"Traceback")] * 1 * data_size #error on load_model -> only one output per input

tokenizer_models_output_df = [] # no model loaded so no model to output anything

tmpdir_name = "_".join(("/tmpdir", __qualname__))
base_cache_dir1 = PurePosixPath(tmpdir_name, "bfs_conn1")
base_cache_dir1 = PurePosixPath(tmpdir_name, bucketfs_conn)
bfs_connections = {
"bfs_conn1": Connection(address=f"file://{base_cache_dir1}")
bucketfs_conn: Connection(address=f"file://{base_cache_dir1}")
}


Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# default values, used for input/output rows if no other params are given
device_id=None
bucketfs_conn="bfs_conn"
sub_dir="sub_dir"
model_name="model"
token_conn=None

answer="answer"
score=0.1
error_msg = None

def make_input_row(device_id=device_id, model_name=model_name, sub_dir=sub_dir,
bucketfs_conn=bucketfs_conn, token_conn=token_conn):
"""
Creates an input row for token classification without span usage as a list,
using default values for all parameters that are not specified.
"""
return [(device_id, model_name, sub_dir, bucketfs_conn, token_conn)]

def make_output_row(model_name=model_name, sub_dir=sub_dir,
bucketfs_conn=bucketfs_conn, token_conn=token_conn,
answer=answer, score=score, error_msg=error_msg):
"""
Creates an output row for token classification without span usage as a list,
using default values for all parameters that are not specified.
The found token is called "word" in our non span udf output,
while the type/class of the found token is called "entity".
"""
return [( model_name, sub_dir, bucketfs_conn, token_conn, answer, score, error_msg)]

def make_model_output_for_one_input_row(number_entities:int, answer=answer, score=score):
"""
Makes the output the model returns to the udf for one input row.
The found token is called "word" in the model output,
while the type/class of the found token is called "entity_group".
Unless aggregation_strategy == "none", then the type/class of the found
token is called "entity" in the model output.
returns a list of number_entities times the model output row.
each model output row is a dictionary.
"""
model_output_single_entities = {'answer': answer, 'score': score}

return [[model_output_single_entities] * number_entities] #todo test where this is not in list?


def make_number_of_strings(input_str: str, desired_number: int):#todo move this to utils?
"""
Returns desired number of "input_strX", where X is counting up to desired_number.
"""
return (input_str + f"{i}" for i in range(desired_number))

7 changes: 4 additions & 3 deletions tests/unit_tests/udfs/base_model_dummy_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def __init__(self,
base_model=transformers.AutoModel,
tokenizer=transformers.AutoTokenizer):
super().__init__(exa, batch_size, pipeline, base_model,
tokenizer, 'dummy_task')
tokenizer, task_type='dummy_task')
self._desired_fields_in_prediction = ["answer", "score"]
self.new_columns = ["answer", "score", "error_message"]

Expand All @@ -24,8 +24,9 @@ def extract_unique_param_based_dataframes(

def execute_prediction(self, model_df: pd.DataFrame) -> \
List[Union[Dict[str, Any], List[Dict[str, Any]]]]:
dummy_result = [{"answer": True, "score": "1"}]
return dummy_result
dummy_result = [{"answer": True, "score": "1"}]#todo this should return model.frompretrained
results = self.last_created_pipeline()
return results

def append_predictions_to_input_dataframe(
self, model_df: pd.DataFrame, pred_df_list: List[pd.DataFrame]) \
Expand Down
30 changes: 23 additions & 7 deletions tests/unit_tests/udfs/test_base_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@
from exasol_udf_mock_python.connection import Connection
from exasol_udf_mock_python.mock_meta_data import MockMetaData

from tests.unit_tests.udf_wrapper_params.base_udf.error_not_cached_multiple_model_multiple_batch import \
ErrorNotCachedMultipleModelMultipleBatch
from tests.unit_tests.udf_wrapper_params.base_udf.error_not_cached_single_model_multiple_batch import \
ErrorNotCachedSingleModelMultipleBatch
from tests.unit_tests.udfs.test_token_classification import assert_result_matches_expected_output, \
assert_correct_number_of_results
from tests.unit_tests.utils_for_udf_tests import create_mock_exa_environment, create_mock_udf_context, \
create_mock_exa_environment_with_token_con, create_base_mock_model_factories, \
create_mock_model_factories_with_models, create_mock_pipeline_factory
Expand All @@ -29,6 +35,7 @@ def __repr__(self):

def create_mock_metadata() -> MockMetaData:
meta = MockMetaData(
script_code_wrapper_function=None,
input_type="SET",
input_columns=[
Column("device_id", int, "INTEGER"),
Expand Down Expand Up @@ -77,10 +84,9 @@ def setup_base_udf_tests_and_run(bfs_connections, input_data, number_of_intended
mock_base_model_factory, mock_tokenizer_factory = create_mock_model_factories_with_models(number_of_intended_used_models)
mock_meta = create_mock_metadata()
mock_exa = create_mock_exa_environment(mock_meta, bfs_connections)

mock_pipeline = create_mock_pipeline_factory(tokenizer_models_output_df, number_of_intended_used_models)
mock_pipeline_factory = create_mock_pipeline_factory(tokenizer_models_output_df, number_of_intended_used_models)
mock_ctx = create_mock_udf_context(input_data, mock_meta)
res = run_test(mock_exa, mock_base_model_factory, mock_tokenizer_factory, mock_pipeline, mock_ctx)
res = run_test(mock_exa, mock_base_model_factory, mock_tokenizer_factory, mock_pipeline_factory, mock_ctx)
return res, mock_meta

def setup_model_loader_tests_and_run(bucketfs_conn_name, bucketfs_conn, input_data):#todo do we need?
Expand All @@ -98,7 +104,10 @@ def setup_model_loader_tests_and_run(bucketfs_conn_name, bucketfs_conn, input_da
res = run_test(mock_exa, mock_base_model_factory, mock_tokenizer_factory, mock_pipeline, mock_ctx)
return res, mock_meta


@pytest.mark.parametrize("params", [
ErrorNotCachedSingleModelMultipleBatch,
ErrorNotCachedMultipleModelMultipleBatch
])
@patch('exasol.python_extension_common.connections.bucketfs_location.create_bucketfs_location_from_conn_object')
@patch('exasol_transformers_extension.utils.bucketfs_operations.get_local_bucketfs_path')
def test_base_model_udf(mock_local_path, mock_create_loc, params):
Expand All @@ -110,15 +119,22 @@ def test_base_model_udf(mock_local_path, mock_create_loc, params):
bfs_connections = params.bfs_connections
expected_model_counter = params.expected_model_counter
tokenizer_models_output_df = params.tokenizer_models_output_df
tokenizer_model_output_df_model1 = params.tokenizer_model_output_df_model1
print(tokenizer_model_output_df_model1)
print("___________________________")
#batch_size = params.batch_size
expected_output_data = params.output_data

res, mock_meta = setup_base_udf_tests_and_run(bfs_connections, input_data,
expected_model_counter,
tokenizer_models_output_df)
# check if no errors
# todo better asserts
assert res[0][-1] is None and len(res[0]) == len(mock_meta.output_columns)
print(res)

#todo moce these out of token class tests
assert_correct_number_of_results(res, mock_meta.output_columns, expected_output_data)
assert_result_matches_expected_output(res, expected_output_data, mock_meta.input_columns)
#assert len(mock_pipeline_factory.mock_calls) == expected_model_counter


@pytest.mark.parametrize(["description", "bucketfs_conn_name", "bucketfs_conn",
"sub_dir", "model_name"], [
Expand Down

0 comments on commit c2c803a

Please sign in to comment.