Skip to content

Commit

Permalink
changes from code review
Browse files Browse the repository at this point in the history
  • Loading branch information
MarleneKress79789 committed Nov 20, 2024
1 parent 1763436 commit f234187
Show file tree
Hide file tree
Showing 5 changed files with 9 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@ class ErrorNotCachedMultipleModelMultipleBatch:
make_output_row(bucketfs_conn=bfs_conn2, sub_dir=subdir2,
model_name="non_existing_model",text_data=text2,
score=None, start=None, end=None, word=None, entity=None,
error_msg="Traceback") * 1 * data_size#error on load_model -> only one output per input
error_msg="Traceback") * 1 * data_size #error on load_model -> only one output per input

work_with_span_input_data = make_input_row_with_span(bucketfs_conn=bfs_conn1, sub_dir=subdir1,text_data=text1) * data_size + \
make_input_row_with_span(bucketfs_conn=bfs_conn2, sub_dir=subdir2,
model_name="non_existing_model",text_data=text2) * data_size

work_with_span_output_data1 = make_output_row_with_span(bucketfs_conn=bfs_conn1, sub_dir=subdir1) * n_entities * data_size
work_with_span_output_data2 = [(bfs_conn2, subdir2, "non_existing_model", text_docid, text_start, text_end, agg_strategy_simple,
None, None, None, text_docid, None, None,"Traceback")] * 1 * data_size#error on load_model -> only one output per input
None, None, None, text_docid, None, None,"Traceback")] * 1 * data_size #error on load_model -> only one output per input
work_with_span_output_data = work_with_span_output_data1 + work_with_span_output_data2

tokenizer_model_output_df_model1 = [make_model_output_for_one_input_row(number_entities=n_entities) * data_size]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class ErrorNotCachedSingleModelMultipleBatch:
input_data = make_input_row(model_name="non_existing_model") * data_size
output_data = make_output_row(model_name="non_existing_model",
score=None, start=None, end=None, word=None, entity=None,
error_msg="Traceback") * 1 * data_size#error on load_model -> only one output per input
error_msg="Traceback") * 1 * data_size #error on load_model -> only one output per input

work_with_span_input_data = make_input_row_with_span(model_name="non_existing_model") * data_size
work_with_span_output_data = [(bucketfs_conn, sub_dir, "non_existing_model",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class ErrorOnPredictionMultipleModelMultipleBatch:
)]
tokenizer_model_output_df_model2_batch2 = [make_model_output_for_one_input_row(number_entities=1, #error on pred -> only one output per input
score=None, start=None, end=None, word=None, entity_group=None,
)] # do we expect error on prediction to happen once per input or once per entity? per input makes more sense right?
)]

tokenizer_models_output_df = [tokenizer_model_output_df_model1, tokenizer_model_output_df_model2_batch1, tokenizer_model_output_df_model2_batch2]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def make_input_row_with_span(device_id=device_id, bucketfs_conn=bucketfs_conn, s
text_data_char_begin=text_start, text_data_char_end=text_end,
aggregation_strategy=agg_strategy_simple):
"""
creates an input row for token classification with span usage as a list,
Creates an input row for token classification with span usage as a list,
using base params for all params that are not specified.
"""
return [(device_id, bucketfs_conn, sub_dir, model_name,text_data,text_data_docid,
Expand All @@ -75,9 +75,9 @@ def make_output_row_with_span(bucketfs_conn=bucketfs_conn, sub_dir=sub_dir,
entity_covered_text=token, entity_type=entity_type, score=score,
error_msg = error_msg):
"""
creates an output row for token classification with span usage as a list,
Creates an output row for token classification with span usage as a list,
using base params for all params that are not specified.
the found token is called "entity_covered_text" in our non span udf output.
The found token is called "entity_covered_text" in our non span udf output.
"""
entity_char_begin = text_start + token_start
entity_char_end = text_start + token_end
Expand All @@ -89,7 +89,7 @@ def make_output_row_with_span(bucketfs_conn=bucketfs_conn, sub_dir=sub_dir,

def make_number_of_strings(input_str: str, desired_number: int):
"""
returns desired number of "input_strX", where X is counting up to desired_number.
Returns desired number of "input_strX", where X is counting up to desired_number.
"""
return (input_str + f"{i}" for i in range(desired_number))

3 changes: 1 addition & 2 deletions tests/unit_tests/udfs/test_token_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ def create_mock_metadata():
)
return meta

# todo these functions should be reusable for the other unit tests. should we move them to a utils file or something?
def create_db_mocks(bfs_connection, model_input_data, mock_meta):
mock_ctx = StandaloneMockContext(inp=model_input_data, metadata=mock_meta)
mock_exa = MockExaEnvironment(
Expand Down Expand Up @@ -155,7 +154,7 @@ def create_mock_pipeline_factory(tokenizer_models_output_df, number_of_intended_
"""
mock_pipeline: List[Union[AutoModel, MagicMock]] = [
create_autospec(Pipeline, side_effect=tokenizer_models_output_df[i]) if tokenizer_models_output_df[i][0][0][0]["word"]
else [Exception("Traceback mock_pipeline is throwing an error intentionally")] # todo we could probably put this exception into the tokenizer_models_output_df in the params files instead
else [Exception("Traceback mock_pipeline is throwing an error intentionally")]
for i in range(0, number_of_intended_used_models)
]

Expand Down

0 comments on commit f234187

Please sign in to comment.