Skip to content

Commit

Permalink
#269: fixed token class span output (#270)
Browse files Browse the repository at this point in the history
* token_classification_udf with spans now returns input span
* started changing unit tests to use stadaloneudfmack, and be easier maintainable
* fixed error in unit tests asserts
* fix failing unit tests wrong assumptions
* version update of pandas and udf_mock_python
  • Loading branch information
MarleneKress79789 authored Nov 21, 2024
1 parent 594bf64 commit 4091c57
Show file tree
Hide file tree
Showing 147 changed files with 1,097 additions and 2,082 deletions.
1 change: 1 addition & 0 deletions doc/changes/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Changelog

* [2.2.0](changes_2.1.0.md)
* [2.1.0](changes_2.1.0.md)
* [2.0.0](changes_2.0.0.md)
* [1.0.1](changes_1.0.1.md)
Expand Down
27 changes: 27 additions & 0 deletions doc/changes/changes_2.2.0.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Transformers Extension 2.2.0, T.B.D

Code name: T.B.D

## Summary

T.B.D

### Features

n/a

### Bugs

- #272: Fixed unit tests assertions not working correctly

### Documentation

n/a

### Refactorings

- #273: Refactored unit tests for token_classification_udf to use StandAloneUDFMock, made params files more maintainable

### Security

n/a
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ CREATE OR REPLACE {{ language_alias }} SET SCRIPT "TE_TOKEN_CLASSIFICATION_UDF_W
bucketfs_conn VARCHAR(2000000),
sub_dir VARCHAR(2000000),
model_name VARCHAR(2000000),
text_data_docid INTEGER,
text_data_char_begin INTEGER,
text_data_char_end INTEGER,
aggregation_strategy VARCHAR(2000000),
entity_covered_text VARCHAR(2000000),
entity_type VARCHAR(2000000),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ def __init__(self,
tokenizer, task_type='token-classification',
work_with_spans=work_with_spans)
self._default_aggregation_strategy = 'simple'
#self.work_with_spans = work_with_spans
self._desired_fields_in_prediction = [
"start", "end", "word", "entity", "score"]
self.new_columns = [
Expand Down Expand Up @@ -92,8 +91,7 @@ def create_new_span_columns(self, model_df: pd.DataFrame) -> pd.DataFrame:

def drop_old_data_for_span_execution(self, model_df: pd.DataFrame) -> pd.DataFrame:
# drop columns which are made superfluous by the spans to save data transfer
model_df = model_df.drop(columns=["text_data", "text_data_docid", "text_data_char_begin",
"text_data_char_end", "start_pos", "end_pos"])
model_df = model_df.drop(columns=["text_data", "start_pos", "end_pos"])
return model_df

def make_entity_span(self, df_row):
Expand Down
2 changes: 1 addition & 1 deletion exasol_transformers_extension/utils/load_local_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def load_models(self) -> transformers.pipelines.Pipeline:
loaded_tokenizer = self._tokenizer_factory.from_pretrained(str(self._bucketfs_model_cache_dir))

last_created_pipeline = self.pipeline_factory(
self.task_type,
task=self.task_type,
model=loaded_model,
tokenizer=loaded_tokenizer,
device=self.device,
Expand Down
133 changes: 92 additions & 41 deletions poetry.lock

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "exasol-transformers-extension"
version = "2.1.0"
version = "2.2.0"
description = "An Exasol extension for using state-of-the-art pretrained machine learning models via the Hugging Face Transformers API."

authors = [
Expand All @@ -17,7 +17,7 @@ keywords = ['exasol']

[tool.poetry.dependencies]
python = "^3.10.0"
pandas = ">=1.4.2,<2.0.0"
pandas = ">=1.4.2,<3.0.0"
torch = "^2.0.1"
transformers = {extras = ["torch"], version = "^4.36.2"}
Jinja2 = "^3.0.3"
Expand All @@ -31,7 +31,7 @@ exasol-python-extension-common = ">=0.8.0,<1"

[tool.poetry.dev-dependencies]
pytest = "^7.2.0"
exasol-udf-mock-python = "^0.1.0"
exasol-udf-mock-python = ">=0.3.0"
toml = "^0.10.2"

[tool.poetry.group.dev.dependencies]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,9 @@ def test_token_classification_script_with_span(
# assertions
assert result[0][-1] is None
# added_columns: entity_covered_text, entity_type, score, entity_docid, entity_char_begin, entity_char_end, error_message
# removed_columns: # device_id, text_data, text_data_docid, text_data_char_begin, text_data_char_end
assert_correct_number_of_results(7, 5,
# removed_columns: # device_id, text_data
assert_correct_number_of_results(7, 2,
input_data[0], result, n_rows)
# lenient test for quality of results, will be replaced by deterministic test later
results = [[result[i][4], result[i][5]] for i in range(len(result))]
results = [[result[i][7], result[i][8]] for i in range(len(result))]
assert_lenient_check_of_output_quality(results)
2 changes: 1 addition & 1 deletion tests/integration_tests/without_db/udfs/matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def _is_rank_monotonic(self, score_rank_df: pd.DataFrame, row: int) -> bool:
return (
score_rank_df[row * self._results_per_row: self._results_per_row + row * self._results_per_row]
.sort_values(by='score', ascending=False)['rank']
.is_monotonic
.is_monotonic_increasing
)

def __eq__(self, other) -> bool:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@ def test_token_classification_udf_with_span(
'bucketfs_conn',
'sub_dir',
'model_name',
"text_data_docid",
"text_data_char_begin",
"text_data_char_end",
'aggregation_strategy'
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ class MockPipeline:
counter = 0

def __init__(self,
task_type: str,
task: str,
model: "MockFillingMaskModel",
tokenizer: MockSequenceTokenizer,
device: str,
framework: str):
self.task_type = task_type
self.task_type = task
self.model = model
self.tokenizer = tokenizer
self.device = device
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ class ErrorNotCachedMultipleModelMultipleBatch:
"bfs_conn2": Connection(address=f"file://{base_cache_dir2}")
}
mock_factory = MockQuestionAnsweringFactory({
PurePosixPath(base_cache_dir1, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 1", score=0.1, rank=1),
PurePosixPath(base_cache_dir2, "sub_dir2", "model2"):
PurePosixPath(base_cache_dir2, "sub_dir2", "model2_question-answering"):
MockQuestionAnsweringModel(answer="answer 2", score=0.2, rank=1),
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class ErrorNotCachedSingleModelMultipleBatch:
}

mock_factory = MockQuestionAnsweringFactory({
PurePosixPath(base_cache_dir1, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 1", score=0.1, rank=1)
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ class ErrorOnPredictionMultipleModelMultipleBatch:
"bfs_conn2": Connection(address=f"file://{base_cache_dir2}")
}
mock_factory = MockQuestionAnsweringFactory({
PurePosixPath(base_cache_dir1, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 1", score=0.1, rank=1),
PurePosixPath(base_cache_dir2, "sub_dir2", "model2"):
PurePosixPath(base_cache_dir2, "sub_dir2", "model2_question-answering"):
MockQuestionAnsweringModel(answer="answer 2", score=0.2, rank=1),
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class ErrorOnPredictionSingleModelMultipleBatch:
}

mock_factory = MockQuestionAnsweringFactory({
PurePosixPath(base_cache_dir1, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 1", score=0.1, rank=1)
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def __init__(self, answer: str, score: float, rank: int):
self.result = {"answer": answer, "score": score, "rank": rank}

@classmethod
def from_pretrained(cls, model_name, cache_dir, use_auth_token):
def from_pretrained(cls, model_path):
return cls


Expand All @@ -18,22 +18,21 @@ def __init__(self, mock_models: Dict[PurePosixPath,
MockQuestionAnsweringModel]):
self.mock_models = mock_models

def from_pretrained(self, model_name, cache_dir):
# the cache_dir path already has model_name
return self.mock_models[cache_dir]
def from_pretrained(self, model_path):
return self.mock_models[PurePosixPath(model_path)]


class MockPipeline:
ResultDict = NewType("ResultDict", Dict[str, Union[str, float]])
counter = 0

def __init__(self,
task_type: str,
task: str,
model: MockQuestionAnsweringModel,
tokenizer: MockSequenceTokenizer,
device: str,
framework: str):
self.task_type = task_type
self.task_type = task
self.model = model
self.tokenizer = tokenizer
self.device = device
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@

class MockSequenceTokenizer:
@classmethod
def from_pretrained(cls, model_name, cache_dir, use_auth_token):
def from_pretrained(cls, model_path):
return cls
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ class MultipleBucketFSConnSingleSubdirSingleModelNameMultipleBatch:
"bfs_conn2": Connection(address=f"file://{base_cache_dir2}")
}
mock_factory = MockQuestionAnsweringFactory({
PurePosixPath(base_cache_dir1, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 1", score=0.1, rank=1),
PurePosixPath(base_cache_dir2, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir2, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 2", score=0.2, rank=1),
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ class MultipleBucketFSConnSingleSubdirSingleModelNameSingleBatch:
"bfs_conn2": Connection(address=f"file://{base_cache_dir2}")
}
mock_factory = MockQuestionAnsweringFactory({
PurePosixPath(base_cache_dir1, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 1", score=0.1, rank=1),
PurePosixPath(base_cache_dir2, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir2, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 2", score=0.2, rank=1),
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ class MultipleModelMultipleBatchComplete:
"bfs_conn2": Connection(address=f"file://{base_cache_dir2}")
}
mock_factory = MockQuestionAnsweringFactory({
PurePosixPath(base_cache_dir1, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 1", score=0.1, rank=1),
PurePosixPath(base_cache_dir2, "sub_dir2", "model2"):
PurePosixPath(base_cache_dir2, "sub_dir2", "model2_question-answering"):
MockQuestionAnsweringModel(answer="answer 2", score=0.2, rank=1),
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ class MultipleModelMultipleBatchIncomplete:
"bfs_conn2": Connection(address=f"file://{base_cache_dir2}")
}
mock_factory = MockQuestionAnsweringFactory({
PurePosixPath(base_cache_dir1, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 1", score=0.1, rank=1),
PurePosixPath(base_cache_dir2, "sub_dir2", "model2"):
PurePosixPath(base_cache_dir2, "sub_dir2", "model2_question-answering"):
MockQuestionAnsweringModel(answer="answer 2", score=0.2, rank=1),
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,13 @@ class MultipleModelMultipleBatchMultipleModelsPerBatch:
"bfs_conn4": Connection(address=f"file://{cache_dir4}")}

mock_factory = MockQuestionAnsweringFactory({
PurePosixPath(base_cache_dir1, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 1", score=0.1, rank=1),
PurePosixPath(base_cache_dir2, "sub_dir2", "model2"):
PurePosixPath(base_cache_dir2, "sub_dir2", "model2_question-answering"):
MockQuestionAnsweringModel(answer="answer 2", score=0.2, rank=1),
PurePosixPath(cache_dir3, "sub_dir3", "model3"):
PurePosixPath(cache_dir3, "sub_dir3", "model3_question-answering"):
MockQuestionAnsweringModel(answer="answer 3", score=0.3, rank=1),
PurePosixPath(cache_dir4, "sub_dir4", "model4"):
PurePosixPath(cache_dir4, "sub_dir4", "model4_question-answering"):
MockQuestionAnsweringModel(answer="answer 4", score=0.4, rank=1),
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ class MultipleModelSingleBatchComplete:
"bfs_conn2": Connection(address=f"file://{base_cache_dir2}")
}
mock_factory = MockQuestionAnsweringFactory({
PurePosixPath(base_cache_dir1, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 1", score=0.1, rank=1),
PurePosixPath(base_cache_dir2, "sub_dir2", "model2"):
PurePosixPath(base_cache_dir2, "sub_dir2", "model2_question-answering"):
MockQuestionAnsweringModel(answer="answer 2", score=0.2, rank=1),
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ class MultipleModelSingleBatchIncomplete:
"bfs_conn2": Connection(address=f"file://{base_cache_dir2}")
}
mock_factory = MockQuestionAnsweringFactory({
PurePosixPath(base_cache_dir1, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 1", score=0.1, rank=1),
PurePosixPath(base_cache_dir2, "sub_dir2", "model2"):
PurePosixPath(base_cache_dir2, "sub_dir2", "model2_question-answering"):
MockQuestionAnsweringModel(answer="answer 2", score=0.2, rank=1),
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,13 @@ class MultipleTopkMultipleSizeSingleModelNameMultipleBatch:
}

mock_factory = MockQuestionAnsweringFactory({
PurePosixPath(base_cache_dir1, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 1", score=0.1, rank=1),
PurePosixPath(base_cache_dir1, "sub_dir1", "model2"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model2_question-answering"):
MockQuestionAnsweringModel(answer="answer 2", score=0.2, rank=1),
PurePosixPath(base_cache_dir1, "sub_dir1", "model3"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model3_question-answering"):
MockQuestionAnsweringModel(answer="answer 3", score=0.3, rank=1),
PurePosixPath(base_cache_dir1, "sub_dir1", "model4"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model4_question-answering"):
MockQuestionAnsweringModel(answer="answer 4", score=0.4, rank=1),
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,13 @@ class MultipleTopkMultipleSizeSingleModelNameSingleBatch:
}

mock_factory = MockQuestionAnsweringFactory({
PurePosixPath(base_cache_dir1, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 1", score=0.1, rank=1),
PurePosixPath(base_cache_dir1, "sub_dir1", "model2"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model2_question-answering"):
MockQuestionAnsweringModel(answer="answer 2", score=0.2, rank=1),
PurePosixPath(base_cache_dir1, "sub_dir1", "model3"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model3_question-answering"):
MockQuestionAnsweringModel(answer="answer 3", score=0.3, rank=1),
PurePosixPath(base_cache_dir1, "sub_dir1", "model4"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model4_question-answering"):
MockQuestionAnsweringModel(answer="answer 4", score=0.4, rank=1),
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class MultipleTopkSingleSizeSingleModelNameMultipleBatch:
}

mock_factory = MockQuestionAnsweringFactory({
PurePosixPath(base_cache_dir1, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 1", score=0.1, rank=1)
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class MultipleTopkSingleSizeSingleModelNameSingleBatch:
}

mock_factory = MockQuestionAnsweringFactory({
PurePosixPath(base_cache_dir1, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 1", score=0.1, rank=1)
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ class SingleBucketFSConnMultipleSubdirSingleModelNameMultipleBatch:
}

mock_factory = MockQuestionAnsweringFactory({
PurePosixPath(base_cache_dir1, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 1", score=0.1, rank=1),
PurePosixPath(base_cache_dir1, "sub_dir2", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir2", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 2", score=0.2, rank=1),
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ class SingleBucketFSConnMultipleSubdirSingleModelNameSingleBatch:
}

mock_factory = MockQuestionAnsweringFactory({
PurePosixPath(base_cache_dir1, "sub_dir1", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir1", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 1", score=0.1, rank=1),
PurePosixPath(base_cache_dir1, "sub_dir2", "model1"):
PurePosixPath(base_cache_dir1, "sub_dir2", "model1_question-answering"):
MockQuestionAnsweringModel(answer="answer 2", score=0.2, rank=1),
})

Expand Down
Loading

0 comments on commit 4091c57

Please sign in to comment.