From 640ed909338fa6afa07f55e84d72b8682638f9de Mon Sep 17 00:00:00 2001 From: David Beauchemin Date: Mon, 20 Feb 2023 06:41:40 -0500 Subject: [PATCH] merge dev for release 0.9.4 (#177) * merge python3_10 integration * improve codecov script * improve codecov script with test verbosity * improve codecov script with test verbosity * add script to run tests on all python version supported * fix path management * change mod of executable file * add interactive shell to handle conda * remove shebang arg * fix contributing and minor typo in tests script * improv example code and remove dead example * squash handling from url branch * cleanup dead file * improve speed test code * add num_workers test fasttext under windows os condition * add tests case for num_workers test in parser * simplified tests case windows * update changelog * fix windows os failing test due to num workers gt 0 * fix missing lower cassing windows os name * add missing downlaod_from_url deprecated message and redirect to new refactored function * add major release todo list to track function to remove * update changelog * add pragma no cover to skip codecovv * improve variable naming * refactor position of non protected method * bump pylint and add django for codacy * fix deepparse tools pylint * fix network pylint * fix vectorizer modules * fix torch member and parser modules * refactor arguments init in cli and cycling import * fix circular import * fix last pylint errors * fix error in csv column names versus column name * fix list csv column names missing nargs * remove duplicate detection and fix with statement for temporary directory * fix oylint on test * push to 0.8.1 * simplification skipif test testing * bug fix issue 141 * fix missing csv dataset in test for csv integration test * merge improvement for error handling of retrain and test API * linting yml file * improve run all tests script * improve run tests python envs * fix naming of tests and some typos * add save_model_eights method (#147) * bumb actions version (checkout and setup-python * fixed actions/checkout setted to 4 instead of 3 * add dependabot * bump stale to v5 * add python 3.11 in linting * remove python 3.11 since not supported for now and add 3.10 in windows test to see if still fails * revert windoes python 3.10 since still fail * Add codeql (#148) * Create FUNDING.yml * Update README.md * Update FUNDING.yml * Create codeql-analysis.yml * add deprecated warnings class type on deprecated download_from_url_fn * refactored dataset containter creation into a factory * fix errors for parsing cases * moved arguments in dataset factory * add tests case for new factory tool fn * added val dataset handling * fixed tests and remove major release todo * added cleaning conda env * improved scirpt with warmup training * remove fine_tuning script since in branch * fixed tests * fixed test without clear num_workers arg * remove fn download_from_url * removed unecessary retrain in test api tests * added verbose for test and improved tests for retrain test integration * updated changelog * fixed missing hint typing, improved internal doc, fixed train_ratio arg error in code examples and in doc * add pylint step on code examples * added missing typing, uniformization of assertFileExist fn, added integration test and improved doc * remove comment in linting ci to bug fix if failling problem * fix dead verbose retrain api flag * add ini option for django * remove linting of code example since fail due to pylint-django and I am unable to make it work * fixed django settings * add steps to install depparse for code examples linting * remove install -e * reinstaller install -e . * add skip=no-member since it is mostly flase positive * removed no-member pylint disable * add docker image * formating * formated README * update changelog * merge uk example and fixes to doc * hot-fix choices handling in cli.download * linting and security template mv * improved deepparse server error handling * merge offline parsing * fix typo in all test run * fixed error in module name and refactored errors module * fixed reference packaging other deepparse module * added missing hint typing * add missing urllib3 dependancies * improve workflow * improve doc * add download_models, fix bug in cache path handling and fixed examples * update changelog * refactored test and add download_models tests * merge refactoring of download cli fn * moved code for licensing * fixed typo in doc * Update CHANGELOG.md * added factories and tests * added offline argument to model factory * added data padders & tests * black formatting * added data padder factory & tests * added docstring & preparing to refactor padder * refactored data padder to solve LSP issue * refactored vectorizer factory & temporarily removed type hinting from TrainVectorizer due to cyclic import * adjusted docstring * Hotfix `SSLError` when downloading model weights of model type: `bpemb` (#157) * :sparkles: add `no_ssl_verification()` context manager disables SSL for requests library within context * :bug: hotfix model factory for `model_type="bpemb"` Co-authored-by: David Beauchemin * moved context wrapper in bpemb embedding model * removed unused as err * added pylint skip for broad except to hotfix code * added pylint skip for broad except to hotfix code * bump version and changelog * added DataPadder docstring * applied refurb (#160) * wip - added DataProcessor and tests * tweaked process_for_training method * finished DataProcessor and tests * removed obsolete tests * added DataProcessor docstring * Bump docker/metadata-action from 4.0.1 to 4.1.1 (#161) Bumps [docker/metadata-action](https://github.com/docker/metadata-action) from 4.0.1 to 4.1.1. - [Release notes](https://github.com/docker/metadata-action/releases) - [Commits](https://github.com/docker/metadata-action/compare/69f6fc9d46f2f8bf0d5491e4aabe0bb8c6a4678a...57396166ad8aefe6098280995947635806a0e6ea) --- updated-dependencies: - dependency-name: docker/metadata-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump docker/login-action from 2.0.0 to 2.1.0 (#162) Bumps [docker/login-action](https://github.com/docker/login-action) from 2.0.0 to 2.1.0. - [Release notes](https://github.com/docker/login-action/releases) - [Commits](https://github.com/docker/login-action/compare/49ed152c8eca782a232dede0303416e8f356c37b...f4ef78c080cd8ba55a85445d5b36e214a81df20a) --- updated-dependencies: - dependency-name: docker/login-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump pylint from 2.15.3 to 2.15.5 (#163) Bumps [pylint](https://github.com/PyCQA/pylint) from 2.15.3 to 2.15.5. - [Release notes](https://github.com/PyCQA/pylint/releases) - [Commits](https://github.com/PyCQA/pylint/compare/v2.15.3...v2.15.5) --- updated-dependencies: - dependency-name: pylint dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump docker/build-push-action from 3.1.1 to 3.2.0 (#164) Bumps [docker/build-push-action](https://github.com/docker/build-push-action) from 3.1.1 to 3.2.0. - [Release notes](https://github.com/docker/build-push-action/releases) - [Commits](https://github.com/docker/build-push-action/compare/c84f38281176d4c9cdb1626ffafcd6b3911b5d94...c56af957549030174b10d6867f20e78cfd7debc5) --- updated-dependencies: - dependency-name: docker/build-push-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump black from 22.8.0 to 22.10.0 (#165) Bumps [black](https://github.com/psf/black) from 22.8.0 to 22.10.0. - [Release notes](https://github.com/psf/black/releases) - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) - [Commits](https://github.com/psf/black/compare/22.8.0...22.10.0) --- updated-dependencies: - dependency-name: black dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: David Beauchemin * fix black dependancy pyproject.toml * added DataProcessorFactory and tests * fix error in arg train ratio example and added assert in deepparse.retrain to be more verbose * added error handling for macos and improved windows for case of num_worker and multiprocessing * fixed failling test and improved test for test_api * fixed windows tests * Update CHANGELOG.md * Feat/add new tags to retrain cli (#167) * add missing import in init * add feature to allow new_prediction_tags in retrain CLI API * bump version and changelog * fix typo in doc retrain CLI * fixed errors due to model naming conventions * added final docstring * fixed broken tests * removed broken test patching * cleaned-up parser after new changes integration * black formatting * remove accidental unused import * fixed linting * black formatting * removed unnecessary args * patching factories in AddressParser tests to memory optimise * fixed brocken tests * removed unused import * fixed windows tests * fixed windows test * removed unused modules after refactor * removed imports for removed modules * add tensorboard dependancies in test/requirements since it make test fail due to missing tensorboard for Poutyne import * Update deepparse/parser/address_parser.py Co-authored-by: David Beauchemin * added error handling to data processor factory * fixed linting * Update deepparse/converter/data_processor_factory.py * fixed broken tests * fixed broken test * Update CHANGELOG.md * Bump docker/metadata-action from 4.1.1 to 4.3.0 (#173) Bumps [docker/metadata-action](https://github.com/docker/metadata-action) from 4.1.1 to 4.3.0. - [Release notes](https://github.com/docker/metadata-action/releases) - [Commits](https://github.com/docker/metadata-action/compare/57396166ad8aefe6098280995947635806a0e6ea...507c2f2dc502c992ad446e3d7a5dfbe311567a96) --- updated-dependencies: - dependency-name: docker/metadata-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: David Beauchemin * Bump pylint from 2.15.9 to 2.15.10 (#174) Bumps [pylint](https://github.com/PyCQA/pylint) from 2.15.9 to 2.15.10. - [Release notes](https://github.com/PyCQA/pylint/releases) - [Commits](https://github.com/PyCQA/pylint/compare/v2.15.9...v2.15.10) --- updated-dependencies: - dependency-name: pylint dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: David Beauchemin * Bump docker/build-push-action from 3.2.0 to 4.0.0 (#175) Bumps [docker/build-push-action](https://github.com/docker/build-push-action) from 3.2.0 to 4.0.0. - [Release notes](https://github.com/docker/build-push-action/releases) - [Commits](https://github.com/docker/build-push-action/compare/c56af957549030174b10d6867f20e78cfd7debc5...3b5e8027fcad23fda98b2e3ac259d8d67585f671) --- updated-dependencies: - dependency-name: docker/build-push-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: David Beauchemin * Bump black from 22.12.0 to 23.1.0 (#176) * Bump black from 22.12.0 to 23.1.0 Bumps [black](https://github.com/psf/black) from 22.12.0 to 23.1.0. - [Release notes](https://github.com/psf/black/releases) - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) - [Commits](https://github.com/psf/black/compare/22.12.0...23.1.0) --- updated-dependencies: - dependency-name: black dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] * bump pyproject.toml --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: David Beauchemin * bump version * black formatting --------- Signed-off-by: dependabot[bot] Co-authored-by: Marouane Yassine Co-authored-by: Ajinkya Indulkar <26824103+AjinkyaIndulkar@users.noreply.github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Marouane Yassine <46830666+MAYAS3@users.noreply.github.com> --- .github/workflows/docker-publish.yml | 4 +- CHANGELOG.md | 5 +- .../formatted_compared_addresses_raw.py | 3 - deepparse/converter/__init__.py | 5 +- deepparse/converter/data_padder.py | 204 +++ deepparse/converter/data_padding.py | 238 ---- deepparse/converter/data_processor.py | 86 ++ deepparse/converter/data_processor_factory.py | 38 + deepparse/converter/data_transform.py | 59 - deepparse/embeddings_models/__init__.py | 1 + .../bpemb_embeddings_model.py | 2 +- .../embeddings_model_factory.py | 43 + deepparse/network/__init__.py | 1 + deepparse/network/model_factory.py | 76 + deepparse/network/seq2seq.py | 2 +- deepparse/parser/address_parser.py | 107 +- deepparse/vectorizer/__init__.py | 2 +- deepparse/vectorizer/train_vectorizer.py | 37 - deepparse/vectorizer/vectorizer_factory.py | 40 + docs/source/cli.rst | 2 +- examples/single_country_retrain.ipynb | 1 + pyproject.toml | 2 +- styling_requirements.txt | 4 +- tests/converter/test_data_padder.py | 319 +++++ tests/converter/test_data_padding.py | 341 ----- tests/converter/test_data_processor.py | 320 +++++ .../converter/test_data_processor_factory.py | 58 + tests/converter/test_data_transform.py | 120 -- .../test_embeddings_model_factory.py | 63 + .../test_integration_seq2seq_model_cpu.py | 6 +- .../test_integration_seq2seq_model_gpu.py | 6 +- tests/network/test_model_factory.py | 36 + tests/parser/base.py | 4 +- tests/parser/test_address_parser.py | 1258 ++++++++--------- .../parser/test_address_parser_retrain_api.py | 921 ++++++------ tests/parser/test_address_parser_test_api.py | 245 ++-- tests/requirements.txt | 1 + tests/vectorizer/test_train_vectorizer.py | 31 - tests/vectorizer/test_vectorizer_factory.py | 55 + version.txt | 2 +- 40 files changed, 2554 insertions(+), 2194 deletions(-) create mode 100644 deepparse/converter/data_padder.py delete mode 100644 deepparse/converter/data_padding.py create mode 100644 deepparse/converter/data_processor.py create mode 100644 deepparse/converter/data_processor_factory.py delete mode 100644 deepparse/converter/data_transform.py create mode 100644 deepparse/embeddings_models/embeddings_model_factory.py create mode 100644 deepparse/network/model_factory.py delete mode 100644 deepparse/vectorizer/train_vectorizer.py create mode 100644 deepparse/vectorizer/vectorizer_factory.py create mode 100644 tests/converter/test_data_padder.py delete mode 100644 tests/converter/test_data_padding.py create mode 100644 tests/converter/test_data_processor.py create mode 100644 tests/converter/test_data_processor_factory.py delete mode 100644 tests/converter/test_data_transform.py create mode 100644 tests/embeddings_models/test_embeddings_model_factory.py create mode 100644 tests/network/test_model_factory.py delete mode 100644 tests/vectorizer/test_train_vectorizer.py create mode 100644 tests/vectorizer/test_vectorizer_factory.py diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 50e11dd2..ac3e8225 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -28,12 +28,12 @@ jobs: - name: Extract metadata (tags, labels) for Docker id: meta - uses: docker/metadata-action@57396166ad8aefe6098280995947635806a0e6ea + uses: docker/metadata-action@507c2f2dc502c992ad446e3d7a5dfbe311567a96 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - name: Build and push Docker image - uses: docker/build-push-action@c56af957549030174b10d6867f20e78cfd7debc5 + uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671 with: context: . push: true diff --git a/CHANGELOG.md b/CHANGELOG.md index 83676a0a..c057a9fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -256,7 +256,6 @@ - Add Zenodo DOI ## 0.9 -- - Add `save_model_weights` method to `AddressParser` to save model weights (PyTorch state dictionary) - Improve CI @@ -288,4 +287,8 @@ - Bug-fix FastText error not handled in test API. - Add feature to allow new_prediction_tags to retrain CLI. +## 0.9.4 + + - Improve codebase. + ## dev diff --git a/deepparse/comparer/formatted_compared_addresses_raw.py b/deepparse/comparer/formatted_compared_addresses_raw.py index 37ba8804..614ee313 100644 --- a/deepparse/comparer/formatted_compared_addresses_raw.py +++ b/deepparse/comparer/formatted_compared_addresses_raw.py @@ -68,16 +68,13 @@ def _comparison_report_builder(self) -> str: str_formatted += "Parsed address: " + repr(self.first_address) + "\n" str_formatted += str(probs[0]) + "\n" if not self.identical: - str_formatted += "\nParsed address: " + repr(self.second_address) + "\n" str_formatted += str(probs[1]) + "\n" if self.equivalent: - str_formatted += "\n\nRaw differences between the two addresses: \n" str_formatted += self._get_raw_diff_color() else: - str_formatted += "\n\nAddresses tags differences between the two addresses: \n" str_formatted += self._get_tags_diff_color() diff --git a/deepparse/converter/__init__.py b/deepparse/converter/__init__.py index f0c2b331..6390b683 100644 --- a/deepparse/converter/__init__.py +++ b/deepparse/converter/__init__.py @@ -1,4 +1,5 @@ # pylint: disable=wildcard-import -from .data_padding import * from .target_converter import * -from .data_transform import * +from .data_padder import * +from .data_processor import * +from .data_processor_factory import * diff --git a/deepparse/converter/data_padder.py b/deepparse/converter/data_padder.py new file mode 100644 index 00000000..c0192105 --- /dev/null +++ b/deepparse/converter/data_padder.py @@ -0,0 +1,204 @@ +from typing import List, Tuple, Union + +import torch +from torch.nn.utils.rnn import pad_sequence +import numpy as np + + +class DataPadder: + """ + Class that handles the padding of vectorized sequences to the length of the longuest sequence. + Args: + padding_value (int): the value to use as padding to extend the shorter sequences. Default: -100. + """ + + def __init__(self, padding_value: int = -100) -> None: + self.padding_value = padding_value + + def pad_word_embeddings_batch( + self, batch: List[Tuple[List, List]], teacher_forcing: bool = False + ) -> Union[ + Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor], + Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor], + ]: + """ + Method to pad a batch of word embeddings sequences and their targets to the length of the longuest one. + Args: + batch (List[Tuple[List, List]]): a list of tuples where the first element is a list + of word embeddings (the sequence) and the second is a list of targets. + teacher_forcing (bool): if True, the padded target vectors are returned twice, + once with the sequences and their lengths, and once on their own. This enables + the use of teacher forcing during the training of sequence to sequence models. + Return: + A tuple of two elements: + - a tuple containing either two :class:`~torch.Tensor` (the padded sequences and their + repective original lengths),or three :class:`~torch.Tensor` (the padded sequences + and their lengths, as well as the padded targets) if `teacher_forcing` is true. + For details on the padding of sequences, + check out :meth:`~DataPadder.pad_word_embeddings_sequences` below. + The returned sequences are sorted in decreasing order. + - a :class:`~torch.Tensor` containing the padded targets. + """ + sequences_vectors, target_vectors = self._extract_word_embeddings_sequences_and_target(batch) + + padded_sequences, lengths = self.pad_word_embeddings_sequences(sequences_vectors) + padded_target_vectors = self.pad_targets(target_vectors) + + if teacher_forcing: + return (padded_sequences, lengths, padded_target_vectors), padded_target_vectors + + return (padded_sequences, lengths), padded_target_vectors + + def pad_word_embeddings_sequences(self, sequences_batch: Tuple[List, ...]) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Method to pad a batch of word embeddings sequences. + Args: + seuqnces_batch (Tuple[List, ...]): a tuple containing lists of word embeddings (the sequences) + Return: + A tuple of two elements: + - a :class:`~torch.Tensor` containing the padded sequcences. + - a :class:`~torch.Tensor` containing the respective original lengths of the padded sequences. + """ + sequences_vectors, lengths = zip( + *[ + ( + torch.FloatTensor(np.array(seq_vectors)), + len(seq_vectors), + ) + for seq_vectors in sequences_batch + ] + ) + + lengths = torch.tensor(lengths) + + padded_sequences_vectors = self._pad_tensors(sequences_vectors) + + return padded_sequences_vectors, lengths + + def pad_subword_embeddings_batch( + self, batch: List[Tuple[Tuple[List, List], List]], teacher_forcing: bool = False + ) -> Union[ + Tuple[Tuple[torch.Tensor, List, torch.Tensor], torch.Tensor], + Tuple[Tuple[torch.Tensor, List, torch.Tensor, torch.Tensor], torch.Tensor], + ]: + """ + Method to pad a batch of subword embeddings sequences and their targets to the length of the longuest one. + Args: + batch (List[Tuple[Tuple[List, List], List]]): a list of tuples containing the two following elements: + - a tuple where the first element is a list of words represented as subword embeddings and the + second element is a list of the number of subword embeddings that each word is decomposed into. + - a list of targets. + teacher_forcing (bool): if True, the padded target vectors are returned twice, + once with the sequences and their lengths, and once on their own. This enables + the use of teacher forcing during the training of sequence to sequence models. + Return: + A tuple of two elements: + - A tuple (``x``, ``y`` , ``z``). The element ``x`` is a :class:`~torch.Tensor` of + padded subword vectors,``y`` is a list of padded decomposition lengths, + and ``z`` is a :class:`~torch.Tensor` of the original lengths of the sequences + before padding. If teacher_forcing is True, a fourth element is added which + corresponds to a :class:`~torch.Tensor` of the padded targets. For details + on the padding of sequences, check out :meth:`~DataPadder.pad_subword_embeddings_sequences` below. + The returned sequences are sorted in decreasing order. + - a :class:`~torch.Tensor` containing the padded targets. + """ + sequences_tuples, target_vectors = self._extract_subword_embeddings_sequences_and_targets(batch) + + padded_sequences, decomposition_lengths, sequence_lengths = self.pad_subword_embeddings_sequences( + sequences_tuples + ) + padded_target_vectors = self.pad_targets(target_vectors) + + if teacher_forcing: + return ( + padded_sequences, + decomposition_lengths, + sequence_lengths, + padded_target_vectors, + ), padded_target_vectors + + return (padded_sequences, decomposition_lengths, sequence_lengths), padded_target_vectors + + def pad_subword_embeddings_sequences( + self, sequences_batch: Tuple[Tuple[List, List], ...] + ) -> Tuple[torch.Tensor, List, torch.Tensor]: + """ + Method to pad a batch of subword embeddings sequences. + Args: + sequences_batch (Tuple[Tuple[List, List], ...]): a tuple containing tuples of two elements: + - a list of lists representing words as lists of subword embeddings. + - a list of the number of subword embeddings that each word is decomposed into. + Return: + A tuple of three elements: + - a :class:`~torch.Tensor` containing the padded sequcences. + - a list containing the padded decomposition lengths of each word. When a word is + added as padding to elongate a sequence, we consider that the decomposition + length of the added word is 1. + - a :class:`~torch.Tensor` containing the respective original lengths (number of words) + of the padded sequences. + """ + sequences_vectors, decomp_len, lengths = zip( + *[ + ( + torch.tensor(np.array(vectors)), + word_decomposition_len, + len(vectors), + ) + for vectors, word_decomposition_len in sequences_batch + ] + ) + + padded_sequences_vectors = self._pad_tensors(sequences_vectors) + + lengths = torch.tensor(lengths) + max_sequence_length = lengths.max().item() + for decomposition_length in decomp_len: + if len(decomposition_length) < max_sequence_length: + decomposition_length.extend([1] * (max_sequence_length - len(decomposition_length))) + + return padded_sequences_vectors, list(decomp_len), lengths + + def pad_targets(self, target_batch: Tuple[List, ...]) -> torch.Tensor: + """ + Method to pad a batch of target indices to the longuest one. + Args: + target_batch (Tuple[List, ...]): a tuple comtaining lists of target indices. + Return: + A :class:`~torch.Tensor` of padded targets. + """ + target_batch = map(torch.tensor, target_batch) + + return self._pad_tensors(target_batch) + + def _extract_word_embeddings_sequences_and_target(self, batch: List[Tuple[List, List]]) -> Tuple[List, List]: + """ + Method that takes a list of word embedding sequences and targets and zips the + sequences together and the targets together. + """ + sorted_batch = sorted(batch, key=lambda x: len(x[0]), reverse=True) + + sequence_batch, target_batch = zip(*sorted_batch) + + return sequence_batch, target_batch + + def _extract_subword_embeddings_sequences_and_targets( + self, batch: List[Tuple[Tuple[List, List], List]] + ) -> Tuple[List[Tuple[List, List]], List]: + """ + Method that takes a list of subword embedding sequences and targets + and zips the sequences together and the targets together. + """ + sorted_batch = sorted(batch, key=lambda x: len(x[0][1]), reverse=True) + + sequence_batch, target_batch = zip(*sorted_batch) + + return sequence_batch, target_batch + + def _pad_tensors(self, sequences_batch: Tuple[torch.Tensor, ...]) -> torch.Tensor: + """ + A method to pad and collate multiple :class:``torch.Tensor` representing sequences + into a single :class:``torch.Tensor`using :attr:`DataPadder.padding_value`. + The final :class:``torch.Tensor` is returned with batch first + """ + + return pad_sequence(sequences_batch, batch_first=True, padding_value=self.padding_value) diff --git a/deepparse/converter/data_padding.py b/deepparse/converter/data_padding.py deleted file mode 100644 index f23af767..00000000 --- a/deepparse/converter/data_padding.py +++ /dev/null @@ -1,238 +0,0 @@ -# Bug with PyTorch source code makes torch.tensor as not callable for pylint. -# pylint: disable=not-callable - -from typing import List, Tuple, Iterable - -import numpy as np -import torch -from torch.nn.utils.rnn import pad_sequence - -# By default, the loss and accuracy ignore the value of -100 -# we leverage that when padding elements -padding_value = -100 - - -def fasttext_data_padding(batch: List) -> Tuple: - """ - Function that adds padding to the sequences so all can have the same length as the longest one for fastText model. - - Args: - batch (List): The vectorized batch data. - - Returns: - A tuple (``x``, ``y``). The element ``x`` is a tensor of padded word vectors and ``y`` is their respective - lengths of the sequences. - """ - - # We convert into np.array before as per PyTorch optimization recommendation - sequences_vectors, lengths = zip( - *[(torch.FloatTensor(np.array(seq_vectors)), len(seq_vectors)) for seq_vectors in batch] - ) - - lengths = torch.tensor(lengths) - - padded_sequences_vectors = pad_sequence(sequences_vectors, batch_first=True, padding_value=padding_value) - - return padded_sequences_vectors, lengths - - -def bpemb_data_padding(batch: List[Tuple]) -> Tuple: - """ - Function that add padding to the sequences and to the decomposition lengths so all can have the same length as - the longest one. - - Args: - batch (list[tuple]): The list of vectorize tupled batch data where the first element is the address embeddings - and the second is the word decomposition lengths. - - Returns: - A tuple (``x``, ``y``, ``z``). The element ``x`` is a tensor of padded word vectors, ``y`` is the padded - decomposition lengths, and ``z`` is the original lengths of the sequences before padding. - """ - - # We convert into np.array before as per PyTorch optimization recommendation - sequences_vectors, decomp_len, lengths = zip( - *[ - (torch.tensor(np.array(vectors)), word_decomposition_len, len(vectors)) - for vectors, word_decomposition_len in batch - ] - ) - - lengths = torch.tensor(lengths) - - padded_sequences_vectors = pad_sequence(sequences_vectors, batch_first=True, padding_value=padding_value) - - # pad decomposition length - max_sequence_length = lengths.max().item() - for decomposition_length in decomp_len: - if len(decomposition_length) < max_sequence_length: - decomposition_length.extend([1] * (max_sequence_length - len(decomposition_length))) - - return padded_sequences_vectors, list(decomp_len), lengths - - -def fasttext_data_padding_teacher_forcing(batch: List) -> Tuple: - """ - Function that adds padding to the sequences so all can have the same length as the longest one, - using teacher forcing training (i.e. we also provide the target during training). - - Args: - batch (List): The vectorized batch data - - Returns: - A tuple ((``x``, ``y``, ``z``), ``z``). The element ``x`` is a tensor of padded word vectors, ``y`` is their - respective lengths of the sequences and ``z`` is a tensor of padded target idx. We use teacher forcing so we - also need to pass the target during training (``z``). - """ - - sequences_vectors, target_vectors, lengths = _convert_sequence_to_tensor(batch) - - lengths = torch.tensor(lengths) - - padded_sequences_vectors = pad_sequence(sequences_vectors, batch_first=True, padding_value=padding_value) - padded_target_vectors = pad_sequence(target_vectors, batch_first=True, padding_value=padding_value) - - return ( - padded_sequences_vectors, - lengths, - padded_target_vectors, - ), padded_target_vectors - - -def bpemb_data_padding_teacher_forcing(batch: List[Tuple]) -> Tuple: - """ - Function that add padding to the sequences and to the decomposition lengths so all can have the same length as - the longest one, using teacher forcing training (i.e. we also provide the target during training). - - Args: - batch (list[tuple]): The list of vectorize tupled batch data where the first element is the address embeddings - and the second is the word decomposition lengths. - - Returns: - A tuple ((``x``, ``y``, ``z``, ``w``), ``w``). The element ``x`` is a tensor of padded word vectors, - ``y`` is the padded decomposition lengths, ``z`` is the original lengths of the sequences before padding, and - ``w`` is a tensor of padded target idx. We use teacher forcing so we also need to pass the target during - training (``w``). - """ - - ( - sequences_vectors, - decomp_len, - target_vectors, - lengths, - ) = _convert_bpemb_sequence_to_tensor(batch) - - lengths = torch.tensor(lengths) - - padded_sequences_vectors = pad_sequence(sequences_vectors, batch_first=True, padding_value=padding_value) - padded_target_vectors = pad_sequence(target_vectors, batch_first=True, padding_value=padding_value) - - # pad decomposition length - max_sequence_length = lengths.max().item() - for decomposition_length in decomp_len: - if len(decomposition_length) < max_sequence_length: - decomposition_length.extend([1] * (max_sequence_length - len(decomposition_length))) - - return ( - padded_sequences_vectors, - list(decomp_len), - lengths, - padded_target_vectors, - ), padded_target_vectors - - -def fasttext_data_padding_with_target(batch: List) -> Tuple: - """ - Function that adds padding to the sequences so all can have the same length as the longest one. - - Args: - batch (List): The vectorized batch data - - Returns: - A tuple ((``x``, ``y``), ``z``). The element ``x`` is a tensor of padded word vectors, ``y`` is their - respective lengths of the sequences and ``z`` is a tensor of padded target idx. - """ - - sequences_vectors, target_vectors, lengths = _convert_sequence_to_tensor(batch) - - lengths = torch.tensor(lengths) - - padded_sequences_vectors = pad_sequence(sequences_vectors, batch_first=True, padding_value=padding_value) - padded_target_vectors = pad_sequence(target_vectors, batch_first=True, padding_value=padding_value) - - return (padded_sequences_vectors, lengths), padded_target_vectors - - -def bpemb_data_padding_with_target(batch: List[Tuple]) -> Tuple: - """ - Function that add padding to the sequences and to the decomposition lengths so all can have the same length as - the longest one. - - Args: - batch (list[tuple]): The list of vectorize tupled batch data where the first element is the address embeddings - and the second is the word decomposition lengths. - - Returns: - A tuple ((``x``, ``y`` , ``z``), ``w``). The element ``x`` is a tensor of padded word vectors, - ``y`` is the padded decomposition lengths, ``z`` is the original lengths of the sequences before padding, and - ``w`` is a tensor of padded target idx. - """ - - ( - sequences_vectors, - decomp_len, - target_vectors, - lengths, - ) = _convert_bpemb_sequence_to_tensor(batch) - - lengths = torch.tensor(lengths) - - padded_sequences_vectors = pad_sequence(sequences_vectors, batch_first=True, padding_value=padding_value) - padded_target_vectors = pad_sequence(target_vectors, batch_first=True, padding_value=padding_value) - - # pad decomposition length - max_sequence_length = lengths.max().item() - for decomposition_length in decomp_len: - if len(decomposition_length) < max_sequence_length: - decomposition_length.extend([1] * (max_sequence_length - len(decomposition_length))) - - return (padded_sequences_vectors, list(decomp_len), lengths), padded_target_vectors - - -def _convert_sequence_to_tensor(batch: List) -> Iterable: - """ - Sort and convert sequence into a tensor with target element - """ - sorted_batch = sorted(batch, key=lambda x: len(x[0]), reverse=True) - - # We convert into np.array before as per PyTorch optimization recommendation - return zip( - *[ - ( - torch.FloatTensor(np.array(seq_vectors)), - torch.tensor(target_vector), - len(seq_vectors), - ) - for seq_vectors, target_vector in sorted_batch - ] - ) - - -def _convert_bpemb_sequence_to_tensor(batch: List[Tuple]) -> Iterable: - """ - Sort and convert a BPEmb sequence into a tensor with target element - """ - sorted_batch = sorted(batch, key=lambda x: len(x[0][1]), reverse=True) - - # We convert into np.array before as per PyTorch optimization recommendation - return zip( - *[ - ( - torch.tensor(np.array(vectors)), - word_decomposition_len, - torch.tensor(target_vectors), - len(vectors), - ) - for (vectors, word_decomposition_len), target_vectors in sorted_batch - ] - ) diff --git a/deepparse/converter/data_processor.py b/deepparse/converter/data_processor.py new file mode 100644 index 00000000..b6ca84ea --- /dev/null +++ b/deepparse/converter/data_processor.py @@ -0,0 +1,86 @@ +from typing import Callable, List, Tuple, Union + +import torch + +from . import TagsConverter +from ..vectorizer import Vectorizer + + +class DataProcessor: + """ + Class that processes addresses into padded batches ready for training or inference + Args: + vectorizer (:class:`~Vectorizer`): a callable vectorizer capable of vectorizing a list of addresses + sequences_padding_callback (Callable): a callback to pad a sequence of vectorized addresses to the + longuest, while returning the original unpadded lengths, see :class:`~deepparse.converter.Datapadder` + batch_padding_callback (Callable): a callback to pad a a sequence of vectorized addresses and their labels + to the longuest, while returning the original unpadded lengths, + see :class:`~deepparse.converter.Datapadder` + tags_converter (:class:`~TagsConverter`): a callable converter to transform address labels into + indices for training + + """ + + def __init__( + self, + vectorizer: Vectorizer, + sequences_padding_callback: Callable, + batch_padding_callback: Callable, + tags_converter: TagsConverter, + ) -> None: + self.vectorizer = vectorizer + self.sequences_padding_callback = sequences_padding_callback + self.batch_padding_callback = batch_padding_callback + self.tags_converter = tags_converter + + def process_for_inference( + self, addresses: List[str] + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, List, torch.Tensor]]: + """ + Method to vectorize addresses for inference. + Args: + addresses (List[str]): a list of addresses + Return: + Either a tuple of vectorized addresses and their respective original lengths before padding + or a tuple of vectorized addresses their subword decomposition lengths and their respective + original lengths before padding, depending on the vectorizing and padding methods. + """ + return self.sequences_padding_callback(self.vectorizer(addresses)) + + def process_for_training( + self, addresses_and_targets: List[Tuple[str, List[str]]], teacher_forcing: bool = False + ) -> Union[ + Union[ + Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor], + Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor], + ], + Union[ + Tuple[Tuple[torch.Tensor, List, torch.Tensor], torch.Tensor], + Tuple[Tuple[torch.Tensor, List, torch.Tensor, torch.Tensor], torch.Tensor], + ], + ]: + """ + Method to vectorize addresses and tags for training. + Args: + addresses_and_targets (List[Tuple[str, List[str]]]): a list of tuples where the first element is an + address and the second is a list of tags. + teacher_forcing (bool): if True, the padded target vectors are returned twice, + once with the sequences and their lengths, and once on their own. This enables + the use of teacher forcing during the training of sequence to sequence models. + Return: + A padded batch. Check out :meth:`~deepparse.converter.DataPadder.pad_word_embeddings_batch` + and :meth:`~DataPadder.pad_subword_embeddings_batch` for more details. + """ + input_sequence = [] + target_sequence = [] + + addresses, targets = zip(*addresses_and_targets) + + input_sequence.extend(self.vectorizer(list(addresses))) + + for target_list in targets: + target_tmp = [self.tags_converter(target) for target in target_list] + target_tmp.append(self.tags_converter("EOS")) # to append the End Of Sequence token + target_sequence.append(target_tmp) + + return self.batch_padding_callback(list(zip(input_sequence, target_sequence)), teacher_forcing) diff --git a/deepparse/converter/data_processor_factory.py b/deepparse/converter/data_processor_factory.py new file mode 100644 index 00000000..12210de5 --- /dev/null +++ b/deepparse/converter/data_processor_factory.py @@ -0,0 +1,38 @@ +from . import DataPadder, TagsConverter, DataProcessor +from ..vectorizer import Vectorizer, BPEmbVectorizer, FastTextVectorizer, MagnitudeVectorizer + + +class DataProcessorFactory: + """ + A factory for data processors + """ + + def create(self, vectorizer: Vectorizer, padder: DataPadder, tags_converter: TagsConverter): + """ + A factory method to create a data processor + Args: + vectorizer (:class:`~Vectorizer`): a callable vectorizer capable of vectorizing a list of addresses + padder (:class:`~DataPadder`): a data padder with methods to pad address sequences and batches + tags_converter (:class:`~TagsConverter`): a callable converter to transform address + labels into indices for training + Return: + A :class:`~DataProcessor` + """ + if isinstance(vectorizer, BPEmbVectorizer): + processor = DataProcessor( + vectorizer, padder.pad_subword_embeddings_sequences, padder.pad_subword_embeddings_batch, tags_converter + ) + + elif isinstance(vectorizer, (FastTextVectorizer, MagnitudeVectorizer)): + processor = DataProcessor( + vectorizer, padder.pad_word_embeddings_sequences, padder.pad_word_embeddings_batch, tags_converter + ) + else: + raise NotImplementedError( + """ + There's no data processor corresponding to the provided vectorizer. + Supported vectorizers are BPEmbVectorizer, FastTextVectorizerand MagnitudeVectorizer + """ + ) + + return processor diff --git a/deepparse/converter/data_transform.py b/deepparse/converter/data_transform.py deleted file mode 100644 index 2dd1aaea..00000000 --- a/deepparse/converter/data_transform.py +++ /dev/null @@ -1,59 +0,0 @@ -from typing import Tuple - -from . import ( - fasttext_data_padding_teacher_forcing, - bpemb_data_padding_teacher_forcing, - bpemb_data_padding_with_target, - fasttext_data_padding_with_target, -) -from ..vectorizer import TrainVectorizer - - -class DataTransform: - """ - Data transformer to vectorize the data and prepare it for training. - - Args: - vectorizer (~deepparse.deepparse.train_vectorizer.TrainVectorizer): Vectorizer to vectorize the data - (i.e. transform into word embedding and tag idx). - model_type (str): See AddressParser for model type. Only ``fasttext-light`` is not supported due to - ``pymagnitude-light`` incompatibility. - - Note: - Since Windows uses ``spawn`` instead of ``fork`` during multiprocess (for the data loading pre-processing - ``num_worker`` > 0) we use the Gensim model, which takes more RAM (~10 GO) than the Fasttext one (~8 GO). - It also takes a longer time to load. See here the - `issue `_. - """ - - def __init__(self, vectorizer: TrainVectorizer, model_type: str) -> None: - self.vectorizer = vectorizer - if "fasttext" in model_type and "light" not in model_type: - self.teacher_forcing_data_padding_fn = fasttext_data_padding_teacher_forcing - self.output_transform_data_padding_fn = fasttext_data_padding_with_target - elif "bpemb" in model_type: - self.teacher_forcing_data_padding_fn = bpemb_data_padding_teacher_forcing - self.output_transform_data_padding_fn = bpemb_data_padding_with_target - else: - # Note that we don't have lightest here since lightest is fasttext-light (magnitude) and we cannot train - # with that model type (see doc note). - raise NotImplementedError( - f"There is no {model_type} network implemented. Value should be: " - f"fasttext, bpemb or their attention variant." - ) - - def teacher_forcing_transform(self, batch_pairs: Tuple) -> Tuple: - """ - Apply a teacher forcing transform (into tensor) to a batch of pairs (address, target). - """ - vectorize_batch_pairs = self.vectorizer(batch_pairs) - - return self.teacher_forcing_data_padding_fn(vectorize_batch_pairs) - - def output_transform(self, batch_pairs: Tuple) -> Tuple: - """ - Apply a transform (into tensor) to a batch of pairs (address, target). - """ - vectorize_batch_pairs = self.vectorizer(batch_pairs) - - return self.output_transform_data_padding_fn(vectorize_batch_pairs) diff --git a/deepparse/embeddings_models/__init__.py b/deepparse/embeddings_models/__init__.py index a83687bc..2b1a02e2 100644 --- a/deepparse/embeddings_models/__init__.py +++ b/deepparse/embeddings_models/__init__.py @@ -3,3 +3,4 @@ from .embeddings_model import * from .fasttext_embeddings_model import * from .magnitude_embeddings_model import * +from .embeddings_model_factory import * diff --git a/deepparse/embeddings_models/bpemb_embeddings_model.py b/deepparse/embeddings_models/bpemb_embeddings_model.py index 7d689d10..414bb78d 100644 --- a/deepparse/embeddings_models/bpemb_embeddings_model.py +++ b/deepparse/embeddings_models/bpemb_embeddings_model.py @@ -19,7 +19,7 @@ class BPEmbEmbeddingsModel(EmbeddingsModel): Params: cache_dir (str): Path to the cache directory to the embeddings' bin vector and the model. - verbose (bool): Either or not to make the loading of the embeddings verbose. + verbose (bool): Wether or not to make the loading of the embeddings verbose. """ def __init__(self, cache_dir: str, verbose: bool = True) -> None: diff --git a/deepparse/embeddings_models/embeddings_model_factory.py b/deepparse/embeddings_models/embeddings_model_factory.py new file mode 100644 index 00000000..6360f752 --- /dev/null +++ b/deepparse/embeddings_models/embeddings_model_factory.py @@ -0,0 +1,43 @@ +from . import BPEmbEmbeddingsModel, FastTextEmbeddingsModel, MagnitudeEmbeddingsModel, EmbeddingsModel +from .. import download_fasttext_embeddings, download_fasttext_magnitude_embeddings + + +class EmbeddingsModelFactory: + """ + A factory for the creation of embeddings models. + """ + + def create(self, embedding_model_type: str, cache_dir: str, verbose: bool = True) -> EmbeddingsModel: + """ + Embeddings model creation method. + Args: + embeddings_model_type (str): the type of the embeddings model to create. Valid options: + - bpemb + - fasttext + - fasttext_magnitude + cache_dir (str): Path to the cache directory where the embeddings model exists or is to be downloaded. + verbose (bool): Wether or not to make the loading of the embeddings verbose. + Return: + An :class:`~EmbeddingsModel` + """ + if "bpemb" in embedding_model_type: + embeddings_model = BPEmbEmbeddingsModel(verbose=verbose, cache_dir=cache_dir) + + elif "fasttext" in embedding_model_type: + if "fasttext-light" in embedding_model_type: + file_name = download_fasttext_magnitude_embeddings(cache_dir=cache_dir, verbose=verbose) + + embeddings_model = MagnitudeEmbeddingsModel(file_name, verbose=verbose) + else: + file_name = download_fasttext_embeddings(cache_dir=cache_dir, verbose=verbose) + + embeddings_model = FastTextEmbeddingsModel(file_name, verbose=verbose) + + else: + raise NotImplementedError( + f""" + The {embedding_model_type} embeddings model does not exist. + Existing embeddings models are: bpemb, fasttext and fasttext_magnitude""" + ) + + return embeddings_model diff --git a/deepparse/network/__init__.py b/deepparse/network/__init__.py index 52de5477..a00cb501 100644 --- a/deepparse/network/__init__.py +++ b/deepparse/network/__init__.py @@ -5,3 +5,4 @@ from .bpemb_seq2seq import * from .fasttext_seq2seq import * from .seq2seq import * +from .model_factory import * diff --git a/deepparse/network/model_factory.py b/deepparse/network/model_factory.py new file mode 100644 index 00000000..4893a9c7 --- /dev/null +++ b/deepparse/network/model_factory.py @@ -0,0 +1,76 @@ +# pylint: disable=too-many-arguments +from typing import Dict, Union + +import torch + +from . import FastTextSeq2SeqModel, BPEmbSeq2SeqModel, Seq2SeqModel + + +class ModelFactory: + """ + A factory for the creation of neural network models that predict the tags from addresses + """ + + def create( + self, + model_type: str, + cache_dir: str, + device: torch.device, + output_size: int = 9, + attention_mechanism: bool = False, + path_to_retrained_model: Union[str, None] = None, + offline: bool = False, + verbose: bool = True, + **seq2seq_kwargs: Dict, + ) -> Seq2SeqModel: + """ + Model creation method. + + Args: + model_type (str): the type of the model to create. Valid options: + - fasttext + - bpemb + cache_dir (str): The path to the cached directory to use for downloading (and loading) the + model weights. + device (~torch.device): The device tu use for the prediction. + output_size (int): The size of the prediction layers (i.e. the number of tag to predict). + attention_mechanism (bool): Either or not to use attention mechanism. The default value is False. + path_to_retrained_model (Union[str, None]): The path to the retrained model to use for the seq2seq. + offline (bool): Wether or not the model is an offline or an online. + verbose (bool): Turn on/off the verbosity of the model. The default value is True. + + Return: + A :class:`~Seq2SeqModel`. + """ + if "fasttext" in model_type or "fasttext-light" in model_type: + model = FastTextSeq2SeqModel( + cache_dir=cache_dir, + device=device, + output_size=output_size, + verbose=verbose, + path_to_retrained_model=path_to_retrained_model, + attention_mechanism=attention_mechanism, + offline=offline, + **seq2seq_kwargs, + ) + + elif "bpemb" in model_type: + model = BPEmbSeq2SeqModel( + cache_dir=cache_dir, + device=device, + output_size=output_size, + verbose=verbose, + path_to_retrained_model=path_to_retrained_model, + attention_mechanism=attention_mechanism, + offline=offline, + **seq2seq_kwargs, + ) + + else: + raise NotImplementedError( + f""" + There is no {model_type} network implemented. model_type should be either fasttext or bpemb + """ + ) + + return model diff --git a/deepparse/network/seq2seq.py b/deepparse/network/seq2seq.py index aa3ee1c8..31d8c302 100644 --- a/deepparse/network/seq2seq.py +++ b/deepparse/network/seq2seq.py @@ -93,7 +93,7 @@ def _load_pre_trained_weights(self, model_type: str, cache_dir: str, offline: bo model_type (str): The network pretrained weights to load. cache_dir (str): The path to the cached directory to use for downloading (and loading) the model weights. - offline (bool): Either or not the model is an offline or an online. + offline (bool): Whether or not the model is an offline or an online. """ model_path = os.path.join(cache_dir, f"{model_type}.ckpt") diff --git a/deepparse/parser/address_parser.py b/deepparse/parser/address_parser.py index a89a23b7..b46c3206 100644 --- a/deepparse/parser/address_parser.py +++ b/deepparse/parser/address_parser.py @@ -5,6 +5,7 @@ # pylint: disable=inconsistent-return-statements import contextlib +from functools import partial import os import platform import re @@ -32,18 +33,16 @@ validate_if_new_seq2seq_params, ) from .. import validate_data_to_parse -from ..converter import DataTransform, TagsConverter, bpemb_data_padding, fasttext_data_padding +from ..converter import TagsConverter, DataProcessorFactory, DataPadder from ..dataset_container import DatasetContainer -from ..embeddings_models import BPEmbEmbeddingsModel, FastTextEmbeddingsModel, MagnitudeEmbeddingsModel -from ..errors.model_error import FastTextModelError -from ..fasttext_tools import download_fasttext_embeddings, download_fasttext_magnitude_embeddings -from ..metrics import accuracy, nll_loss -from ..network.bpemb_seq2seq import BPEmbSeq2SeqModel -from ..network.fasttext_seq2seq import FastTextSeq2SeqModel +from ..embeddings_models import EmbeddingsModelFactory +from ..metrics import nll_loss, accuracy +from ..network import ModelFactory from ..preprocessing import AddressCleaner from ..tools import CACHE_PATH, valid_poutyne_version -from ..vectorizer import BPEmbVectorizer, FastTextVectorizer, TrainVectorizer -from ..vectorizer.magnitude_vectorizer import MagnitudeVectorizer +from ..vectorizer import VectorizerFactory +from ..errors import FastTextModelError + _pre_trained_tags_to_idx = { "StreetNumber": 0, @@ -109,7 +108,7 @@ class AddressParser: be set to True. cache_dir (Union[str, None]): The path to the cached directory to use for downloading (and loading) the embeddings model and the model pretrained weights. - offline (bool): Either or not the model is an offline one, meaning you have already downloaded the pre-trained + offline (bool): Whether or not the model is an offline one, meaning you have already downloaded the pre-trained weights and embeddings weights in either the default Deepparse cache directory (~./cache/deepparse) or the ``cache_dir`` directory. When offline, we will not verify if the model is the latest. You can use our ``download_models`` CLI function to download all the requirements for a model. The default value is False @@ -246,7 +245,7 @@ def __init__( self.named_parser = named_parser self.model_type, self._model_type_formatted = handle_model_name(model_type, attention_mechanism) - self._model_factory( + self._setup_model( verbose=self.verbose, path_to_retrained_model=path_to_retrained_model, prediction_layer_len=self.tags_converter.dim, @@ -678,7 +677,7 @@ def retrain( model_factory_dict.update({"seq2seq_kwargs": seq2seq_params}) # We set verbose to false since model is reloaded - self._model_factory(verbose=False, path_to_retrained_model=None, **model_factory_dict) + self._setup_model(verbose=False, path_to_retrained_model=None, **model_factory_dict) callbacks = [] if callbacks is None else callbacks train_generator, valid_generator = self._create_training_data_generator( @@ -861,11 +860,10 @@ def test( raise ValueError("The dataset container is not a train container.") callbacks = [] if callbacks is None else callbacks - data_transform = self._set_data_transformer() test_generator = DataLoader( test_dataset_container, - collate_fn=data_transform.output_transform, + collate_fn=partial(self.processor.process_for_training, teacher_forcing=False), batch_size=batch_size, num_workers=num_workers, ) @@ -975,13 +973,6 @@ def _process_device(self, device: Union[int, str, torch.device]) -> None: warnings.warn("No CUDA device detected, device will be set to 'CPU'.") self.device = torch.device("cpu") - def _set_data_transformer(self) -> DataTransform: - train_vectorizer = TrainVectorizer(self.vectorizer, self.tags_converter) # Vectorize to provide also the target - data_transform = DataTransform( - train_vectorizer, self.model_type - ) # Use for transforming the data prior to training - return data_transform - def _create_training_data_generator( self, train_dataset_container: DatasetContainer, @@ -992,7 +983,6 @@ def _create_training_data_generator( seed: int, ) -> Tuple: # pylint: disable=too-many-arguments - data_transform = self._set_data_transformer() if val_dataset_container is None: train_indices, valid_indices = indices_splitting( @@ -1008,7 +998,7 @@ def _create_training_data_generator( train_generator = DataLoader( train_dataset, - collate_fn=data_transform.teacher_forcing_transform, + collate_fn=partial(self.processor.process_for_training, teacher_forcing=True), batch_size=batch_size, num_workers=num_workers, shuffle=True, @@ -1016,14 +1006,14 @@ def _create_training_data_generator( valid_generator = DataLoader( valid_dataset, - collate_fn=data_transform.output_transform, + collate_fn=partial(self.processor.process_for_training, teacher_forcing=False), batch_size=batch_size, num_workers=num_workers, ) return train_generator, valid_generator - def _model_factory( + def _setup_model( self, verbose: bool, path_to_retrained_model: Union[str, None] = None, @@ -1044,61 +1034,32 @@ def _model_factory( # Set to default cache_path value cache_dir = CACHE_PATH - if "fasttext" in self.model_type: - if "fasttext-light" in self.model_type: - file_name = download_fasttext_magnitude_embeddings( - cache_dir=cache_dir, verbose=verbose, offline=offline - ) + self.model = ModelFactory().create( + model_type=self.model_type, + cache_dir=cache_dir, + device=self.device, + output_size=prediction_layer_len, + attention_mechanism=attention_mechanism, + path_to_retrained_model=path_to_retrained_model, + offline=offline, + verbose=verbose, + **seq2seq_kwargs, + ) - embeddings_model = MagnitudeEmbeddingsModel(file_name, verbose=verbose) - self.vectorizer = MagnitudeVectorizer(embeddings_model=embeddings_model) - else: - file_name = download_fasttext_embeddings(cache_dir=cache_dir, verbose=verbose, offline=offline) - - embeddings_model = FastTextEmbeddingsModel(file_name, verbose=verbose) - self.vectorizer = FastTextVectorizer(embeddings_model=embeddings_model) - - self.data_converter = fasttext_data_padding - - self.model = FastTextSeq2SeqModel( - cache_dir=cache_dir, - device=self.device, - output_size=prediction_layer_len, - verbose=verbose, - path_to_retrained_model=path_to_retrained_model, - attention_mechanism=attention_mechanism, - offline=offline, - **seq2seq_kwargs, - ) + embeddings_model = EmbeddingsModelFactory().create( + embedding_model_type=self.model_type, cache_dir=cache_dir, verbose=verbose + ) + vectorizer = VectorizerFactory().create(embeddings_model) - elif "bpemb" in self.model_type: - embeddings_model = BPEmbEmbeddingsModel(verbose=verbose, cache_dir=cache_dir) - self.vectorizer = BPEmbVectorizer(embeddings_model=embeddings_model) - - self.data_converter = bpemb_data_padding - - self.model = BPEmbSeq2SeqModel( - cache_dir=cache_dir, - device=self.device, - output_size=prediction_layer_len, - verbose=verbose, - path_to_retrained_model=path_to_retrained_model, - attention_mechanism=attention_mechanism, - offline=offline, - **seq2seq_kwargs, - ) - else: - raise NotImplementedError( - f"There is no {self.model_type} network implemented. Value should be: " - f"fasttext, bpemb, lightest (fasttext-light), fastest (fasttext) " - f"or best (bpemb)." - ) + padder = DataPadder() + + self.processor = DataProcessorFactory().create(vectorizer, padder, self.tags_converter) def _predict_pipeline(self, data: List) -> Tuple: """ Pipeline to process data in a data loader for prediction. """ - return self.data_converter(self.vectorizer(data)) + return self.processor.process_for_inference(data) @staticmethod def _retrain( diff --git a/deepparse/vectorizer/__init__.py b/deepparse/vectorizer/__init__.py index 42973404..2c0ce9e2 100644 --- a/deepparse/vectorizer/__init__.py +++ b/deepparse/vectorizer/__init__.py @@ -2,5 +2,5 @@ from .bpemb_vectorizer import * from .fasttext_vectorizer import * from .magnitude_vectorizer import * -from .train_vectorizer import * from .vectorizer import * +from .vectorizer_factory import * diff --git a/deepparse/vectorizer/train_vectorizer.py b/deepparse/vectorizer/train_vectorizer.py deleted file mode 100644 index 81f4d8bf..00000000 --- a/deepparse/vectorizer/train_vectorizer.py +++ /dev/null @@ -1,37 +0,0 @@ -from typing import List, Iterable - -from ..converter import TagsConverter -from ..vectorizer import Vectorizer - - -class TrainVectorizer: - def __init__(self, embedding_vectorizer: Vectorizer, tags_converter: TagsConverter) -> None: - """ - Vectorizer use during training to convert an address into word embeddings and to provide the target. - """ - self.embedding_vectorizer = embedding_vectorizer - self.tags_converter = tags_converter - - def __call__(self, addresses: List[str]) -> Iterable: - """ - Method to vectorizer addresses for training. - - Args: - addresses (list[str]): The addresses to vectorize. - - Return: - A tuple compose of embeddings word addresses' and the target idxs. - """ - input_sequence = [] - target_sequence = [] - - input_sequence.extend( - self.embedding_vectorizer([address[0] for address in addresses]) - ) # Need to be pass in batch - - # Otherwise, the padding for byte-pair encoding will be broken - for address in addresses: - target_tmp = [self.tags_converter(target) for target in address[1]] - target_tmp.append(self.tags_converter("EOS")) # to append the End Of Sequence token - target_sequence.append(target_tmp) - return zip(input_sequence, target_sequence) diff --git a/deepparse/vectorizer/vectorizer_factory.py b/deepparse/vectorizer/vectorizer_factory.py new file mode 100644 index 00000000..fded7140 --- /dev/null +++ b/deepparse/vectorizer/vectorizer_factory.py @@ -0,0 +1,40 @@ +from ..embeddings_models import ( + BPEmbEmbeddingsModel, + FastTextEmbeddingsModel, + MagnitudeEmbeddingsModel, + EmbeddingsModel, +) +from . import BPEmbVectorizer, FastTextVectorizer, MagnitudeVectorizer, Vectorizer + + +class VectorizerFactory: + """ + A factory for the creation of vectorizers associated with specific embeddings models. + """ + + def create(self, embeddings_model: EmbeddingsModel) -> Vectorizer: + """ + Vectorizer creation method. + Args: + embeddings_model (:class:`~EmbeddingsModel`): The embeddings model for which a vectorizer is to be created + Return: + A :class:`~Vectorizer` + """ + if isinstance(embeddings_model, BPEmbEmbeddingsModel): + vectorizer = BPEmbVectorizer(embeddings_model) + + elif isinstance(embeddings_model, FastTextEmbeddingsModel): + vectorizer = FastTextVectorizer(embeddings_model) + + elif isinstance(embeddings_model, MagnitudeEmbeddingsModel): + vectorizer = MagnitudeVectorizer(embeddings_model) + + else: + raise NotImplementedError( + """ + There's no vectorizer corresponding to the embeddings model type provided. + Supported embedding models are: BPEmbEmbeddingsModel, FastTextEmbeddingsModel and MagnitudeEmbeddingsModel. + """ + ) + + return vectorizer diff --git a/docs/source/cli.rst b/docs/source/cli.rst index 314a4c3a..c4ae4e3a 100644 --- a/docs/source/cli.rst +++ b/docs/source/cli.rst @@ -98,7 +98,7 @@ One can use the command ``parse --help`` to output the same description in your - ``--csv_column_names``: The column names to extract address in the CSV. Need to be specified if the provided dataset_path leads to a CSV file. Column names have to be separated by whitespace. For example, ``--csv_column_names column1 column2``. - ``--csv_column_separator``: The column separator for the dataset container will only be used if the dataset is a CSV one. By default, ``'\t'``. - ``--cache_dir``: To change the default cache directory (default to ``None``, e.g. default path). - - ``prediction_tags``: To change the prediction tags. The ``prediction_tags`` is a path leading to a JSON file of the new tags in a key-value style. For example, the path can be ``"a_path/file.json"`` and the content can be ``{"new_tag": 0, "other_tag": 1, "EOS": 2}`` + - ``prediction_tags``: To change the prediction tags. The ``prediction_tags`` is a path leading to a JSON file of the new tags in a key-value style. For example, the path can be ``"a_path/file.json"`` and the content can be ``{"new_tag": 0, "other_tag": 1, "EOS": 2}``. .. autofunction:: deepparse.cli.retrain.main diff --git a/examples/single_country_retrain.ipynb b/examples/single_country_retrain.ipynb index 252dd639..c5f65bf3 100644 --- a/examples/single_country_retrain.ipynb +++ b/examples/single_country_retrain.ipynb @@ -150,6 +150,7 @@ "# The script functions with minor modification to handle argument\n", "# instead or CLI parsed argument\n", "\n", + "\n", "# Function to handle the files paths\n", "def absolute_file_paths(directory):\n", " \"\"\"\n", diff --git a/pyproject.toml b/pyproject.toml index 68a8e6a1..1f9703f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.black] line-length = 120 skip-string-normalization = true -required-version = "22.12.0" +required-version = "23.1.0" extend-exclude = "/(slides)/" [tool.pytest.ini_options] diff --git a/styling_requirements.txt b/styling_requirements.txt index e5849910..2a53ccae 100644 --- a/styling_requirements.txt +++ b/styling_requirements.txt @@ -1,3 +1,3 @@ -black==22.12.0 -pylint==2.15.9 +black==23.1.0 +pylint==2.15.10 pylint-django[with_django] diff --git a/tests/converter/test_data_padder.py b/tests/converter/test_data_padder.py new file mode 100644 index 00000000..adfa528e --- /dev/null +++ b/tests/converter/test_data_padder.py @@ -0,0 +1,319 @@ +# pylint: disable=line-too-long, too-many-public-methods +import unittest +from unittest import TestCase + +import torch + +from deepparse.converter import DataPadder + + +class DataPadderTest(TestCase): + @classmethod + def setUpClass(cls) -> None: + cls.a_padding_value = -100 + + def setUp(self): + self.a_non_padded_word_embedding_batch_length_list = torch.tensor([3, 2, 1]) + self.a_non_padded_word_embedding_sequences_batch = [ + [[1, 1], [1, 1], [1, 1]], + [[1, 1], [1, 1]], + [[1, 1]], + ] + self.a_padded_word_embedding_sequences_batch = torch.FloatTensor( + [ + [[1, 1], [1, 1], [1, 1]], + [[1, 1], [1, 1], [-100, -100]], + [[1, 1], [-100, -100], [-100, -100]], + ] + ) + + self.a_training_non_padded_word_embedding_batch = [ + ([[1, 1], [1, 1], [1, 1]], [0, 3, 5]), + ([[1, 1], [1, 1]], [4, 7]), + ([[1, 1]], [8]), + ] + + self.a_non_padded_subword_embedding_batch_length_list = torch.tensor([3, 2, 1]) + self.a_non_padded_subword_embedding_sequences_batch = [ + ( + [ + [[1, 1], [1, 1], [-1, -1]], + [[1, 1], [1, 1], [1, 1]], + [[1, 1], [-1, -1], [-1, -1]], + ], + [2, 3, 1], + ), + ([[[1, 1], [1, 1], [-1, -1]], [[1, 1], [1, 1], [-1, -1]]], [2, 2]), + ([[[1, 1], [1, 1], [1, 1]]], [3]), + ] + + self.a_padded_subword_embedding_sequences_batch = torch.tensor( + [ + [ + [[1, 1], [1, 1], [-1, -1]], + [[1, 1], [1, 1], [1, 1]], + [[1, 1], [-1, -1], [-1, -1]], + ], + [ + [[1, 1], [1, 1], [-1, -1]], + [[1, 1], [1, 1], [-1, -1]], + [[-100, -100], [-100, -100], [-100, -100]], + ], + [ + [[1, 1], [1, 1], [1, 1]], + [[-100, -100], [-100, -100], [-100, -100]], + [[-100, -100], [-100, -100], [-100, -100]], + ], + ] + ) + self.a_non_padded_subword_embedding_batch_decomposition_length_list = [ + [2, 3, 1], + [2, 2, 1], + [3, 1, 1], + ] + + self.a_training_non_padded_subword_embedding_batch = [ + ( + ( + [ + [[1, 1], [1, 1], [-1, -1]], + [[1, 1], [1, 1], [1, 1]], + [[1, 1], [-1, -1], [-1, -1]], + ], + [2, 3, 1], + ), + [0, 3, 5], + ), + ( + ([[[1, 1], [1, 1], [-1, -1]], [[1, 1], [1, 1], [-1, -1]]], [2, 2]), + [4, 7], + ), + (([[[1, 1], [1, 1], [1, 1]]], [3]), [8]), + ] + + self.a_padded_target_tensor = torch.tensor([[0, 3, 5], [4, 7, -100], [8, -100, -100]]) + + self.padder = DataPadder(self.a_padding_value) + + def test_givenASequencesBatch_whenPaddingWordEmbeddings_thenShouldReturnCorrectLengths(self): + _, lengths = self.padder.pad_word_embeddings_sequences(self.a_non_padded_word_embedding_sequences_batch) + + self.assertTrue(torch.all(lengths.eq(self.a_non_padded_word_embedding_batch_length_list))) + + def test_givenASequencesBatch_whenPaddingWordEmbeddings_thenShouldReturnBatchAsTensor(self): + padded_sequences, _ = self.padder.pad_word_embeddings_sequences( + self.a_non_padded_word_embedding_sequences_batch + ) + + self.assertIsInstance(padded_sequences, torch.Tensor) + + def test_givenASequencesBatch_whenPaddingWordEmbeddings_thenShouldPerformCorrectPadding(self): + padded_sequences, _ = self.padder.pad_word_embeddings_sequences( + self.a_non_padded_word_embedding_sequences_batch + ) + + self.assertTrue(torch.all(padded_sequences.eq(self.a_padded_word_embedding_sequences_batch))) + + def test_givenATrainingBatch_whenPaddingWordEmbeddings_thenShouldReturnCorrectLengths(self): + (_, lengths), _ = self.padder.pad_word_embeddings_batch(self.a_training_non_padded_word_embedding_batch) + + self.assertTrue(torch.all(lengths.eq(self.a_non_padded_word_embedding_batch_length_list))) + + def test_givenATrainingBatch_whenPaddingWordEmbeddings_thenShouldReturnBatchAsTensor(self): + (padded_sequences, _), _ = self.padder.pad_word_embeddings_batch( + self.a_training_non_padded_word_embedding_batch + ) + + self.assertIsInstance(padded_sequences, torch.Tensor) + + def test_givenATrainingBatch_whenPaddingWordEmbeddings_thenShouldPerformCorrectPadding(self): + (padded_sequences, _), _ = self.padder.pad_word_embeddings_batch( + self.a_training_non_padded_word_embedding_batch + ) + + self.assertTrue(torch.all(padded_sequences.eq(self.a_padded_word_embedding_sequences_batch))) + + def test_givenATrainingBatch_whenPaddingWordEmbeddings_thenShouldReturnTargetAsTensor(self): + (_, _), padded_target = self.padder.pad_word_embeddings_batch(self.a_training_non_padded_word_embedding_batch) + + self.assertIsInstance(padded_target, torch.Tensor) + + def test_givenATrainingBatch_whenPaddingWordEmbeddings_thenShouldPerformCorrectPaddingOnTarget(self): + (_, _), padded_target = self.padder.pad_word_embeddings_batch(self.a_training_non_padded_word_embedding_batch) + + self.assertTrue(torch.all(padded_target.eq(self.a_padded_target_tensor))) + + def test_givenATrainingBatch_whenPaddingWordEmbeddingsWithTeacherForcing_thenShouldReturnCorrectLengths(self): + (_, lengths, _), _ = self.padder.pad_word_embeddings_batch( + self.a_training_non_padded_word_embedding_batch, teacher_forcing=True + ) + + self.assertTrue(torch.all(lengths.eq(self.a_non_padded_word_embedding_batch_length_list))) + + def test_givenATrainingBatch_whenPaddingWordEmbeddingsWithTeacherForcing_thenShouldReturnBatchAsTensor(self): + (padded_sequences, _, _), _ = self.padder.pad_word_embeddings_batch( + self.a_training_non_padded_word_embedding_batch, teacher_forcing=True + ) + + self.assertIsInstance(padded_sequences, torch.Tensor) + + def test_givenATrainingBatch_whenPaddingWordEmbeddingsWithTeacherForcing_thenShouldPerformCorrectPadding(self): + (padded_sequences, _, _), _ = self.padder.pad_word_embeddings_batch( + self.a_training_non_padded_word_embedding_batch, teacher_forcing=True + ) + + self.assertTrue(torch.all(padded_sequences.eq(self.a_padded_word_embedding_sequences_batch))) + + def test_givenATrainingBatch_whenPaddingWordEmbeddingsWithTeacherForcing_thenShouldReturnTargetAsTensor(self): + (_, _, _), padded_target = self.padder.pad_word_embeddings_batch( + self.a_training_non_padded_word_embedding_batch, teacher_forcing=True + ) + + self.assertIsInstance(padded_target, torch.Tensor) + + def test_givenATrainingBatch_whenPaddingWordEmbeddingsWithTeacherForcing_thenShouldPerformCorrectPaddingOnTarget( + self, + ): + (_, _, _), padded_target = self.padder.pad_word_embeddings_batch( + self.a_training_non_padded_word_embedding_batch, teacher_forcing=True + ) + + self.assertTrue(torch.all(padded_target.eq(self.a_padded_target_tensor))) + + def test_givenATrainingBatch_whenPaddingWordEmbeddingsWithTeacherForcing_thenShouldReturnTargetWithSequencesAndLengths( + self, + ): + (_, _, padded_target), _ = self.padder.pad_word_embeddings_batch( + self.a_training_non_padded_word_embedding_batch, teacher_forcing=True + ) + + self.assertTrue(torch.all(padded_target.eq(self.a_padded_target_tensor))) + + def test_givenASequencesBatch_whenPaddingSubwordEmbeddings_thenShouldReturnCorrectLengths(self): + _, _, lengths = self.padder.pad_subword_embeddings_sequences( + self.a_non_padded_subword_embedding_sequences_batch + ) + + self.assertTrue(torch.all(lengths.eq(self.a_non_padded_subword_embedding_batch_length_list))) + + def test_givenASequencesBatch_whenPaddingSubwordEmbeddings_thenShouldReturnCorrectDecompositionLengths(self): + _, decomposition_lengths, _ = self.padder.pad_subword_embeddings_sequences( + self.a_non_padded_subword_embedding_sequences_batch + ) + + self.assertEqual(decomposition_lengths, self.a_non_padded_subword_embedding_batch_decomposition_length_list) + + def test_givenASequencesBatch_whenPaddingSubwordEmbeddings_thenShouldReturnBatchAsTensor(self): + padded_sequences, _, _ = self.padder.pad_subword_embeddings_sequences( + self.a_non_padded_subword_embedding_sequences_batch + ) + + self.assertIsInstance(padded_sequences, torch.Tensor) + + def test_givenASequencesBatch_whenPaddingSubwordEmbeddings_thenShouldPerformCorrectPadding(self): + padded_sequences, _, _ = self.padder.pad_subword_embeddings_sequences( + self.a_non_padded_subword_embedding_sequences_batch + ) + + self.assertTrue(torch.all(padded_sequences.eq(self.a_padded_subword_embedding_sequences_batch))) + + def test_givenATrainingBatch_whenPaddingSubwordEmbeddings_thenShouldReturnCorrectLengths(self): + (_, _, lengths), _ = self.padder.pad_subword_embeddings_batch( + self.a_training_non_padded_subword_embedding_batch + ) + + self.assertTrue(torch.all(lengths.eq(self.a_non_padded_subword_embedding_batch_length_list))) + + def test_givenATrainingsBatch_whenPaddingSubwordEmbeddings_thenShouldReturnCorrectDecompositionLengths(self): + (_, decomposition_lengths, _), _ = self.padder.pad_subword_embeddings_batch( + self.a_training_non_padded_subword_embedding_batch + ) + + self.assertEqual(decomposition_lengths, self.a_non_padded_subword_embedding_batch_decomposition_length_list) + + def test_givenATrainingBatch_whenPaddingSubwordEmbeddings_thenShouldReturnBatchAsTensor(self): + (padded_sequences, _, _), _ = self.padder.pad_subword_embeddings_batch( + self.a_training_non_padded_subword_embedding_batch + ) + + self.assertIsInstance(padded_sequences, torch.Tensor) + + def test_givenATrainingBatch_whenPaddingSubwordEmbeddings_thenShouldPerformCorrectPadding(self): + (padded_sequences, _, _), _ = self.padder.pad_subword_embeddings_batch( + self.a_training_non_padded_subword_embedding_batch + ) + + self.assertTrue(torch.all(padded_sequences.eq(self.a_padded_subword_embedding_sequences_batch))) + + def test_givenATrainingBatch_whenPaddingSubwordEmbeddings_thenShouldReturnTargetAsTensor(self): + (_, _, _), padded_target = self.padder.pad_subword_embeddings_batch( + self.a_training_non_padded_subword_embedding_batch + ) + + self.assertIsInstance(padded_target, torch.Tensor) + + def test_givenATrainingBatch_whenPaddingSubwordEmbeddings_thenShouldPerformCorrectPaddingOnTarget(self): + (_, _, _), padded_target = self.padder.pad_subword_embeddings_batch( + self.a_training_non_padded_subword_embedding_batch + ) + + self.assertTrue(torch.all(padded_target.eq(self.a_padded_target_tensor))) + + def test_givenATrainingBatch_whenPaddingSubwordEmbeddingsWithTeacherForcing_thenShouldReturnCorrectLengths(self): + (_, _, lengths, _), _ = self.padder.pad_subword_embeddings_batch( + self.a_training_non_padded_subword_embedding_batch, teacher_forcing=True + ) + + self.assertTrue(torch.all(lengths.eq(self.a_non_padded_subword_embedding_batch_length_list))) + + def test_givenATrainingsBatch_whenPaddingSubwordEmbeddingsWithTeacherForcing_thenShouldReturnCorrectDecompositionLengths( + self, + ): + (_, decomposition_lengths, _, _), _ = self.padder.pad_subword_embeddings_batch( + self.a_training_non_padded_subword_embedding_batch, teacher_forcing=True + ) + + self.assertEqual(decomposition_lengths, self.a_non_padded_subword_embedding_batch_decomposition_length_list) + + def test_givenATrainingBatch_whenPaddingSubwordEmbeddingsWithTeacherForcing_thenShouldReturnBatchAsTensor(self): + (padded_sequences, _, _, _), _ = self.padder.pad_subword_embeddings_batch( + self.a_training_non_padded_subword_embedding_batch, teacher_forcing=True + ) + + self.assertIsInstance(padded_sequences, torch.Tensor) + + def test_givenATrainingBatch_whenPaddingSubwordEmbeddingsWithTeacherForcing_thenShouldPerformCorrectPadding(self): + (padded_sequences, _, _, _), _ = self.padder.pad_subword_embeddings_batch( + self.a_training_non_padded_subword_embedding_batch, teacher_forcing=True + ) + + self.assertTrue(torch.all(padded_sequences.eq(self.a_padded_subword_embedding_sequences_batch))) + + def test_givenATrainingBatch_whenPaddingSubwordEmbeddingsWithTeacherForcing_thenShouldReturnTargetAsTensor(self): + (_, _, _, _), padded_target = self.padder.pad_subword_embeddings_batch( + self.a_training_non_padded_subword_embedding_batch, teacher_forcing=True + ) + + self.assertIsInstance(padded_target, torch.Tensor) + + def test_givenATrainingBatch_whenPaddingSubwordEmbeddingsWithTeacherForcing_thenShouldPerformCorrectPaddingOnTarget( + self, + ): + (_, _, _, _), padded_target = self.padder.pad_subword_embeddings_batch( + self.a_training_non_padded_subword_embedding_batch, teacher_forcing=True + ) + + self.assertTrue(torch.all(padded_target.eq(self.a_padded_target_tensor))) + + def test_givenATrainingBatch_whenPaddingSubwordEmbeddingsWithTeacherForcing_thenShouldReturnTargetWithSequencesAndLengths( + self, + ): + (_, _, _, padded_target), _ = self.padder.pad_subword_embeddings_batch( + self.a_training_non_padded_subword_embedding_batch, teacher_forcing=True + ) + + self.assertTrue(torch.all(padded_target.eq(self.a_padded_target_tensor))) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/converter/test_data_padding.py b/tests/converter/test_data_padding.py deleted file mode 100644 index 3fda7409..00000000 --- a/tests/converter/test_data_padding.py +++ /dev/null @@ -1,341 +0,0 @@ -# Bug with PyTorch source code makes torch.tensor as not callable for pylint. -# pylint: disable=not-callable, too-many-public-methods - -import unittest -from unittest import TestCase - -import torch - -from deepparse.converter import ( - fasttext_data_padding, - bpemb_data_padding, - fasttext_data_padding_with_target, - bpemb_data_padding_with_target, - fasttext_data_padding_teacher_forcing, - bpemb_data_padding_teacher_forcing, -) - - -class DataPaddingTest(TestCase): - @classmethod - def setUpClass(cls): - cls.a_number_of_sequences = 5 - cls.a_max_length = 10 - - cls.a_non_padded_word_embedding_batch_length_list = [] - cls.a_non_padded_word_embedding_batch = [] - cls.a_fasttext_padded_batch = [] - - cls.a_non_padded_subword_embedding_batch_lenght_list = [] - cls.a_non_padded_subword_embedding_batch_decomposition_lenght_list = [] - cls.a_non_padded_subword_embedding_batch = [] - cls.a_bpemb_padded_batch = [] - - cls.a_training_non_padded_word_embedding_batch = [] - cls.a_training_non_padded_subword_embedding_batch = [] - cls.a_padded_target_tensor = [] - - def setUp(self): - self.a_non_padded_word_embedding_batch_length_list = torch.tensor([3, 2, 1]) - self.a_non_padded_word_embedding_batch = [ - [[1, 1], [1, 1], [1, 1]], - [[1, 1], [1, 1]], - [[1, 1]], - ] - self.a_fasttext_padded_batch = torch.FloatTensor( - [ - [[1, 1], [1, 1], [1, 1]], - [[1, 1], [1, 1], [-100, -100]], - [[1, 1], [-100, -100], [-100, -100]], - ] - ) - - self.a_non_padded_subword_embedding_batch_lenght_list = torch.tensor([3, 2, 1]) - self.a_non_padded_subword_embedding_batch = [ - ( - [ - [[1, 1], [1, 1], [-1, -1]], - [[1, 1], [1, 1], [1, 1]], - [[1, 1], [-1, -1], [-1, -1]], - ], - [2, 3, 1], - ), - ([[[1, 1], [1, 1], [-1, -1]], [[1, 1], [1, 1], [-1, -1]]], [2, 2]), - ([[[1, 1], [1, 1], [1, 1]]], [3]), - ] - self.a_bpemb_padded_batch = torch.tensor( - [ - [ - [[1, 1], [1, 1], [-1, -1]], - [[1, 1], [1, 1], [1, 1]], - [[1, 1], [-1, -1], [-1, -1]], - ], - [ - [[1, 1], [1, 1], [-1, -1]], - [[1, 1], [1, 1], [-1, -1]], - [[-100, -100], [-100, -100], [-100, -100]], - ], - [ - [[1, 1], [1, 1], [1, 1]], - [[-100, -100], [-100, -100], [-100, -100]], - [[-100, -100], [-100, -100], [-100, -100]], - ], - ] - ) - self.a_non_padded_subword_embedding_batch_decomposition_lenght_list = [ - [2, 3, 1], - [2, 2, 1], - [3, 1, 1], - ] - - self.a_training_non_padded_word_embedding_batch = [ - ([[1, 1], [1, 1], [1, 1]], [0, 3, 5]), - ([[1, 1], [1, 1]], [4, 7]), - ([[1, 1]], [8]), - ] - - self.a_training_non_padded_subword_embedding_batch = [ - ( - ( - [ - [[1, 1], [1, 1], [-1, -1]], - [[1, 1], [1, 1], [1, 1]], - [[1, 1], [-1, -1], [-1, -1]], - ], - [2, 3, 1], - ), - [0, 3, 5], - ), - ( - ([[[1, 1], [1, 1], [-1, -1]], [[1, 1], [1, 1], [-1, -1]]], [2, 2]), - [4, 7], - ), - (([[[1, 1], [1, 1], [1, 1]]], [3]), [8]), - ] - - self.a_padded_target_tensor = torch.tensor([[0, 3, 5], [4, 7, -100], [8, -100, -100]]) - - self.fasttext_data_padding = fasttext_data_padding - self.bpemb_data_padding = bpemb_data_padding - self.fasttext_data_padding_with_target = fasttext_data_padding_with_target - self.bpemb_data_padding_with_target = bpemb_data_padding_with_target - self.fasttext_data_padding_teacher_Forcing = fasttext_data_padding_teacher_forcing - self.bpemb_data_padding_teacher_forcing = bpemb_data_padding_teacher_forcing - - def test_givenbatch_whenfasttextpadding_thenshouldreturnrightlengths(self): - _, lengths = self.fasttext_data_padding(self.a_non_padded_word_embedding_batch) - - self.assertTrue(torch.all(lengths.eq(self.a_non_padded_word_embedding_batch_length_list))) - - def test_whenfasttextpadding_thenshouldreturnbatchastensor(self): - padded_batch, _ = self.fasttext_data_padding(self.a_non_padded_word_embedding_batch) - - self.assertIsInstance(padded_batch, torch.Tensor) - - def test_givenbatch_whenfasttextpadding_thenshoulddorightpadding(self): - padded_batch, _ = self.fasttext_data_padding(self.a_non_padded_word_embedding_batch) - - self.assertTrue(torch.all(padded_batch.eq(self.a_fasttext_padded_batch))) - - def test_givenbatch_whenbpembpadding_thenshouldreturnrightlengths(self): - _, _, lengths = self.bpemb_data_padding(self.a_non_padded_subword_embedding_batch) - - self.assertTrue(torch.all(lengths.eq(self.a_non_padded_subword_embedding_batch_lenght_list))) - - def test_givenbatch_whenbpembpadding_thenshouldreturnrightdecomposition_lengths( - self, - ): - _, decomposition_lengths, _ = self.bpemb_data_padding(self.a_non_padded_subword_embedding_batch) - - self.assertEqual( - decomposition_lengths, - self.a_non_padded_subword_embedding_batch_decomposition_lenght_list, - ) - - def test_whenbpembpadding_thenshouldreturnbatchastensor(self): - padded_batch, _, _ = self.bpemb_data_padding(self.a_non_padded_subword_embedding_batch) - - self.assertIsInstance(padded_batch, torch.Tensor) - - def test_givenbatch_whenbpembpadding_thenshoulddorightpadding(self): - padded_batch, _, _ = self.bpemb_data_padding(self.a_non_padded_subword_embedding_batch) - - self.assertTrue(torch.all(padded_batch.eq(self.a_bpemb_padded_batch))) - - def test_givenBatch_whenFasttextPaddingWithTarget_thenShouldReturnRightLengths( - self, - ): - (_, lengths), _ = self.fasttext_data_padding_with_target(self.a_training_non_padded_word_embedding_batch) - - self.assertTrue(torch.all(lengths.eq(self.a_non_padded_word_embedding_batch_length_list))) - - def test_givenBatch_whenFasttextPaddingWithTarget_thenShouldReturnBatchAsTensor( - self, - ): - (padded_batch, _), _ = self.fasttext_data_padding_with_target(self.a_training_non_padded_word_embedding_batch) - - self.assertIsInstance(padded_batch, torch.Tensor) - - def test_givenBatch_whenFasttextPaddingWithTarget_thenShouldPerformRightPadding( - self, - ): - (padded_batch, _), _ = self.fasttext_data_padding_with_target(self.a_training_non_padded_word_embedding_batch) - - self.assertTrue(torch.all(padded_batch.eq(self.a_fasttext_padded_batch))) - - def test_givenBatch_whenFasttextPaddingWithTarget_thenShouldReturnPaddedTarget( - self, - ): - (_, _), target_tensor = self.fasttext_data_padding_with_target(self.a_training_non_padded_word_embedding_batch) - - self.assertTrue(torch.all(target_tensor.eq(self.a_padded_target_tensor))) - - def test_givenBatch_whenBpembDataPaddingWithTarget_thenShouldReturnRightLengths( - self, - ): - (_, _, lengths), _ = self.bpemb_data_padding_with_target(self.a_training_non_padded_subword_embedding_batch) - - self.assertTrue(torch.all(lengths.eq(self.a_non_padded_subword_embedding_batch_lenght_list))) - - def test_givenBatch_whenBpembDataPaddingWithTarget_thenShouldReturnBatchAsTensor( - self, - ): - (padded_batch, _, _), _ = self.bpemb_data_padding_with_target( - self.a_training_non_padded_subword_embedding_batch - ) - - self.assertIsInstance(padded_batch, torch.Tensor) - - def test_givenBatch_whenBpembDataPaddingWithTarget_thenShouldPerformRightPadding( - self, - ): - (padded_batch, _, _), _ = self.bpemb_data_padding_with_target( - self.a_training_non_padded_subword_embedding_batch - ) - - self.assertTrue(torch.all(padded_batch.eq(self.a_bpemb_padded_batch))) - - def test_givenBatch_whenBpembDataPaddingWithTarget_thenShouldReturnRightDecompositionLengths( - self, - ): - (_, decomposition_lengths, _), _ = self.bpemb_data_padding_with_target( - self.a_training_non_padded_subword_embedding_batch - ) - - self.assertEqual( - decomposition_lengths, - self.a_non_padded_subword_embedding_batch_decomposition_lenght_list, - ) - - def test_givenBatch_whenBpembDataPaddingWithTarget_thenShouldReturnPaddedTarget( - self, - ): - (_, _, _), target_tensor = self.bpemb_data_padding_with_target( - self.a_training_non_padded_subword_embedding_batch - ) - - self.assertTrue(torch.all(target_tensor.eq(self.a_padded_target_tensor))) - - def test_givenBatch_whenFasttextPaddingTeacherForcing_thenShouldReturnRightLengths( - self, - ): - (_, lengths, _), _ = self.fasttext_data_padding_teacher_Forcing(self.a_training_non_padded_word_embedding_batch) - - self.assertTrue(torch.all(lengths.eq(self.a_non_padded_word_embedding_batch_length_list))) - - def test_givenBatch_whenFasttextPaddingTeacherForcing_thenShouldReturnBatchAsTensor( - self, - ): - (padded_batch, _, _), _ = self.fasttext_data_padding_teacher_Forcing( - self.a_training_non_padded_word_embedding_batch - ) - - self.assertIsInstance(padded_batch, torch.Tensor) - - def test_givenBatch_whenFasttextPaddingTeacherForcing_thenShouldPerformRightPadding( - self, - ): - (padded_batch, _, _), _ = self.fasttext_data_padding_teacher_Forcing( - self.a_training_non_padded_word_embedding_batch - ) - - self.assertTrue(torch.all(padded_batch.eq(self.a_fasttext_padded_batch))) - - def test_givenBatch_whenFasttextPaddingTeacherForcing_thenShouldReturnPaddedTarget( - self, - ): - (_, _, _), target_tensor = self.fasttext_data_padding_teacher_Forcing( - self.a_training_non_padded_word_embedding_batch - ) - - self.assertTrue(torch.all(target_tensor.eq(self.a_padded_target_tensor))) - - def test_givenBatch_whenFasttextPaddingTeacherForcing_thenShouldReturnPaddedTargetInBatch( - self, - ): - (_, _, target_tensor), _ = self.fasttext_data_padding_teacher_Forcing( - self.a_training_non_padded_word_embedding_batch - ) - - self.assertTrue(torch.all(target_tensor.eq(self.a_padded_target_tensor))) - - def test_givenBatch_whenBpembDataPaddingTeacherForcing_thenShouldReturnRightLengths( - self, - ): - (_, _, lengths, _), _ = self.bpemb_data_padding_teacher_forcing( - self.a_training_non_padded_subword_embedding_batch - ) - - self.assertTrue(torch.all(lengths.eq(self.a_non_padded_subword_embedding_batch_lenght_list))) - - def test_givenBatch_whenBpembDataPaddingTeacherForcing_thenShouldReturnBatchAsTensor( - self, - ): - (padded_batch, _, _, _), _ = self.bpemb_data_padding_teacher_forcing( - self.a_training_non_padded_subword_embedding_batch - ) - - self.assertIsInstance(padded_batch, torch.Tensor) - - def test_givenBatch_whenBpembDataPaddingTeacherForcing_thenShouldPerformRightPadding( - self, - ): - (padded_batch, _, _, _), _ = self.bpemb_data_padding_teacher_forcing( - self.a_training_non_padded_subword_embedding_batch - ) - - self.assertTrue(torch.all(padded_batch.eq(self.a_bpemb_padded_batch))) - - def test_givenBatch_whenBpembDataPaddingTeacherForcing_thenShouldReturnRightDecompositionLengths( - self, - ): - (_, decomposition_lengths, _, _), _ = self.bpemb_data_padding_teacher_forcing( - self.a_training_non_padded_subword_embedding_batch - ) - - self.assertEqual( - decomposition_lengths, - self.a_non_padded_subword_embedding_batch_decomposition_lenght_list, - ) - - def test_givenBatch_whenBpembDataPaddingTeacherForcing_thenShouldReturnPaddedTarget( - self, - ): - (_, _, _, _), target_tensor = self.bpemb_data_padding_teacher_forcing( - self.a_training_non_padded_subword_embedding_batch - ) - - self.assertTrue(torch.all(target_tensor.eq(self.a_padded_target_tensor))) - - def test_givenBatch_whenBpembDataPaddingTeacherForcing_thenShouldReturnTargetTensorInBatch( - self, - ): - (_, _, _, target_tensor), _ = self.bpemb_data_padding_teacher_forcing( - self.a_training_non_padded_subword_embedding_batch - ) - - self.assertTrue(torch.all(target_tensor.eq(self.a_padded_target_tensor))) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/converter/test_data_processor.py b/tests/converter/test_data_processor.py new file mode 100644 index 00000000..4686caed --- /dev/null +++ b/tests/converter/test_data_processor.py @@ -0,0 +1,320 @@ +# pylint: disable=line-too-long +import unittest +from unittest import TestCase +from unittest.mock import ANY, MagicMock, Mock, call + +import torch + +from deepparse.converter.data_processor import DataProcessor + + +class DataProcessorTest(TestCase): + @classmethod + def setUpClass(cls): + cls.an_address_list = ["45 old road", "quebec g1v0a1"] + + cls.a_tag_list = [["StreetNumber", "StreetName", "StreetName"], ["Municipality", "PostalCode"]] + cls.a_address_and_tags_list = list(zip(cls.an_address_list, cls.a_tag_list)) + + cls.a_word_embedding_sequence = [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10]]] + cls.a_padded_word_embedding_sequence = torch.tensor([[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [-100, -100]]]) + cls.a_sequence_lengths_list = torch.tensor([3, 2]) + + cls.a_subword_embedding_sequence = [ + [[[1, 2], [3, 4], [-1, -1]], [[5, 6], [7, 8], [9, 10]], [[11, 12], [-1, -1], [-1, -1]]], + [[[13, 14], [15, 16], [17, 18]], [[19, 20], [-1, -1], [-1, -1]]], + ] + cls.a_padded_subword_embedding_sequence = torch.tensor( + [ + [[[1, 2], [3, 4], [-1, -1]], [[5, 6], [7, 8], [9, 10]], [[11, 12], [-1, -1], [-1, -1]]], + [ + [[13, 14], [15, 16], [17, 18]], + [[19, 20], [-1, -1], [-1, -1]], + [[-100, -100], [-100, -100], [-100, -100]], + ], + ] + ) + cls.a_word_decomposition_lengths_list = [[2, 3, 1], [3, 1]] + + cls.a_subword_vectorized_sequence = list( + zip(cls.a_subword_embedding_sequence, cls.a_word_decomposition_lengths_list) + ) + + cls.a_tag_to_idx = {"StreetNumber": 0, "StreetName": 1, "Municipality": 2, "PostalCode": 3, "EOS": 4} + + cls.a_tag_targets_list = [[0, 1, 1, 4], [2, 3, 4]] + cls.a_padded_tag_targets = torch.tensor([[0, 1, 1, 4], [2, 3, 4, -100]]) + + def setUp(self): + self.fasttext_vectorizer_mock = MagicMock(return_value=self.a_word_embedding_sequence) + self.bpemb_vectorizer_mock = MagicMock(return_value=self.a_subword_vectorized_sequence) + + self.fasttext_sequences_padding_callback_mock = Mock( + return_value=( + self.a_padded_word_embedding_sequence, + self.a_sequence_lengths_list, + ) + ) + self.bpemb_sequences_padding_callback_mock = Mock( + return_value=( + self.a_padded_subword_embedding_sequence, + self.a_word_decomposition_lengths_list, + self.a_sequence_lengths_list, + ) + ) + + self.fasttext_batch_padding_callback_mock = Mock() + self.fasttext_batch_padding_callback_mock.side_effect = ( + lambda *params: ( + ( + self.a_padded_word_embedding_sequence, + self.a_sequence_lengths_list, + ), + self.a_padded_tag_targets, + ) + if params[1] is False + else ( + (self.a_padded_word_embedding_sequence, self.a_sequence_lengths_list, self.a_padded_tag_targets), + self.a_padded_tag_targets, + ) + ) + + self.bpemb_batch_padding_callback_mock = Mock( + return_value=( + ( + self.a_padded_subword_embedding_sequence, + self.a_word_decomposition_lengths_list, + self.a_sequence_lengths_list, + ), + self.a_padded_tag_targets, + ) + ) + self.bpemb_batch_padding_callback_mock.side_effect = ( + lambda *params: ( + ( + self.a_padded_subword_embedding_sequence, + self.a_word_decomposition_lengths_list, + self.a_sequence_lengths_list, + ), + self.a_padded_tag_targets, + ) + if params[1] is False + else ( + ( + self.a_padded_subword_embedding_sequence, + self.a_word_decomposition_lengths_list, + self.a_sequence_lengths_list, + self.a_padded_tag_targets, + ), + self.a_padded_tag_targets, + ) + ) + + self.tags_converter_mock = Mock() + self.tags_converter_mock.side_effect = lambda tag: self.a_tag_to_idx[tag] + + def test_whenProcessingForInference_thenShouldCallVectorizerWithAddresses(self): + processor = DataProcessor( + self.fasttext_vectorizer_mock, + self.fasttext_sequences_padding_callback_mock, + self.fasttext_batch_padding_callback_mock, + self.tags_converter_mock, + ) + + processor.process_for_inference(self.an_address_list) + + self.fasttext_vectorizer_mock.assert_called_once_with(self.an_address_list) + + def test_givenAFasttextEmbeddingContext_whenProcessingForInference_thenShouldCallSequencesPaddingCallbackWithCorrectEmbeddings( + self, + ): + processor = DataProcessor( + self.fasttext_vectorizer_mock, + self.fasttext_sequences_padding_callback_mock, + self.fasttext_batch_padding_callback_mock, + self.tags_converter_mock, + ) + + processor.process_for_inference(self.an_address_list) + + self.fasttext_sequences_padding_callback_mock.assert_called_once_with(self.a_word_embedding_sequence) + + def test_givenAFasttextEmbeddingContext_whenProcessingForInference_thenShouldReturnCorrectPaddedEmbeddingSequences( + self, + ): + processor = DataProcessor( + self.fasttext_vectorizer_mock, + self.fasttext_sequences_padding_callback_mock, + self.fasttext_batch_padding_callback_mock, + self.tags_converter_mock, + ) + + sequences, lengths = processor.process_for_inference(self.an_address_list) + + self.assertTrue(torch.all(sequences.eq(self.a_padded_word_embedding_sequence))) + self.assertTrue(torch.all(lengths.eq(self.a_sequence_lengths_list))) + + def test_givenABpembEmbeddingContext_whenProcessingForInference_thenShouldCallSequencesPaddingCallbackWithCorrectEmbeddings( + self, + ): + processor = DataProcessor( + self.bpemb_vectorizer_mock, + self.bpemb_sequences_padding_callback_mock, + self.bpemb_batch_padding_callback_mock, + self.tags_converter_mock, + ) + + processor.process_for_inference(self.an_address_list) + + self.bpemb_sequences_padding_callback_mock.assert_called_once_with(self.a_subword_vectorized_sequence) + + def test_givenABpembEmbeddingContext_whenProcessingForInference_thenShouldReturnCorrectPaddedEmbeddingSequences( + self, + ): + processor = DataProcessor( + self.bpemb_vectorizer_mock, + self.bpemb_sequences_padding_callback_mock, + self.bpemb_batch_padding_callback_mock, + self.tags_converter_mock, + ) + + sequences, word_decomposition_lengths_list, lengths = processor.process_for_inference(self.an_address_list) + + self.assertTrue(torch.all(sequences.eq(self.a_padded_subword_embedding_sequence))) + self.assertEqual(word_decomposition_lengths_list, self.a_word_decomposition_lengths_list) + self.assertTrue(torch.all(lengths.eq(self.a_sequence_lengths_list))) + + def test_whenProcessingForTraining_thenShouldCallVectorizerWithAddresses(self): + processor = DataProcessor( + self.fasttext_vectorizer_mock, + self.fasttext_sequences_padding_callback_mock, + self.fasttext_batch_padding_callback_mock, + self.tags_converter_mock, + ) + + processor.process_for_training(self.a_address_and_tags_list) + + self.fasttext_vectorizer_mock.assert_called_once_with(self.an_address_list) + + def test_givenAFasttextEmbeddingContext_whenProcessingForTraining_thenShouldCallBatchPaddingCallbackWithCorrectEmbeddings( + self, + ): + processor = DataProcessor( + self.fasttext_vectorizer_mock, + self.fasttext_sequences_padding_callback_mock, + self.fasttext_batch_padding_callback_mock, + self.tags_converter_mock, + ) + + processor.process_for_training(self.a_address_and_tags_list) + + self.fasttext_batch_padding_callback_mock.assert_called_once_with( + list(zip(self.a_word_embedding_sequence, self.a_tag_targets_list)), ANY + ) + + def test_givenAFasttextEmbeddingContext_whenProcessingForTraining_thenShouldReturnCorrectPaddedEmbeddingSequencesAndTargets( + self, + ): + processor = DataProcessor( + self.fasttext_vectorizer_mock, + self.fasttext_sequences_padding_callback_mock, + self.fasttext_batch_padding_callback_mock, + self.tags_converter_mock, + ) + + (sequences, lengths), targets = processor.process_for_training(self.a_address_and_tags_list) + + self.assertTrue(torch.all(sequences.eq(self.a_padded_word_embedding_sequence))) + self.assertTrue(torch.all(lengths.eq(self.a_sequence_lengths_list))) + self.assertTrue(torch.all(targets.eq(self.a_padded_tag_targets))) + + def test_givenABpembEmbeddingContext_whenProcessingForTraining_thenShouldCallBatchPaddingCallbackWithCorrectEmbeddings( + self, + ): + processor = DataProcessor( + self.bpemb_vectorizer_mock, + self.bpemb_sequences_padding_callback_mock, + self.bpemb_batch_padding_callback_mock, + self.tags_converter_mock, + ) + + processor.process_for_training(self.a_address_and_tags_list) + + self.bpemb_batch_padding_callback_mock.assert_called_once_with( + list(zip(self.a_subword_vectorized_sequence, self.a_tag_targets_list)), ANY + ) + + def test_givenABpembEmbeddingContext_whenProcessingForTraining_thenShouldReturnCorrectPaddedEmbeddingSequencesAndTargets( + self, + ): + processor = DataProcessor( + self.bpemb_vectorizer_mock, + self.bpemb_sequences_padding_callback_mock, + self.bpemb_batch_padding_callback_mock, + self.tags_converter_mock, + ) + + (sequences, word_decomposition_lengths, lengths), targets = processor.process_for_training( + self.a_address_and_tags_list + ) + + self.assertTrue(torch.all(sequences.eq(self.a_padded_subword_embedding_sequence))) + self.assertEqual(word_decomposition_lengths, self.a_word_decomposition_lengths_list) + self.assertTrue(torch.all(lengths.eq(self.a_sequence_lengths_list))) + self.assertTrue(torch.all(targets.eq(self.a_padded_tag_targets))) + + def test_whenProcessingForTraining_thenShouldCallTagsConverterToConvertTags(self): + processor = DataProcessor( + self.fasttext_vectorizer_mock, + self.fasttext_sequences_padding_callback_mock, + self.fasttext_batch_padding_callback_mock, + self.tags_converter_mock, + ) + + processor.process_for_training(self.a_address_and_tags_list) + + tags_converter_calls = [call(tag) for tags in self.a_tag_list for tag in tags + ["EOS"]] + self.tags_converter_mock.assert_has_calls(tags_converter_calls) + + def test_givenAFasttextEmbeddingContext_whenProcessingForTrainingWithTeacherForcing_thenShouldReturnCorrectPaddedEmbeddingSequencesAndTargets( + self, + ): + processor = DataProcessor( + self.fasttext_vectorizer_mock, + self.fasttext_sequences_padding_callback_mock, + self.fasttext_batch_padding_callback_mock, + self.tags_converter_mock, + ) + + (sequences, lengths, targets_input), targets = processor.process_for_training( + self.a_address_and_tags_list, teacher_forcing=True + ) + + self.assertTrue(torch.all(sequences.eq(self.a_padded_word_embedding_sequence))) + self.assertTrue(torch.all(lengths.eq(self.a_sequence_lengths_list))) + self.assertTrue(torch.all(targets_input.eq(self.a_padded_tag_targets))) + self.assertTrue(torch.all(targets.eq(self.a_padded_tag_targets))) + + def test_givenABpembEmbeddingContext_whenProcessingForTrainingWithTeacherForcing_thenShouldReturnCorrectPaddedEmbeddingSequencesAndTargets( + self, + ): + processor = DataProcessor( + self.bpemb_vectorizer_mock, + self.bpemb_sequences_padding_callback_mock, + self.bpemb_batch_padding_callback_mock, + self.tags_converter_mock, + ) + + (sequences, word_decomposition_lengths, lengths, targets_input), targets = processor.process_for_training( + self.a_address_and_tags_list, teacher_forcing=True + ) + + self.assertTrue(torch.all(sequences.eq(self.a_padded_subword_embedding_sequence))) + self.assertEqual(word_decomposition_lengths, self.a_word_decomposition_lengths_list) + self.assertTrue(torch.all(lengths.eq(self.a_sequence_lengths_list))) + self.assertTrue(torch.all(targets_input.eq(self.a_padded_tag_targets))) + self.assertTrue(torch.all(targets.eq(self.a_padded_tag_targets))) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/converter/test_data_processor_factory.py b/tests/converter/test_data_processor_factory.py new file mode 100644 index 00000000..628315c1 --- /dev/null +++ b/tests/converter/test_data_processor_factory.py @@ -0,0 +1,58 @@ +import unittest +from unittest import TestCase +from unittest.mock import Mock + +from deepparse.converter import DataProcessorFactory, DataPadder +from deepparse.vectorizer import BPEmbVectorizer, FastTextVectorizer + + +class DataProcessorFactoryTest(TestCase): + def setUp(self): + self.bpemb_vectorizer = BPEmbVectorizer(Mock()) + + self.fasttext_vectorizer = FastTextVectorizer(Mock()) + + self.padder = DataPadder() + + self.tags_converter_mock = Mock() + + self.processor_factory = DataProcessorFactory() + + self.invalid_vectorizer = "invalid vectorizer" + + def test_givenABpembVectorizer_whenCreatingProcessor_thenShouldAssignCorrectSequencesPaddingCallbacks(self): + processor = self.processor_factory.create(self.bpemb_vectorizer, self.padder, self.tags_converter_mock) + + self.assertTrue( + processor.sequences_padding_callback.__qualname__ + == DataPadder.pad_subword_embeddings_sequences.__qualname__ + ) + + def test_givenABpembVectorizer_whenCreatingProcessor_thenShouldAssignCorrectBatchPaddingCallbacks(self): + processor = self.processor_factory.create(self.bpemb_vectorizer, self.padder, self.tags_converter_mock) + + self.assertTrue( + processor.batch_padding_callback.__qualname__ == DataPadder.pad_subword_embeddings_batch.__qualname__ + ) + + def test_givenANonBpembVectorizer_whenCreatingProcessor_thenShouldAssignCorrectSequencesPaddingCallbacks(self): + processor = self.processor_factory.create(self.fasttext_vectorizer, self.padder, self.tags_converter_mock) + + self.assertTrue( + processor.sequences_padding_callback.__qualname__ == DataPadder.pad_word_embeddings_sequences.__qualname__ + ) + + def test_givenANonBpembVectorizer_whenCreatingProcessor_thenShouldAssignCorrectBatchPaddingCallbacks(self): + processor = self.processor_factory.create(self.fasttext_vectorizer, self.padder, self.tags_converter_mock) + + self.assertTrue( + processor.batch_padding_callback.__qualname__ == DataPadder.pad_word_embeddings_batch.__qualname__ + ) + + def test_givenAnInvalidVectorizer_whenCreatingProcessor_thenShouldRaiseError(self): + with self.assertRaises(NotImplementedError): + self.processor_factory.create(self.invalid_vectorizer, self.padder, self.tags_converter_mock) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/converter/test_data_transform.py b/tests/converter/test_data_transform.py deleted file mode 100644 index 5bc5665e..00000000 --- a/tests/converter/test_data_transform.py +++ /dev/null @@ -1,120 +0,0 @@ -import unittest -from unittest import TestCase -from unittest.mock import MagicMock, patch, call - -from deepparse.converter import ( - DataTransform, - fasttext_data_padding_teacher_forcing, - fasttext_data_padding_with_target, - bpemb_data_padding_teacher_forcing, - bpemb_data_padding_with_target, -) - - -class DataTransformTest(TestCase): - def setUp(self) -> None: - self.train_vectorizer_mock = MagicMock() - self.a_fasttext_model_type = "fasttext" - self.a_bpemb_model_type = "bpemb" - self.a_fasttext_light_model_type = "fasttext-light" - self.an_attention_model = "fasttextAttention" - - def test_whenInstantiateAFastTextDataTransform_thenParametersAreOk(self): - data_transform = DataTransform(self.train_vectorizer_mock, self.a_fasttext_model_type) - - # teacher forcing padding test - expected = fasttext_data_padding_teacher_forcing - self.assertIs(expected, data_transform.teacher_forcing_data_padding_fn) - - # output transform padding test - expected = fasttext_data_padding_with_target - self.assertIs(expected, data_transform.output_transform_data_padding_fn) - - def test_whenInstantiateABPEmbDataTransform_thenParametersAreOk(self): - data_transform = DataTransform(self.train_vectorizer_mock, self.a_bpemb_model_type) - - # teacher forcing padding test - expected = bpemb_data_padding_teacher_forcing - self.assertIs(expected, data_transform.teacher_forcing_data_padding_fn) - - # output transform padding test - expected = bpemb_data_padding_with_target - self.assertIs(expected, data_transform.output_transform_data_padding_fn) - - def test_whenInstantiateAFasttextLightDataTransform_thenRaiseError(self): - with self.assertRaises(NotImplementedError): - _ = DataTransform(self.train_vectorizer_mock, self.a_fasttext_light_model_type) - - @patch("deepparse.converter.data_transform.fasttext_data_padding_teacher_forcing") - def test_givenAFasttextDataTransform_whenTeacherForcingTransform_thenComponentsAreCall(self, teacher_forcing_mock): - data_transform = DataTransform(self.train_vectorizer_mock, self.a_fasttext_model_type) - - batch_pairs_mock = MagicMock() - - data_transform.teacher_forcing_transform(batch_pairs_mock) - - train_vectorizer_call = [call(batch_pairs_mock)] - self.train_vectorizer_mock.assert_has_calls(train_vectorizer_call) - - train_vectorizer_call = [call(self.train_vectorizer_mock())] - teacher_forcing_mock.assert_has_calls(train_vectorizer_call) - - @patch("deepparse.converter.data_transform.bpemb_data_padding_teacher_forcing") - def test_givenABPEmbDataTransform_whenTeacherForcingTransform_thenComponentsAreCall(self, teacher_forcing_mock): - data_transform = DataTransform(self.train_vectorizer_mock, self.a_bpemb_model_type) - - batch_pairs_mock = MagicMock() - - data_transform.teacher_forcing_transform(batch_pairs_mock) - - train_vectorizer_call = [call(batch_pairs_mock)] - self.train_vectorizer_mock.assert_has_calls(train_vectorizer_call) - - train_vectorizer_call = [call(self.train_vectorizer_mock())] - teacher_forcing_mock.assert_has_calls(train_vectorizer_call) - - @patch("deepparse.converter.data_transform.fasttext_data_padding_with_target") - def test_givenAFasttextDataTransform_whenOutputTransform_thenComponentsAreCall(self, output_transform_mock): - data_transform = DataTransform(self.train_vectorizer_mock, self.a_fasttext_model_type) - - batch_pairs_mock = MagicMock() - - data_transform.output_transform(batch_pairs_mock) - - train_vectorizer_call = [call(batch_pairs_mock)] - self.train_vectorizer_mock.assert_has_calls(train_vectorizer_call) - - train_vectorizer_call = [call(self.train_vectorizer_mock())] - output_transform_mock.assert_has_calls(train_vectorizer_call) - - @patch("deepparse.converter.data_transform.fasttext_data_padding_with_target") - def test_givenAFasttextAttDataTransform_whenOutputTransform_thenComponentsAreCall(self, output_transform_mock): - data_transform = DataTransform(self.train_vectorizer_mock, self.an_attention_model) - - batch_pairs_mock = MagicMock() - - data_transform.output_transform(batch_pairs_mock) - - train_vectorizer_call = [call(batch_pairs_mock)] - self.train_vectorizer_mock.assert_has_calls(train_vectorizer_call) - - train_vectorizer_call = [call(self.train_vectorizer_mock())] - output_transform_mock.assert_has_calls(train_vectorizer_call) - - @patch("deepparse.converter.data_transform.bpemb_data_padding_with_target") - def test_givenABPEmbDataTransform_whenOutputTransform_thenComponentsAreCall(self, output_transform_mock): - data_transform = DataTransform(self.train_vectorizer_mock, self.a_bpemb_model_type) - - batch_pairs_mock = MagicMock() - - data_transform.output_transform(batch_pairs_mock) - - train_vectorizer_call = [call(batch_pairs_mock)] - self.train_vectorizer_mock.assert_has_calls(train_vectorizer_call) - - train_vectorizer_call = [call(self.train_vectorizer_mock())] - output_transform_mock.assert_has_calls(train_vectorizer_call) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/embeddings_models/test_embeddings_model_factory.py b/tests/embeddings_models/test_embeddings_model_factory.py new file mode 100644 index 00000000..b9e9cc07 --- /dev/null +++ b/tests/embeddings_models/test_embeddings_model_factory.py @@ -0,0 +1,63 @@ +# pylint: disable=line-too-long, unused-argument +import unittest +from unittest import TestCase +from unittest.mock import patch + +from deepparse.embeddings_models import ( + EmbeddingsModelFactory, + FastTextEmbeddingsModel, + BPEmbEmbeddingsModel, + MagnitudeEmbeddingsModel, +) + + +class EmbeddingsModelFactoryTest(TestCase): + @classmethod + def setUpClass(cls): + cls.a_bpemb_embeddings_model_type = "bpemb" + cls.a_fasttext_embeddings_model_type = "fasttext" + cls.a_fasttext_magnitude_embeddings_model_type = "fasttext-light" + + cls.an_invalid_embeddings_model_type = "invalid" + + cls.a_cache_dir = "~/.cache/deepparse" + + def setUp(self): + self.embeddings_model_factory = EmbeddingsModelFactory() + + @patch("deepparse.embeddings_models.bpemb_embeddings_model.BPEmb") + def test_givenABpembEmbeddingsModelType_whenCreatingEmbeddingsModel_thenShouldReturnCorrectEmbeddingsModel( + self, bpemb_mock + ): + embeddings_model = self.embeddings_model_factory.create(self.a_bpemb_embeddings_model_type, self.a_cache_dir) + + self.assertIsInstance(embeddings_model, BPEmbEmbeddingsModel) + + @patch("deepparse.embeddings_models.embeddings_model_factory.download_fasttext_embeddings") + @patch("deepparse.embeddings_models.fasttext_embeddings_model.load_fasttext_embeddings") + @patch("deepparse.embeddings_models.fasttext_embeddings_model.load_facebook_vectors") + def test_givenAFasttextEmbeddingsModelType_whenCreatingEmbeddingsModel_thenShouldReturnCorrectEmbeddingsModel( + self, facebook_vectors_load_mock, fasttext_load_mock, download_mock + ): + embeddings_model = self.embeddings_model_factory.create(self.a_fasttext_embeddings_model_type, self.a_cache_dir) + + self.assertIsInstance(embeddings_model, FastTextEmbeddingsModel) + + @patch("deepparse.embeddings_models.embeddings_model_factory.download_fasttext_magnitude_embeddings") + @patch("deepparse.embeddings_models.magnitude_embeddings_model.Magnitude") + def test_givenAFasttextMagnitudeEmbeddingsModelType_whenCreatingEmbeddingsModel_thenShouldReturnCorrectEmbeddingsModel( + self, download_mock, load_mock + ): + embeddings_model = self.embeddings_model_factory.create( + self.a_fasttext_magnitude_embeddings_model_type, self.a_cache_dir + ) + + self.assertIsInstance(embeddings_model, MagnitudeEmbeddingsModel) + + def test_givenAnInvalidEmbeddingsModelType_whenCreatingEmbeddingsModel_thenShouldRaiseError(self): + with self.assertRaises(NotImplementedError): + self.embeddings_model_factory.create(self.an_invalid_embeddings_model_type, self.a_cache_dir) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/network/integration/test_integration_seq2seq_model_cpu.py b/tests/network/integration/test_integration_seq2seq_model_cpu.py index b7b39730..5f697085 100644 --- a/tests/network/integration/test_integration_seq2seq_model_cpu.py +++ b/tests/network/integration/test_integration_seq2seq_model_cpu.py @@ -41,7 +41,11 @@ def setUp(self) -> None: def test_whenEncoderStep_thenEncoderStepIsOk(self): # encoding for two address: "["15 major st london ontario n5z1e1", "15 major st london ontario n5z1e1"]" - (decoder_input, decoder_hidden, encoder_outputs,) = self.pre_trained_seq2seq_model._encoder_step( + ( + decoder_input, + decoder_hidden, + encoder_outputs, + ) = self.pre_trained_seq2seq_model._encoder_step( self.to_predict_tensor, self.a_lengths_tensor, self.a_batch_size ) diff --git a/tests/network/integration/test_integration_seq2seq_model_gpu.py b/tests/network/integration/test_integration_seq2seq_model_gpu.py index 8e91a758..004663d5 100644 --- a/tests/network/integration/test_integration_seq2seq_model_gpu.py +++ b/tests/network/integration/test_integration_seq2seq_model_gpu.py @@ -38,7 +38,11 @@ def setUp(self) -> None: def test_whenEncoderStep_thenEncoderStepIsOk(self): # encoding for two address: "["15 major st london ontario n5z1e1", "15 major st london ontario n5z1e1"]" - (decoder_input, decoder_hidden, encoder_outputs,) = self.pre_trained_seq2seq_model._encoder_step( + ( + decoder_input, + decoder_hidden, + encoder_outputs, + ) = self.pre_trained_seq2seq_model._encoder_step( self.to_predict_tensor, self.a_lengths_tensor, self.a_batch_size ) diff --git a/tests/network/test_model_factory.py b/tests/network/test_model_factory.py new file mode 100644 index 00000000..4126d29a --- /dev/null +++ b/tests/network/test_model_factory.py @@ -0,0 +1,36 @@ +import unittest +from unittest import TestCase + +from deepparse.network import ModelFactory, FastTextSeq2SeqModel, BPEmbSeq2SeqModel + + +class ModelFactoryTest(TestCase): + @classmethod + def setUpClass(cls): + cls.a_fasttext_model_type = "fasttext" + cls.a_bpemb_model_type = "bpemb" + cls.an_invalid_model_type = "invalid_model" + + cls.a_cache_dir = "~/.cache/deepparse" + cls.a_device = "cpu" + + def setUp(self): + self.factory = ModelFactory() + + def test_givenAFasttextModelType_whenCreatingModel_thenShouldReturnFasttextSeq2Seq(self): + model = self.factory.create(self.a_fasttext_model_type, self.a_cache_dir, self.a_device) + + self.assertIsInstance(model, FastTextSeq2SeqModel) + + def test_givenABpembModelType_whenCreatingModel_thenShouldReturnBpembSeq2Seq(self): + model = self.factory.create(self.a_bpemb_model_type, self.a_cache_dir, self.a_device) + + self.assertIsInstance(model, BPEmbSeq2SeqModel) + + def test_givenAnInvalidModelType_whenCreatingModel_thenShouldRaiseException(self): + with self.assertRaises(NotImplementedError): + self.factory.create(self.an_invalid_model_type, self.a_cache_dir, self.a_device) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/parser/base.py b/tests/parser/base.py index 23258d5f..f9c84d3c 100644 --- a/tests/parser/base.py +++ b/tests/parser/base.py @@ -149,7 +149,7 @@ def setUp(self): def mock_predictions_vectors(self, model): returned_prediction_vectors = self.a_prediction_vector_for_a_complete_address returned_value = returned_prediction_vectors - model.__call__().return_value = returned_value + model.return_value = returned_value def mock_multiple_predictions_vectors(self, model): returned_prediction_vectors = torch.cat( @@ -160,7 +160,7 @@ def mock_multiple_predictions_vectors(self, model): 1, ) returned_value = returned_prediction_vectors - model.__call__().return_value = returned_value + model.return_value = returned_value def setup_retrain_new_tags_model(self, address_components, model_type): data_dict = { diff --git a/tests/parser/test_address_parser.py b/tests/parser/test_address_parser.py index f725a7ae..5cf395ad 100644 --- a/tests/parser/test_address_parser.py +++ b/tests/parser/test_address_parser.py @@ -1,5 +1,5 @@ # Since we use a patch as model mock we skip the unused argument error -# pylint: disable=unused-argument, too-many-public-methods, too-many-lines, too-many-arguments +# pylint: disable=unused-argument, too-many-public-methods, too-many-lines, too-many-arguments, line-too-long # Pylint error for TemporaryDirectory ask for with statement # pylint: disable=consider-using-with @@ -85,15 +85,15 @@ def setUp(self): self.BPEmb_mock = MagicMock() self.fasttext_mock = MagicMock() + self.model_mock = MagicMock() + self.embeddings_model_mock = MagicMock() def assert_equal_not_ordered(self, actual, expected_elements): for expected in expected_elements: self.assertIn(expected, actual) - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - def test_givenAModel_whenInit_thenProperFieldsSet(self, embeddings_model_mock, model_mock): + def test_givenAModel_whenInit_thenProperFieldsSet(self): address_parser = AddressParser(model_type=self.a_bpemb_model_type, device=self.a_cpu_device, verbose=True) expected_fields = self.expected_fields @@ -104,9 +104,7 @@ def test_givenAModel_whenInit_thenProperFieldsSet(self, embeddings_model_mock, m self.assert_equal_not_ordered(actual_fields, expected_fields) - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - def test_givenACPUDeviceSetup_whenInstantiatingParser_thenDeviceIsCPU(self, embeddings_model_mock, model_mock): + def test_givenACPUDeviceSetup_whenInstantiatingParser_thenDeviceIsCPU(self): address_parser = AddressParser( model_type=self.a_best_model_type.capitalize(), # we use BPEmb for simplicity @@ -118,11 +116,7 @@ def test_givenACPUDeviceSetup_whenInstantiatingParser_thenDeviceIsCPU(self, embe # We use BPEmb but could use FastText also @patch("deepparse.parser.address_parser.torch.cuda") - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - def test_givenAGPUDeviceSetup_whenInstantiatingParserWithoutGPU_thenRaiseWarningAndCPU( - self, embeddings_model_mock, model_mock, cuda_mock - ): + def test_givenAGPUDeviceSetup_whenInstantiatingParserWithoutGPU_thenRaiseWarningAndCPU(self, cuda_mock): cuda_mock.is_available.return_value = False with self.assertWarns(UserWarning): address_parser = AddressParser( @@ -134,10 +128,8 @@ def test_givenAGPUDeviceSetup_whenInstantiatingParserWithoutGPU_thenRaiseWarning expected = self.a_cpu_torch_device self.assertEqual(actual, expected) - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") @skipIf(not torch.cuda.is_available(), "no gpu available") - def test_givenAGPUDeviceSetup_whenInstantiatingParser_thenDeviceIsGPU(self, embeddings_model_mock, model_mock): + def test_givenAGPUDeviceSetup_whenInstantiatingParser_thenDeviceIsGPU(self): address_parser = AddressParser( model_type=self.a_best_model_type.capitalize(), # we use BPEmb for simplicity @@ -147,12 +139,8 @@ def test_givenAGPUDeviceSetup_whenInstantiatingParser_thenDeviceIsGPU(self, embe expected = self.a_gpu_torch_device self.assertEqual(actual, expected) - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") @skipIf(not torch.cuda.is_available(), "no gpu available") - def test_givenAGPUDeviceSetupSTRFormat_whenInstantiatingParser_thenDeviceIsGPU( - self, embeddings_model_mock, model_mock - ): + def test_givenAGPUDeviceSetupSTRFormat_whenInstantiatingParser_thenDeviceIsGPU(self): address_parser = AddressParser( model_type=self.a_best_model_type.capitalize(), # we use BPEmb for simplicity @@ -162,12 +150,8 @@ def test_givenAGPUDeviceSetupSTRFormat_whenInstantiatingParser_thenDeviceIsGPU( expected = self.a_gpu_torch_device self.assertEqual(actual, expected) - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") @skipIf(not torch.cuda.is_available(), "no gpu available") - def test_givenAGPUDeviceSetupINTFormat_whenInstantiatingParser_thenDeviceIsGPU( - self, embeddings_model_mock, model_mock - ): + def test_givenAGPUDeviceSetupINTFormat_whenInstantiatingParser_thenDeviceIsGPU(self): address_parser = AddressParser( model_type=self.a_best_model_type.capitalize(), # we use BPEmb for simplicity @@ -177,10 +161,8 @@ def test_givenAGPUDeviceSetupINTFormat_whenInstantiatingParser_thenDeviceIsGPU( expected = self.a_gpu_torch_device self.assertEqual(actual, expected) - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") @skipIf(not torch.cuda.is_available(), "no gpu available") - def test_givenAGPUTorchDeviceSetup_whenInstantiatingParser_thenDeviceIsGPU(self, embeddings_model_mock, model_mock): + def test_givenAGPUTorchDeviceSetup_whenInstantiatingParser_thenDeviceIsGPU(self): address_parser = AddressParser( model_type=self.a_best_model_type.capitalize(), # we use BPEmb for simplicity @@ -190,144 +172,119 @@ def test_givenAGPUTorchDeviceSetup_whenInstantiatingParser_thenDeviceIsGPU(self, expected = self.a_gpu_torch_device self.assertEqual(actual, expected) - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - def test_givenACapitalizeBPEmbModelType_whenInstantiatingParser_thenInstantiateModelWithCorrectParameters( - self, model_mock + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + def test_givenACapitalizeBPEmbModelType_whenInstantiatingParser_thenCallEmbeddingsModelFactoryWithCorrectParameters( + self, data_processor_factory_mock, vectorizer_factory_mock ): - with patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") as embeddings_model_mock: + with patch("deepparse.parser.address_parser.EmbeddingsModelFactory") as embeddings_model_factory_mock: AddressParser( model_type=self.a_best_model_type.capitalize(), device=self.a_cpu_device, verbose=self.verbose, ) - embeddings_model_mock.assert_called_with(verbose=self.verbose, cache_dir=self.cache_dir) - - with patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") as embeddings_model_mock: - AddressParser( - model_type=self.a_bpemb_model_type.capitalize(), - device=self.a_cpu_device, - verbose=self.verbose, + embeddings_model_factory_mock().create.assert_called_with( + embedding_model_type=self.a_bpemb_model_type, verbose=self.verbose, cache_dir=self.cache_dir ) - embeddings_model_mock.assert_called_with(verbose=self.verbose, cache_dir=self.cache_dir) - - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - def test_givenACapitalizeFastTextModelType_whenInstantiatingParser_thenInstantiateModelWithCorrectParameters( - self, embeddings_model_mock, model_mock + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + def test_givenACapitalizeFastTextModelType_whenInstantiatingParser_thenCallEmbeddingsModelFactoryWithCorrectParameters( + self, data_processor_factory_mock, vectorizer_factory_mock ): - with patch("deepparse.parser.address_parser.download_fasttext_embeddings") as downloader_mock: + with patch("deepparse.parser.address_parser.EmbeddingsModelFactory") as embeddings_model_factory_mock: AddressParser( model_type=self.a_fastest_model_type.capitalize(), device=self.a_cpu_device, verbose=self.verbose, ) - downloader_mock.assert_called_with( - cache_dir=self.fasttext_download_path, verbose=self.verbose, offline=False - ) - - with patch("deepparse.parser.address_parser.download_fasttext_embeddings") as downloader_mock: - AddressParser( - model_type=self.a_fasttext_model_type.capitalize(), - device=self.a_cpu_device, - verbose=self.verbose, - ) - - downloader_mock.assert_called_with( - cache_dir=self.fasttext_download_path, verbose=self.verbose, offline=False - ) - - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - def test_givenAFastTextAttModelType_whenInstantiatingParser_thenInstantiateModelWithCorrectParameters( - self, embeddings_model_mock, model_mock - ): - with patch("deepparse.parser.address_parser.download_fasttext_embeddings") as downloader_mock: - AddressParser( - model_type=self.a_fastest_model_type, - device=self.a_cpu_device, - verbose=self.verbose, - attention_mechanism=True, - ) - - downloader_mock.assert_called_with( - cache_dir=self.fasttext_download_path, verbose=self.verbose, offline=False + embeddings_model_factory_mock().create.assert_called_with( + embedding_model_type=self.a_fasttext_model_type, verbose=self.verbose, cache_dir=self.cache_dir ) - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - def test_givenABestModelType_whenInstantiatingParser_thenInstantiateBPEmbEmbeddingsModelWithCorrectParameters( - self, model_mock + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + def test_givenABestModelType_whenInstantiatingParser_thenCallEmbeddingsModelFactoryWithCorrectParameters( + self, data_processor_factory_mock, vectorizer_factory_mock ): - with patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") as embeddings_model_mock: + with patch("deepparse.parser.address_parser.EmbeddingsModelFactory") as embeddings_model_factory_mock: AddressParser( model_type=self.a_best_model_type, device=self.a_cpu_device, verbose=self.verbose, ) - embeddings_model_mock.assert_called_with(verbose=self.verbose, cache_dir=self.cache_dir) + embeddings_model_factory_mock().create.assert_called_with( + embedding_model_type=self.a_bpemb_model_type, verbose=self.verbose, cache_dir=self.cache_dir + ) - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - def test_givenABPEmbModelType_whenInstantiatingParser_thenInstantiateBPEmbEmbeddingsModelWithCorrectParameters( - self, model_mock + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + def test_givenABPEmbModelType_whenInstantiatingParser_thenCallEmbeddingsModelFactoryWithCorrectParameters( + self, data_processor_factory_mock, vectorizer_factory_mock ): - with patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") as embeddings_model_mock: + with patch("deepparse.parser.address_parser.EmbeddingsModelFactory") as embeddings_model_factory_mock: AddressParser( model_type=self.a_bpemb_model_type, device=self.a_cpu_device, verbose=self.verbose, ) - embeddings_model_mock.assert_called_with(verbose=self.verbose, cache_dir=self.cache_dir) + embeddings_model_factory_mock().create.assert_called_with( + embedding_model_type=self.a_bpemb_model_type, verbose=self.verbose, cache_dir=self.cache_dir + ) - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - def test_givenABestModelType_whenInstantiatingParser_thenInstantiateBPEmbVectorizerWithCorrectParameters( - self, model_mock + @patch("deepparse.parser.address_parser.DataProcessorFactory") + def test_givenABestModelType_whenInstantiatingParser_thenCallVectorizerFactoryWithCorrectParameters( + self, data_processor_factory_mock ): with patch( - "deepparse.parser.address_parser.BPEmbEmbeddingsModel", - return_value=self.embeddings_model_mock, - ): - with patch("deepparse.parser.address_parser.BPEmbVectorizer") as vectorizer_mock: + "deepparse.parser.address_parser.EmbeddingsModelFactory", + ) as embeddings_factory_mock: + embeddings_factory_mock().create.return_value = self.embeddings_model_mock + with patch("deepparse.parser.address_parser.VectorizerFactory") as vectorizer_factory_mock: AddressParser( model_type=self.a_best_model_type, device=self.a_cpu_device, verbose=self.verbose, ) - vectorizer_mock.assert_called_with(embeddings_model=self.embeddings_model_mock) + vectorizer_factory_mock().create.assert_called_with(self.embeddings_model_mock) - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - def test_givenABPEmbModelType_whenInstantiatingParser_thenInstantiateBPEmbVectorizerWithCorrectParameters( - self, model_mock + @patch("deepparse.parser.address_parser.DataProcessorFactory") + def test_givenABPEmbModelType_whenInstantiatingParser_thenCallVectorizerFactoryWithCorrectParameters( + self, data_processor_factory_mock ): with patch( - "deepparse.parser.address_parser.BPEmbEmbeddingsModel", - return_value=self.embeddings_model_mock, - ): - with patch("deepparse.parser.address_parser.BPEmbVectorizer") as vectorizer_mock: + "deepparse.parser.address_parser.EmbeddingsModelFactory", + ) as embeddings_factory_mock: + embeddings_factory_mock().create.return_value = self.embeddings_model_mock + with patch("deepparse.parser.address_parser.VectorizerFactory") as vectorizer_factory_mock: AddressParser( model_type=self.a_bpemb_model_type, device=self.a_cpu_device, verbose=self.verbose, ) - vectorizer_mock.assert_called_with(embeddings_model=self.embeddings_model_mock) + vectorizer_factory_mock().create.assert_called_with(self.embeddings_model_mock) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - def test_givenABestModelType_whenInstantiatingParser_thenInstantiateModelWithCorrectParameters( - self, embeddings_model_mock + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + def test_givenABestModelType_whenInstantiatingParser_thenCallModelFactoryWithCorrectParameters( + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: AddressParser( model_type=self.a_best_model_type, device=self.a_cpu_device, verbose=self.verbose, ) - model_mock.assert_called_with( + model_factory_mock().create.assert_called_with( + model_type=self.a_bpemb_model_type, cache_dir=self.cache_dir, device=self.a_cpu_torch_device, output_size=self.number_tags, @@ -337,11 +294,13 @@ def test_givenABestModelType_whenInstantiatingParser_thenInstantiateModelWithCor offline=False, ) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenABPEmbModelType_whenInstantiatingParserWithUserComponent_thenCorrectNumberOfOutputDim( - self, embeddings_model_mock + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: self.setup_retrain_new_tags_model(self.correct_address_components, self.a_bpemb_model_type) AddressParser( model_type=self.a_bpemb_model_type, @@ -350,7 +309,8 @@ def test_givenABPEmbModelType_whenInstantiatingParserWithUserComponent_thenCorre path_to_retrained_model=self.a_model_path, ) - model_mock.assert_called_with( + model_factory_mock().create.assert_called_with( + model_type=self.a_bpemb_model_type, cache_dir=self.cache_dir, device=self.a_cpu_torch_device, output_size=len(self.correct_address_components), @@ -360,11 +320,13 @@ def test_givenABPEmbModelType_whenInstantiatingParserWithUserComponent_thenCorre offline=False, ) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenABPEmbModelType_whenInstantiatingParserWithUserSeq2seqParams_thenCorrectSettings( - self, embeddings_model_mock + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: self.setup_retrain_new_params_model(self.new_seq2seq_params, self.a_bpemb_model_type) AddressParser( model_type=self.a_bpemb_model_type, @@ -373,7 +335,8 @@ def test_givenABPEmbModelType_whenInstantiatingParserWithUserSeq2seqParams_thenC path_to_retrained_model=self.a_model_path, ) - model_mock.assert_called_with( + model_factory_mock().create.assert_called_with( + model_type=self.a_bpemb_model_type, cache_dir=self.cache_dir, device=self.a_cpu_torch_device, output_size=self.number_tags, @@ -385,12 +348,13 @@ def test_givenABPEmbModelType_whenInstantiatingParserWithUserSeq2seqParams_thenC offline=False, ) - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenAFasttextModelType_whenInstantiatingParserWithUserComponent_thenCorrectNumberOfOutputDim( - self, download_weights_mock, embeddings_model_mock + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: self.setup_retrain_new_tags_model(self.incorrect_address_components, self.a_fasttext_model_type) AddressParser( model_type=self.a_fasttext_model_type, @@ -399,7 +363,8 @@ def test_givenAFasttextModelType_whenInstantiatingParserWithUserComponent_thenCo path_to_retrained_model=self.a_model_path, ) - model_mock.assert_called_with( + model_factory_mock().create.assert_called_with( + model_type=self.a_fasttext_model_type, cache_dir=self.cache_dir, device=self.a_cpu_torch_device, output_size=len(self.incorrect_address_components), @@ -409,12 +374,13 @@ def test_givenAFasttextModelType_whenInstantiatingParserWithUserComponent_thenCo offline=False, ) - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenAFasttextModelType_whenInstantiatingParserWithUserSeq2seqParams_thenCorrectSettings( - self, download_weights_mock, embeddings_model_mock + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: self.setup_retrain_new_params_model(self.new_seq2seq_params, self.a_fasttext_model_type) AddressParser( model_type=self.a_fasttext_model_type, @@ -423,7 +389,8 @@ def test_givenAFasttextModelType_whenInstantiatingParserWithUserSeq2seqParams_th path_to_retrained_model=self.a_model_path, ) - model_mock.assert_called_with( + model_factory_mock().create.assert_called_with( + model_type=self.a_fasttext_model_type, cache_dir=self.cache_dir, device=self.a_cpu_torch_device, output_size=self.number_tags, @@ -435,18 +402,21 @@ def test_givenAFasttextModelType_whenInstantiatingParserWithUserSeq2seqParams_th offline=False, ) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenABPEmbModelType_whenInstantiatingParser_thenInstantiateModelWithCorrectParameters( - self, embeddings_model_mock + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: AddressParser( model_type=self.a_bpemb_model_type, device=self.a_cpu_device, verbose=self.verbose, ) - model_mock.assert_called_with( + model_factory_mock().create.assert_called_with( + model_type=self.a_bpemb_model_type, cache_dir=self.cache_dir, device=self.a_cpu_torch_device, output_size=self.number_tags, @@ -456,180 +426,74 @@ def test_givenABPEmbModelType_whenInstantiatingParser_thenInstantiateModelWithCo offline=False, ) - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - def test_givenAFastestModelType_whenInstantiatingParser_thenDownloadFasttextModelWithCorrectPath( - self, embeddings_model_mock, model_mock - ): - with patch("deepparse.parser.address_parser.download_fasttext_embeddings") as downloader: - AddressParser( - model_type=self.a_fastest_model_type, - device=self.a_cpu_device, - verbose=self.verbose, - ) - - downloader.assert_called_with(cache_dir=self.fasttext_download_path, verbose=self.verbose, offline=False) - - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - def test_givenAFasttextModelType_whenInstantiatingParser_thenDownloadFasttextModelWithCorrectPath( - self, embeddings_model_mock, model_mock - ): - with patch("deepparse.parser.address_parser.download_fasttext_embeddings") as downloader: - AddressParser( - model_type=self.a_fasttext_model_type, - device=self.a_cpu_device, - verbose=self.verbose, - ) - - downloader.assert_called_with(cache_dir=self.fasttext_download_path, verbose=self.verbose, offline=False) - - @patch("deepparse.parser.address_parser.MagnitudeEmbeddingsModel") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - def test_givenAFasttextLightModelType_whenInstantiatingParser_thenDownloadFasttextMagnitudeModelWithCorrectPath( - self, embeddings_model_mock, model_mock - ): - with patch("deepparse.parser.address_parser.download_fasttext_magnitude_embeddings") as downloader: - AddressParser( - model_type=self.a_fasttext_light_model_type, - device=self.a_cpu_device, - verbose=self.verbose, - ) - - downloader.assert_called_with(cache_dir=self.fasttext_download_path, verbose=self.verbose, offline=False) - - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - def test_givenAFastestModelType_whenInstantiatingParser_thenInstantiateModelWithCorrectPath(self, model_mock): - with patch( - "deepparse.parser.address_parser.download_fasttext_embeddings", - return_value=self.a_embeddings_path, - ): - with patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") as embeddings_model_mock: - AddressParser( - model_type=self.a_fastest_model_type, - device=self.a_cpu_device, - verbose=self.verbose, - ) - - embeddings_model_mock.assert_called_with(self.a_embeddings_path, verbose=self.verbose) - - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - def test_givenAFasttextModelType_whenInstantiatingParser_thenInstantiateModelWithCorrectPath(self, model_mock): - with patch( - "deepparse.parser.address_parser.download_fasttext_embeddings", - return_value=self.a_embeddings_path, - ): - with patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") as embeddings_model_mock: - AddressParser( - model_type=self.a_fasttext_model_type, - device=self.a_cpu_device, - verbose=self.verbose, - ) - - embeddings_model_mock.assert_called_with(self.a_embeddings_path, verbose=self.verbose) - - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - def test_givenAFasttextLightModelType_whenInstanciatingParser_thenInstanciateModelWithCorrectPath(self, model_mock): - with patch( - "deepparse.parser.address_parser.download_fasttext_magnitude_embeddings", - return_value=self.a_embeddings_path, - ): - with patch("deepparse.parser.address_parser.MagnitudeEmbeddingsModel") as embeddings_model_mock: - AddressParser( - model_type=self.a_fasttext_light_model_type, - device=self.a_cpu_device, - verbose=self.verbose, - ) - - embeddings_model_mock.assert_called_with(self.a_embeddings_path, verbose=self.verbose) - - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - def test_givenAFastestModelType_whenInstantiatingParser_thenInstantiateFasttextVectorizerWithCorrectParameters( - self, model_mock, downloader_mock + @patch("deepparse.parser.address_parser.DataProcessorFactory") + def test_givenAFastestModelType_whenInstantiatingParser_thenVectorizerFactoryWithCorrectParameters( + self, data_processor_factory_mock ): with patch( - "deepparse.parser.address_parser.FastTextEmbeddingsModel", - return_value=self.embeddings_model_mock, - ): - with patch("deepparse.parser.address_parser.FastTextVectorizer") as vectorizer_mock: + "deepparse.parser.address_parser.EmbeddingsModelFactory", + ) as embeddings_factory_mock: + embeddings_factory_mock().create.return_value = self.embeddings_model_mock + with patch("deepparse.parser.address_parser.VectorizerFactory") as vectorizer_factory_mock: AddressParser( model_type=self.a_fastest_model_type, device=self.a_cpu_device, verbose=self.verbose, ) - vectorizer_mock.assert_called_with(embeddings_model=self.embeddings_model_mock) + vectorizer_factory_mock().create.assert_called_with(self.embeddings_model_mock) - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - def test_givenAFasttextModelType_whenInstantiatingParser_thenInstantiateFasttextVectorizerWithCorrectParameters( - self, model_mock, downloader_mock - ): - with patch( - "deepparse.parser.address_parser.FastTextEmbeddingsModel", - return_value=self.embeddings_model_mock, - ): - with patch("deepparse.parser.address_parser.FastTextVectorizer") as vectorizer_mock: - AddressParser( - model_type=self.a_fasttext_model_type, - device=self.a_cpu_device, - verbose=self.verbose, - ) - - vectorizer_mock.assert_called_with(embeddings_model=self.embeddings_model_mock) - - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.download_fasttext_magnitude_embeddings") # pylint: disable=C0301 - def test_givenAFasttextLightModelType_whenInstanciatingParser_thenInstanciateMagnitudeVectorizerWithCorrectParameters( - self, model_mock, downloader_mock + @patch("deepparse.parser.address_parser.DataProcessorFactory") + def test_givenAFasttextLightModelType_whenInstanciatingParser_thenCallVectorizerFactoryWithCorrectParameters( + self, data_processor_factory_mock ): with patch( - "deepparse.parser.address_parser.MagnitudeEmbeddingsModel", - return_value=self.embeddings_model_mock, - ): - with patch("deepparse.parser.address_parser.MagnitudeVectorizer") as vectorizer_mock: + "deepparse.parser.address_parser.EmbeddingsModelFactory", + ) as embeddings_factory_mock: + embeddings_factory_mock().create.return_value = self.embeddings_model_mock + with patch("deepparse.parser.address_parser.VectorizerFactory") as vectorizer_factory_mock: AddressParser( model_type=self.a_fasttext_light_model_type, device=self.a_cpu_device, verbose=self.verbose, ) - vectorizer_mock.assert_called_with(embeddings_model=self.embeddings_model_mock) + vectorizer_factory_mock().create.assert_called_with(self.embeddings_model_mock) - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.download_fasttext_magnitude_embeddings") # pylint: disable=C0301 - def test_givenALightestModelType_whenInstanciatingParser_thenInstanciateMagnitudeVectorizerWithCorrectParameters( - self, model_mock, downloader_mock + @patch("deepparse.parser.address_parser.DataProcessorFactory") + def test_givenALightestModelType_whenInstanciatingParser_thenCallVectorizerFactoryWithCorrectParameters( + self, data_processor_factory_mock ): with patch( - "deepparse.parser.address_parser.MagnitudeEmbeddingsModel", - return_value=self.embeddings_model_mock, - ): - with patch("deepparse.parser.address_parser.MagnitudeVectorizer") as vectorizer_mock: + "deepparse.parser.address_parser.EmbeddingsModelFactory", + ) as embeddings_factory_mock: + embeddings_factory_mock().create.return_value = self.embeddings_model_mock + with patch("deepparse.parser.address_parser.VectorizerFactory") as vectorizer_factory_mock: AddressParser( model_type=self.a_fasttext_lightest_model_type, device=self.a_cpu_device, verbose=self.verbose, ) - vectorizer_mock.assert_called_with(embeddings_model=self.embeddings_model_mock) + vectorizer_factory_mock().create.assert_called_with(self.embeddings_model_mock) - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - def test_givenAFastestModelType_whenInstantiatingParser_thenInstantiateModelWithCorrectParameters( - self, download_weights_mock, embeddings_model_mock + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + def test_givenAFastestModelType_whenInstantiatingParser_thenCallModelFactoryWithCorrectParameters( + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: AddressParser( model_type=self.a_fastest_model_type, device=self.a_cpu_device, verbose=self.verbose, ) - model_mock.assert_called_with( + model_factory_mock().create.assert_called_with( + model_type=self.a_fasttext_model_type, cache_dir=self.cache_dir, device=self.a_cpu_torch_device, output_size=self.number_tags, @@ -639,19 +503,21 @@ def test_givenAFastestModelType_whenInstantiatingParser_thenInstantiateModelWith offline=False, ) - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - def test_givenAFasttextModelType_whenInstantiatingParser_thenInstantiateModelWithCorrectParameters( - self, download_weights_mock, embeddings_model_mock + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + def test_givenAFasttextModelType_whenInstantiatingParser_thenCallModelFactoryWithCorrectParameters( + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: AddressParser( model_type=self.a_fasttext_model_type, device=self.a_cpu_device, verbose=self.verbose, ) - model_mock.assert_called_with( + model_factory_mock().create.assert_called_with( + model_type=self.a_fasttext_model_type, cache_dir=self.cache_dir, device=self.a_cpu_torch_device, output_size=self.number_tags, @@ -661,19 +527,17 @@ def test_givenAFasttextModelType_whenInstantiatingParser_thenInstantiateModelWit offline=False, ) - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextModel_whenAddressParsingAString_thenParseAddress( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_model_type, device=self.a_cpu_device, @@ -685,19 +549,17 @@ def test_givenAFasttextModel_whenAddressParsingAString_thenParseAddress( self.assertIsInstance(parse_address, FormattedParsedAddress) self.assertEqual(parse_address.raw_address, self.a_complete_address) - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextAttModel_whenAddressParsingAString_thenParseAddress( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_model_type, device=self.a_cpu_device, @@ -710,19 +572,17 @@ def test_givenAFasttextAttModel_whenAddressParsingAString_thenParseAddress( self.assertIsInstance(parse_address, FormattedParsedAddress) self.assertEqual(parse_address.raw_address, self.a_complete_address) - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextModel_whenAddressParsingAListOfAddress_thenParseAllAddress( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_multiple_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_multiple_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_model_type, device=self.a_cpu_device, @@ -736,19 +596,17 @@ def test_givenAFasttextModel_whenAddressParsingAListOfAddress_thenParseAllAddres self.assertEqual(parse_address[0].raw_address, self.a_complete_address) self.assertEqual(parse_address[1].raw_address, self.a_complete_address) - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextAttModel_whenAddressParsingAListOfAddress_thenParseAllAddress( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_multiple_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_multiple_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_model_type, device=self.a_cpu_device, @@ -763,19 +621,17 @@ def test_givenAFasttextAttModel_whenAddressParsingAListOfAddress_thenParseAllAdd self.assertEqual(parse_address[0].raw_address, self.a_complete_address) self.assertEqual(parse_address[1].raw_address, self.a_complete_address) - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextModel_whenAddressParsingAnAddress_thenParseAddressCorrectly( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_model_type, device=self.a_cpu_device, @@ -792,19 +648,17 @@ def test_givenAFasttextModel_whenAddressParsingAnAddress_thenParseAddressCorrect self.assertEqual(parse_address.StreetName, self.a_street_name) self.assertEqual(parse_address.StreetNumber, self.a_street_number) - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextAttModel_whenAddressParsingAnAddress_thenParseAddressCorrectly( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_model_type, device=self.a_cpu_device, @@ -822,19 +676,18 @@ def test_givenAFasttextAttModel_whenAddressParsingAnAddress_thenParseAddressCorr self.assertEqual(parse_address.StreetName, self.a_street_name) self.assertEqual(parse_address.StreetNumber, self.a_street_number) - @patch("deepparse.parser.address_parser.download_fasttext_magnitude_embeddings") - @patch("deepparse.parser.address_parser.MagnitudeEmbeddingsModel") - @patch("deepparse.parser.address_parser.MagnitudeVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAMagnitudeModel_whenAddressParsingAString_thenParseAddress( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + + self.mock_predictions_vectors(self.model_mock) address_parser = AddressParser( model_type=self.a_fasttext_light_model_type, device=self.a_cpu_device, @@ -846,19 +699,17 @@ def test_givenAMagnitudeModel_whenAddressParsingAString_thenParseAddress( self.assertIsInstance(parse_address, FormattedParsedAddress) self.assertEqual(parse_address.raw_address, self.a_complete_address) - @patch("deepparse.parser.address_parser.download_fasttext_magnitude_embeddings") - @patch("deepparse.parser.address_parser.MagnitudeEmbeddingsModel") - @patch("deepparse.parser.address_parser.MagnitudeVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAMagnitudeAttModel_whenAddressParsingAString_thenParseAddress( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_light_model_type, device=self.a_cpu_device, @@ -871,19 +722,17 @@ def test_givenAMagnitudeAttModel_whenAddressParsingAString_thenParseAddress( self.assertIsInstance(parse_address, FormattedParsedAddress) self.assertEqual(parse_address.raw_address, self.a_complete_address) - @patch("deepparse.parser.address_parser.download_fasttext_magnitude_embeddings") - @patch("deepparse.parser.address_parser.MagnitudeEmbeddingsModel") - @patch("deepparse.parser.address_parser.MagnitudeVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAMagnitudeModel_whenAddressParsingAListOfAddress_thenParseAllAddress( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_multiple_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_multiple_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_light_model_type, device=self.a_cpu_device, @@ -897,19 +746,17 @@ def test_givenAMagnitudeModel_whenAddressParsingAListOfAddress_thenParseAllAddre self.assertEqual(parse_address[0].raw_address, self.a_complete_address) self.assertEqual(parse_address[1].raw_address, self.a_complete_address) - @patch("deepparse.parser.address_parser.download_fasttext_magnitude_embeddings") - @patch("deepparse.parser.address_parser.MagnitudeEmbeddingsModel") - @patch("deepparse.parser.address_parser.MagnitudeVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAMagnitudeAttModel_whenAddressParsingAListOfAddress_thenParseAllAddress( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_multiple_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_multiple_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_light_model_type, device=self.a_cpu_device, @@ -924,19 +771,17 @@ def test_givenAMagnitudeAttModel_whenAddressParsingAListOfAddress_thenParseAllAd self.assertEqual(parse_address[0].raw_address, self.a_complete_address) self.assertEqual(parse_address[1].raw_address, self.a_complete_address) - @patch("deepparse.parser.address_parser.download_fasttext_magnitude_embeddings") - @patch("deepparse.parser.address_parser.MagnitudeEmbeddingsModel") - @patch("deepparse.parser.address_parser.MagnitudeVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAMagnitudeModel_whenAddressParsingAnAddress_thenParseAddressCorrectly( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_light_model_type, device=self.a_cpu_device, @@ -953,19 +798,17 @@ def test_givenAMagnitudeModel_whenAddressParsingAnAddress_thenParseAddressCorrec self.assertEqual(parse_address.StreetName, self.a_street_name) self.assertEqual(parse_address.StreetNumber, self.a_street_number) - @patch("deepparse.parser.address_parser.download_fasttext_magnitude_embeddings") - @patch("deepparse.parser.address_parser.MagnitudeEmbeddingsModel") - @patch("deepparse.parser.address_parser.MagnitudeVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAMagnitudeAttModel_whenAddressParsingAnAddress_thenParseAddressCorrectly( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_light_model_type, device=self.a_cpu_device, @@ -983,14 +826,17 @@ def test_givenAMagnitudeAttModel_whenAddressParsingAnAddress_thenParseAddressCor self.assertEqual(parse_address.StreetName, self.a_street_name) self.assertEqual(parse_address.StreetNumber, self.a_street_number) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbModel_whenAddressParsingAString_thenParseAddress( - self, embeddings_model_mock, vectorizer_model_mock, data_padding_mock + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_cpu_device, @@ -1002,14 +848,17 @@ def test_givenABPEmbModel_whenAddressParsingAString_thenParseAddress( self.assertIsInstance(parse_address, FormattedParsedAddress) self.assertEqual(parse_address.raw_address, self.a_complete_address) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbAttModel_whenAddressParsingAString_thenParseAddress( - self, embeddings_model_mock, vectorizer_model_mock, data_padding_mock + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_cpu_device, @@ -1022,14 +871,17 @@ def test_givenABPEmbAttModel_whenAddressParsingAString_thenParseAddress( self.assertIsInstance(parse_address, FormattedParsedAddress) self.assertEqual(parse_address.raw_address, self.a_complete_address) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbModel_whenAddressParsingAListOfAddress_thenParseAllAddress( - self, embeddings_model_mock, vectorizer_model_mock, data_padding_mock + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: - self.mock_multiple_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_multiple_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_cpu_device, @@ -1043,14 +895,17 @@ def test_givenABPEmbModel_whenAddressParsingAListOfAddress_thenParseAllAddress( self.assertEqual(parse_address[0].raw_address, self.a_complete_address) self.assertEqual(parse_address[1].raw_address, self.a_complete_address) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbAttModel_whenAddressParsingAListOfAddress_thenParseAllAddress( - self, embeddings_model_mock, vectorizer_model_mock, data_padding_mock + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: - self.mock_multiple_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_multiple_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_cpu_device, @@ -1065,14 +920,17 @@ def test_givenABPEmbAttModel_whenAddressParsingAListOfAddress_thenParseAllAddres self.assertEqual(parse_address[0].raw_address, self.a_complete_address) self.assertEqual(parse_address[1].raw_address, self.a_complete_address) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbModel_whenAddressParsingAnAddress_thenParseAddressCorrectly( - self, embeddings_model_mock, vectorizer_model_mock, data_padding_mock + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_cpu_device, @@ -1089,14 +947,17 @@ def test_givenABPEmbModel_whenAddressParsingAnAddress_thenParseAddressCorrectly( self.assertEqual(parse_address.StreetName, self.a_street_name) self.assertEqual(parse_address.StreetNumber, self.a_street_number) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbAttModel_whenAddressParsingAnAddress_thenParseAddressCorrectly( - self, embeddings_model_mock, vectorizer_model_mock, data_padding_mock + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_cpu_device, @@ -1114,43 +975,49 @@ def test_givenABPEmbAttModel_whenAddressParsingAnAddress_thenParseAddressCorrect self.assertEqual(parse_address.StreetName, self.a_street_name) self.assertEqual(parse_address.StreetNumber, self.a_street_number) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbModel_whenAddressParsingAnAddressVerbose_thenVerbose( - self, embeddings_model_mock, vectorizer_model_mock, data_padding_mock + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - self._capture_output() - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: with patch( "deepparse.parser.address_parser.PREDICTION_TIME_PERFORMANCE_THRESHOLD", 0, ): - self.mock_predictions_vectors(model_mock) + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_cpu_device, verbose=True, ) + self._capture_output() + address_parser(self.a_complete_address) actual = self.test_out.getvalue().strip() expect = "Vectorizing the address" self.assertEqual(actual, expect) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbAttModel_whenAddressParsingAnAddressVerbose_thenVerbose( - self, embeddings_model_mock, vectorizer_model_mock, data_padding_mock + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - self._capture_output() - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: with patch( "deepparse.parser.address_parser.PREDICTION_TIME_PERFORMANCE_THRESHOLD", 0, ): - self.mock_predictions_vectors(model_mock) + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_cpu_device, @@ -1158,21 +1025,25 @@ def test_givenABPEmbAttModel_whenAddressParsingAnAddressVerbose_thenVerbose( attention_mechanism=True, ) + self._capture_output() + address_parser(self.a_complete_address) actual = self.test_out.getvalue().strip() expect = "Vectorizing the address" self.assertEqual(actual, expect) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenAnBPEmbAddressParser_whenStrAddressParser_thenStringIsModelTypeAddressParse( - self, embeddings_model_mock, vectorizer_model_mock, data_padding_mock + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): self._capture_output() - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_cpu_device, @@ -1182,16 +1053,18 @@ def test_givenAnBPEmbAddressParser_whenStrAddressParser_thenStringIsModelTypeAdd self.assertEqual(self.a_BPEmb_name, self.test_out.getvalue().strip()) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenAnBPEmbAttAddressParser_whenStrAddressParser_thenStringIsModelTypeAddressParse( - self, embeddings_model_mock, vectorizer_model_mock, data_padding_mock + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): self._capture_output() - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_cpu_device, @@ -1202,16 +1075,18 @@ def test_givenAnBPEmbAttAddressParser_whenStrAddressParser_thenStringIsModelType self.assertEqual(self.a_BPEmb_att_name, self.test_out.getvalue().strip()) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenAnBPEmbAddressParser_whenReprAddressParser_thenStringIsModelTypeAddressParse( - self, embeddings_model_mock, vectorizer_model_mock, data_padding_mock + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): self._capture_output() - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_best_model_type, device=self.a_cpu_device, @@ -1221,16 +1096,18 @@ def test_givenAnBPEmbAddressParser_whenReprAddressParser_thenStringIsModelTypeAd self.assertEqual(self.a_BPEmb_name, self.test_out.getvalue().strip()) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenAnBPEmbAttAddressParser_whenReprAddressParser_thenStringIsModelTypeAddressParse( - self, embeddings_model_mock, vectorizer_model_mock, data_padding_mock + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): self._capture_output() - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_best_model_type, device=self.a_cpu_device, @@ -1241,21 +1118,18 @@ def test_givenAnBPEmbAttAddressParser_whenReprAddressParser_thenStringIsModelTyp self.assertEqual(self.a_BPEmb_att_name, self.test_out.getvalue().strip()) - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenAnFasttextAddressParser_whenStrAddressParser_thenStringIsModelTypeAddressParse( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): self._capture_output() - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_model_type, device=self.a_cpu_device, @@ -1265,21 +1139,18 @@ def test_givenAnFasttextAddressParser_whenStrAddressParser_thenStringIsModelType self.assertEqual(self.a_fasttext_name, self.test_out.getvalue().strip()) - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenAnFasttextAttAddressParser_whenStrAddressParser_thenStringIsModelTypeAddressParse( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): self._capture_output() - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_model_type, device=self.a_cpu_device, @@ -1290,21 +1161,18 @@ def test_givenAnFasttextAttAddressParser_whenStrAddressParser_thenStringIsModelT self.assertEqual(self.a_fasttext_att_name, self.test_out.getvalue().strip()) - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenAnFasttextAddressParser_whenReprAddressParser_thenStringIsModelTypeAddressParse( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): self._capture_output() - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_model_type, device=self.a_cpu_device, @@ -1314,21 +1182,18 @@ def test_givenAnFasttextAddressParser_whenReprAddressParser_thenStringIsModelTyp self.assertEqual(self.a_fasttext_name, self.test_out.getvalue().strip()) - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenAnFasttextAttAddressParser_whenReprAddressParser_thenStringIsModelTypeAddressParse( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): self._capture_output() - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_model_type, device=self.a_cpu_device, @@ -1339,21 +1204,18 @@ def test_givenAnFasttextAttAddressParser_whenReprAddressParser_thenStringIsModel self.assertEqual(self.a_fasttext_att_name, self.test_out.getvalue().strip()) - @patch("deepparse.parser.address_parser.download_fasttext_magnitude_embeddings") - @patch("deepparse.parser.address_parser.MagnitudeEmbeddingsModel") - @patch("deepparse.parser.address_parser.MagnitudeVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenAnFasttextLightAddressParser_whenStrAddressParser_thenStringIsModelTypeAddressParse( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): self._capture_output() - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_lightest_model_type, device=self.a_cpu_device, @@ -1363,21 +1225,18 @@ def test_givenAnFasttextLightAddressParser_whenStrAddressParser_thenStringIsMode self.assertEqual(self.a_fasttext_light_name, self.test_out.getvalue().strip()) - @patch("deepparse.parser.address_parser.download_fasttext_magnitude_embeddings") - @patch("deepparse.parser.address_parser.MagnitudeEmbeddingsModel") - @patch("deepparse.parser.address_parser.MagnitudeVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenAnFasttextLightAttAddressParser_whenStrAddressParser_thenStringIsModelTypeAddressParse( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): self._capture_output() - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_lightest_model_type, device=self.a_cpu_device, @@ -1388,21 +1247,18 @@ def test_givenAnFasttextLightAttAddressParser_whenStrAddressParser_thenStringIsM self.assertEqual(self.a_fasttext_att_light_name, self.test_out.getvalue().strip()) - @patch("deepparse.parser.address_parser.download_fasttext_magnitude_embeddings") - @patch("deepparse.parser.address_parser.MagnitudeEmbeddingsModel") - @patch("deepparse.parser.address_parser.MagnitudeVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenAnFasttextLightAddressParser_whenReprAddressParser_thenStringIsModelTypeAddressParse( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): self._capture_output() - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_lightest_model_type, device=self.a_cpu_device, @@ -1412,21 +1268,18 @@ def test_givenAnFasttextLightAddressParser_whenReprAddressParser_thenStringIsMod self.assertEqual(self.a_fasttext_light_name, self.test_out.getvalue().strip()) - @patch("deepparse.parser.address_parser.download_fasttext_magnitude_embeddings") - @patch("deepparse.parser.address_parser.MagnitudeEmbeddingsModel") - @patch("deepparse.parser.address_parser.MagnitudeVectorizer") - @patch("deepparse.parser.address_parser.fasttext_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenAnFasttextLightAttAddressParser_whenReprAddressParser_thenStringIsModelTypeAddressParse( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): self._capture_output() - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_fasttext_lightest_model_type, device=self.a_cpu_device, @@ -1437,39 +1290,87 @@ def test_givenAnFasttextLightAttAddressParser_whenReprAddressParser_thenStringIs self.assertEqual(self.a_fasttext_att_light_name, self.test_out.getvalue().strip()) + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + def test_givenABPEmbModelType_whenRetrainWithIncorrectPredictionTags_thenRaiseValueError( + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock + ): + address_parser = AddressParser( + model_type=self.a_bpemb_model_type, + device=self.a_cpu_device, + verbose=self.verbose, + ) + with self.assertRaises(ValueError): + address_parser.retrain( + MagicMock(), + train_ratio=0.8, + batch_size=1, + epochs=1, + prediction_tags=self.incorrect_address_components, + ) + + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + def test_givenAFasttextModelType_whenInstantiatingParserWithUserComponent_thenRaiseValueError( + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock + ): + address_parser = AddressParser( + model_type=self.a_fasttext_model_type, + device=self.a_cpu_device, + verbose=self.verbose, + ) + with self.assertRaises(ValueError): + address_parser.retrain( + MagicMock(), + train_ratio=0.8, + batch_size=1, + epochs=1, + prediction_tags=self.incorrect_address_components, + num_workers=0, + ) + # we do BPEmb but can be fasttext or fasttext-light - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAModel_whenAddressParsingAnAddressVerbose_thenVerbose( - self, embeddings_model_mock, vectorizer_model_mock, data_padding_mock + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - self._capture_output() - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: with patch( "deepparse.parser.address_parser.PREDICTION_TIME_PERFORMANCE_THRESHOLD", 0, ): - self.mock_predictions_vectors(model_mock) + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_cpu_device, verbose=True, ) + self._capture_output() + address_parser(self.a_complete_address) actual = self.test_out.getvalue().strip() expect = "Vectorizing the address" self.assertEqual(actual, expect) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAModel_whenAddressParsingAnAddressWithProb_thenIncludeProb( - self, embeddings_model_mock, vectorizer_model_mock, data_padding_mock + self, data_padder_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_cpu_device, @@ -1480,83 +1381,83 @@ def test_givenAModel_whenAddressParsingAnAddressWithProb_thenIncludeProb( self.assertIsInstance(output.address_parsed_components[0][1], tuple) # tuple of prob self.assertIsInstance(output.address_parsed_components[1][1], tuple) - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenAFasttextModel_whenGetFormattedModelType_thenReturnFastText( - self, download_weights_mock, embeddings_model_mock + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel"): - address_parser = AddressParser( - model_type=self.a_fasttext_model_type, - device=self.a_cpu_device, - verbose=self.verbose, - ) - actual = address_parser.get_formatted_model_name() - expected = "FastText" - self.assertEqual(expected, actual) + address_parser = AddressParser( + model_type=self.a_fasttext_model_type, + device=self.a_cpu_device, + verbose=self.verbose, + ) + actual = address_parser.get_formatted_model_name() + expected = "FastText" + self.assertEqual(expected, actual) - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenAFasttextAttModel_whenGetFormattedModelType_thenReturnFastText( - self, download_weights_mock, embeddings_model_mock + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel"): - address_parser = AddressParser( - model_type=self.a_fasttext_model_type, - device=self.a_cpu_device, - verbose=self.verbose, - attention_mechanism=True, - ) - actual = address_parser.get_formatted_model_name() - expected = "FastTextAttention" - self.assertEqual(expected, actual) + address_parser = AddressParser( + model_type=self.a_fasttext_model_type, + device=self.a_cpu_device, + verbose=self.verbose, + attention_mechanism=True, + ) + actual = address_parser.get_formatted_model_name() + expected = "FastTextAttention" + self.assertEqual(expected, actual) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenABpembModel_whenGetFormattedModelType_thenReturnBPEmb( - self, embeddings_model_mock, vectorizer_model_mock, data_padding_mock + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel"): - address_parser = AddressParser( - model_type=self.a_bpemb_model_type, - device=self.a_cpu_device, - verbose=True, - ) + address_parser = AddressParser( + model_type=self.a_bpemb_model_type, + device=self.a_cpu_device, + verbose=True, + ) - actual = address_parser.get_formatted_model_name() - expected = "BPEmb" - self.assertEqual(expected, actual) + actual = address_parser.get_formatted_model_name() + expected = "BPEmb" + self.assertEqual(expected, actual) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenABpembAttModel_whenGetFormattedModelType_thenReturnBPEmbAtt( - self, embeddings_model_mock, vectorizer_model_mock, data_padding_mock + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) - address_parser = AddressParser( - model_type=self.a_bpemb_model_type, - device=self.a_cpu_device, - verbose=True, - attention_mechanism=True, - ) + address_parser = AddressParser( + model_type=self.a_bpemb_model_type, + device=self.a_cpu_device, + verbose=True, + attention_mechanism=True, + ) - actual = address_parser.get_formatted_model_name() - expected = "BPEmbAttention" - self.assertEqual(expected, actual) + actual = address_parser.get_formatted_model_name() + expected = "BPEmbAttention" + self.assertEqual(expected, actual) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenEmptyData_whenParse_raiseDataError( - self, embeddings_model_mock, vectorizer_model_mock, data_padding_mock + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): empty_data = ["an address", ""] another_empty_address = "" - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser(model_type=self.a_bpemb_model_type, device=self.a_cpu_device) with self.assertRaises(DataError): address_parser(empty_data) @@ -1564,16 +1465,19 @@ def test_givenEmptyData_whenParse_raiseDataError( with self.assertRaises(DataError): address_parser(another_empty_address) - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenWhiteSpaceOnlyData_whenParse_raiseDataError( - self, embeddings_model_mock, vectorizer_model_mock, data_padding_mock + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): whitespace_data = ["an address", " "] another_whitespace_address = " " - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") as model_mock: - self.mock_predictions_vectors(model_mock) + + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + self.mock_predictions_vectors(self.model_mock) + model_factory_mock().create.return_value = self.model_mock + address_parser = AddressParser(model_type=self.a_bpemb_model_type, device=self.a_cpu_device) with self.assertRaises(DataError): address_parser(whitespace_data) @@ -1581,55 +1485,59 @@ def test_givenWhiteSpaceOnlyData_whenParse_raiseDataError( with self.assertRaises(DataError): address_parser(another_whitespace_address) - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - def test_givenANewCacheDirBPEmb_thenInitWeightsInNewCacheDir(self, vectorizer_model_mock, data_padding_mock): - with patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") as embeddings_model_mock: + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + def test_givenANewCacheDirBPEmb_thenInitWeightsInNewCacheDir( + self, data_processor_factory_mock, vectorizer_factory_mock + ): + with patch("deepparse.parser.address_parser.EmbeddingsModelFactory") as embeddings_model_factory_mock: AddressParser( model_type=self.a_bpemb_model_type, device=self.a_cpu_device, verbose=self.verbose, cache_dir=self.a_cache_dir, ) - embeddings_model_mock.assert_called_with(verbose=self.verbose, cache_dir=self.a_cache_dir) + embeddings_model_factory_mock().create.assert_called_with( + embedding_model_type=self.a_bpemb_model_type, verbose=self.verbose, cache_dir=self.a_cache_dir + ) - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - def test_givenANewCacheDirFastText_thenInitWeightsInNewCacheDir(self, embeddings_model_mock): - with patch("deepparse.parser.address_parser.download_fasttext_embeddings") as download_weights_mock: + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + def test_givenANewCacheDirFastText_thenInitWeightsInNewCacheDir( + self, data_processor_factory_mock, vectorizer_factory_mock + ): + with patch("deepparse.parser.address_parser.EmbeddingsModelFactory") as embeddings_model_factory_mock: AddressParser( model_type=self.a_fastest_model_type, device=self.a_cpu_device, verbose=self.verbose, cache_dir=self.a_cache_dir, ) - download_weights_mock.assert_called_with(verbose=self.verbose, cache_dir=self.a_cache_dir, offline=False) + embeddings_model_factory_mock().create.assert_called_with( + embedding_model_type=self.a_fasttext_model_type, verbose=self.verbose, cache_dir=self.a_cache_dir + ) + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") @patch("deepparse.parser.address_parser.torch.save") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") def test_givenAModelToExportDict_thenCallTorchSaveWithProperArgs( - self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - torch_save_mock, + self, torch_save_mock, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - address_parser = AddressParser( - model_type=self.a_fasttext_model_type, - device=self.a_cpu_device, - verbose=self.verbose, - ) + with patch("deepparse.parser.address_parser.ModelFactory") as model_factory_mock: + model_factory_mock().create.return_value = self.model_mock + + address_parser = AddressParser( + model_type=self.a_fasttext_model_type, + device=self.a_cpu_device, + verbose=self.verbose, + ) - a_file_path = os.path.join(self.a_saving_dir_path, "exported_model.p") - address_parser.save_model_weights(file_path=a_file_path) + a_file_path = os.path.join(self.a_saving_dir_path, "exported_model.p") + address_parser.save_model_weights(file_path=a_file_path) - torch_save_mock.assert_called() - torch_save_mock.assert_called_with(model_mock().state_dict(), a_file_path) + torch_save_mock.assert_called() + torch_save_mock.assert_called_with(self.model_mock.state_dict(), a_file_path) if __name__ == "__main__": diff --git a/tests/parser/test_address_parser_retrain_api.py b/tests/parser/test_address_parser_retrain_api.py index 6c0882ef..fccfe321 100644 --- a/tests/parser/test_address_parser_retrain_api.py +++ b/tests/parser/test_address_parser_retrain_api.py @@ -54,6 +54,8 @@ def setUp(self): os.makedirs(self.a_logging_path) self.saving_template_path = os.path.join(self.a_logging_path, "retrained_{}_address_parser.ckpt") + self.model_mock = MagicMock() + def populate_directory(self): create_file( os.path.join(self.a_logging_path, "retrained_fasttext_address_parser.ckpt"), @@ -110,7 +112,7 @@ def address_parser_retrain_call( def assert_experiment_retrain(self, experiment_mock, model_mock, optimizer_mock, device): experiment_mock.assert_called_with( self.a_logging_path, - model_mock(), + model_mock, device=device, optimizer=optimizer_mock(), # For a reason I don"t understand if I use self.nll_loss and set it in the @@ -137,24 +139,24 @@ def assert_experiment_train_method_is_call(self, data_loader_mock, experiment_mo @patch("deepparse.parser.address_parser.torch.save") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextModel_whenRetrain_thenInstantiateOptimizer( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, torch_save_mock, ): + model_factory_mock().create.return_value = self.model_mock + self.address_parser = AddressParser( model_type=self.a_fasttext_model_type, device=self.a_device, @@ -162,26 +164,24 @@ def test_givenAFasttextModel_whenRetrain_thenInstantiateOptimizer( ) self.address_parser_retrain_call() - optimizer_mock.assert_called_with(model_mock().parameters(), self.a_learning_rate) + optimizer_mock.assert_called_with(self.model_mock.parameters(), self.a_learning_rate) @patch("deepparse.tools.poutyne") @patch("deepparse.parser.address_parser.torch.save") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAModel_whenRetrainWithPoutyneBefore18_thenPrintMessage( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, torch_save_mock, @@ -208,20 +208,18 @@ def test_givenAModel_whenRetrainWithPoutyneBefore18_thenPrintMessage( @patch("deepparse.parser.address_parser.torch.save") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAModel_whenRetrainWithPoutyneAfter17_thenDoNotPrintMessage( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, torch_save_mock, @@ -255,20 +253,18 @@ def test_givenAModel_whenRetrainWithPoutyneAfter17_thenDoNotPrintMessage( **{"return_value.train.side_effect": RuntimeError()}, ) @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextModelDirectoryWithOtherRetrainModel_whenRetrain_thenRaiseError( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, torch_save_mock, @@ -297,20 +293,18 @@ def test_givenAFasttextModelDirectoryWithOtherRetrainModel_whenRetrain_thenRaise **{"return_value.train.side_effect": RuntimeError()}, ) @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbModelDirectoryWithOtherFastTextRetrainModel_whenRetrain_thenRaiseError( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, torch_save_mock, @@ -342,20 +336,18 @@ def test_givenABPEmbModelDirectoryWithOtherFastTextRetrainModel_whenRetrain_then ) @patch("deepparse.parser.address_parser.os.listdir", return_value=[]) @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAModelDirectoryWithoutOtherRetrainModel_whenRetrainRaisesRuntimeError_thenReRaiseError( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, torch_save_mock, @@ -378,18 +370,18 @@ def test_givenAModelDirectoryWithoutOtherRetrainModel_whenRetrainRaisesRuntimeEr self.assertEqual(actual_error_message.args[0], expect_error_message) @patch("deepparse.parser.address_parser.Experiment") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.MagnitudeVectorizer") - @patch("deepparse.parser.address_parser.MagnitudeEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_magnitude_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextMagnitudeModel_whenRetrain_thenRaiseError( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - mock_model, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, experiment_mock, ): self.address_parser = AddressParser( @@ -402,20 +394,18 @@ def test_givenAFasttextMagnitudeModel_whenRetrain_thenRaiseError( self.address_parser_retrain_call() @patch("deepparse.parser.address_parser.platform") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFastTextLikeModelOnWindowsOS_whenRetrainWithNumWorkersGT0_thenReRaiseError( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, platform_mock, ): # OS equal Windows @@ -435,20 +425,18 @@ def test_givenAFastTextLikeModelOnWindowsOS_whenRetrainWithNumWorkersGT0_thenReR @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextModel_whenRetrain_thenSaveModelProperly( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -479,25 +467,24 @@ def test_givenAFasttextModel_whenRetrain_thenSaveModelProperly( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenFastTextModel_whenRetrainCPU_thenInstantiateExperimentProperly( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, torch_save_mock, ): + model_factory_mock().create.return_value = self.model_mock self.address_parser = AddressParser( model_type=self.a_fasttext_model_type, device=self.a_device, @@ -505,32 +492,32 @@ def test_givenFastTextModel_whenRetrainCPU_thenInstantiateExperimentProperly( ) self.address_parser_retrain_call() - self.assert_experiment_retrain(experiment_mock, model_mock, optimizer_mock, device=self.a_device) + self.assert_experiment_retrain(experiment_mock, self.model_mock, optimizer_mock, device=self.a_device) @patch("deepparse.parser.address_parser.torch.save") @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") @skipIf(not torch.cuda.is_available(), "no gpu available") def test_givenFastTextModel_whenRetrainGPU_thenInstantiateExperimentProperly( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, torch_save_mock, ): + model_factory_mock().create.return_value = self.model_mock + self.address_parser = AddressParser( model_type=self.a_fasttext_model_type, device=self.a_torch_device, @@ -538,26 +525,24 @@ def test_givenFastTextModel_whenRetrainGPU_thenInstantiateExperimentProperly( ) self.address_parser_retrain_call() - self.assert_experiment_retrain(experiment_mock, model_mock, optimizer_mock, device=self.a_torch_device) + self.assert_experiment_retrain(experiment_mock, self.model_mock, optimizer_mock, device=self.a_torch_device) @patch("deepparse.parser.address_parser.torch.save") @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextModel_whenRetrainWithUserTags_thenSaveTagsDict( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -591,20 +576,18 @@ def test_givenAFasttextModel_whenRetrainWithUserTags_thenSaveTagsDict( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextModel_whenRetrainWithNewParams_thenModelFactoryIsCalled( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -616,7 +599,7 @@ def test_givenAFasttextModel_whenRetrainWithNewParams_thenModelFactoryIsCalled( verbose=self.verbose, ) model_factory_mock = MagicMock() - self.address_parser._model_factory = model_factory_mock + self.address_parser._setup_model = model_factory_mock self.address_parser_retrain_call(seq2seq_params=self.seq2seq_params) model_factory_mock.assert_called() @@ -625,20 +608,18 @@ def test_givenAFasttextModel_whenRetrainWithNewParams_thenModelFactoryIsCalled( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextModel_whenRetrainWithNewParams_thenSaveNewParamsDict( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -672,20 +653,18 @@ def test_givenAFasttextModel_whenRetrainWithNewParams_thenSaveNewParamsDict( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextModel_whenRetrainWithNewParamsAndNewTags_thenSaveNewParamsDictAndParams( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -720,20 +699,18 @@ def test_givenAFasttextModel_whenRetrainWithNewParamsAndNewTags_thenSaveNewParam @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextModel_whenRetrainWithNewParamsAndNewTagsAndFreezeLayers_thenSaveNewParamsDictAndParams( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -770,23 +747,24 @@ def test_givenAFasttextModel_whenRetrainWithNewParamsAndNewTagsAndFreezeLayers_t @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbModel_whenRetrainCPU_thenInstantiateExperimentProperly( self, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, torch_save_mock, ): + model_factory_mock().create.return_value = self.model_mock self.address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_device, @@ -794,30 +772,33 @@ def test_givenABPEmbModel_whenRetrainCPU_thenInstantiateExperimentProperly( ) self.address_parser_retrain_call() - self.assert_experiment_retrain(experiment_mock, model_mock, optimizer_mock, device=self.a_device) + self.assert_experiment_retrain(experiment_mock, self.model_mock, optimizer_mock, device=self.a_device) @patch("deepparse.parser.address_parser.torch.save") @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") @skipIf(not torch.cuda.is_available(), "no gpu available") def test_givenABPEmbModel_whenRetrainGPU_thenInstantiateExperimentProperly( self, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, torch_save_mock, ): + model_factory_mock().create.return_value = self.model_mock + model_factory_mock().create.return_value = self.model_mock + self.address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_torch_device, @@ -825,29 +806,31 @@ def test_givenABPEmbModel_whenRetrainGPU_thenInstantiateExperimentProperly( ) self.address_parser_retrain_call() - self.assert_experiment_retrain(experiment_mock, model_mock, optimizer_mock, device=self.a_torch_device) + self.assert_experiment_retrain(experiment_mock, self.model_mock, optimizer_mock, device=self.a_torch_device) @patch("deepparse.parser.address_parser.torch.save") @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbModel_whenRetrainCPU_thenInstantiateDataLoaderAndTrainProperly( self, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, torch_save_mock, ): + model_factory_mock().create.return_value = self.model_mock + self.address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_device, @@ -855,30 +838,32 @@ def test_givenABPEmbModel_whenRetrainCPU_thenInstantiateDataLoaderAndTrainProper ) self.address_parser_retrain_call() - self.assert_experiment_retrain(experiment_mock, model_mock, optimizer_mock, device=self.a_device) + self.assert_experiment_retrain(experiment_mock, self.model_mock, optimizer_mock, device=self.a_device) @patch("deepparse.parser.address_parser.torch.save") @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") @skipIf(not torch.cuda.is_available(), "no gpu available") def test_givenABPEmbModel_whenRetrainGPU_thenInstantiateDataLoaderAndTrainProperly( self, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, torch_save_mock, ): + model_factory_mock().create.return_value = self.model_mock + self.address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_torch_device, @@ -886,24 +871,24 @@ def test_givenABPEmbModel_whenRetrainGPU_thenInstantiateDataLoaderAndTrainProper ) self.address_parser_retrain_call() - self.assert_experiment_retrain(experiment_mock, model_mock, optimizer_mock, device=self.a_torch_device) + self.assert_experiment_retrain(experiment_mock, self.model_mock, optimizer_mock, device=self.a_torch_device) @patch("deepparse.parser.address_parser.torch.save") @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbModel_whenRetrain_thenSaveModelProperly( self, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -934,18 +919,18 @@ def test_givenABPEmbModel_whenRetrain_thenSaveModelProperly( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbModel_whenRetrainWithUserTags_thenSaveTagsDict( self, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -980,18 +965,18 @@ def test_givenABPEmbModel_whenRetrainWithUserTags_thenSaveTagsDict( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbModel_whenRetrainWithNewParams_thenModelFactoryIsCalled( self, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -1003,7 +988,7 @@ def test_givenABPEmbModel_whenRetrainWithNewParams_thenModelFactoryIsCalled( verbose=self.verbose, ) model_factory_mock = MagicMock() - self.address_parser._model_factory = model_factory_mock + self.address_parser._setup_model = model_factory_mock self.address_parser_retrain_call(seq2seq_params=self.seq2seq_params) model_factory_mock.assert_called() @@ -1012,18 +997,18 @@ def test_givenABPEmbModel_whenRetrainWithNewParams_thenModelFactoryIsCalled( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbModel_whenRetrainWithNewParams_thenSaveNewParamsDict( self, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -1057,18 +1042,18 @@ def test_givenABPEmbModel_whenRetrainWithNewParams_thenSaveNewParamsDict( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbModel_whenRetrainWithNewParamsAndNewTags_thenSaveNewParamsDictAndParams( self, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -1103,18 +1088,18 @@ def test_givenABPEmbModel_whenRetrainWithNewParamsAndNewTags_thenSaveNewParamsDi @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbModel_whenRetrainWithNewParamsAndNewTagsAndFreezeLayers_thenSaveNewParamsDictAndParams( self, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -1152,18 +1137,18 @@ def test_givenABPEmbModel_whenRetrainWithNewParamsAndNewTagsAndFreezeLayers_then @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenNewPredictionTagsNewDimSize_thenHandleNewOutputDimProperly( self, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_patch, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_model_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -1171,9 +1156,8 @@ def test_givenNewPredictionTagsNewDimSize_thenHandleNewOutputDimProperly( tags_converter_patch, ): # we test with BPEmb but fasttext would give same results - model_mock = MagicMock() - model_mock.same_output_dim.return_value = False - model_patch.return_value = model_mock + self.model_mock.same_output_dim.return_value = False + model_factory_mock().create.return_value = self.model_mock tags_converter_mock = MagicMock(spec=TagsConverter) tags_converter_patch.return_value = tags_converter_mock @@ -1186,18 +1170,20 @@ def test_givenNewPredictionTagsNewDimSize_thenHandleNewOutputDimProperly( self.address_parser_retrain_call(prediction_tags=self.address_components) new_dim_call = [call.handle_new_output_dim(tags_converter_mock.dim)] - model_mock.assert_has_calls(new_dim_call) + self.model_mock.assert_has_calls(new_dim_call) - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenNotTrainingDataContainer_thenRaiseValueError( self, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_patch, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_model_factory_mock, + model_factory_mock, ): self.address_parser = AddressParser( model_type=self.a_bpemb_model_type, @@ -1225,20 +1211,18 @@ def test_givenNotTrainingDataContainer_thenRaiseValueError( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenNotFreezeLayers_thenFreezeLayerMethodNotCalled( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -1259,20 +1243,18 @@ def test_givenNotFreezeLayers_thenFreezeLayerMethodNotCalled( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenFreezeLayersEncoder_thenFreezeLayerMethodCalledWithEncoder( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -1294,20 +1276,18 @@ def test_givenFreezeLayersEncoder_thenFreezeLayerMethodCalledWithEncoder( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenFreezeLayersDecoder_thenFreezeLayerMethodCalledWithDecoder( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -1329,20 +1309,18 @@ def test_givenFreezeLayersDecoder_thenFreezeLayerMethodCalledWithDecoder( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenFreezeLayersPredictionLayer_thenFreezeLayerMethodCalledWithPredictionLayer( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -1364,20 +1342,18 @@ def test_givenFreezeLayersPredictionLayer_thenFreezeLayerMethodCalledWithPredict @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenFreezeLayersSeq2Seq_thenFreezeLayerMethodCalledWithSeq2Seq( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -1399,20 +1375,18 @@ def test_givenFreezeLayersSeq2Seq_thenFreezeLayerMethodCalledWithSeq2Seq( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenWrongFreezeLayersName_thenRaiseValueError( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -1431,20 +1405,18 @@ def test_givenWrongFreezeLayersName_thenRaiseValueError( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenRetrainSettings_whenFormattedNameParserName_thenReturnProperNaming( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -1502,20 +1474,18 @@ def test_givenRetrainSettings_whenFormattedNameParserName_thenReturnProperNaming @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenNoneNewNamedModelName_thenSavingPathIsDefaultPathWithExtension( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -1540,20 +1510,18 @@ def test_givenNoneNewNamedModelName_thenSavingPathIsDefaultPathWithExtension( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenNewNamedModelName_thenSavingPathIsModified( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -1579,20 +1547,18 @@ def test_givenNewNamedModelName_thenSavingPathIsModified( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenWrongNewNamedModelName_thenRaiseValueError( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -1613,27 +1579,24 @@ def test_givenWrongNewNamedModelName_thenRaiseValueError( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenNotADatasetContainer_whenRetrainCall_thenRaiseValueError( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, torch_save_mock, os_path_join_mock, ): - self.address_parser = AddressParser( model_type=self.a_fasttext_model_type, device=self.a_device, @@ -1662,20 +1625,18 @@ def test_givenNotADatasetContainer_whenRetrainCall_thenRaiseValueError( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenNotADatasetContainer_whenRetrainCallWithValDataset_thenDontUseTrainRatio( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -1693,11 +1654,13 @@ def test_givenNotADatasetContainer_whenRetrainCallWithValDataset_thenDontUseTrai train_ratio_mock.assert_not_called() - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenABPEmbModelType_whenRetrainWithIncorrectPredictionTags_thenRaiseValueError( - self, embeddings_model_mock + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel"): + with patch("deepparse.parser.address_parser.ModelFactory"): address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_device, @@ -1713,12 +1676,13 @@ def test_givenABPEmbModelType_whenRetrainWithIncorrectPredictionTags_thenRaiseVa num_workers=0, ) - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenAFasttextModelType_whenInstantiatingParserWithUserComponent_thenRaiseValueError( - self, download_weights_mock, embeddings_model_mock + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel"): + with patch("deepparse.parser.address_parser.ModelFactory"): address_parser = AddressParser( model_type=self.a_fasttext_model_type, device=self.a_device, @@ -1735,13 +1699,14 @@ def test_givenAFasttextModelType_whenInstantiatingParserWithUserComponent_thenRa ) @skipIf(platform.system() != "Windows", "Integration test on Windows env.") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") def test_givenAFasttextModelTypeOnWindows_whenInstantiatingParserWithNumWorkerGT0_thenRaiseError( - self, download_weights_mock, embeddings_model_mock + self, data_processor_factory_mock, vectorizer_factory_mock, embeddings_model_factory_mock ): num_workers_gt_0 = 1 - with patch("deepparse.parser.address_parser.FastTextSeq2SeqModel"): + with patch("deepparse.parser.address_parser.ModelFactory"): address_parser = AddressParser( model_type=self.a_fasttext_model_type, device=self.a_device, diff --git a/tests/parser/test_address_parser_test_api.py b/tests/parser/test_address_parser_test_api.py index 0c849b15..e7c650cc 100644 --- a/tests/parser/test_address_parser_test_api.py +++ b/tests/parser/test_address_parser_test_api.py @@ -3,7 +3,7 @@ import platform import unittest from unittest import skipIf -from unittest.mock import patch, call +from unittest.mock import MagicMock, patch, call import torch @@ -37,6 +37,9 @@ def setUpClass(cls): cls.verbose = False + def setUp(self): + self.model_mock = MagicMock() + def address_parser_test_call(self, dataset_container=None, num_workers=None): if dataset_container is None: dataset_container = self.mocked_data_container @@ -56,7 +59,7 @@ def address_parser_test_call(self, dataset_container=None, num_workers=None): def assert_experiment_test(self, experiment_mock, model_mock, device): experiment_mock.assert_called_with( "./checkpoint", # We always use this as default logging dir. - model_mock(), + model_mock, device=device, # For a reason I don't understand if I use self.nll_loss and set it in the # class setup, it return a bound method for the nll_loss but it work for @@ -73,24 +76,23 @@ def assert_experiment_test_method_is_call(self, data_loader_mock, experiment_moc @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextModel_whenTestCPU_thenInstantiateExperimentProperly( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_model_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, ): + model_factory_mock().create.return_value = self.model_mock self.address_parser = AddressParser( model_type=self.a_fasttext_model_type, device=self.a_device, @@ -98,30 +100,29 @@ def test_givenAFasttextModel_whenTestCPU_thenInstantiateExperimentProperly( ) self.address_parser_test_call() - self.assert_experiment_test(experiment_mock, model_mock, device=self.a_device) + self.assert_experiment_test(experiment_mock, self.model_mock, device=self.a_device) @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") @skipIf(not torch.cuda.is_available(), "no gpu available") def test_givenAFasttextModel_whenTestGPU_thenInstantiateExperimentProperly( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_model_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, ): + model_factory_mock().create.return_value = self.model_mock self.address_parser = AddressParser( model_type=self.a_fasttext_model_type, device=self.a_torch_device, @@ -129,25 +130,23 @@ def test_givenAFasttextModel_whenTestGPU_thenInstantiateExperimentProperly( ) self.address_parser_test_call() - self.assert_experiment_test(experiment_mock, model_mock, device=self.a_torch_device) + self.assert_experiment_test(experiment_mock, self.model_mock, device=self.a_torch_device) @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextModel_whenTest_thenInstantiateDataLoaderAndTestProperly( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_model_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -164,20 +163,18 @@ def test_givenAFasttextModel_whenTest_thenInstantiateDataLoaderAndTestProperly( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextModel_whenTestVerbose_thenInstantiateWithVerbose( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_model_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -194,20 +191,18 @@ def test_givenAFasttextModel_whenTestVerbose_thenInstantiateWithVerbose( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.FastTextSeq2SeqModel") - @patch("deepparse.parser.address_parser.fasttext_data_padding") - @patch("deepparse.parser.address_parser.FastTextVectorizer") - @patch("deepparse.parser.address_parser.FastTextEmbeddingsModel") - @patch("deepparse.parser.address_parser.download_fasttext_embeddings") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenAFasttextModelOnWindows_whenTestVerboseWithNumWorkerGT0_thenRaiseError( self, - download_weights_mock, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_model_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -223,22 +218,23 @@ def test_givenAFasttextModelOnWindows_whenTestVerboseWithNumWorkerGT0_thenRaiseE @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbModel_whenTestCPU_thenInstantiateExperimentProperly( self, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_model_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, ): + model_factory_mock().create.return_value = self.model_mock self.address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_device, @@ -246,28 +242,29 @@ def test_givenABPEmbModel_whenTestCPU_thenInstantiateExperimentProperly( ) self.address_parser_test_call() - self.assert_experiment_test(experiment_mock, model_mock, device=self.a_device) + self.assert_experiment_test(experiment_mock, self.model_mock, device=self.a_device) @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") @skipIf(not torch.cuda.is_available(), "no gpu available") def test_givenABPEmbModel_whenTestGPU_thenInstantiateExperimentProperly( self, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_model_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, ): + model_factory_mock().create.retturn_value = self.model_mock self.address_parser = AddressParser( model_type=self.a_bpemb_model_type, device=self.a_torch_device, @@ -275,23 +272,23 @@ def test_givenABPEmbModel_whenTestGPU_thenInstantiateExperimentProperly( ) self.address_parser_test_call() - self.assert_experiment_test(experiment_mock, model_mock, device=self.a_torch_device) + self.assert_experiment_test(experiment_mock, self.model_mock, device=self.a_torch_device) @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbModel_whenTest_thenInstantiateDataLoaderAndTestProperly( self, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_model_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -308,18 +305,18 @@ def test_givenABPEmbModel_whenTest_thenInstantiateDataLoaderAndTestProperly( @patch("deepparse.parser.address_parser.DataLoader") @patch("deepparse.parser.address_parser.Experiment") @patch("deepparse.parser.address_parser.SGD") - @patch("deepparse.parser.address_parser.DataTransform") - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenABPEmbModel_whenTestVerboseTrue_thenInstantiateWithVerbose( self, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_mock, - data_transform_mock, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_model_factory_mock, + model_factory_mock, optimizer_mock, experiment_mock, data_loader_mock, @@ -330,16 +327,18 @@ def test_givenABPEmbModel_whenTestVerboseTrue_thenInstantiateWithVerbose( self.assert_experiment_test_method_is_call(data_loader_mock, experiment_mock, verbose=verbose) - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenNotTrainingDataContainer_thenRaiseValueError( self, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_patch, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_model_factory_mock, + model_factory_mock, ): self.address_parser = AddressParser( model_type=self.a_bpemb_model_type, @@ -350,16 +349,18 @@ def test_givenNotTrainingDataContainer_thenRaiseValueError( with self.assertRaises(ValueError): self.address_parser_test_call(dataset_container=mocked_data_container) - @patch("deepparse.parser.address_parser.BPEmbSeq2SeqModel") - @patch("deepparse.parser.address_parser.bpemb_data_padding") - @patch("deepparse.parser.address_parser.BPEmbVectorizer") - @patch("deepparse.parser.address_parser.BPEmbEmbeddingsModel") + @patch("deepparse.parser.address_parser.ModelFactory") + @patch("deepparse.parser.address_parser.EmbeddingsModelFactory") + @patch("deepparse.parser.address_parser.VectorizerFactory") + @patch("deepparse.parser.address_parser.DataProcessorFactory") + @patch("deepparse.parser.address_parser.DataPadder") def test_givenNotADataContainer_thenRaiseValueError( self, - embeddings_model_mock, - vectorizer_model_mock, - data_padding_mock, - model_patch, + data_padder_mock, + data_processor_factory_mock, + vectorizer_factory_mock, + embeddings_model_factory_mock, + model_factory_mock, ): self.address_parser = AddressParser( model_type=self.a_bpemb_model_type, diff --git a/tests/requirements.txt b/tests/requirements.txt index c68cf3af..7d14b297 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -2,3 +2,4 @@ pycountry pandas pytest pytest_cov +tensorboard \ No newline at end of file diff --git a/tests/vectorizer/test_train_vectorizer.py b/tests/vectorizer/test_train_vectorizer.py deleted file mode 100644 index fbf92a27..00000000 --- a/tests/vectorizer/test_train_vectorizer.py +++ /dev/null @@ -1,31 +0,0 @@ -from unittest import TestCase -from unittest.mock import MagicMock - -from deepparse.errors.data_error import DataError -from deepparse.embeddings_models import EmbeddingsModel -from deepparse.vectorizer import TrainVectorizer, BPEmbVectorizer - - -class TrainVectorizerTest(TestCase): - def test_givenAEmbeddingVectorizer_whenCallVectorizer_thenProcess(self): - train_vectorizer = TrainVectorizer(MagicMock(), MagicMock()) - - output = train_vectorizer(["A list"]) - - self.assertIsInstance(output, zip) - - def test_givenAVectorizer_whenCallAnAddress_thenProcess(self): - train_vectorizer = TrainVectorizer(MagicMock(side_effect=[[0]]), MagicMock(side_effect=[0, 0])) - - output = train_vectorizer(["A list of"]) - self.assertEqual(list(output), [(0, [0, 0])]) - - def test_givenAVectorizer_whenCallWithAnWhiteSpaceOnlyAddress_thenRaiseError(self): - embedding_network = MagicMock(spec=EmbeddingsModel) - embedding_network.dim = 2 - bpemb_vectorizer = BPEmbVectorizer(embedding_network) - - train_vectorizer = TrainVectorizer(bpemb_vectorizer, MagicMock()) - a_whitespace_only_address = " " - with self.assertRaises(DataError): - train_vectorizer([a_whitespace_only_address]) diff --git a/tests/vectorizer/test_vectorizer_factory.py b/tests/vectorizer/test_vectorizer_factory.py new file mode 100644 index 00000000..4839ce41 --- /dev/null +++ b/tests/vectorizer/test_vectorizer_factory.py @@ -0,0 +1,55 @@ +# pylint: disable=unused-argument, arguments-differ +import unittest +from unittest import TestCase +from unittest.mock import patch + +from deepparse.vectorizer import VectorizerFactory, BPEmbVectorizer, MagnitudeVectorizer, FastTextVectorizer +from deepparse.embeddings_models import ( + BPEmbEmbeddingsModel, + FastTextEmbeddingsModel, + MagnitudeEmbeddingsModel, +) + + +class VectorizerFactoryTest(TestCase): + @classmethod + @patch("deepparse.embeddings_models.bpemb_embeddings_model.BPEmb") + @patch("deepparse.embeddings_models.fasttext_embeddings_model.load_fasttext_embeddings") + @patch("deepparse.embeddings_models.fasttext_embeddings_model.load_facebook_vectors") + @patch("deepparse.embeddings_models.magnitude_embeddings_model.Magnitude") + def setUpClass(cls, magnitude_mock, facebook_vectors_load_mock, fasttext_load_mock, bpemb_mock): + a_cache_dir = "~/.cache/deepparse" + cls.a_bpemb_embeddings_model = BPEmbEmbeddingsModel(a_cache_dir) + + a_embeddings_path = "path" + cls.a_fasttext_embeddings_model = FastTextEmbeddingsModel(a_embeddings_path) + + cls.a_magnitude_embeddings_model = MagnitudeEmbeddingsModel(a_embeddings_path) + + cls.an_unsupported_embeddings_model = "unsupported" + + def setUp(self): + self.vectorizer_factory = VectorizerFactory() + + def test_givenABpembEmbeddingsModel_whenCreatingVectorizer_thenShouldReturnProperVectorizer(self): + vectorizer = self.vectorizer_factory.create(self.a_bpemb_embeddings_model) + + self.assertIsInstance(vectorizer, BPEmbVectorizer) + + def test_givenAFasttextEmbeddingsModel_whenCreatingVectorizer_thenShouldReturnProperVectorizer(self): + vectorizer = self.vectorizer_factory.create(self.a_fasttext_embeddings_model) + + self.assertIsInstance(vectorizer, FastTextVectorizer) + + def test_givenAMagnitudeEmbeddingsModel_whenCreatingVectorizer_thenShouldReturnProperVectorizer(self): + vectorizer = self.vectorizer_factory.create(self.a_magnitude_embeddings_model) + + self.assertIsInstance(vectorizer, MagnitudeVectorizer) + + def test_givenAUnsupportedEmbeddingsModel_whenCreatingVectorizer_thenShouldRaiseError(self): + with self.assertRaises(NotImplementedError): + self.vectorizer_factory.create(self.an_unsupported_embeddings_model) + + +if __name__ == "__main__": + unittest.main() diff --git a/version.txt b/version.txt index 965065db..a602fc9e 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.9.3 +0.9.4