From e2aef1ee9bee1b85e3ffa7a4d52724113140095d Mon Sep 17 00:00:00 2001 From: davebulaval Date: Thu, 27 Apr 2023 07:21:25 -0400 Subject: [PATCH 01/13] add first version of uri implementation using s3path --- deepparse/parser/address_parser.py | 30 ++++++++++++++++++++++++++++-- setup.py | 2 +- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/deepparse/parser/address_parser.py b/deepparse/parser/address_parser.py index 18718ae5..b5464e0c 100644 --- a/deepparse/parser/address_parser.py +++ b/deepparse/parser/address_parser.py @@ -44,6 +44,11 @@ from ..tools import CACHE_PATH, valid_poutyne_version from ..vectorizer import VectorizerFactory +try: + from s3path import PureS3Path +except ImportError: + PureS3Path = None + _pre_trained_tags_to_idx = { "StreetNumber": 0, "StreetName": 1, @@ -105,7 +110,8 @@ class AddressParser: path_to_retrained_model (Union[str, None]): The path to the retrained model to use for prediction. We will infer the ``model_type`` of the retrained model. The default value is ``None``, meaning we use our pretrained model. If the retrained model uses an attention mechanism, ``attention_mechanism`` needs to - be set to True. + be set to True. The path_to_retrain_model can also be an AWS S3 bucket URI + (e.g. ``"s3://path/to/aws/s3/bucket.ckpt"``). The default value is None. cache_dir (Union[str, None]): The path to the cached directory to use for downloading (and loading) the embeddings model and the model pretrained weights. offline (bool): Whether or not the model is an offline one, meaning you have already downloaded the pre-trained @@ -193,6 +199,14 @@ class AddressParser: offline=True) parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6") + Using a retrained model in an AWS S3 bucket. + + .. code-block:: python + + address_parser = AddressParser(model_type="fasttext", + path_to_retrained_model="s3://path/to/bucket.ckpt") + parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6") + """ def __init__( @@ -222,7 +236,19 @@ def __init__( seq2seq_kwargs = {} # Empty for default settings if path_to_retrained_model is not None: - checkpoint_weights = torch.load(path_to_retrained_model, map_location="cpu") + if "s3://" in path_to_retrained_model: + if PureS3Path is None: + raise ImportError("s3path needs to be installed to use a AWS S3 URI as path_to_retrained_model.") + path_to_retrained_model = PureS3Path.from_uri(path_to_retrained_model) + try: + checkpoint_weights = torch.load(path_to_retrained_model, map_location="cpu") + except FileNotFoundError as e: + if "s3" in path_to_retrained_model or "//" in path_to_retrained_model or ":" in path_to_retrained_model: + raise FileNotFoundError( + f"{e}. Are You trying to use a AWS S3 URI? If so path need to start with" f"s3://." + ) + else: + raise e if checkpoint_weights.get("model_type") is None: # Validate if we have the proper metadata, it has at least the parser model type # if no other thing have been modified. diff --git a/setup.py b/setup.py index 109e5327..7dfd5b03 100644 --- a/setup.py +++ b/setup.py @@ -84,7 +84,7 @@ def main(): description="A library for parsing multinational street addresses using deep learning.", long_description=readme, long_description_content_type="text/markdown", - extras_require={"colorama": "colorama>=0.4.3"}, + extras_require={"colorama": "colorama>=0.4.3", "s3path": "s3path>=0.4.1"}, ) From 4a1cbf93342599b7c4a5f3e51681faa0ae16a2ec Mon Sep 17 00:00:00 2001 From: davebulaval Date: Thu, 27 Apr 2023 08:52:42 -0400 Subject: [PATCH 02/13] changed to cloudpath since support more s3 provider --- deepparse/parser/address_parser.py | 47 +++++++++++++++++++----------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/deepparse/parser/address_parser.py b/deepparse/parser/address_parser.py index 42e12ed7..dafcd2d2 100644 --- a/deepparse/parser/address_parser.py +++ b/deepparse/parser/address_parser.py @@ -45,9 +45,9 @@ from ..vectorizer import VectorizerFactory try: - from s3path import PureS3Path + from cloudpathlib import CloudPath except ImportError: - PureS3Path = None + CloudPath = None _pre_trained_tags_to_idx = { "StreetNumber": 0, @@ -110,8 +110,9 @@ class AddressParser: path_to_retrained_model (Union[str, None]): The path to the retrained model to use for prediction. We will infer the ``model_type`` of the retrained model. The default value is ``None``, meaning we use our pretrained model. If the retrained model uses an attention mechanism, ``attention_mechanism`` needs to - be set to True. The path_to_retrain_model can also be an AWS S3 bucket URI - (e.g. ``"s3://path/to/aws/s3/bucket.ckpt"``). The default value is None. + be set to True. The path_to_retrain_model can also be a S3-like (Azure, AWS, Google) bucket URI (e.g. + ``"s3://path/to/aws/s3/bucket.ckpt"``). See `cloudpathlib ` + for detail on supported S3 buckets provider. The default value is None. cache_dir (Union[str, None]): The path to the cached directory to use for downloading (and loading) the embeddings model and the model pretrained weights. offline (bool): Whether or not the model is an offline one, meaning you have already downloaded the pre-trained @@ -199,7 +200,7 @@ class AddressParser: offline=True) parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6") - Using a retrained model in an AWS S3 bucket. + Using a retrained model in an S3-like bucket. .. code-block:: python @@ -237,25 +238,37 @@ def __init__( if path_to_retrained_model is not None: if "s3://" in path_to_retrained_model: - if PureS3Path is None: - raise ImportError("s3path needs to be installed to use a AWS S3 URI as path_to_retrained_model.") - path_to_retrained_model = PureS3Path.from_uri(path_to_retrained_model) - try: - checkpoint_weights = torch.load(path_to_retrained_model, map_location="cpu") - except FileNotFoundError as e: - if "s3" in path_to_retrained_model or "//" in path_to_retrained_model or ":" in path_to_retrained_model: - raise FileNotFoundError( - f"{e}. Are You trying to use a AWS S3 URI? If so path need to start with" f"s3://." + if CloudPath is None: + raise ImportError( + "cloudpathlib needs to be installed to use a S3-like " "URI as path_to_retrained_model." ) - else: - raise e + path_to_retrained_model = CloudPath(path_to_retrained_model) + try: + with path_to_retrained_model.open("rb") as file: + checkpoint_weights = torch.load(file, map_location="cpu") + except FileNotFoundError as e: + raise FileNotFoundError(f"The file in the S3 bucket was not found. Original error: {e}.") + else: + try: + checkpoint_weights = torch.load(path_to_retrained_model, map_location="cpu") + except FileNotFoundError as e: + if ( + "s3" in path_to_retrained_model + or "//" in path_to_retrained_model + or ":" in path_to_retrained_model + ): + raise FileNotFoundError( + f"{e}. Are You trying to use a AWS S3 URI? If so path need to start with" f"s3://." + ) + else: + raise e if checkpoint_weights.get("model_type") is None: # Validate if we have the proper metadata, it has at least the parser model type # if no other thing have been modified. raise RuntimeError( "You are not using the proper retrained checkpoint. " "When we retrain an AddressParser, by default, we create a " - "checkpoint name 'retrained_modeltype_address_parser.ckpt'. Be sure to use that" + "checkpoint name 'retrained_modeltype_address_parser.ckpt'. Be sure to use that " "checkpoint since it includes some metadata for the reloading." "See AddressParser.retrain for more details." ) From 3d430a453cf806e9239c84d4252bf59036096e29 Mon Sep 17 00:00:00 2001 From: davebulaval Date: Sun, 7 May 2023 07:15:32 -0400 Subject: [PATCH 03/13] improve cloudpathlib handling and doc --- deepparse/parser/address_parser.py | 53 +++++++++++++++++++++++++----- setup.py | 2 +- 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/deepparse/parser/address_parser.py b/deepparse/parser/address_parser.py index dafcd2d2..e85cb287 100644 --- a/deepparse/parser/address_parser.py +++ b/deepparse/parser/address_parser.py @@ -45,7 +45,7 @@ from ..vectorizer import VectorizerFactory try: - from cloudpathlib import CloudPath + from cloudpathlib import CloudPath, S3Path except ImportError: CloudPath = None @@ -107,12 +107,13 @@ class AddressParser: The default value is GPU with the index ``0`` if it exists. Otherwise, the value is ``CPU``. rounding (int): The rounding to use when asking the probability of the tags. The default value is four digits. verbose (bool): Turn on/off the verbosity of the model weights download and loading. The default value is True. - path_to_retrained_model (Union[str, None]): The path to the retrained model to use for prediction. We will - infer the ``model_type`` of the retrained model. The default value is ``None``, meaning we use our + path_to_retrained_model (Union[S3Path, str, None]): The path to the retrained model to use for prediction. + We will infer the ``model_type`` of the retrained model. The default value is ``None``, meaning we use our pretrained model. If the retrained model uses an attention mechanism, ``attention_mechanism`` needs to - be set to True. The path_to_retrain_model can also be a S3-like (Azure, AWS, Google) bucket URI (e.g. - ``"s3://path/to/aws/s3/bucket.ckpt"``). See `cloudpathlib ` - for detail on supported S3 buckets provider. The default value is None. + be set to True. The path_to_retrain_model can also be a S3-like (Azure, AWS, Google) bucket URI string path + (e.g. ``"s3://path/to/aws/s3/bucket.ckpt"``). Or it can be a ``S3Path`` S3-like URI using `cloudpathlib` + to handle S3-like bucket. See `cloudpathlib ` + for detail on supported S3 buckets provider and URI condition. The default value is None. cache_dir (Union[str, None]): The path to the cached directory to use for downloading (and loading) the embeddings model and the model pretrained weights. offline (bool): Whether or not the model is an offline one, meaning you have already downloaded the pre-trained @@ -208,6 +209,13 @@ class AddressParser: path_to_retrained_model="s3://path/to/bucket.ckpt") parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6") + Using a retrained model in an S3-like bucket using CloudPathLib. + + .. code-block:: python + + address_parser = AddressParser(model_type="fasttext", + path_to_retrained_model=CloudPath("s3://path/to/bucket.ckpt")) + parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6") """ def __init__( @@ -217,7 +225,7 @@ def __init__( device: Union[int, str, torch.device] = 0, rounding: int = 4, verbose: bool = True, - path_to_retrained_model: Union[str, None] = None, + path_to_retrained_model: Union[S3Path, str, None] = None, cache_dir: Union[str, None] = None, offline: bool = False, ) -> None: @@ -237,7 +245,15 @@ def __init__( seq2seq_kwargs = {} # Empty for default settings if path_to_retrained_model is not None: - if "s3://" in path_to_retrained_model: + if isinstance(path_to_retrained_model, S3Path): + # To handle CloudPath path_to_retrained_model + try: + with path_to_retrained_model.open("rb") as file: + checkpoint_weights = torch.load(file, map_location="cpu") + except FileNotFoundError as e: + raise FileNotFoundError(f"The file in the S3 bucket was not found. Original error: {e}.") + elif "s3://" in path_to_retrained_model: + # To handle str S3-like URI. if CloudPath is None: raise ImportError( "cloudpathlib needs to be installed to use a S3-like " "URI as path_to_retrained_model." @@ -836,6 +852,7 @@ def retrain( else f"retrained_{self.model_type}_address_parser.ckpt" ) file_path = os.path.join(logging_path, file_name) + torch_save = { "address_tagger_model": exp.model.network.state_dict(), "model_type": self.model_type, @@ -856,7 +873,25 @@ def retrain( } ) - torch.save(torch_save, file_path) + if "s3://" in file_path: + if CloudPath is None: + raise ImportError("cloudpathlib needs to be installed to use a S3-like URI as export path.") + path_to_retrained_model = CloudPath(file_path) + try: + with path_to_retrained_model.open("rb") as file: + torch.save(torch_save, file) + except FileNotFoundError as e: + raise FileNotFoundError(f"The file in the S3 bucket was not found. Original error: {e}.") + else: + try: + torch.save(torch_save, file_path) + except FileNotFoundError as e: + if "s3" in file_path or "//" in file_path or ":" in file_path: + raise FileNotFoundError( + f"{e}. Are You trying to use a AWS S3 URI? If so path need to start with" f"s3://." + ) + else: + raise e return train_res def test( diff --git a/setup.py b/setup.py index 7dfd5b03..3655eb91 100644 --- a/setup.py +++ b/setup.py @@ -84,7 +84,7 @@ def main(): description="A library for parsing multinational street addresses using deep learning.", long_description=readme, long_description_content_type="text/markdown", - extras_require={"colorama": "colorama>=0.4.3", "s3path": "s3path>=0.4.1"}, + extras_require={"colorama": "colorama>=0.4.3", "cloudpathlib": "cloudpathlib>=0.13.0"}, ) From 3f0ac080983b2aa60f52c98bd1bd54f328882ffd Mon Sep 17 00:00:00 2001 From: davebulaval Date: Sat, 20 May 2023 13:33:39 -0400 Subject: [PATCH 04/13] add fn to handle weights download over URI, add new dependencies, add tests and fix setup.py --- deepparse/parser/address_parser.py | 42 +++--------------------- deepparse/parser/tools.py | 35 +++++++++++++++++++- requirements.txt | 3 +- setup.py | 2 +- tests/parser/test_tools.py | 52 ++++++++++++++++++++++++++++++ 5 files changed, 94 insertions(+), 40 deletions(-) diff --git a/deepparse/parser/address_parser.py b/deepparse/parser/address_parser.py index bce8fca5..5b144ae9 100644 --- a/deepparse/parser/address_parser.py +++ b/deepparse/parser/address_parser.py @@ -31,6 +31,7 @@ pretrained_parser_in_directory, validate_if_new_prediction_tags, validate_if_new_seq2seq_params, + handle_weights_upload, ) from .. import validate_data_to_parse from ..converter import TagsConverter, DataProcessorFactory, DataPadder @@ -44,10 +45,8 @@ from ..tools import CACHE_PATH, valid_poutyne_version from ..vectorizer import VectorizerFactory -try: - from cloudpathlib import CloudPath, S3Path -except ImportError: - CloudPath = None +from cloudpathlib import CloudPath, S3Path + _pre_trained_tags_to_idx = { "StreetNumber": 0, @@ -245,39 +244,7 @@ def __init__( seq2seq_kwargs = {} # Empty for default settings if path_to_retrained_model is not None: - if isinstance(path_to_retrained_model, S3Path): - # To handle CloudPath path_to_retrained_model - try: - with path_to_retrained_model.open("rb") as file: - checkpoint_weights = torch.load(file, map_location="cpu") - except FileNotFoundError as e: - raise FileNotFoundError(f"The file in the S3 bucket was not found. Original error: {e}.") - elif "s3://" in path_to_retrained_model: - # To handle str S3-like URI. - if CloudPath is None: - raise ImportError( - "cloudpathlib needs to be installed to use a S3-like " "URI as path_to_retrained_model." - ) - path_to_retrained_model = CloudPath(path_to_retrained_model) - try: - with path_to_retrained_model.open("rb") as file: - checkpoint_weights = torch.load(file, map_location="cpu") - except FileNotFoundError as e: - raise FileNotFoundError(f"The file in the S3 bucket was not found. Original error: {e}.") - else: - try: - checkpoint_weights = torch.load(path_to_retrained_model, map_location="cpu") - except FileNotFoundError as e: - if ( - "s3" in path_to_retrained_model - or "//" in path_to_retrained_model - or ":" in path_to_retrained_model - ): - raise FileNotFoundError( - f"{e}. Are You trying to use a AWS S3 URI? If so path need to start with" f"s3://." - ) - else: - raise e + checkpoint_weights = handle_weights_upload(path_to_retrained_model=path_to_retrained_model) if checkpoint_weights.get("model_type") is None: # Validate if we have the proper metadata, it has at least the parser model type # if no other thing have been modified. @@ -292,6 +259,7 @@ def __init__( "See AddressParser.retrain for more details." ) raise RuntimeError(error_text) + if validate_if_new_seq2seq_params(checkpoint_weights): seq2seq_kwargs = checkpoint_weights.get("seq2seq_params") if validate_if_new_prediction_tags(checkpoint_weights): diff --git a/deepparse/parser/tools.py b/deepparse/parser/tools.py index 8e77afe9..97f222c1 100644 --- a/deepparse/parser/tools.py +++ b/deepparse/parser/tools.py @@ -1,9 +1,10 @@ import math import os -from typing import List, OrderedDict, Tuple +from typing import List, OrderedDict, Tuple, Union import numpy as np import torch +from cloudpathlib import CloudPath, S3Path def validate_if_new_prediction_tags(checkpoint_weights: dict) -> bool: @@ -138,3 +139,35 @@ def infer_model_type(checkpoint_weights: OrderedDict, attention_mechanism: bool) attention_mechanism = True return model_type, attention_mechanism + + +def handle_weights_upload(path_to_retrained_model: Union[str, S3Path]) -> OrderedDict: + if isinstance(path_to_retrained_model, S3Path): + # To handle CloudPath path_to_retrained_model + try: + with path_to_retrained_model.open("rb") as file: + checkpoint_weights = torch.load(file, map_location="cpu") + except FileNotFoundError as e: + raise FileNotFoundError(f"The file in the S3 bucket was not found. Original error: {e}.") + elif "s3://" in path_to_retrained_model: + # To handle str S3-like URI. + if CloudPath is None: + raise ImportError("cloudpathlib needs to be installed to use a S3-like URI as path_to_retrained_model.") + path_to_retrained_model = CloudPath(path_to_retrained_model) + try: + with path_to_retrained_model.open("rb") as file: + checkpoint_weights = torch.load(file, map_location="cpu") + except FileNotFoundError as e: + raise FileNotFoundError(f"The file in the S3 bucket was not found. Original error: {e}.") + else: + # Path is a local one (or a wrongly written S3 URI). + try: + checkpoint_weights = torch.load(path_to_retrained_model, map_location="cpu") + except FileNotFoundError as e: + if "s3" in path_to_retrained_model or "//" in path_to_retrained_model or ":" in path_to_retrained_model: + raise FileNotFoundError( + f"{e}. Are You trying to use a AWS S3 URI? If so path need to start with" f"s3://." + ) + else: + raise e + return checkpoint_weights diff --git a/requirements.txt b/requirements.txt index bb532601..da5997f4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ poutyne gensim>=4.2.0 fasttext pandas -urllib3 \ No newline at end of file +urllib3 +cloudpathlib[s3, gs, azure] \ No newline at end of file diff --git a/setup.py b/setup.py index 3655eb91..109e5327 100644 --- a/setup.py +++ b/setup.py @@ -84,7 +84,7 @@ def main(): description="A library for parsing multinational street addresses using deep learning.", long_description=readme, long_description_content_type="text/markdown", - extras_require={"colorama": "colorama>=0.4.3", "cloudpathlib": "cloudpathlib>=0.13.0"}, + extras_require={"colorama": "colorama>=0.4.3"}, ) diff --git a/tests/parser/test_tools.py b/tests/parser/test_tools.py index 970e4f4b..dc2915dd 100644 --- a/tests/parser/test_tools.py +++ b/tests/parser/test_tools.py @@ -8,8 +8,10 @@ import unittest from tempfile import TemporaryDirectory from unittest import skipIf +from unittest.mock import MagicMock, patch, call import torch +from cloudpathlib import S3Path from deepparse.parser.tools import ( indices_splitting, @@ -21,6 +23,7 @@ pretrained_parser_in_directory, handle_model_name, infer_model_type, + handle_weights_upload, ) from tests.parser.base import PretrainedWeightsBase from tests.tools import create_file @@ -513,6 +516,55 @@ def test_givenAModelTypeToInfer_whenRealRetrainBPEmb_thenReturnBPEmb(self): self.assertEqual(expected_inferred_model_type, actual_inferred_model_type) + @patch("deepparse.parser.tools.torch") + def test_givenAS3Path_whenHandleWeights_upload_thenReturnProperWeights(self, torch_mock): + s3_path = MagicMock(spec=S3Path) + + weights_mock = MagicMock() + torch_mock.load().return_value = weights_mock + + handle_weights_upload(path_to_retrained_model=s3_path) + + torch_mock.has_calls([call.load()]) + + @patch("deepparse.parser.tools.CloudPath") + @patch("deepparse.parser.tools.torch") + def test_givenAStringS3Path_whenHandleWeights_upload_thenReturnProperWeights(self, torch_mock, cloud_path_mock): + s3_path = "s3://a_path" + + weights_mock = MagicMock() + torch_mock.load().return_value = weights_mock + + handle_weights_upload(path_to_retrained_model=s3_path) + + torch_mock.has_calls([call.load()]) + cloud_path_mock.assert_called() + + @patch("deepparse.parser.tools.CloudPath") + @patch("deepparse.parser.tools.torch") + def test_givenAStringPath_whenHandleWeights_upload_thenReturnProperWeights(self, torch_mock, cloud_path_mock): + s3_path = "a_normal_path.ckpt" + + weights_mock = MagicMock() + torch_mock.load().return_value = weights_mock + + handle_weights_upload(path_to_retrained_model=s3_path) + + torch_mock.has_calls([call.load()]) + + cloud_path_mock.assert_not_called() + + def test_givenAWrongfullyStringS3Path_whenHandleWeights_upload_thenRaiseError(self): + s3_path = "s3/model.ckpt" + + with self.assertRaises(FileNotFoundError): + handle_weights_upload(path_to_retrained_model=s3_path) + + s3_path = "s3//model.ckpt" + + with self.assertRaises(FileNotFoundError): + handle_weights_upload(path_to_retrained_model=s3_path) + if __name__ == "__main__": unittest.main() From a96232e6b1ce45d098e7bd424bf97154c60d28e1 Mon Sep 17 00:00:00 2001 From: davebulaval Date: Sat, 20 May 2023 14:15:35 -0400 Subject: [PATCH 05/13] add code to upload wieghts properly, add some tests, add model version release --- .gitignore | 2 + .release/model_version_release.md | 4 ++ deepparse/__init__.py | 2 +- deepparse/cli/parse.py | 4 +- deepparse/cli/parser_arguments_adder.py | 2 +- deepparse/cli/test.py | 2 +- deepparse/network/decoder.py | 2 +- deepparse/network/encoder.py | 2 +- deepparse/network/seq2seq.py | 19 +++---- deepparse/parser/address_parser.py | 52 ++++++++++--------- deepparse/parser/tools.py | 35 +------------ deepparse/weights_init.py | 21 -------- deepparse/weights_tools.py | 55 ++++++++++++++++++++ tests/cli/test_parse.py | 4 +- tests/cli/test_testing.py | 2 +- tests/parser/test_tools.py | 52 ------------------- tests/test_weights_tools.py | 68 +++++++++++++++++++++++++ 17 files changed, 178 insertions(+), 150 deletions(-) create mode 100644 .release/model_version_release.md delete mode 100644 deepparse/weights_init.py create mode 100644 deepparse/weights_tools.py create mode 100644 tests/test_weights_tools.py diff --git a/.gitignore b/.gitignore index 25785c8e..b7bbdfbc 100644 --- a/.gitignore +++ b/.gitignore @@ -144,3 +144,5 @@ deepparse/version.py *.ckpt *mlruns/ + +*model/ \ No newline at end of file diff --git a/.release/model_version_release.md b/.release/model_version_release.md new file mode 100644 index 00000000..1e1dceb3 --- /dev/null +++ b/.release/model_version_release.md @@ -0,0 +1,4 @@ +# How to Create a New Model's Version + +1. `md5sum > model.version` +2. Remove the model.cpkt text in `model.version` file diff --git a/deepparse/__init__.py b/deepparse/__init__.py index 462804e9..dd64adac 100644 --- a/deepparse/__init__.py +++ b/deepparse/__init__.py @@ -2,4 +2,4 @@ from .fasttext_tools import * from .tools import * from .version import __version__ -from .weights_init import * +from .weights_tools import * diff --git a/deepparse/cli/parse.py b/deepparse/cli/parse.py index 39fc54f8..9d2c9d7e 100644 --- a/deepparse/cli/parse.py +++ b/deepparse/cli/parse.py @@ -50,7 +50,7 @@ def main(args=None) -> None: .. code-block:: sh - parse fasttext ./dataset.csv parsed_address.pckl --path_to_retrained_model ./path + parse fasttext ./dataset.csv parsed_address.pckl --path_to_model_weights ./path """ if args is None: # pragma: no cover @@ -91,7 +91,7 @@ def main(args=None) -> None: parser_args.update(**parser_args_update_args) if path_to_retrained_model is not None: - parser_args.update({"path_to_retrained_model": path_to_retrained_model}) + parser_args.update({"path_to_model_weights": path_to_retrained_model}) address_parser = AddressParser(**parser_args) diff --git a/deepparse/cli/parser_arguments_adder.py b/deepparse/cli/parser_arguments_adder.py index c50426f0..38e07d17 100644 --- a/deepparse/cli/parser_arguments_adder.py +++ b/deepparse/cli/parser_arguments_adder.py @@ -107,7 +107,7 @@ def add_batch_size_arg(parser: ArgumentParser) -> None: def add_path_to_retrained_model_arg(parser: ArgumentParser) -> None: parser.add_argument( - "--path_to_retrained_model", + "--path_to_model_weights", help=wrap("A path to a retrained model to use for testing."), type=str, default=None, diff --git a/deepparse/cli/test.py b/deepparse/cli/test.py index 648e3dc7..d7da56f9 100644 --- a/deepparse/cli/test.py +++ b/deepparse/cli/test.py @@ -69,7 +69,7 @@ def main(args=None) -> None: path_to_retrained_model = parsed_args.path_to_retrained_model if path_to_retrained_model is not None: - parser_args.update({"path_to_retrained_model": path_to_retrained_model}) + parser_args.update({"path_to_model_weights": path_to_retrained_model}) base_parsing_model = parsed_args.base_parsing_model parser_args_update_args = attention_model_type_handling(base_parsing_model) diff --git a/deepparse/network/decoder.py b/deepparse/network/decoder.py index 8d578069..f0c347ab 100644 --- a/deepparse/network/decoder.py +++ b/deepparse/network/decoder.py @@ -6,7 +6,7 @@ import torch from torch import nn -from ..weights_init import weights_init +from .. import weights_init class Decoder(nn.Module): diff --git a/deepparse/network/encoder.py b/deepparse/network/encoder.py index 27d911f6..5fafb917 100644 --- a/deepparse/network/encoder.py +++ b/deepparse/network/encoder.py @@ -7,7 +7,7 @@ from torch import nn from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence -from ..weights_init import weights_init +from .. import weights_init class Encoder(nn.Module): diff --git a/deepparse/network/seq2seq.py b/deepparse/network/seq2seq.py index 38fc5b5a..5e05625b 100644 --- a/deepparse/network/seq2seq.py +++ b/deepparse/network/seq2seq.py @@ -4,7 +4,6 @@ import random import warnings from abc import ABC -from collections import OrderedDict from typing import Tuple, Union, List import torch @@ -12,6 +11,7 @@ from .decoder import Decoder from .encoder import Encoder +from .. import handle_weights_upload from ..tools import download_weights, latest_version @@ -113,20 +113,21 @@ def _load_pre_trained_weights(self, model_type: str, cache_dir: str, offline: bo ) download_weights(model_type, cache_dir, verbose=self.verbose) - all_layers_params = torch.load(model_path, map_location=self.device) - self.load_state_dict(all_layers_params) + self._load_weights(path_to_model_torch_archive=model_path) - def _load_weights(self, path_to_retrained_model: str) -> None: + def _load_weights(self, path_to_model_torch_archive: str) -> None: """ Method to load (into the network) the weights. Args: - path_to_retrained_model (str): The path to the fine-tuned model. + path_to_model_torch_archive (str): The path to the fine-tuned model Torch archive. """ - all_layers_params = torch.load(path_to_retrained_model, map_location=self.device) - if isinstance(all_layers_params, dict) and not isinstance(all_layers_params, OrderedDict): - # Case where we have a retrained model with a different tagging space - all_layers_params = all_layers_params.get("address_tagger_model") + all_layers_params = handle_weights_upload( + path_to_model_to_upload=path_to_model_torch_archive, device=self.device + ) + + # All the time, our torch archive include meta-data along with the model weights + all_layers_params = all_layers_params.get("address_tagger_model") self.load_state_dict(all_layers_params) def _encoder_step(self, to_predict: torch.Tensor, lengths: List, batch_size: int) -> Tuple: diff --git a/deepparse/parser/address_parser.py b/deepparse/parser/address_parser.py index 5b144ae9..8fc48ce8 100644 --- a/deepparse/parser/address_parser.py +++ b/deepparse/parser/address_parser.py @@ -14,6 +14,7 @@ from typing import Dict, List, Tuple, Union, Callable import torch +from cloudpathlib import CloudPath, S3Path from poutyne.framework import Experiment from torch.optim import SGD from torch.utils.data import DataLoader, Subset @@ -31,7 +32,6 @@ pretrained_parser_in_directory, validate_if_new_prediction_tags, validate_if_new_seq2seq_params, - handle_weights_upload, ) from .. import validate_data_to_parse from ..converter import TagsConverter, DataProcessorFactory, DataPadder @@ -44,9 +44,7 @@ from ..pre_processing import trailing_whitespace_cleaning, double_whitespaces_cleaning from ..tools import CACHE_PATH, valid_poutyne_version from ..vectorizer import VectorizerFactory - -from cloudpathlib import CloudPath, S3Path - +from ..weights_tools import handle_weights_upload _pre_trained_tags_to_idx = { "StreetNumber": 0, @@ -90,7 +88,7 @@ class AddressParser: - ``"lightest"`` (the one using the less RAM and GPU usage) (equivalent to ``"fasttext-light"``), - ``"best"`` (the best accuracy performance) (equivalent to ``"bpemb"``). - The default value is ``"best"`` for the most accurate model. Ignored if ``path_to_retrained_model`` is not + The default value is ``"best"`` for the most accurate model. Ignored if ``path_to_model_weights`` is not ``None``. To further improve performance, consider using the models (fasttext or BPEmb) with their counterparts using an attention mechanism with the ``attention_mechanism`` flag. attention_mechanism (bool): Whether to use the model with an attention mechanism. The model will use an @@ -171,15 +169,15 @@ class AddressParser: .. code-block:: python address_parser = AddressParser(model_type="fasttext", - path_to_retrained_model="/path_to_a_retrain_fasttext_model.ckpt") + path_to_model_weights="/path_to_a_retrain_fasttext_model.ckpt") parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6") Using a retrained model trained on different tags .. code-block:: python - # We don't give the model_type since it's ignored when using path_to_retrained_model - address_parser = AddressParser(path_to_retrained_model="/path_to_a_retrain_fasttext_model.ckpt") + # We don't give the model_type since it's ignored when using path_to_model_weights + address_parser = AddressParser(path_to_model_weights="/path_to_a_retrain_fasttext_model.ckpt") parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6") Using a retrained model with attention @@ -187,7 +185,7 @@ class AddressParser: .. code-block:: python address_parser = AddressParser(model_type="fasttext", - path_to_retrained_model="/path_to_a_retrain_fasttext_attention_model.ckpt", + path_to_model_weights="/path_to_a_retrain_fasttext_attention_model.ckpt", attention_mechanism=True) parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6") @@ -205,7 +203,7 @@ class AddressParser: .. code-block:: python address_parser = AddressParser(model_type="fasttext", - path_to_retrained_model="s3://path/to/bucket.ckpt") + path_to_model_weights="s3://path/to/bucket.ckpt") parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6") Using a retrained model in an S3-like bucket using CloudPathLib. @@ -213,7 +211,7 @@ class AddressParser: .. code-block:: python address_parser = AddressParser(model_type="fasttext", - path_to_retrained_model=CloudPath("s3://path/to/bucket.ckpt")) + path_to_model_weights=CloudPath("s3://path/to/bucket.ckpt")) parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6") """ @@ -244,7 +242,7 @@ def __init__( seq2seq_kwargs = {} # Empty for default settings if path_to_retrained_model is not None: - checkpoint_weights = handle_weights_upload(path_to_retrained_model=path_to_retrained_model) + checkpoint_weights = handle_weights_upload(path_to_model_to_upload=path_to_retrained_model) if checkpoint_weights.get("model_type") is None: # Validate if we have the proper metadata, it has at least the parser model type # if no other thing have been modified. @@ -525,6 +523,7 @@ def retrain( state if any checkpoints are there. Thus, an error will be raised if you change the model type. For example, you retrain a FastText model and then retrain a BPEmb in the same logging path directory. By default, the path is ``./checkpoints``. + # TODO: add can be S3Path disable_tensorboard (bool): To disable Poutyne automatic Tensorboard monitoring. By default, we disable them (true). prediction_tags (Union[dict, None]): A dictionary where the keys are the address components @@ -845,25 +844,30 @@ def retrain( } ) - if "s3://" in file_path: - if CloudPath is None: - raise ImportError("cloudpathlib needs to be installed to use a S3-like URI as export path.") - path_to_retrained_model = CloudPath(file_path) + # TODO: validate if work and tests + if isinstance(file_path, S3Path): + # To handle CloudPath path_to_model_weights try: - with path_to_retrained_model.open("rb") as file: + with file_path.open("wb") as file: torch.save(torch_save, file) - except FileNotFoundError as e: - raise FileNotFoundError(f"The file in the S3 bucket was not found. Original error: {e}.") + except FileNotFoundError as error: + raise FileNotFoundError("The file in the S3 bucket was not found.") from error + + elif "s3://" in file_path: + file_path = CloudPath(file_path) + try: + with file_path.open("wb") as file: + torch.save(torch_save, file) + except FileNotFoundError as error: + raise FileNotFoundError("The file in the S3 bucket was not found.") from error else: try: torch.save(torch_save, file_path) - except FileNotFoundError as e: + except FileNotFoundError as error: if "s3" in file_path or "//" in file_path or ":" in file_path: raise FileNotFoundError( - f"{e}. Are You trying to use a AWS S3 URI? If so path need to start with" f"s3://." - ) - else: - raise e + "Are You trying to use a AWS S3 URI? If so path need to start with s3://." + ) from error return train_res def test( diff --git a/deepparse/parser/tools.py b/deepparse/parser/tools.py index 97f222c1..8e77afe9 100644 --- a/deepparse/parser/tools.py +++ b/deepparse/parser/tools.py @@ -1,10 +1,9 @@ import math import os -from typing import List, OrderedDict, Tuple, Union +from typing import List, OrderedDict, Tuple import numpy as np import torch -from cloudpathlib import CloudPath, S3Path def validate_if_new_prediction_tags(checkpoint_weights: dict) -> bool: @@ -139,35 +138,3 @@ def infer_model_type(checkpoint_weights: OrderedDict, attention_mechanism: bool) attention_mechanism = True return model_type, attention_mechanism - - -def handle_weights_upload(path_to_retrained_model: Union[str, S3Path]) -> OrderedDict: - if isinstance(path_to_retrained_model, S3Path): - # To handle CloudPath path_to_retrained_model - try: - with path_to_retrained_model.open("rb") as file: - checkpoint_weights = torch.load(file, map_location="cpu") - except FileNotFoundError as e: - raise FileNotFoundError(f"The file in the S3 bucket was not found. Original error: {e}.") - elif "s3://" in path_to_retrained_model: - # To handle str S3-like URI. - if CloudPath is None: - raise ImportError("cloudpathlib needs to be installed to use a S3-like URI as path_to_retrained_model.") - path_to_retrained_model = CloudPath(path_to_retrained_model) - try: - with path_to_retrained_model.open("rb") as file: - checkpoint_weights = torch.load(file, map_location="cpu") - except FileNotFoundError as e: - raise FileNotFoundError(f"The file in the S3 bucket was not found. Original error: {e}.") - else: - # Path is a local one (or a wrongly written S3 URI). - try: - checkpoint_weights = torch.load(path_to_retrained_model, map_location="cpu") - except FileNotFoundError as e: - if "s3" in path_to_retrained_model or "//" in path_to_retrained_model or ":" in path_to_retrained_model: - raise FileNotFoundError( - f"{e}. Are You trying to use a AWS S3 URI? If so path need to start with" f"s3://." - ) - else: - raise e - return checkpoint_weights diff --git a/deepparse/weights_init.py b/deepparse/weights_init.py deleted file mode 100644 index 5e6b13b2..00000000 --- a/deepparse/weights_init.py +++ /dev/null @@ -1,21 +0,0 @@ -from torch import nn -from torch.nn import init - - -def weights_init(m: nn.Module) -> None: - """ - Function to initialize the weights of a model layers. - - Usage: - network = Model() - network.apply(weight_init) - """ - if isinstance(m, nn.Linear): - init.xavier_normal_(m.weight.data) - init.normal_(m.bias.data) - elif isinstance(m, (nn.LSTM, nn.LSTMCell, nn.GRU, nn.GRUCell)): - for param in m.parameters(): - if len(param.shape) >= 2: - init.orthogonal_(param.data) - else: - init.normal_(param.data) diff --git a/deepparse/weights_tools.py b/deepparse/weights_tools.py new file mode 100644 index 00000000..0273d1fb --- /dev/null +++ b/deepparse/weights_tools.py @@ -0,0 +1,55 @@ +from typing import OrderedDict, Union + +import torch +from cloudpathlib import CloudPath, S3Path +from torch import nn +from torch.nn import init + + +def weights_init(m: nn.Module) -> None: + """ + Function to initialize the weights of a model layers. + + Usage: + network = Model() + network.apply(weight_init) + """ + if isinstance(m, nn.Linear): + init.xavier_normal_(m.weight.data) + init.normal_(m.bias.data) + elif isinstance(m, (nn.LSTM, nn.LSTMCell, nn.GRU, nn.GRUCell)): + for param in m.parameters(): + if len(param.shape) >= 2: + init.orthogonal_(param.data) + else: + init.normal_(param.data) + + +def handle_weights_upload( + path_to_model_to_upload: Union[str, S3Path], device: Union[str, torch.device] = "cpu" +) -> OrderedDict: + if isinstance(path_to_model_to_upload, S3Path): + # To handle CloudPath path_to_model_weights + try: + with path_to_model_to_upload.open("rb") as file: + checkpoint_weights = torch.load(file, map_location=device) + except FileNotFoundError as error: + raise FileNotFoundError("The file in the S3 bucket was not found.") from error + elif "s3://" in path_to_model_to_upload: + # To handle str S3-like URI. + path_to_model_to_upload = CloudPath(path_to_model_to_upload) + try: + with path_to_model_to_upload.open("rb") as file: + checkpoint_weights = torch.load(file, map_location=device) + except FileNotFoundError as error: + raise FileNotFoundError("The file in the S3 bucket was not found.") from error + else: + # Path is a local one (or a wrongly written S3 URI). + try: + checkpoint_weights = torch.load(path_to_model_to_upload, map_location=device) + except FileNotFoundError as error: + if "s3" in path_to_model_to_upload or "//" in path_to_model_to_upload or ":" in path_to_model_to_upload: + raise FileNotFoundError( + "Are You trying to use a AWS S3 URI? If so path need to start with s3://." + ) from error + return checkpoint_weights diff --git a/tests/cli/test_parse.py b/tests/cli/test_parse.py index 6f489d1d..d341ca79 100644 --- a/tests/cli/test_parse.py +++ b/tests/cli/test_parse.py @@ -258,7 +258,7 @@ def test_ifPathToFakeRetrainModel_thenUseFakeRetrainModel(self): self.pickle_p_export_filename, "--device", self.cpu_device, - "--path_to_retrained_model", + "--path_to_model_weights", self.path_to_retrain_fasttext, ] ) @@ -281,7 +281,7 @@ def test_ifPathToFastTextRetrainModel_thenUseFastTextRetrainModel(self): self.pickle_p_export_filename, "--device", self.cpu_device, - "--path_to_retrained_model", + "--path_to_model_weights", path_to_retrained_model, ] ) diff --git a/tests/cli/test_testing.py b/tests/cli/test_testing.py index f2d266b6..281c4e1a 100644 --- a/tests/cli/test_testing.py +++ b/tests/cli/test_testing.py @@ -196,7 +196,7 @@ def test_ifPathToFakeRetrainModel_thenUseFakeRetrainModel(self): parser_params = [ self.a_fasttext_model_type, self.a_train_pickle_dataset_path, - "--path_to_retrained_model", + "--path_to_model_weights", self.path_to_retrain_fasttext, "--device", self.cpu_device, diff --git a/tests/parser/test_tools.py b/tests/parser/test_tools.py index dc2915dd..970e4f4b 100644 --- a/tests/parser/test_tools.py +++ b/tests/parser/test_tools.py @@ -8,10 +8,8 @@ import unittest from tempfile import TemporaryDirectory from unittest import skipIf -from unittest.mock import MagicMock, patch, call import torch -from cloudpathlib import S3Path from deepparse.parser.tools import ( indices_splitting, @@ -23,7 +21,6 @@ pretrained_parser_in_directory, handle_model_name, infer_model_type, - handle_weights_upload, ) from tests.parser.base import PretrainedWeightsBase from tests.tools import create_file @@ -516,55 +513,6 @@ def test_givenAModelTypeToInfer_whenRealRetrainBPEmb_thenReturnBPEmb(self): self.assertEqual(expected_inferred_model_type, actual_inferred_model_type) - @patch("deepparse.parser.tools.torch") - def test_givenAS3Path_whenHandleWeights_upload_thenReturnProperWeights(self, torch_mock): - s3_path = MagicMock(spec=S3Path) - - weights_mock = MagicMock() - torch_mock.load().return_value = weights_mock - - handle_weights_upload(path_to_retrained_model=s3_path) - - torch_mock.has_calls([call.load()]) - - @patch("deepparse.parser.tools.CloudPath") - @patch("deepparse.parser.tools.torch") - def test_givenAStringS3Path_whenHandleWeights_upload_thenReturnProperWeights(self, torch_mock, cloud_path_mock): - s3_path = "s3://a_path" - - weights_mock = MagicMock() - torch_mock.load().return_value = weights_mock - - handle_weights_upload(path_to_retrained_model=s3_path) - - torch_mock.has_calls([call.load()]) - cloud_path_mock.assert_called() - - @patch("deepparse.parser.tools.CloudPath") - @patch("deepparse.parser.tools.torch") - def test_givenAStringPath_whenHandleWeights_upload_thenReturnProperWeights(self, torch_mock, cloud_path_mock): - s3_path = "a_normal_path.ckpt" - - weights_mock = MagicMock() - torch_mock.load().return_value = weights_mock - - handle_weights_upload(path_to_retrained_model=s3_path) - - torch_mock.has_calls([call.load()]) - - cloud_path_mock.assert_not_called() - - def test_givenAWrongfullyStringS3Path_whenHandleWeights_upload_thenRaiseError(self): - s3_path = "s3/model.ckpt" - - with self.assertRaises(FileNotFoundError): - handle_weights_upload(path_to_retrained_model=s3_path) - - s3_path = "s3//model.ckpt" - - with self.assertRaises(FileNotFoundError): - handle_weights_upload(path_to_retrained_model=s3_path) - if __name__ == "__main__": unittest.main() diff --git a/tests/test_weights_tools.py b/tests/test_weights_tools.py new file mode 100644 index 00000000..90c412f7 --- /dev/null +++ b/tests/test_weights_tools.py @@ -0,0 +1,68 @@ +# pylint: disable=too-many-public-methods + +# Pylint error for TemporaryDirectory ask for with statement +# pylint: disable=consider-using-with + + +import unittest +from unittest import TestCase +from unittest.mock import MagicMock, patch, call + +from cloudpathlib import S3Path + +from deepparse import handle_weights_upload + + +class WeightsToolsTests(TestCase): + @patch("deepparse.weights_tools.torch") + def test_givenAS3Path_whenHandleWeights_upload_thenReturnProperWeights(self, torch_mock): + s3_path = MagicMock(spec=S3Path) + + weights_mock = MagicMock() + torch_mock.load().return_value = weights_mock + + handle_weights_upload(path_to_model_to_upload=s3_path) + + torch_mock.has_calls([call.load()]) + + @patch("deepparse.weights_tools.CloudPath") + @patch("deepparse.weights_tools.torch") + def test_givenAStringS3Path_whenHandleWeights_upload_thenReturnProperWeights(self, torch_mock, cloud_path_mock): + s3_path = "s3://a_path" + + weights_mock = MagicMock() + torch_mock.load().return_value = weights_mock + + handle_weights_upload(path_to_model_to_upload=s3_path) + + torch_mock.has_calls([call.load()]) + cloud_path_mock.assert_called() + + @patch("deepparse.weights_tools.CloudPath") + @patch("deepparse.weights_tools.torch") + def test_givenAStringPath_whenHandleWeights_upload_thenReturnProperWeights(self, torch_mock, cloud_path_mock): + s3_path = "a_normal_path.ckpt" + + weights_mock = MagicMock() + torch_mock.load().return_value = weights_mock + + handle_weights_upload(path_to_model_to_upload=s3_path) + + torch_mock.has_calls([call.load()]) + + cloud_path_mock.assert_not_called() + + def test_givenAWrongfullyStringS3Path_whenHandleWeights_upload_thenRaiseError(self): + s3_path = "s3/model.ckpt" + + with self.assertRaises(FileNotFoundError): + handle_weights_upload(path_to_model_to_upload=s3_path) + + s3_path = "s3//model.ckpt" + + with self.assertRaises(FileNotFoundError): + handle_weights_upload(path_to_model_to_upload=s3_path) + + +if __name__ == "__main__": + unittest.main() From 17b08b00701b24c35c5aeecd01d07fc582ec802d Mon Sep 17 00:00:00 2001 From: davebulaval Date: Sat, 20 May 2023 14:25:52 -0400 Subject: [PATCH 06/13] add release version --- .release/bpemb.version | 1 + .release/bpemb_attention.version | 1 + .release/fasttext.version | 1 + .release/fasttext_attention.version | 1 + 4 files changed, 4 insertions(+) create mode 100644 .release/bpemb.version create mode 100644 .release/bpemb_attention.version create mode 100644 .release/fasttext.version create mode 100644 .release/fasttext_attention.version diff --git a/.release/bpemb.version b/.release/bpemb.version new file mode 100644 index 00000000..b31b8547 --- /dev/null +++ b/.release/bpemb.version @@ -0,0 +1 @@ +aa32fa918494b461202157c57734c374 diff --git a/.release/bpemb_attention.version b/.release/bpemb_attention.version new file mode 100644 index 00000000..bcc9ea1f --- /dev/null +++ b/.release/bpemb_attention.version @@ -0,0 +1 @@ +cfb190902476376573591c0ec6f91ece diff --git a/.release/fasttext.version b/.release/fasttext.version new file mode 100644 index 00000000..b19d26d5 --- /dev/null +++ b/.release/fasttext.version @@ -0,0 +1 @@ +f67a0517c70a314bdde0b8440f21139d diff --git a/.release/fasttext_attention.version b/.release/fasttext_attention.version new file mode 100644 index 00000000..12db9cc1 --- /dev/null +++ b/.release/fasttext_attention.version @@ -0,0 +1 @@ +a2b688bdfa2aa7c009bb7d980e352978 From 05f6d77c2dcc9eb19e49b52f41c0e75d2a0f3d62 Mon Sep 17 00:00:00 2001 From: davebulaval Date: Sat, 20 May 2023 14:53:11 -0400 Subject: [PATCH 07/13] fixed tests, add changelog and start docs --- .release/model_version_release.md | 1 + CHANGELOG.md | 6 ++++++ deepparse/parser/address_parser.py | 8 ++++++-- docs/source/index.rst | 2 ++ tests/network/test_bpemb_seq2seq_model_cpu.py | 5 ++--- tests/network/test_bpemb_seq2seq_model_gpu.py | 7 +++---- .../test_fasttext_seq2seq_model_cpu.py | 5 ++--- .../test_fasttext_seq2seq_model_gpu.py | 7 +++---- tests/network/test_seq2seq.py | 20 +++++++++---------- tests/test_tools.py | 4 ++-- 10 files changed, 37 insertions(+), 28 deletions(-) diff --git a/.release/model_version_release.md b/.release/model_version_release.md index 1e1dceb3..46e9616e 100644 --- a/.release/model_version_release.md +++ b/.release/model_version_release.md @@ -2,3 +2,4 @@ 1. `md5sum > model.version` 2. Remove the model.cpkt text in `model.version` file +3. Update latests BPEMB and FastText hash in `tests/test_tools.py` \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 885630e8..964c8be5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -314,3 +314,9 @@ increases the performance by about 1/100. ## dev + +- New models release with more meta-data +- Add feature to use an AddressParser from a URI +- Add feature to upload trained model to a URI +- Add example on how to use URI for parsing from and to upload to +- Improve error handling of `path_to_retrain_model` diff --git a/deepparse/parser/address_parser.py b/deepparse/parser/address_parser.py index 8fc48ce8..d0ebced8 100644 --- a/deepparse/parser/address_parser.py +++ b/deepparse/parser/address_parser.py @@ -522,8 +522,13 @@ def retrain( logging_path (str): The logging path for the checkpoints. Poutyne will use the best one and reload the state if any checkpoints are there. Thus, an error will be raised if you change the model type. For example, you retrain a FastText model and then retrain a BPEmb in the same logging path directory. + The logging_path can also be a S3-like (Azure, AWS, Google) bucket URI string path + (e.g. ``"s3://path/to/aws/s3/bucket.ckpt"``). Or it can be a ``S3Path`` S3-like URI using `cloudpathlib` + to handle S3-like bucket. See `cloudpathlib ` + for detail on supported S3 buckets provider and URI condition. + If the logging_path is a S3 bucket, we will only save the best checkpoint to the S3 Bucket at the end + of training. By default, the path is ``./checkpoints``. - # TODO: add can be S3Path disable_tensorboard (bool): To disable Poutyne automatic Tensorboard monitoring. By default, we disable them (true). prediction_tags (Union[dict, None]): A dictionary where the keys are the address components @@ -844,7 +849,6 @@ def retrain( } ) - # TODO: validate if work and tests if isinstance(file_path, S3Path): # To handle CloudPath path_to_model_weights try: diff --git a/docs/source/index.rst b/docs/source/index.rst index b1993d06..fe320485 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -776,9 +776,11 @@ API Reference :caption: Examples examples/parse_addresses + examples/parse_addresses_uri examples/parse_addresses_with_cli examples/retrained_model_parsing examples/fine_tuning + examples/fine_tuning_uri examples/fine_tuning_with_csv_dataset examples/retrain_attention_model examples/retrain_with_new_prediction_tags diff --git a/tests/network/test_bpemb_seq2seq_model_cpu.py b/tests/network/test_bpemb_seq2seq_model_cpu.py index b6bbd174..f174cd58 100644 --- a/tests/network/test_bpemb_seq2seq_model_cpu.py +++ b/tests/network/test_bpemb_seq2seq_model_cpu.py @@ -77,7 +77,7 @@ def test_givenLocalWeights_whenInstantiatingABPEmbSeq2SeqModel_thenShouldntDownl BPEmbSeq2SeqModel(self.cache_dir, self.a_cpu_device, output_size=self.output_size, verbose=self.verbose) download_weights_mock.assert_not_called() - @patch("deepparse.network.seq2seq.torch") + @patch("deepparse.weights_tools.torch") @patch("deepparse.network.seq2seq.Seq2SeqModel.load_state_dict") def test_givenRetrainedWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShouldUseRetrainedWeights( self, load_state_dict_mock, torch_mock @@ -95,8 +95,7 @@ def test_givenRetrainedWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShould torch_load_call = [call.load(self.a_path_to_retrained_model, map_location=self.a_cpu_device)] torch_mock.assert_has_calls(torch_load_call) - load_state_dict_call = [call(all_layers_params)] - load_state_dict_mock.assert_has_calls(load_state_dict_call) + load_state_dict_mock.assert_called() @patch("deepparse.network.seq2seq.Encoder") @patch("deepparse.network.seq2seq.download_weights") diff --git a/tests/network/test_bpemb_seq2seq_model_gpu.py b/tests/network/test_bpemb_seq2seq_model_gpu.py index 19c63ad0..5b538ce0 100644 --- a/tests/network/test_bpemb_seq2seq_model_gpu.py +++ b/tests/network/test_bpemb_seq2seq_model_gpu.py @@ -78,7 +78,7 @@ def test_givenLocalWeights_whenInstantiatingABPEmbSeq2SeqModel_thenShouldntDownl BPEmbSeq2SeqModel(self.cache_dir, self.a_torch_device, output_size=self.output_size, verbose=self.verbose) download_weights_mock.assert_not_called() - @patch("deepparse.network.seq2seq.torch") + @patch("deepparse.weights_tools.torch") @patch("deepparse.network.seq2seq.Seq2SeqModel.load_state_dict") def test_givenRetrainedWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShouldUseRetrainedWeights( self, load_state_dict_mock, torch_mock @@ -87,7 +87,7 @@ def test_givenRetrainedWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShould torch_mock.load.return_value = all_layers_params BPEmbSeq2SeqModel( self.cache_dir, - self.a_torch_device, + self.a_cpu_device, output_size=self.output_size, verbose=self.verbose, path_to_retrained_model=self.a_path_to_retrained_model, @@ -96,8 +96,7 @@ def test_givenRetrainedWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShould torch_load_call = [call.load(self.a_path_to_retrained_model, map_location=self.a_torch_device)] torch_mock.assert_has_calls(torch_load_call) - load_state_dict_call = [call(all_layers_params)] - load_state_dict_mock.assert_has_calls(load_state_dict_call) + load_state_dict_mock.assert_called() @patch("deepparse.network.seq2seq.Encoder") @patch("deepparse.network.seq2seq.download_weights") diff --git a/tests/network/test_fasttext_seq2seq_model_cpu.py b/tests/network/test_fasttext_seq2seq_model_cpu.py index e21663cb..e5d5e443 100644 --- a/tests/network/test_fasttext_seq2seq_model_cpu.py +++ b/tests/network/test_fasttext_seq2seq_model_cpu.py @@ -49,7 +49,7 @@ def test_givenLocalWeightsNotLastVersion_whenInstantiatingAFastTextSeq2SeqModel_ FastTextSeq2SeqModel(self.cache_dir, self.a_cpu_device, output_size=self.output_size, verbose=self.verbose) download_weights_mock.assert_called_with(self.model_type, self.a_root_path, verbose=self.verbose) - @patch("deepparse.network.seq2seq.torch") + @patch("deepparse.weights_tools.torch") @patch("deepparse.network.seq2seq.Seq2SeqModel.load_state_dict") def test_givenRetrainedWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShouldUseRetrainedWeights( self, load_state_dict_mock, torch_mock @@ -67,8 +67,7 @@ def test_givenRetrainedWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShould torch_load_call = [call.load(self.a_path_to_retrained_model, map_location=self.a_cpu_device)] torch_mock.assert_has_calls(torch_load_call) - load_state_dict_call = [call(all_layers_params)] - load_state_dict_mock.assert_has_calls(load_state_dict_call) + load_state_dict_mock.assert_called() @patch("deepparse.network.seq2seq.Encoder") @patch("deepparse.network.seq2seq.download_weights") diff --git a/tests/network/test_fasttext_seq2seq_model_gpu.py b/tests/network/test_fasttext_seq2seq_model_gpu.py index 2d66aebe..2b022171 100644 --- a/tests/network/test_fasttext_seq2seq_model_gpu.py +++ b/tests/network/test_fasttext_seq2seq_model_gpu.py @@ -54,7 +54,7 @@ def test_givenLocalWeightsNotLastVersion_whenInstantiatingAFastTextSeq2SeqModel_ ) download_weights_mock.assert_called_with(self.model_type, self.a_root_path, verbose=self.verbose) - @patch("deepparse.network.seq2seq.torch") + @patch("deepparse.weights_tools.torch") @patch("deepparse.network.seq2seq.Seq2SeqModel.load_state_dict") def test_givenRetrainedWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShouldUseRetrainedWeights( self, load_state_dict_mock, torch_mock @@ -69,11 +69,10 @@ def test_givenRetrainedWeights_whenInstantiatingAFastTextSeq2SeqModel_thenShould path_to_retrained_model=self.a_path_to_retrained_model, ) - torch_load_call = [call.load(self.a_path_to_retrained_model, map_location=self.a_torch_device)] + torch_load_call = [call.load(self.a_path_to_retrained_model, map_location=self.a_cpu_device)] torch_mock.assert_has_calls(torch_load_call) - load_state_dict_call = [call(all_layers_params)] - load_state_dict_mock.assert_has_calls(load_state_dict_call) + load_state_dict_mock.assert_called() @patch("deepparse.network.seq2seq.Encoder") @patch("deepparse.network.seq2seq.download_weights") diff --git a/tests/network/test_seq2seq.py b/tests/network/test_seq2seq.py index ee3570a9..804c3806 100644 --- a/tests/network/test_seq2seq.py +++ b/tests/network/test_seq2seq.py @@ -130,7 +130,7 @@ def test_whenHandleNewOutputDim_thenProperlyHandleNewDim(self): self.assertEqual(expected, actual) @patch("os.path.isfile") - @patch("deepparse.network.seq2seq.torch") + @patch("deepparse.weights_tools.torch") @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict") def test_givenSeq2seqModel_whenNoPretrainedWeights_thenDownloadIt( self, @@ -156,7 +156,7 @@ def test_givenSeq2seqModel_whenNoPretrainedWeights_thenDownloadIt( download_weights_mock.assert_called_with(self.a_model_type, self.cache_dir, verbose=False) @patch("os.path.isfile") - @patch("deepparse.network.seq2seq.torch") + @patch("deepparse.weights_tools.torch") @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict") def test_givenSeq2seqModelVerbose_whenNoPretrainedWeights_thenWarns( self, @@ -181,7 +181,7 @@ def test_givenSeq2seqModelVerbose_whenNoPretrainedWeights_thenWarns( @patch("deepparse.network.seq2seq.latest_version") @patch("os.path.isfile") - @patch("deepparse.network.seq2seq.torch") + @patch("deepparse.weights_tools.torch") @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict") @skipIf(not torch.cuda.is_available(), "no gpu available") def test_givenSeq2seqModel_whenLoadPreTrainedWeightsNotRecentVersion_thenDownloadIt( @@ -206,7 +206,7 @@ def test_givenSeq2seqModel_whenLoadPreTrainedWeightsNotRecentVersion_thenDownloa @patch("deepparse.network.seq2seq.latest_version") @patch("os.path.isfile") - @patch("deepparse.network.seq2seq.torch") + @patch("deepparse.weights_tools.torch") @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict") @skipIf(not torch.cuda.is_available(), "no gpu available") def test_givenSeq2seqModel_whenLoadPreTrainedWeightsVerboseGPU_thenWarningsRaised( @@ -230,7 +230,7 @@ def test_givenSeq2seqModel_whenLoadPreTrainedWeightsVerboseGPU_thenWarningsRaise @patch("deepparse.network.seq2seq.latest_version") @patch("os.path.isfile") - @patch("deepparse.network.seq2seq.torch") + @patch("deepparse.weights_tools.torch") @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict") def test_givenSeq2seqModel_whenLoadPreTrainedWeightsNotVerboseGPU_thenWarningsNotRaised( self, torch_nn_mock, torch_mock, isfile_mock, last_version_mock @@ -254,7 +254,7 @@ def test_givenSeq2seqModel_whenLoadPreTrainedWeightsNotVerboseGPU_thenWarningsNo @patch("deepparse.network.seq2seq.latest_version") @patch("os.path.isfile") - @patch("deepparse.network.seq2seq.torch") + @patch("deepparse.weights_tools.torch") @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict") def test_givenSeq2seqModel_whenLoadPreTrainedWeightsVerboseCPU_thenWarningsRaised( self, torch_nn_mock, torch_mock, isfile_mock, last_version_mock @@ -277,7 +277,7 @@ def test_givenSeq2seqModel_whenLoadPreTrainedWeightsVerboseCPU_thenWarningsRaise @patch("deepparse.network.seq2seq.latest_version") @patch("os.path.isfile") - @patch("deepparse.network.seq2seq.torch") + @patch("deepparse.weights_tools.torch") @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict") def test_givenSeq2seqModel_whenLoadPreTrainedWeightsNotVerboseCPU_thenWarningsNotRaised( self, torch_nn_mock, torch_mock, isfile_mock, last_version_mock @@ -299,7 +299,7 @@ def test_givenSeq2seqModel_whenLoadPreTrainedWeightsNotVerboseCPU_thenWarningsNo seq2seq_model._load_pre_trained_weights(self.a_model_type, cache_dir=self.cache_dir, offline=False) self.assertEqual(0, len(record)) - @patch("deepparse.network.seq2seq.torch") + @patch("deepparse.weights_tools.torch") @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict") def test_givenSeq2SeqModelRetrained_whenLoadRetrainedWeights_thenLoadProperly(self, torch_nn_mock, torch_mock): # pylint: disable=unnecessary-dunder-call @@ -324,7 +324,7 @@ def test_givenSeq2SeqModelRetrained_whenLoadRetrainedWeights_thenLoadProperly(se torch_nn_mock.assert_called() torch_nn_mock.asser_has_calls([call(all_layers_params_mock)]) - @patch("deepparse.network.seq2seq.torch") + @patch("deepparse.weights_tools.torch") @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict") def test_givenSeq2SeqModelRetrained_whenLoadRetrainedWeightsNewTagModel_thenLoadProperDict( self, torch_nn_mock, torch_mock @@ -351,7 +351,7 @@ def test_givenSeq2SeqModelRetrained_whenLoadRetrainedWeightsNewTagModel_thenLoad @patch("deepparse.network.seq2seq.latest_version") @patch("os.path.isfile") - @patch("deepparse.network.seq2seq.torch") + @patch("deepparse.weights_tools.torch") @patch("deepparse.network.seq2seq.torch.nn.Module.load_state_dict") def test_givenAnOfflineSeq2SeqModel_whenInit_thenDontCallOnlineFunctions( self, torch_nn_mock, torch_mock, isfile_mock, last_version_mock diff --git a/tests/test_tools.py b/tests/test_tools.py index 4e512752..f7d47532 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -33,8 +33,8 @@ def setUp(self) -> None: self.temp_dir_obj = TemporaryDirectory() self.fake_cache_path = self.temp_dir_obj.name self.a_file_extension = "version" - self.latest_fasttext_version = "b4f098bb8909b1c8a8d24eea07df3435" - self.latest_bpemb_version = "ac0dc019748b6853dca412add7234203" + self.latest_fasttext_version = "f67a0517c70a314bdde0b8440f21139d" + self.latest_bpemb_version = "aa32fa918494b461202157c57734c374" self.a_seed = 42 self.verbose = False From 9c8e006d04dd6182e047be0e482ca832abe84aba Mon Sep 17 00:00:00 2001 From: davebulaval Date: Sat, 20 May 2023 14:56:04 -0400 Subject: [PATCH 08/13] add examples --- docs/source/examples/fine_tuning_uri.rst | 63 ++++++++++++++++++++ docs/source/examples/parse_addresses_uri.rst | 43 +++++++++++++ examples/fine_tuning_uri.py | 42 +++++++++++++ examples/parse_addresses_uri.py | 23 +++++++ 4 files changed, 171 insertions(+) create mode 100644 docs/source/examples/fine_tuning_uri.rst create mode 100644 docs/source/examples/parse_addresses_uri.rst create mode 100644 examples/fine_tuning_uri.py create mode 100644 examples/parse_addresses_uri.py diff --git a/docs/source/examples/fine_tuning_uri.rst b/docs/source/examples/fine_tuning_uri.rst new file mode 100644 index 00000000..c734f7ab --- /dev/null +++ b/docs/source/examples/fine_tuning_uri.rst @@ -0,0 +1,63 @@ +.. role:: hidden + :class: hidden-section + +Retrain a Pretrained Model +************************** + +.. code-block:: python + + import poutyne + + from deepparse import download_from_public_repository + from deepparse.dataset_container import PickleDatasetContainer + from deepparse.parser import AddressParser + + +First, let's download the train and test data from the public repository. + +.. code-block:: python + + saving_dir = "./data" + file_extension = "p" + training_dataset_name = "sample_incomplete_data" + test_dataset_name = "test_sample_data" + download_from_public_repository(training_dataset_name, saving_dir, file_extension=file_extension) + download_from_public_repository(test_dataset_name, saving_dir, file_extension=file_extension) + +Now let's create a training and test container. + +.. code-block:: python + + training_container = PickleDatasetContainer(os.path.join(saving_dir, + training_dataset_name + "." + file_extension)) + test_container = PickleDatasetContainer(os.path.join(saving_dir, + test_dataset_name + "." + file_extension)) + +We will retrain the ``FastText`` version of our pretrained model. + +.. code-block:: python + + path_to_your_uri = "s3:///fasttext.ckpt" + address_parser = AddressParser(model_type="fasttext", device=0, path_to_retrained_model=path_to_your_uri) + + +Now, let's retrain for ``5`` epochs using a batch size of ``8`` since the data is really small for the example. +Let's start with the default learning rate of ``0.01`` and use a learning rate scheduler to lower the learning rate as we progress. + +.. code-block:: python + + # Reduce LR by a factor of 10 each epoch + lr_scheduler = poutyne.StepLR(step_size=1, gamma=0.1) + +The retrained model best checkpoint (ckpt) will be saved in the S3 Bucket