From 18849cbf50a48bac904599b5173fe17b8e827db5 Mon Sep 17 00:00:00 2001 From: ericharper Date: Mon, 28 Jun 2021 13:05:15 -0600 Subject: [PATCH 1/3] make vocab_file configurable for megatron in nmt Signed-off-by: ericharper --- .../nlp/models/machine_translation/mt_enc_dec_model.py | 4 +++- nemo/collections/nlp/modules/common/tokenizer_utils.py | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py b/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py index acae5309c22b..b462d4569ebf 100644 --- a/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py +++ b/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py @@ -80,6 +80,7 @@ def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None): if cfg.encoder_tokenizer.get('bpe_dropout', 0.0) is not None else 0.0, encoder_model_name=cfg.encoder.get('model_name') if hasattr(cfg.encoder, 'model_name') else None, + encoder_tokenizer_vocab_file=cfg.encoder_tokenizer.get('vocab_file', None), decoder_tokenizer_library=cfg.decoder_tokenizer.get('library', 'yttm'), decoder_tokenizer_model=cfg.decoder_tokenizer.tokenizer_model, decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0) @@ -379,6 +380,7 @@ def setup_enc_dec_tokenizers( encoder_tokenizer_model=None, encoder_bpe_dropout=0.0, encoder_model_name=None, + encoder_tokenizer_vocab_file=None, decoder_tokenizer_library=None, decoder_tokenizer_model=None, decoder_bpe_dropout=0.0, @@ -397,7 +399,7 @@ def setup_enc_dec_tokenizers( tokenizer_model=self.register_artifact("encoder_tokenizer.tokenizer_model", encoder_tokenizer_model), bpe_dropout=encoder_bpe_dropout, model_name=encoder_model_name, - vocab_file=None, + vocab_file=self.register_artifact("encoder_tokenizer.vocab_file", encoder_tokenizer_vocab_file), special_tokens=None, use_fast=False, ) diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py index 8eaa2b3f7707..404b386486e8 100644 --- a/nemo/collections/nlp/modules/common/tokenizer_utils.py +++ b/nemo/collections/nlp/modules/common/tokenizer_utils.py @@ -95,6 +95,9 @@ def get_tokenizer( elif tokenizer_name == 'char': return CharTokenizer(vocab_file=vocab_file, **special_tokens_dict) + logging.info( + f"Getting HuggingFace AutoTokenizer with pretrained_model_name: {tokenizer_name}, vocab_file: {vocab_file}, special_tokens_dict: {special_tokens_dict}, and use_fast: {use_fast}" + ) return AutoTokenizer( pretrained_model_name=tokenizer_name, vocab_file=vocab_file, **special_tokens_dict, use_fast=use_fast ) @@ -140,7 +143,7 @@ def get_nmt_tokenizer( ) elif library == 'megatron': logging.info( - f'Getting Megatron tokenizer with pretrained model name: {model_name} and custom vocab file: {vocab_file}' + f'Getting Megatron tokenizer for pretrained model name: {model_name} and custom vocab file: {vocab_file}' ) return get_tokenizer(tokenizer_name=model_name, vocab_file=vocab_file) else: From f9c1e2b4c58357d0b48958258fc5cbfb17253a5a Mon Sep 17 00:00:00 2001 From: ericharper Date: Mon, 28 Jun 2021 13:17:24 -0600 Subject: [PATCH 2/3] update docs Signed-off-by: ericharper --- docs/source/nlp/machine_translation.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/source/nlp/machine_translation.rst b/docs/source/nlp/machine_translation.rst index 3635fad5174f..c469b1e8645e 100644 --- a/docs/source/nlp/machine_translation.rst +++ b/docs/source/nlp/machine_translation.rst @@ -561,6 +561,15 @@ To train a Megatron 345M BERT, we would use model.encoder.num_layers=24 \ model.encoder.max_position_embeddings=512 \ +If the pretrained megatron model used a custom vocab file, then set: + +.. code:: + + model.encoder_tokenizer.vocab_file=/path/to/your/megatron/vocab_file.txt + +Use ``encoder.model_name=megatron_bert_uncased`` for uncased models with custom vocabularies and +use ``encoder.model_name=megatron_bert_cased`` for cased models with custom vocabularies. + References ---------- From c3c566ccfc6b15193b594a854233bf10e164d5a7 Mon Sep 17 00:00:00 2001 From: ericharper Date: Mon, 28 Jun 2021 15:54:02 -0600 Subject: [PATCH 3/3] update docs Signed-off-by: ericharper --- docs/source/nlp/machine_translation.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/nlp/machine_translation.rst b/docs/source/nlp/machine_translation.rst index c469b1e8645e..3672eb4bbf36 100644 --- a/docs/source/nlp/machine_translation.rst +++ b/docs/source/nlp/machine_translation.rst @@ -566,6 +566,8 @@ If the pretrained megatron model used a custom vocab file, then set: .. code:: model.encoder_tokenizer.vocab_file=/path/to/your/megatron/vocab_file.txt + model.encoder.vocab_file=/path/to/your/megatron/vocab_file.txt + Use ``encoder.model_name=megatron_bert_uncased`` for uncased models with custom vocabularies and use ``encoder.model_name=megatron_bert_cased`` for cased models with custom vocabularies.