diff --git a/docs/source/nlp/machine_translation.rst b/docs/source/nlp/machine_translation.rst index 3635fad5174f..3672eb4bbf36 100644 --- a/docs/source/nlp/machine_translation.rst +++ b/docs/source/nlp/machine_translation.rst @@ -561,6 +561,17 @@ To train a Megatron 345M BERT, we would use model.encoder.num_layers=24 \ model.encoder.max_position_embeddings=512 \ +If the pretrained megatron model used a custom vocab file, then set: + +.. code:: + + model.encoder_tokenizer.vocab_file=/path/to/your/megatron/vocab_file.txt + model.encoder.vocab_file=/path/to/your/megatron/vocab_file.txt + + +Use ``encoder.model_name=megatron_bert_uncased`` for uncased models with custom vocabularies and +use ``encoder.model_name=megatron_bert_cased`` for cased models with custom vocabularies. + References ---------- diff --git a/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py b/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py index acae5309c22b..b462d4569ebf 100644 --- a/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py +++ b/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py @@ -80,6 +80,7 @@ def __init__(self, cfg: MTEncDecModelConfig, trainer: Trainer = None): if cfg.encoder_tokenizer.get('bpe_dropout', 0.0) is not None else 0.0, encoder_model_name=cfg.encoder.get('model_name') if hasattr(cfg.encoder, 'model_name') else None, + encoder_tokenizer_vocab_file=cfg.encoder_tokenizer.get('vocab_file', None), decoder_tokenizer_library=cfg.decoder_tokenizer.get('library', 'yttm'), decoder_tokenizer_model=cfg.decoder_tokenizer.tokenizer_model, decoder_bpe_dropout=cfg.decoder_tokenizer.get('bpe_dropout', 0.0) @@ -379,6 +380,7 @@ def setup_enc_dec_tokenizers( encoder_tokenizer_model=None, encoder_bpe_dropout=0.0, encoder_model_name=None, + encoder_tokenizer_vocab_file=None, decoder_tokenizer_library=None, decoder_tokenizer_model=None, decoder_bpe_dropout=0.0, @@ -397,7 +399,7 @@ def setup_enc_dec_tokenizers( tokenizer_model=self.register_artifact("encoder_tokenizer.tokenizer_model", encoder_tokenizer_model), bpe_dropout=encoder_bpe_dropout, model_name=encoder_model_name, - vocab_file=None, + vocab_file=self.register_artifact("encoder_tokenizer.vocab_file", encoder_tokenizer_vocab_file), special_tokens=None, use_fast=False, ) diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py index 8eaa2b3f7707..404b386486e8 100644 --- a/nemo/collections/nlp/modules/common/tokenizer_utils.py +++ b/nemo/collections/nlp/modules/common/tokenizer_utils.py @@ -95,6 +95,9 @@ def get_tokenizer( elif tokenizer_name == 'char': return CharTokenizer(vocab_file=vocab_file, **special_tokens_dict) + logging.info( + f"Getting HuggingFace AutoTokenizer with pretrained_model_name: {tokenizer_name}, vocab_file: {vocab_file}, special_tokens_dict: {special_tokens_dict}, and use_fast: {use_fast}" + ) return AutoTokenizer( pretrained_model_name=tokenizer_name, vocab_file=vocab_file, **special_tokens_dict, use_fast=use_fast ) @@ -140,7 +143,7 @@ def get_nmt_tokenizer( ) elif library == 'megatron': logging.info( - f'Getting Megatron tokenizer with pretrained model name: {model_name} and custom vocab file: {vocab_file}' + f'Getting Megatron tokenizer for pretrained model name: {model_name} and custom vocab file: {vocab_file}' ) return get_tokenizer(tokenizer_name=model_name, vocab_file=vocab_file) else: