Skip to content

Commit

Permalink
update tokenizer logic
Browse files Browse the repository at this point in the history
  • Loading branch information
suiyoubi committed Aug 3, 2024
1 parent c6efeb0 commit 6a43d30
Showing 1 changed file with 8 additions and 10 deletions.
18 changes: 8 additions & 10 deletions scripts/checkpoint_converters/convert_nemotron_nemo_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@
from collections import OrderedDict

import torch
from pytorch_lightning import Trainer
from transformers import AutoTokenizer as HFAutoTokenizer
from transformers import LlamaTokenizer, PreTrainedTokenizerFast
from transformers.convert_slow_tokenizer import LlamaConverter

from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
from nemo.utils import logging
from pytorch_lightning import Trainer

from transformers import LlamaTokenizer, PreTrainedTokenizerFast
from transformers.convert_slow_tokenizer import LlamaConverter


"""
Script to convert a nemotron checkpoint in nemo (mcore path) into a HuggingFace checkpoint.
Expand Down Expand Up @@ -298,7 +298,8 @@ def param_to_weights(param):
def extract_nemotron_tokenizer(nemo_file, model_config, output_hf_path, nemo_tokenizer):
tokenizer_cfg = model_config.tokenizer
if tokenizer_cfg.library == "sentencepiece":
# For sentencepiece tokenizer, we are wrapping with HF's LlamaTokenizer and convert it to a PreTrainedTokenizerFast
# For sentencepiece tokenizer, we are wrapping with HF's LlamaTokenizer
# and convert it to a PreTrainedTokenizerFast
tokenizer_fn = tokenizer_cfg.model[5:]
output_tokenizer = f"{output_hf_path}/tokenizer.model"
if nemo_file.endswith(".nemo"):
Expand All @@ -312,15 +313,12 @@ def extract_nemotron_tokenizer(nemo_file, model_config, output_hf_path, nemo_tok
elif os.path.isdir(nemo_file):
shutil.copy(f"{nemo_file}/{tokenizer_fn}", output_tokenizer)
# We use LlamaTokenizer for sentencepiece based tokenizer
tokenizer = LlamaTokenizer.from_pretrained(output_hf_path)
tokenizer = LlamaTokenizer.from_pretrained(output_hf_path, legacy=False)
# Convert the LlamaTokenizer to a PreTrainedTokenizerFast instance
tokenizer = PreTrainedTokenizerFast(
tokenizer_object=LlamaConverter(tokenizer).converted(), model_input_names=["input_ids", "token_type_ids"]
)
tokenizer.save_pretrained(output_hf_path)
# Make sure not use legacy mode
tokenizer = HFAutoTokenizer.from_pretrained(output_hf_path, from_slow=False, legacy=False)
tokenizer.save_pretrained(output_hf_path)
logging.info(f"Setencepiece tokenizer has been saved to {output_tokenizer}")
elif isinstance(nemo_tokenizer, AutoTokenizer):
nemo_tokenizer.tokenizer.save_pretrained(output_hf_path)
Expand Down

0 comments on commit 6a43d30

Please sign in to comment.