From 2336bfebf3565bb525ae42c371fcbaf6f6048ee8 Mon Sep 17 00:00:00 2001 From: shibing624 Date: Sat, 27 Apr 2024 12:40:24 +0800 Subject: [PATCH] update bos token. --- supervised_finetuning.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/supervised_finetuning.py b/supervised_finetuning.py index 12e234a..b88b5e0 100644 --- a/supervised_finetuning.py +++ b/supervised_finetuning.py @@ -523,16 +523,21 @@ def main(): tokenizer = tokenizer_class.from_pretrained(tokenizer_name_or_path, **tokenizer_kwargs) prompt_template = get_conv_template(script_args.template_name) if tokenizer.eos_token_id is None: - tokenizer.eos_token = prompt_template.stop_str # eos token is required for SFT - logger.info("Add eos token: {}".format(tokenizer.eos_token)) + tokenizer.eos_token = prompt_template.stop_str # eos token is required + tokenizer.add_special_tokens({"eos_token": tokenizer.eos_token}) + logger.info(f"Add eos_token: {tokenizer.eos_token}, eos_token_id: {tokenizer.eos_token_id}") + if tokenizer.bos_token_id is None: + tokenizer.add_special_tokens({"bos_token": tokenizer.eos_token}) + tokenizer.bos_token_id = tokenizer.eos_token_id + logger.info(f"Add bos_token: {tokenizer.bos_token}, bos_token_id: {tokenizer.bos_token_id}") if tokenizer.pad_token_id is None: if tokenizer.unk_token_id is not None: tokenizer.pad_token = tokenizer.unk_token else: tokenizer.pad_token = tokenizer.eos_token - logger.info("Add pad token: {}".format(tokenizer.pad_token)) - + logger.info(f"Add pad_token: {tokenizer.pad_token}, pad_token_id: {tokenizer.pad_token_id}") logger.debug(f"Tokenizer: {tokenizer}") + IGNORE_INDEX = LabelSmoother.ignore_index if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id # Get datasets