diff --git a/.gitignore b/.gitignore index 0ccf864d..41e0d4be 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,7 @@ tensorboard* datasets qqp glm_large_qqp_pytorch +wandb examples/AltCLIP/clip_benchmark_datasets examples/glm_pretrain/data.lazy examples/glm_pretrain/examples/glm_pretrain/data.lazy diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..31c9d211 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,53 @@ +#Change to your base image, such as pytorch1.11+py38 +#https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_21-02.html#rel_21-02 +FROM nvcr.io/nvidia/pytorch:21.06-py3 +#You can set available pypi sources +RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple" + +ENV STAGE_DIR=/tmp +RUN mkdir -p ${STAGE_DIR} +#Ubuntu +RUN apt-get update && apt-get install -y openssh-server && apt-get install -y git +ARG SSH_PORT=6001 +#Client Liveness & Uncomment Port 22 for SSH Daemon +RUN echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config +RUN mkdir -p /var/run/sshd && cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \ + sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config +RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \ + sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config && \ + sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config +EXPOSE ${SSH_PORT} + +#Set SSH KEY +RUN mkdir /root/.ssh +RUN printf "#StrictHostKeyChecking no\n#UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + ssh-keygen -t rsa -f /root/.ssh/id_rsa -N "" && cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys && \ + chmod og-wx /root/.ssh/authorized_keys + +RUN echo $'Host 127.0.0.1 \n\ + Hostname 127.0.0.1 \n\ + Port 6001 \n\ + StrictHostKeyChecking no \n\ + User root' > /root/.ssh/config +RUN echo $'Host localhost \n\ + Hostname localhost \n\ + Port 6001 \n\ + StrictHostKeyChecking no \n\ + User root' >> /root/.ssh/config + +RUN echo "service ssh start" >> /root/.bashrc + +#Main deps +RUN pip install tensorboard +RUN pip install sentencepiece +RUN pip install boto3 +RUN pip install jieba +RUN pip install ftfy +RUN pip install deepspeed==0.7.7 +RUN pip install bmtrain + +RUN pip install flagai +#For development usage, you can change as follows +#RUN git clone https://github.com/FlagAI-Open/FlagAI.git && cd FlagAI && python setup.py install + +CMD service ssh start && tail -f /dev/null diff --git a/README_zh.md b/README_zh.md index 133cfb42..2ab9eff9 100644 --- a/README_zh.md +++ b/README_zh.md @@ -80,6 +80,7 @@ git clone https://github.com/OpenBMB/BMTrain cd BMTrain python setup.py install ``` +- [可选] 镜像构建,请参照 [Dockerfile](https://github.com/FlagAI-Open/FlagAI/blob/master/Dockerfile) - [提示] 单节点docker环境下,运行多卡数据并行需要设置host。 例如,docker节点 root@127.0.0.1,其端口 7110。 ``` >>> vim ~/.ssh/config diff --git a/examples/AltDiffusion/README.md b/examples/AltDiffusion/README.md index 4c5803f3..10db1002 100644 --- a/examples/AltDiffusion/README.md +++ b/examples/AltDiffusion/README.md @@ -66,9 +66,10 @@ prompt = "Anime portrait of natalie portman as an anime girl by stanley artgerm device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -loader = AutoLoader(task_name="text2img", #contrastive learning +loader = AutoLoader(task_name="text2img", model_name="AltDiffusion-m9", - model_dir="./checkpoints") + model_dir="./checkpoints", + use_fp16=False) # Fp16 mode model = loader.get_model() model.eval() @@ -97,9 +98,9 @@ More parameters of predict_generate_images for you to adjust for `predict_genera | C | int | 图片的channel数; Numeber of channels of generated images | | seed | int | 随机种子; Random seed number | -注意:模型推理要求一张至少10G以上的GPU。 +注意:模型推理要求一张至少14G以上的GPU, FP16模式下则至少11G。 -Note that the model inference requires a GPU of at least 10G above. +Note that the model inference requires a GPU of at least 14G, and at least 11G for FP16 mode. # 更多生成结果/More Results diff --git a/examples/galactica/generate_galactica_1.3b.py b/examples/galactica/generate_galactica_1.3b.py new file mode 100644 index 00000000..fe404336 --- /dev/null +++ b/examples/galactica/generate_galactica_1.3b.py @@ -0,0 +1,27 @@ +from flagai.model.predictor.predictor import Predictor +from flagai.auto_model.auto_loader import AutoLoader +import torch +device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu") + +loader = AutoLoader(task_name="lm", + model_name="galactica-1.3b-en", + model_dir="/share/projset/baaishare/baai-mrnd/xingzhaohu/") + +model = loader.get_model() +model.to(device) +model.eval() + +tokenizer = loader.get_tokenizer() + +predictor = Predictor(model, tokenizer) + +text = "Please write a abstract about the computer vision. \n" +out = predictor.predict_generate_randomsample(text, + out_max_length=700, + top_k=50, + repetition_penalty=1.2, + temperature=0.7 + ) +print(out) + + diff --git a/examples/glm_custom_pvp/README.md b/examples/glm_custom_pvp/README.md new file mode 100644 index 00000000..3091cb72 --- /dev/null +++ b/examples/glm_custom_pvp/README.md @@ -0,0 +1,59 @@ +# Custom prompt-verbalizer pair(PVP) + +## 1. Define your own prompt-verbalizer patterns +We provide api for users to create their own function to construct prompt-verbalizer patterns. Here is an example below: +```python +class RtePVP(PVP): + # Verbalizer convert original labels to more meaningful ones + VERBALIZER = {"not_entailment": [" No"], "entailment": [" Yes"]} + + @staticmethod + def available_patterns(): + return [0, 1, 2] + + @property + def spell_length(self): + return self.num_prompt_tokens + self.prefix_prompt + + def get_parts(self, example: InputExample): + """ + Construct patterns with input texts and mask, "None" here stands for places to insert continuous prompt tokens + """ + text_a = example.text_a + text_b = example.text_b.rstrip(string.punctuation) + if self.pattern_id == 0: + parts_a, parts_b = [None, '"', + self.shortenable(text_b), '" ?'], [ + None, [self.mask], ',', None, ' "', + self.shortenable(text_a), '"' + ] + elif self.pattern_id == 1: + parts_a, parts_b = [None, self.shortenable(text_b), '?'], [ + None, [self.mask], ',', None, + self.shortenable(" " + text_a) + ] + elif self.pattern_id == 2: + parts_a, parts_b = [ + None, + self.shortenable(text_a), None, ' question:', + self.shortenable(" " + text_b), ' True or False?', None, + ' answer:', [self.mask] + ], [] + else: + raise NotImplementedError(self.pattern_id) + parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b) + return parts_a, parts_b + + def verbalize(self, label) -> List[str]: + if self.pattern_id == 4: + return [' true'] if label == 'entailment' else [' false'] + return RtePVP.VERBALIZER[label] +``` + +## 2. Pass the user-defined class to the collate function +```python +collate_fn = ConstructSuperglueStrategy(cl_args, + tokenizer, + task_name=task_name, + custom_pvp=RtePVP) +``` diff --git a/examples/glm_custom_pvp/train_custom_pvp.py b/examples/glm_custom_pvp/train_custom_pvp.py new file mode 100644 index 00000000..812b987a --- /dev/null +++ b/examples/glm_custom_pvp/train_custom_pvp.py @@ -0,0 +1,119 @@ +# Copyright © 2022 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +from flagai.trainer import Trainer +from flagai.model.glm_model import GLMForSequenceClassification +from flagai.model.glm_model import GLMForSingleTokenCloze +from flagai.data.tokenizer import Tokenizer + +from flagai.data.dataset import SuperGlueDataset +from flagai.test_utils import CollateArguments +from flagai.data.dataset.superglue.control import DEFAULT_METRICS, MULTI_TOKEN_TASKS, CH_TASKS +from flagai.data.dataset import ConstructSuperglueStrategy +from flagai.data.dataset.superglue.pvp import PVP +from flagai.data.dataset.data_utils import build_input_from_ids, build_sample, InputExample +from flagai.data.dataset.data_utils import build_decoder_input, build_decoder_sample, num_special_tokens_to_add +from typing import Tuple, List, Union, Dict +import string + +class RtePVP(PVP): + VERBALIZER = {"not_entailment": [" No"], "entailment": [" Yes"]} + + @staticmethod + def available_patterns(): + return [0, 1, 2, 3, 4] + + @property + def spell_length(self): + return self.num_prompt_tokens + self.prefix_prompt + + def get_parts(self, example: InputExample): + # switch text_a and text_b to get the correct order + text_a = example.text_a + text_b = example.text_b.rstrip(string.punctuation) + if self.pattern_id == 0: + parts_a, parts_b = [None, '"', + self.shortenable(text_b), '" ?'], [ + None, [self.mask], ',', None, ' "', + self.shortenable(text_a), '"' + ] + elif self.pattern_id == 1: + parts_a, parts_b = [None, self.shortenable(text_b), '?'], [ + None, [self.mask], ',', None, + self.shortenable(" " + text_a) + ] + elif self.pattern_id == 2: + parts_a, parts_b = [ + None, + self.shortenable(text_a), None, ' question:', + self.shortenable(" " + text_b), ' True or False?', None, + ' answer:', [self.mask] + ], [] + else: + raise NotImplementedError(self.pattern_id) + parts_a, parts_b = self.replace_prompt_tokens(parts_a, parts_b) + return parts_a, parts_b + + def verbalize(self, label) -> List[str]: + if self.pattern_id == 4: + return [' true'] if label == 'entailment' else [' false'] + return RtePVP.VERBALIZER[label] + + +# task_name options: ['boolq', 'cb', 'copa', 'multirc', 'rte', 'wic', 'wsc', 'afqmc', 'tnews'] +task_name = "rte" + +trainer = Trainer(env_type='pytorch', + epochs=10, + batch_size=4, + eval_interval=100, + log_interval=50, + experiment_name='glm_large', + pytorch_device='cuda', + load_dir=None, + lr=1e-4) +print("downloading...") + +cl_args = CollateArguments() +cl_args.cloze_eval = True +cl_args.multi_token = task_name in MULTI_TOKEN_TASKS + +cl_args.continuous_prompt = True +cl_args.prefix_prompt = 2 +cl_args.num_prompt_tokens = 5 + +if task_name in CH_TASKS: + model_name = 'GLM-large-ch' + add_block_symbols=True, +else: + model_name = 'GLM-large-en' +tokenizer = Tokenizer.from_pretrained(model_name) + +# model = GLMForSequenceClassification.from_pretrain(model_name=model_name, spell_length=2, +# class_num=3, tune_prefix_layers=1) + +model = GLMForSingleTokenCloze.from_pretrain(download_path="./checkpoints", + model_name=model_name, spell_length=2, + class_num=3, tune_prefix_layers=1) +train_dataset = SuperGlueDataset(task_name=task_name, + data_dir='./datasets/', + dataset_type='train', + tokenizer=tokenizer) + +collate_fn = ConstructSuperglueStrategy(cl_args, + tokenizer, + task_name=task_name, + custom_pvp=RtePVP) + +valid_dataset = SuperGlueDataset(task_name=task_name, + data_dir='./datasets/', + dataset_type='dev', + tokenizer=tokenizer) + +metric_methods = DEFAULT_METRICS[task_name] +trainer.train(model, + collate_fn=collate_fn, + train_dataset=train_dataset, + valid_dataset=valid_dataset, + metric_methods=metric_methods) + diff --git a/examples/glm_ptuning/README.md b/examples/glm_ptuning/README.md new file mode 100644 index 00000000..c575c0fe --- /dev/null +++ b/examples/glm_ptuning/README.md @@ -0,0 +1,23 @@ +# P-tuning + +Here is an example to train a model with continuous prompt (P-tuning). + +## 1. Change the parameters in config +```python +cl_args.continuous_prompt = True # Enable continuous prompt +cl_args.prefix_prompt = 2 # Number of continuous prompt at the beginning +cl_args.num_prompt_tokens = 5 # Number of continuous prompt in the content +``` + + +## 2. Change model parameters + +```python +# spell_length is the final number of continuous prompt tokens in an instance, it is usually determined by the PVP structure +# tune_prefix_layers is the number of transformer layers to tune, where the rest layers are frozen +model = GLMForSingleTokenCloze.from_pretrain(download_path="./checkpoints", + model_name=model_name, spell_length=8, + tune_prefix_layers=1) +``` + +In such way, p-tuning can be enabled in training. \ No newline at end of file diff --git a/examples/glm_ptuning/deepspeed.json b/examples/glm_ptuning/deepspeed.json new file mode 100644 index 00000000..d5db861e --- /dev/null +++ b/examples/glm_ptuning/deepspeed.json @@ -0,0 +1,48 @@ +{ + "train_micro_batch_size_per_gpu": 456, + "gradient_accumulation_steps": 100, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 2, + "contiguous_gradients": false, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 5e7, + "allgather_bucket_size": 5e7, + "cpu_offload": true + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 1e-5, + "warmup_num_steps": 2000 + } + }, + "zero_allow_untested_optimizer": true, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-5, + "weight_decay": 0.1, + "betas": [ + 0.9, + 0.98 + ], + "eps": 1e-6 + } + }, + "activation_checkpointing": { + "partition_activations": true, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false + } diff --git a/examples/glm_ptuning/train_ptuning.py b/examples/glm_ptuning/train_ptuning.py new file mode 100644 index 00000000..47c86cda --- /dev/null +++ b/examples/glm_ptuning/train_ptuning.py @@ -0,0 +1,85 @@ +# Copyright © 2022 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +import torch +from flagai.trainer import Trainer +from flagai.model.glm_model import GLMForSequenceClassification +from flagai.model.glm_model import GLMForSingleTokenCloze +from flagai.data.tokenizer import Tokenizer +from flagai.data.dataset import SuperGlueDataset +from flagai.test_utils import CollateArguments +from flagai.data.dataset.superglue.control import DEFAULT_METRICS, MULTI_TOKEN_TASKS, CH_TASKS +from flagai.data.dataset import ConstructSuperglueStrategy + + +# task_name options: ['boolq', 'cb', 'copa', 'multirc', 'rte', 'wic', 'wsc', 'afqmc', 'tnews'] +task_name = "cb" + +cl_args = CollateArguments() +cl_args.multi_token = task_name in MULTI_TOKEN_TASKS +cl_args.continuous_prompt = True +cl_args.prefix_prompt = 2 +cl_args.num_prompt_tokens = 5 +if task_name in CH_TASKS: + model_name = 'GLM-large-ch' + add_block_symbols=True, +else: + model_name = 'GLM-large-en' +tokenizer = Tokenizer.from_pretrained(model_name) + +model = GLMForSingleTokenCloze.from_pretrain(download_path="./checkpoints", + model_name=model_name, spell_length=8, + tune_prefix_layers=1) +# model_save_path = "/home/yanzhaodong/anhforth/test/FlagAI/examples/glm_superglue/checkpoints/20000_save/pytorch_model.bin" +# model.load_state_dict(torch.load(model_save_path, map_location="cuda")["module"]) +train_dataset = SuperGlueDataset(task_name=task_name, + data_dir='./datasets/', + dataset_type='train', + tokenizer=tokenizer) + +collate_fn = ConstructSuperglueStrategy(cl_args, + tokenizer, + task_name=task_name) + +valid_dataset = SuperGlueDataset(task_name=task_name, + data_dir='./datasets/', + dataset_type='dev', + tokenizer=tokenizer) + +metric_methods = DEFAULT_METRICS[task_name] + +# Deepspeed parallel trainer +trainer = Trainer(env_type='deepspeed', + epochs=10000000, + batch_size=16, + gradient_accumulation_steps=5, + checkpoint_activations=True, + eval_interval=False, + log_interval=100, + fp16=True, + save_interval=10000, + experiment_name='glm_large', + load_dir=None, + num_nodes=1, + num_gpus=2, + hostfile='./hostfile', + deepspeed_config='./deepspeed.json', + lr=1e-4, + training_script=__file__) +# Single-GPU trainer +# trainer = Trainer(env_type='pytorch', +# epochs=100, +# batch_size=1, +# eval_interval=100, +# log_interval=50, +# experiment_name='glm_large', +# pytorch_device='cuda', +# load_dir=None, +# lr=1e-4) + +trainer.train(model, + collate_fn=collate_fn, + train_dataset=train_dataset, + valid_dataset=valid_dataset, + metric_methods=metric_methods) + diff --git a/examples/glm_seq2seq/deepspeed.json b/examples/glm_seq2seq/deepspeed.json new file mode 100644 index 00000000..d5db861e --- /dev/null +++ b/examples/glm_seq2seq/deepspeed.json @@ -0,0 +1,48 @@ +{ + "train_micro_batch_size_per_gpu": 456, + "gradient_accumulation_steps": 100, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 2, + "contiguous_gradients": false, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 5e7, + "allgather_bucket_size": 5e7, + "cpu_offload": true + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 1e-5, + "warmup_num_steps": 2000 + } + }, + "zero_allow_untested_optimizer": true, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-5, + "weight_decay": 0.1, + "betas": [ + 0.9, + 0.98 + ], + "eps": 1e-6 + } + }, + "activation_checkpointing": { + "partition_activations": true, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false + } diff --git a/examples/glm_seq2seq/train.py b/examples/glm_seq2seq/generate.py similarity index 59% rename from examples/glm_seq2seq/train.py rename to examples/glm_seq2seq/generate.py index 81e5201f..85eea9b1 100644 --- a/examples/glm_seq2seq/train.py +++ b/examples/glm_seq2seq/generate.py @@ -1,6 +1,7 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") +import torch from flagai.trainer import Trainer from flagai.model.glm_model import GLMForSeq2Seq from flagai.data.tokenizer import Tokenizer @@ -8,22 +9,14 @@ from flagai.test_utils import Seq2SeqCollateArguments from flagai.data.dataset.superglue.control import DEFAULT_METRICS, CH_TASKS from flagai.data.dataset import ConstructSeq2seqStrategy - +from flagai.metrics import accuracy_metric, exact_match_score,bleu_metric, rouge_metric # Compared with original seq2seq, seq2seq dataset is used # task_name :['cmrc',xxxx] -task_name = "cmrc" +task_name = "cnn_dm" cl_args = Seq2SeqCollateArguments() -trainer = Trainer(env_type='pytorch', - epochs=1, - batch_size=4, - eval_interval=5, - log_interval=50, - experiment_name='glm_large', - pytorch_device='cuda', - load_dir=None, - lr=1e-4) + print("downloading...") if task_name in CH_TASKS: @@ -34,23 +27,36 @@ tokenizer = Tokenizer.from_pretrained(model_name) train_dataset = Seq2SeqDataset(task_name=task_name, - data_dir='./datasets/', - dataset_type='train', + data_dir='../../datasets/', + dataset_type='test', tokenizer=tokenizer) valid_dataset = Seq2SeqDataset(task_name=task_name, - data_dir='./datasets/', - dataset_type='dev', - tokenizer=tokenizer) + data_dir='../../datasets/', + dataset_type='test', + tokenizer=tokenizer) collate_fn = ConstructSeq2seqStrategy(cl_args, tokenizer, task_name=task_name) -train_dataset.example_list = train_dataset.example_list[:20] -valid_dataset.example_list = valid_dataset.example_list[:20] - +train_dataset = train_dataset[:1] +valid_dataset = valid_dataset[:1] model = GLMForSeq2Seq.from_pretrain(model_name=model_name) +model.load_state_dict(torch.load("/home/yanzhaodong/anhforth/FlagAI/examples/glm_seq2seq/checkpoints/310000/pytorch_model.bin")["module"]) +# model.load_state_dict(torch.load("/home/yanzhaodong/anhforth/FlagAI/examples/glm_seq2seq/checkpoints_lang/135000/pytorch_model.bin")["module"]) +print("model loaded") + +trainer = Trainer(env_type='pytorch', + epochs=0, + batch_size=1, + eval_interval=1, + log_interval=50, + experiment_name='glm_large', + pytorch_device='cuda', + load_dir=None, + tokenizer=tokenizer, + lr=1e-4) trainer.train(model, collate_fn=collate_fn, train_dataset=train_dataset, valid_dataset=valid_dataset, - metric_methods=[]) + metric_methods=[["blue_metric", bleu_metric],["rouge_metric", rouge_metric]]) diff --git a/examples/glm_seq2seq/train_deepspeed.py b/examples/glm_seq2seq/train_deepspeed.py new file mode 100644 index 00000000..d70ecc13 --- /dev/null +++ b/examples/glm_seq2seq/train_deepspeed.py @@ -0,0 +1,70 @@ +# Copyright © 2022 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +import torch +from flagai.trainer import Trainer +from flagai.model.glm_model import GLMForSeq2Seq +from flagai.data.tokenizer import Tokenizer +from flagai.data.dataset import Seq2SeqDataset +from flagai.test_utils import Seq2SeqCollateArguments +from flagai.data.dataset.superglue.control import DEFAULT_METRICS, CH_TASKS +from flagai.data.dataset import ConstructSeq2seqStrategy +from flagai.metrics import accuracy_metric, exact_match_score,bleu_metric, rouge_metric + +# Compared with original seq2seq, seq2seq dataset is used +# task_name :['cmrc',xxxx] +task_name = "cnn_dm" + +cl_args = Seq2SeqCollateArguments() + +print("downloading...") + +if task_name in CH_TASKS: + model_name = 'GLM-large-ch' +else: + model_name = 'GLM-large-en' + +tokenizer = Tokenizer.from_pretrained(model_name) + +train_dataset = Seq2SeqDataset(task_name=task_name, + data_dir='../../datasets/', + dataset_type='test', + tokenizer=tokenizer) +valid_dataset = Seq2SeqDataset(task_name=task_name, + data_dir='../../datasets/', + dataset_type='test', + tokenizer=tokenizer) +collate_fn = ConstructSeq2seqStrategy(cl_args, + tokenizer, + task_name=task_name) + +model = GLMForSeq2Seq.from_pretrain(model_name=model_name) +model.load_state_dict(torch.load("/home/yanzhaodong/anhforth/FlagAI/examples/glm_seq2seq/checkpoints/140000/pytorch_model.bin")["module"]) +trainer = Trainer(env_type='deepspeed', + epochs=10000000, + batch_size=16, + gradient_accumulation_steps=5, + checkpoint_activations=True, + eval_interval=False, + log_interval=100, + fp16=True, + save_interval=10000, + experiment_name='glm_large', + load_dir=None, + num_nodes=1, + num_gpus=2, + tokenizer=tokenizer, + hostfile='./hostfile', + deepspeed_config='./deepspeed.json', + lr=1e-4, + training_script=__file__) +# optimizer = Adam(param_groups, +# lr=1e-3, +# weight_decay=0, +# betas=(0.9, 0.999), +# eps=1e-8) +trainer.train(model, + collate_fn=collate_fn, + train_dataset=train_dataset, + valid_dataset=valid_dataset, + metric_methods=[]) diff --git a/examples/glm_superglue/train_prefix.py b/examples/glm_superglue/evaluate.py similarity index 64% rename from examples/glm_superglue/train_prefix.py rename to examples/glm_superglue/evaluate.py index 99ac3a3a..66b1ed14 100644 --- a/examples/glm_superglue/train_prefix.py +++ b/examples/glm_superglue/evaluate.py @@ -1,8 +1,7 @@ -# Copyright © 2022 BAAI. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License") +import torch from flagai.trainer import Trainer from flagai.model.glm_model import GLMForSequenceClassification +from flagai.model.glm_model import GLMForSingleTokenCloze from flagai.data.tokenizer import Tokenizer from flagai.data.dataset import SuperGlueDataset @@ -11,24 +10,11 @@ from flagai.data.dataset import ConstructSuperglueStrategy -# task_name options: ['boolq', 'cb', 'copa', 'multirc', 'rte', 'wic', 'wsc', 'afqmc', 'tnews'] -task_name = "cb" - -trainer = Trainer(env_type='pytorch', - epochs=10, - batch_size=4, - eval_interval=100, - log_interval=50, - experiment_name='glm_large', - pytorch_device='cuda', - load_dir=None, - lr=1e-4) +task_name = "qqp" +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("downloading...") cl_args = CollateArguments() -cl_args.cloze_eval = False -cl_args.multi_token = task_name in MULTI_TOKEN_TASKS - if task_name in CH_TASKS: model_name = 'GLM-large-ch' add_block_symbols=True, @@ -36,9 +22,13 @@ model_name = 'GLM-large-en' tokenizer = Tokenizer.from_pretrained(model_name) -model = GLMForSequenceClassification.from_pretrain(model_name=model_name, spell_length=2, - class_num=3, tune_prefix_layers=1) - +model = GLMForSingleTokenCloze.from_pretrain(download_path="./checkpoints", + model_name=model_name) + +# Load +model_save_path = "./checkpoints/90000/pytorch_model.bin" +model.load_state_dict( + torch.load(model_save_path, map_location=device)["module"]) train_dataset = SuperGlueDataset(task_name=task_name, data_dir='./datasets/', dataset_type='train', @@ -54,9 +44,19 @@ tokenizer=tokenizer) metric_methods = DEFAULT_METRICS[task_name] + +trainer = Trainer(env_type='pytorch', + epochs=0, + batch_size=4, + eval_interval=1, + log_interval=50, + experiment_name='glm_large', + pytorch_device='cuda', + load_dir=None, + lr=1e-4) + trainer.train(model, collate_fn=collate_fn, train_dataset=train_dataset, valid_dataset=valid_dataset, metric_methods=metric_methods) - diff --git a/examples/glm_superglue/hostfile b/examples/glm_superglue/hostfile deleted file mode 100644 index 8020028b..00000000 --- a/examples/glm_superglue/hostfile +++ /dev/null @@ -1 +0,0 @@ -127.0.0.1 slots=4 diff --git a/examples/glm_superglue/train_10b_superglue.py b/examples/glm_superglue/train_10b_superglue.py index 4fa0207c..da3ab540 100644 --- a/examples/glm_superglue/train_10b_superglue.py +++ b/examples/glm_superglue/train_10b_superglue.py @@ -9,29 +9,31 @@ from flagai.test_utils import CollateArguments -task_name = 'qqp' +task_name = 'cb' trainer = Trainer(env_type='pytorch', pytorch_device="cuda", - epochs=2, - batch_size=1, - eval_interval=1000, + epochs=1000000000, + batch_size=32, + eval_interval=100000000, checkpoint_activations=False, fp16=True, - log_interval=1, - save_dir="./glm_superglue_en") - # master_ip='127.0.0.1', - # master_port=17755, - # num_nodes=1, - # num_gpus=2, - # hostfile='./hostfile', - # model_parallel_size=2, - # deepspeed_config='./deepspeed.json', - # training_script=__file__) + save_interval=10000, + log_interval=50, + save_dir="./glm_superglue_en", + master_ip='127.0.0.1', + master_port=17755, + num_nodes=1, + num_gpus=2, + hostfile='./hostfile', + model_parallel_size=2, + deepspeed_config='./deepspeed.json', + lr=1e-4, + training_script=__file__) model_name = "GLM-large-en" -model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models", +model = GLMForSingleTokenCloze.from_pretrain(download_path="./checkpoints", model_name=model_name) - + tokenizer = Tokenizer.from_pretrained(model_name) train_dataset = SuperGlueDataset(task_name=task_name, data_dir='./datasets/', @@ -45,7 +47,6 @@ cloze_eval=True) cl_args = CollateArguments() -cl_args.cloze_eval = True if task_name in ['copa', 'wsc', 'record']: cl_args.multi_token = True diff --git a/examples/glm_superglue/train_qqp_deepspeed.py b/examples/glm_superglue/train_qqp_deepspeed.py index a8629789..cdc204fc 100644 --- a/examples/glm_superglue/train_qqp_deepspeed.py +++ b/examples/glm_superglue/train_qqp_deepspeed.py @@ -1,72 +1,73 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") +import torch from flagai.trainer import Trainer -from flagai.model.glm_model import GLMForSingleTokenCloze, GLMForMultiTokenCloze +from flagai.model.glm_model import GLMForSequenceClassification +from flagai.model.glm_model import GLMForSingleTokenCloze from flagai.data.tokenizer import Tokenizer -from flagai.metrics import accuracy_metric from flagai.data.dataset import SuperGlueDataset from flagai.test_utils import CollateArguments from flagai.data.dataset.superglue.control import DEFAULT_METRICS, MULTI_TOKEN_TASKS, CH_TASKS +from flagai.data.dataset import ConstructSuperglueStrategy -task_name = 'boolq' -trainer = Trainer(env_type='deepspeed', - epochs=1000, - batch_size=512, - eval_interval=100, - log_interval=10, - save_interval=1e5, - gradient_accumulation_steps=5, - checkpoint_activations=True, - fp16=True, - warm_up=0.1, - weight_decay=0.1, - save_dir="./qqp", - master_ip='127.0.0.1', - master_port=17810, - num_nodes=1, - num_gpus=2, - hostfile='./hostfile', - deepspeed_config='./deepspeed.json', - training_script=__file__) -model_name = "GLM-large-en" -tokenizer = Tokenizer.from_pretrained(model_name) +# task_name options: ['boolq', 'cb', 'copa', 'multirc', 'rte', 'wic', 'wsc', 'afqmc', 'tnews'] +task_name = "qqp" -if task_name in MULTI_TOKEN_TASKS: - model = GLMForMultiTokenCloze.from_pretrain( - download_path="/mnt/test_10b_models", model_name=model_name) +cl_args = CollateArguments() +cl_args.multi_token = task_name in MULTI_TOKEN_TASKS +if task_name in CH_TASKS: + model_name = 'GLM-large-ch' + add_block_symbols=True else: - model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models", - model_name=model_name) - -# model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models", -# model_name="GLM-large-en") -train_dataset = SuperGlueDataset(task_name=task_name, - data_dir='./datasets/', - dataset_type='train', - tokenizer=tokenizer, - cloze_eval=True) -valid_dataset = SuperGlueDataset(task_name=task_name, - data_dir='./datasets/', - dataset_type='dev', - tokenizer=tokenizer, - cloze_eval=True) + model_name = 'GLM-large-en' +tokenizer = Tokenizer.from_pretrained(model_name) +model = GLMForSingleTokenCloze.from_pretrain(download_path="./checkpoints", + model_name=model_name) -cl_args = CollateArguments() -cl_args.cloze_eval = True -if task_name in ['copa', 'wsc', 'record']: - cl_args.multi_token = True +# Continue training from saved checkpoints +# model_save_path = "./checkpoints/20000/pytorch_model.bin" +# model.load_state_dict(torch.load(model_save_path, map_location="cuda")["module"]) -from flagai.data.dataset import ConstructSuperglueStrategy +train_dataset = SuperGlueDataset(task_name=task_name, + data_dir='./datasets/', + dataset_type='train', + tokenizer=tokenizer) collate_fn = ConstructSuperglueStrategy(cl_args, tokenizer, task_name=task_name) +valid_dataset = SuperGlueDataset(task_name=task_name, + data_dir='./datasets/', + dataset_type='dev', + tokenizer=tokenizer) + +metric_methods = DEFAULT_METRICS[task_name] + +trainer = Trainer(env_type='deepspeed', + epochs=1000, + batch_size=64, + gradient_accumulation_steps=5, + checkpoint_activations=True, + eval_interval=1000, + log_interval=100, + fp16=True, + save_interval=10000, + experiment_name='glm_large', + load_dir=None, + num_nodes=1, + num_gpus=2, + hostfile='./hostfile', + deepspeed_config='./deepspeed.json', + lr=1e-4, + training_script=__file__) + trainer.train(model, - train_dataset=train_dataset, - valid_dataset=valid_dataset, - collate_fn=collate_fn, - metric_methods=[["acc", accuracy_metric]]) + collate_fn=collate_fn, + train_dataset=train_dataset, + valid_dataset=valid_dataset, + metric_methods=metric_methods) + diff --git a/examples/glm_superglue/train_qqp_pytorch.py b/examples/glm_superglue/train_qqp_pytorch.py index f4ae40d1..fb649112 100644 --- a/examples/glm_superglue/train_qqp_pytorch.py +++ b/examples/glm_superglue/train_qqp_pytorch.py @@ -1,23 +1,22 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") - from flagai.trainer import Trainer from flagai.model.glm_model import GLMForSingleTokenCloze from flagai.data.tokenizer import Tokenizer from flagai.metrics import accuracy_metric from flagai.data.dataset import SuperGlueDataset from flagai.test_utils import CollateArguments - +from flagai.data.dataset import ConstructSuperglueStrategy task_name = 'qqp' trainer = Trainer(env_type='pytorch', pytorch_device='cuda', - epochs=2, + epochs=10, batch_size=128+256, eval_interval=500, - log_interval=10, + log_interval=100, save_interval=1e5, lr=1e-5, weight_decay=0.1, @@ -45,12 +44,9 @@ tokenizer=tokenizer, cloze_eval=True) cl_args = CollateArguments() -cl_args.cloze_eval = True if task_name in ['copa', 'wsc', 'record']: cl_args.multi_token = True -from flagai.data.dataset import ConstructSuperglueStrategy - collate_fn = ConstructSuperglueStrategy(cl_args, tokenizer, task_name=task_name) diff --git a/examples/glm_title_generation/hostfile b/examples/glm_title_generation/hostfile deleted file mode 100644 index 4c93a71d..00000000 --- a/examples/glm_title_generation/hostfile +++ /dev/null @@ -1 +0,0 @@ -127.0.0.1 slots=4 \ No newline at end of file diff --git a/flagai/auto_model/auto_loader.py b/flagai/auto_model/auto_loader.py index d57b9a2e..7fea3564 100644 --- a/flagai/auto_model/auto_loader.py +++ b/flagai/auto_model/auto_loader.py @@ -55,6 +55,7 @@ def __getattr__(self, name): "glm_title-generation": ["flagai.model.glm_model", "GLMForSeq2Seq"], "opt_seq2seq": ("flagai.model.opt_model", "OPTModel"), "opt_lm": ("flagai.model.opt_model", "OPTModel"), + "galactica_lm": ("flagai.model.galactica_model", "GalacticaModel"), "vit_classification": ("flagai.model.vision.vit", "VisionTransformer"), "clip_txt_img_matching": ("flagai.model.mm.clip_model", "CLIP"), "swinv1_classification": ("flagai.model.vision.swinv1", "SwinTransformer"), @@ -90,6 +91,10 @@ def __getattr__(self, name): "glm-10b-ch": ["flagai.model.glm_model", "GLMModel", "glm", "nlp"], "cpm3": ["flagai.model.cpm3_model", "CPM3", "cpm3", "nlp"], "cpm3-train": ["flagai.model.cpm3_train_model", "CPM3", "cpm3", "nlp"], + "galactica-1.3b-en": ["flagai.model.galactica_model", "GalacticaModel", "galactica", "nlp", "flagai.data.tokenizer.galactica.galactica_tokenizer", "GalacticaTokenizer"], + "galactica-6.7b-en": ["flagai.model.galactica_model", "GalacticaModel", "galactica", "nlp", "flagai.data.tokenizer.galactica.galactica_tokenizer", "GalacticaTokenizer"], + "galactica-30b-en": ["flagai.model.galactica_model", "GalacticaModel", "galactica", "nlp", "flagai.data.tokenizer.galactica.galactica_tokenizer", "GalacticaTokenizer"], + "galactica-120b-en": ["flagai.model.galactica_model", "GalacticaModel", "galactica", "nlp", "flagai.data.tokenizer.galactica.galactica_tokenizer", "GalacticaTokenizer"], "vit-base-p16-224": ["flagai.model.vision.vit", "VisionTransformer", "vit", "vision"], "vit-base-p16-384": @@ -131,6 +136,7 @@ def __getattr__(self, name): "altclip-bert-b": ["flagai.models.mm.AltCLIP", "AltCLIP", "altclip", "mm", "flagai.model.mm.AltCLIP", "AltCLIPProcessBert"], "eva-clip": ["flagai.model.mm.eva_clip_model", "EVA_CLIP", "evaclip", "mm"], + } @@ -170,7 +176,6 @@ def __init__(self, """ raw_model_name = copy.deepcopy(model_name) - model_name = model_name.lower() if model_name not in MODEL_DICT: @@ -206,10 +211,14 @@ def __init__(self, self.model.half() if model_type == "nlp": - tokenizer_class = getattr(LazyImport("flagai.data.tokenizer"), - "Tokenizer") - self.tokenizer = tokenizer_class.from_pretrained( - model_name, cache_dir=download_path) + if brief_model_name in ["galactica", ]: + self.tokenizer = getattr(LazyImport(MODEL_DICT[model_name][4]), + MODEL_DICT[model_name][5])(download_path) + else : + tokenizer_class = getattr(LazyImport("flagai.data.tokenizer"), + "Tokenizer") + self.tokenizer = tokenizer_class.from_pretrained( + model_name, cache_dir=download_path) elif model_type == "mm": if model_name.startswith("altdiffusion"): diff --git a/flagai/data/dataset/data_collator/collate_fn.py b/flagai/data/dataset/data_collator/collate_fn.py index f162d1f2..73b2f8e5 100644 --- a/flagai/data/dataset/data_collator/collate_fn.py +++ b/flagai/data/dataset/data_collator/collate_fn.py @@ -10,7 +10,7 @@ from torch.utils.data.dataloader import default_collate from scipy.stats import poisson from flagai.data.dataset.data_utils import build_sample -from flagai.data.dataset.superglue.control import PVPS, SuperGlueProcessor +from flagai.data.dataset.superglue.control import PVPS, SuperGlueProcessor, PROCESSOR_DICT def rindex(lst, val, start=None): @@ -81,22 +81,26 @@ def pad_choice_dim(data, choice_num): class ConstructSuperglueStrategy: - def __init__(self, args, tokenizer, task_name): + def __init__(self, args, tokenizer, task_name, custom_pvp=None): # pattern_id, seq_length, num_prompt_tokens, multi_token, segment_length, fast_decode, dataset_type, cloze_val=True self.tokenizer = tokenizer self.cloze_eval = args.cloze_eval + # self.processor = PROCESSOR_DICT[task_name] self.processor = SuperGlueProcessor().get_processor(None, task_name)(False) + pvp_func = custom_pvp if custom_pvp else PVPS[task_name] + self.pvp = pvp_func(args, + tokenizer, + self.processor.get_labels(), + args.seq_length, + pattern_id=args.pattern_id, + num_prompt_tokens=args.num_prompt_tokens, + is_multi_token=args.multi_token, + max_segment_length=args.segment_length, + fast_decode=args.fast_decode) + + - self.pvp = PVPS[task_name](args, - tokenizer, - self.processor.get_labels(), - args.seq_length, - pattern_id=args.pattern_id, - num_prompt_tokens=args.num_prompt_tokens, - is_multi_token=args.multi_token, - max_segment_length=args.segment_length, - fast_decode=args.fast_decode) self.args = args def __call__(self, examples): diff --git a/flagai/data/dataset/seq2seq/dataset.py b/flagai/data/dataset/seq2seq/dataset.py index 1313141c..adc28149 100644 --- a/flagai/data/dataset/seq2seq/dataset.py +++ b/flagai/data/dataset/seq2seq/dataset.py @@ -141,11 +141,12 @@ def create_examples(self, split): text_a=source_text, text_b=target_text, meta=meta) - if idx < 10: + if idx < 1: print_rank_0( (source_text.encode('utf-8'), target_text.encode('utf-8'), meta["ref"].encode('utf-8'))) example_list.append(example) + return example_list @@ -393,7 +394,6 @@ def __init__(self, self.dataset_type = dataset_type self.tokenizer = tokenizer self.dataset_name = dataset_type - if not os.path.exists(data_dir + '/' + task_name): SuperGlueProcessor()._download_data(data_dir, task_name) diff --git a/flagai/data/dataset/superglue/control.py b/flagai/data/dataset/superglue/control.py index 2f63899e..f9ca7987 100644 --- a/flagai/data/dataset/superglue/control.py +++ b/flagai/data/dataset/superglue/control.py @@ -131,7 +131,7 @@ MULTI_TOKEN_TASKS = ['copa', 'record', 'cmrc', 'wsc'] -CH_TASKS = ['afqmc', 'tnews', 'cmrc', 'wanke'] +CH_TASKS = ['afqmc', 'tnews', 'cmrc', 'wanke', 'lang8_hsk'] class SuperGlueProcessor: diff --git a/flagai/data/dataset/superglue/processor.py b/flagai/data/dataset/superglue/processor.py index a700ef2d..6cc27ed9 100644 --- a/flagai/data/dataset/superglue/processor.py +++ b/flagai/data/dataset/superglue/processor.py @@ -105,7 +105,6 @@ def encode(self, example: InputExample, tokenizer, seq_length, args): label = example.label label = self.get_labels().index(label) if args.pretrained_bert: - sample = build_sample(ids, label=label, types=types, @@ -125,7 +124,7 @@ def encode(self, example: InputExample, tokenizer, seq_length, args): class SuperGLUEProcessor(DataProcessor): def __init__(self, few_superglue): - super(SuperGLUEProcessor, self).__init__(few_superglue) + super(SuperGLUEProcessor, self).__init__(few_superglue=False) self.few_superglue = few_superglue def get_train_examples(self, data_dir): diff --git a/flagai/data/dataset/superglue/pvp.py b/flagai/data/dataset/superglue/pvp.py index d6e6cce6..d4d07b39 100644 --- a/flagai/data/dataset/superglue/pvp.py +++ b/flagai/data/dataset/superglue/pvp.py @@ -140,40 +140,43 @@ def available_patterns(): return [0] def replace_prompt_tokens(self, parts_a, parts_b): - if not self.continuous_prompt: + """ + In the patterns of parts a and b, there are None(s) in between, which stands for + prefix-tuning tokens + """ + if not self.continuous_prompt: # Remove None in parts_a and parts_b if no p-tuning parts_a = [part for part in parts_a if part is not None] parts_b = [part for part in parts_b if part is not None] return parts_a, parts_b num_prompt_tokens = self.num_prompt_tokens - num_pos = 0 + num_pos = 0 # The number of None in parts a and b for parts in (parts_a, parts_b): for part in parts: if part is None: num_pos += 1 + # Average p-tuning tokens to be inserted in each position avg_prompt_tokens = math.ceil(num_prompt_tokens / num_pos) - new_parts_a, new_parts_b = [], [] - for part in parts_a: - if part is None: - if num_prompt_tokens > 0: - if num_prompt_tokens >= avg_prompt_tokens: - new_parts_a.append(avg_prompt_tokens) - num_prompt_tokens -= avg_prompt_tokens - else: - new_parts_a.append(num_prompt_tokens) - num_prompt_tokens = 0 - else: - new_parts_a.append(part) - for part in parts_b: - if part is None: - if num_prompt_tokens > 0: - if num_prompt_tokens >= avg_prompt_tokens: - new_parts_b.append(avg_prompt_tokens) - num_prompt_tokens -= avg_prompt_tokens - else: - new_parts_b.append(num_prompt_tokens) - num_prompt_tokens = 0 - else: - new_parts_b.append(part) + def insert_tokens(parts, num_prompt_tokens, avg_prompt_tokens): + """ + replace None with the number of tokens need to be added + """ + new_parts = [] + for part in parts: + if part is None: + if num_prompt_tokens > 0: + if num_prompt_tokens >= avg_prompt_tokens: + new_parts.append(avg_prompt_tokens) + num_prompt_tokens -= avg_prompt_tokens + else: + new_parts.append(num_prompt_tokens) + num_prompt_tokens = 0 + else: + new_parts.append(part) + return new_parts + + new_parts_a = insert_tokens(parts_a, num_prompt_tokens, avg_prompt_tokens) + new_parts_b = insert_tokens(parts_b, num_prompt_tokens, avg_prompt_tokens) + return new_parts_a, new_parts_b def encode(self, @@ -199,7 +202,6 @@ def encode(self, x if isinstance(x, tuple) else (x, False) for x in raw_parts_a ] prompt_id = tokenizer.num_tokens - def encode_input(raw_parts): parts = [] for x, s in raw_parts: @@ -216,14 +218,13 @@ def encode_input(raw_parts): raw_parts_a) # Encode part a from text to token ids if self.prefix_prompt > 0: parts_a = [([prompt_id] * self.prefix_prompt, False)] + parts_a - parts_b = None if raw_parts_b: raw_parts_b = [ x if isinstance(x, tuple) else (x, False) for x in raw_parts_b ] parts_b = encode_input(raw_parts_b) - if self.is_multi_token: # note: meta is not added for multi_token yet! + if self.is_multi_token: # Note: meta is not added for multi_token yet! answers = self.get_answers(example) # All text answer choices if example.label is not None: @@ -854,8 +855,6 @@ def encode_input(raw_parts): meta=example.meta) # sample['input_ids'] = np.stack((sample['input_ids'],sample['input_ids'])) # sample['input_ids'] = np.stack((sample['input_ids'], sample['input_ids'])) - # print(sample['input_ids'].shape, sample['target_ids'].shape, sample['attention_mask'].shape, sample['logit_mask'].shape) - return sample def verbalize(self, label) -> List[str]: @@ -1698,35 +1697,3 @@ def get_verbalization_ids(word: str, tokenizer, return ids -# PVPS = { -# 'agnews': AgnewsPVP, -# 'mnli': MnliPVP, -# 'yelp-polarity': YelpPolarityPVP, -# 'yelp-full': YelpFullPVP, -# 'yahoo': YahooPVP, -# 'xstance': XStancePVP, -# 'xstance-de': XStancePVP, -# 'xstance-fr': XStancePVP, -# 'rte': RtePVP, -# 'wic': WicPVP, -# 'cb': CbPVP, -# 'wsc': WscPVP, -# 'boolq': BoolQPVP, -# 'copa': CopaPVP, -# 'multirc': MultiRcPVP, -# 'record': RecordPVP, -# 'ax-b': RtePVP, -# 'ax-g': RtePVP, -# 'sst2': Sst2PVP, -# 'cola': ColaPVP, -# 'mrpc': MrpcPVP, -# 'qqp': QqpPVP, -# 'qnli': QnliPVP, -# 'squad': SquadPVP, -# 'race': RacePVP, -# "afqmc": AFQMCPVP, -# 'tnews': TNewsPVP, -# 'cluewsc': CLUEWSCPVP, -# 'wanke': WankePVP, -# # 'cmrc': CMRCPVP -# } diff --git a/flagai/data/tokenizer/galactica/galactica_tokenizer.py b/flagai/data/tokenizer/galactica/galactica_tokenizer.py new file mode 100644 index 00000000..87a28412 --- /dev/null +++ b/flagai/data/tokenizer/galactica/galactica_tokenizer.py @@ -0,0 +1,84 @@ +from transformers import PreTrainedTokenizerFast + + +from ..tokenizer import CommandToken, Tokenizer + +class GalacticaTokenizer(Tokenizer): + def __init__(self, download_dir) -> None: + pass + self.text_tokenizer = PreTrainedTokenizerFast.from_pretrained(download_dir) + # parse tokens and vocabs from tokenizer + self._tokens = list(self.text_tokenizer.get_vocab().keys()) + self._vocab = {k: v for k, v in self.text_tokenizer.get_vocab().items()} + self.num_tokens = len(self._tokens) + + self._command_tokens = [ + CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), + CommandToken('ENC', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), + CommandToken('MASK', '[MASK]', + self.get_specialid_from_text_tokenizer('mask')), + CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), + CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), + CommandToken('eos', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), + ] + + self.command_name_map = {tok.name: tok for tok in self._command_tokens} + self.command_token_map = { + tok.token: tok + for tok in self._command_tokens + } + self.command_id_map = {tok.Id: tok for tok in self._command_tokens} + + def get_specialid_from_text_tokenizer(self, token): + if token in ["eos", "sep"]: + return self._vocab.get('') + elif token == "cls": + return self._vocab.get('') + elif token == "unk": + return self._vocab.get('') + elif token == "pad": + return self._vocab.get('') + elif token == "mask": + return self._vocab.get('') + else: + raise NameError("token not exists") + + def encode_plus(self, text, max_length=512): + return self.text_tokenizer.encode_plus(text, truncation=True, max_length=max_length) + + def decode(self, ids): + return self.text_tokenizer.decode(ids) + + def get_vocab(self): + return self.text_tokenizer.get_vocab() + + def get_command_id(self, name): + """get command token corresponding to `name`""" + return self.command_name_map[name].Id + + def get_command(self, name): + """get command token corresponding to `name`""" + return self.command_name_map[name] + + def encode_plus(self, + text, + second_text=None, + truncation=False, + max_length=None,): + + return self.text_tokenizer.encode_plus(text, + text_pair=second_text, + truncation=truncation, + max_length=max_length, + add_special_tokens=True) + + def tokenize(self, **kwargs): + return self.text_tokenizer.tokenize(**kwargs) + + def __len__(self): + return len(self.text_tokenizer) + +if __name__ == "__main__": + pass + + diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index cc4aac03..b27bb24d 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -148,15 +148,15 @@ def __init__(self, if add_block_symbols: self.add_command_token('sop', '<|startofpiece|>') self.add_command_token('eop', '<|endofpiece|>',) - # if add_task_mask: - # self.add_command_token('gMASK', '[gMASK]') - # self.add_command_token('sMASK', '[sMASK]') - # if add_decoder_mask: - # self.add_command_token('dBLOCK', '[dBLOCK]') - # if add_sentinel_token > 0: - # for i in range(1, add_sentinel_token): - # self.add_command_token(f'MASK{i}', f'[MASK{i}]') - # self.add_command_token(f'sop{i}', f'<|startofpiece{i}|>') + if add_task_mask: + self.add_command_token('gMASK', '[gMASK]') + self.add_command_token('sMASK', '[sMASK]') + if add_decoder_mask: + self.add_command_token('dBLOCK', '[dBLOCK]') + if add_sentinel_token > 0: + for i in range(1, add_sentinel_token): + self.add_command_token(f'MASK{i}', f'[MASK{i}]') + self.add_command_token(f'sop{i}', f'<|startofpiece{i}|>') elif self.tokenizer_class == "bpe": if self.tokenizer_model_name.lower().startswith('roberta'): self.num_command_tokens = 6 @@ -348,7 +348,7 @@ def __init__(self, } self.command_id_map = {tok.Id: tok for tok in self._command_tokens} self._command_token_tokens = list(self.command_token_map.keys()) - print([(k,v.Id) for k,v in self.command_name_map.items()]) + logger.info("All special tokens: %s", str([(k,v.Id) for k,v in self.command_name_map.items()])) def get_vocab(self): return self.text_tokenizer.get_vocab() diff --git a/flagai/metrics.py b/flagai/metrics.py index d3c6ad4a..ef629256 100644 --- a/flagai/metrics.py +++ b/flagai/metrics.py @@ -9,7 +9,8 @@ import functools import string import math - +import sacrebleu +from rouge_score import rouge_scorer def sigmoid(x): sig = 1 / (1 + math.exp(-x)) @@ -17,14 +18,19 @@ def sigmoid(x): def accuracy_metric(predictions, labels, meta=None): + ''' + predictions: torch.size(n, class_num) + labels: torch.size(n) + ''' count = 0 assert len(predictions) == len(labels) - if predictions.size() != labels.size(): + if predictions.size() != labels.size(): predictions = torch.argmax(predictions, dim=-1) for prediction, label in zip(predictions, labels): count += prediction == label else: prediction, label = predictions[0], labels[0] + if sigmoid(prediction) >= 0.5: count += label == 1 else: @@ -32,6 +38,42 @@ def accuracy_metric(predictions, labels, meta=None): return 100.0 * count / len(labels) +def bleu_metric(predictions, labels, meta=None, tokenizer=None): + ref_list = [] + for i in labels: + i = i.tolist() + ref = tokenizer.DecodeIds(i) + ref_list.append(ref) + pred_list = [] + + for prediction in predictions: + buf = [] + prediction = prediction.tolist() + prediction = tokenizer.DecodeIds(prediction) + pred_list.append(prediction) + bleu_results = sacrebleu.corpus_bleu(pred_list, [ref_list]) + bleu_score = bleu_results.score + return bleu_score + +def rouge_metric(predictions, labels, meta=None, tokenizer=None, metric="rouge-1"): + metric_dict = {"rouge-1": "rouge1", "rouge-2": "rouge2", "rouge-l": "rougeLsum"} + ref_list = [] + for i in labels: + i = i.tolist() + ref = tokenizer.DecodeIds(i) + ref_list.append(ref) + pred_list = [] + for prediction in predictions: + buf = [] + prediction = prediction.tolist() + prediction = tokenizer.DecodeIds(prediction) + pred_list.append(prediction) + scorer = rouge_scorer.RougeScorer([metric_dict[metric]], use_stemmer=True) + scores = [scorer.score(pred, ref) for pred, ref in zip(pred_list, ref_list)] + scores = [score[metric_dict[metric]].fmeasure * 100 for score in scores] + scores = sum(scores) / len(scores) + return scores + def f1_metric(predictions, labels, meta=None): pred = torch.argmax(predictions, dim=-1).cpu() labels = labels.cpu() @@ -91,7 +133,7 @@ def lower(text): return white_space_fix(remove_articles(remove_punc(lower(s)))) -def exact_match_score(prediction, ground_truth): +def exact_match_score(prediction, ground_truth, meta=None): return normalize_answer(prediction) == normalize_answer(ground_truth) diff --git a/flagai/model/galactica_model.py b/flagai/model/galactica_model.py new file mode 100644 index 00000000..98e1f9cb --- /dev/null +++ b/flagai/model/galactica_model.py @@ -0,0 +1,207 @@ +# coding=utf-8 +# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Galactica model.""" +import random +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss +from flagai.model.layers.activations import ACT2FN +from flagai.model.gpt2_model import GPT2Model, GPT2Stack, GPT2Config + + +class OPTLearnedPositionalEmbedding(nn.Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int): + # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2 + # and adjust num_embeddings appropriately. Other models don't have this hack + self.offset = 2 + super().__init__(num_embeddings + self.offset, embedding_dim) + + def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0): + """`input_ids_shape` is expected to be [bsz x seqlen].""" + attention_mask = attention_mask.long() + + # create positions depending on attention_mask + positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1 + # cut positions if `past_key_values_length` is > 0 + positions = positions[:, past_key_values_length:] + + return super().forward(positions + self.offset) + +class OPTStack(GPT2Stack): + def __init__(self, config: GPT2Config): + super(OPTStack, self).__init__(config) + self.wpe = OPTLearnedPositionalEmbedding(config.n_positions, config.hidden_size) + self.ln_f = None + if config.do_layer_norm_before: + self.ln_f = nn.LayerNorm(config.hidden_size) + + if config.n_embd != config.hidden_size: + self.project_out = nn.Linear(config.hidden_size, config.n_embd, bias=False) + else: + self.project_out = None + + if config.n_embd != config.hidden_size: + self.project_in = nn.Linear(config.n_embd, config.hidden_size, bias=False) + else: + self.project_in = None + + def get_position_embeddings(self, **kwargs): + pass + padding_mask = kwargs["padding_mask"] + past_length = kwargs["past_length"] + position_embeds = self.wpe(padding_mask, past_length) + return position_embeds + +def trans_opt_to_gpt_config(opt_config_json): + trans_config_json = {} + trans_key = { + "ffn_dim": "n_inner", + "hidden_size": "hidden_size", + "max_position_embeddings": "n_positions", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + "vocab_size": "vocab_size", + "activation_function": "activation_function", + "checkpoint_activations": "checkpoint_activations", + "word_embed_proj_dim": "n_embd", + "do_layer_norm_before": "do_layer_norm_before", + } + for k, v in opt_config_json.items(): + if k in trans_key: + trans_config_json[trans_key[k]] = v + + return trans_config_json + +class GalacticaModel(GPT2Model): + + def __init__(self, config, **kwargs): + config = trans_opt_to_gpt_config(config) + super(GalacticaModel, self).__init__(config, **kwargs) + self.transformer = OPTStack(self.config_gpt) + + def load_weights(self, checkpoint_path): + checkpoint = torch.load(checkpoint_path, + map_location=torch.device("cpu")) + if "module" in checkpoint: + # ddp + checkpoint = checkpoint["module"] + + checkpoint_ = {} + for k, v in checkpoint.items(): + if k[:6] == "model.": + checkpoint_[k[6:]] = v + else : + checkpoint_[k] = v + + checkpoint = self.transpose_weight(checkpoint_) + self.load_state_dict(checkpoint, strict=False) + self.lm_head.weight.data = nn.Parameter(self.transformer.wte.weight.data) + + return checkpoint + + def transpose_weight(self, checkpoints): + + checkponts_ = { + "transformer.wte.weight": checkpoints["decoder.embed_tokens.weight"], + "transformer.wpe.weight": checkpoints["decoder.embed_positions.weight"], + } + + if "decoder.project_in.weight" in checkpoints: + checkponts_["transformer.project_in.weight"] = checkpoints["decoder.project_in.weight"] + checkponts_["transformer.project_out.weight"] = checkpoints["decoder.project_out.weight"] + + if "decoder.final_layer_norm.weight" in checkpoints: + checkponts_["transformer.ln_f.weight"] = checkpoints["decoder.final_layer_norm.weight"] + checkponts_["transformer.ln_f.bias"] = checkpoints["decoder.final_layer_norm.bias"] + + q_weight = None + k_weight = None + v_weight = None + q_bias = None + k_bias = None + v_bias = None + for k, v in checkpoints.items(): + # first ln + if "decoder.layers" in k and "self_attn_layer_norm" in k: + layer_id = k.split(".")[2] + weight_or_bias = k.split(".")[-1] + checkponts_[f"transformer.h.{layer_id}.ln_1.{weight_or_bias}"] = v + continue + + # qkv + if "self_attn.k_proj.weight" in k: + k_weight = v + continue + if "self_attn.k_proj.bias" in k: + k_bias = v + continue + + if "self_attn.v_proj.weight" in k: + v_weight = v + continue + if "self_attn.v_proj.bias" in k: + v_bias = v + continue + + if "self_attn.q_proj.weight" in k: + q_weight = v + qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0) + layer_id = k.split(".")[2] + checkponts_[f"transformer.h.{layer_id}.attn.c_attn.weight"] = qkv_weight + continue + + if "self_attn.q_proj.bias" in k: + q_bias = v + qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0) + layer_id = k.split(".")[2] + checkponts_[f"transformer.h.{layer_id}.attn.c_attn.bias"] = qkv_bias + continue + + # att out + if "decoder.layers" in k and "self_attn.out_proj" in k: + layer_id = k.split(".")[2] + weight_or_bias = k.split(".")[-1] + checkponts_[f"transformer.h.{layer_id}.attn.c_proj.{weight_or_bias}"] = v + continue + + # fc1 + if "decoder.layers" in k and "fc1" in k: + layer_id = k.split(".")[2] + weight_or_bias = k.split(".")[-1] + checkponts_[f"transformer.h.{layer_id}.mlp.c_fc.{weight_or_bias}"] = v + continue + + # fc2 + if "decoder.layers" in k and "fc2" in k: + layer_id = k.split(".")[2] + weight_or_bias = k.split(".")[-1] + checkponts_[f"transformer.h.{layer_id}.mlp.c_proj.{weight_or_bias}"] = v + continue + + # second ln + if "decoder.layers" in k and "final_layer_norm" in k: + layer_id = k.split(".")[2] + weight_or_bias = k.split(".")[-1] + checkponts_[f"transformer.h.{layer_id}.ln_2.{weight_or_bias}"] = v + continue + + return checkponts_ \ No newline at end of file diff --git a/flagai/model/glm_model.py b/flagai/model/glm_model.py index dd1a374b..cfd55e17 100644 --- a/flagai/model/glm_model.py +++ b/flagai/model/glm_model.py @@ -341,6 +341,7 @@ class GLMModel(BaseModel): def __init__(self, config, **kwargs): super(GLMModel, self).__init__(config, **kwargs) + print(config) self.config = config num_layers = config["num_layers"] vocab_size = config["vocab_size"] @@ -362,10 +363,12 @@ def __init__(self, config, **kwargs): attention_scale = config["attention_scale"] tune_prefix_layers = config.get("tune_prefix_layers", None) + self.parallel_output = parallel_output self.output_predict = output_predict self.hidden_size = hidden_size - + self.spell_length = spell_length + self.spell_func = spell_func init_method = normal_init_method(std=0.02) self.word_embeddings = VocabParallelEmbedding(vocab_size, @@ -387,10 +390,8 @@ def __init__(self, config, **kwargs): attention_scale=attention_scale, relative_encoding=relative_encoding, block_position_encoding=block_position_encoding) - if spell_length is not None: - self.prompt_spell = PromptSpell(spell_length, self.hidden_size, - spell_func) + self.prompt_spell = PromptSpell(spell_length, self.hidden_size, spell_func) if tune_prefix_layers != None: log_dist("the model is freezed!") self.freeze_transformer(tune_prefix_layers=tune_prefix_layers) @@ -400,7 +401,7 @@ def freeze_transformer(self, tune_prefix_layers=None): self.word_embeddings.requires_grad_(False) self.transformer.requires_grad_(False) if tune_prefix_layers is not None: - log_str += f" tune {tune_prefix_layers} prefix layers" + log_str += f"and tune {tune_prefix_layers} prefix layers" for i in range(tune_prefix_layers): self.transformer.layers[i].requires_grad_(True) print_rank_0(log_str) @@ -425,16 +426,17 @@ def forward(self, attention_mask: 2 x 3 ''' # Embeddings. - batch_size = input_ids.size(0) words_embeddings = self.word_embeddings(input_ids) embeddings = words_embeddings + device = input_ids.device if prompt_pos is not None: embeddings = embeddings.clone() - prompt_embeds = self.prompt_spell() - batch_index = torch.arange(batch_size, - device=input_ids.device).unsqueeze(1) - embeddings[batch_index, prompt_pos] = prompt_embeds + prompt_embeds = self.prompt_spell().to(device) + # batch_index = torch.arange(batch_size, + # device=device).unsqueeze(1) + for batch_index in range(batch_size): + embeddings[batch_index, prompt_pos] = prompt_embeds.to(dtype=embeddings.dtype) # Transformer. transformer_output = self.transformer(embeddings, position_ids, @@ -974,6 +976,5 @@ def forward(self, loss_mask = loss_mask.view(-1).float() Loss = nn.CrossEntropyLoss(ignore_index=0, reduction="none") logits = outputs.view(-1, vocab_size) - loss = (Loss(logits, target_ids) * loss_mask).sum() / loss_mask.sum() return {"loss": loss, "hidden_states": mems, "logits": logits} diff --git a/flagai/model/layers/embeddings.py b/flagai/model/layers/embeddings.py index 38cab4b1..e315f299 100644 --- a/flagai/model/layers/embeddings.py +++ b/flagai/model/layers/embeddings.py @@ -501,7 +501,7 @@ def _relative_position_bucket(self, def _segment_relative_position_bucket(self, query_segment, key_segment): """ - segment1看segment2的所有位置编码相同,看segment3的位置编码也相同(是另一个值) + All positional encodings of segment2 are the same in the view of segment1. Same for segment3 and segment1(but is a different value) """ return query_segment * self.num_segments + key_segment diff --git a/flagai/model/predictor/predictor.py b/flagai/model/predictor/predictor.py index 937e3bc5..03cf7ff4 100644 --- a/flagai/model/predictor/predictor.py +++ b/flagai/model/predictor/predictor.py @@ -322,7 +322,8 @@ def predict_generate_randomsample(self, device) elif "gpt" in self.class_name.lower( - ) or "opt" in self.class_name.lower(): + ) or "opt" in self.class_name.lower() \ + or "galactica" in self.class_name.lower(): return gpt_random_sample_use_cache(self.model, self.tokenizer, text, input_max_length, out_max_length, top_k, top_p, diff --git a/flagai/model/prompt.py b/flagai/model/prompt.py index 87e70cff..8606d6e8 100644 --- a/flagai/model/prompt.py +++ b/flagai/model/prompt.py @@ -11,8 +11,7 @@ def __init__(self, spell_length, hidden_size, spell_func): super(PromptSpell, self).__init__() self.spell_length = spell_length self.hidden_size = hidden_size - self.spell_embeddings = torch.nn.Embedding(self.spell_length, - self.hidden_size) + self.spell_embeddings = torch.nn.Embedding(self.spell_length,self.hidden_size) self.spell_func = spell_func if self.spell_func == "lstm": self.lstm_head = torch.nn.LSTM( diff --git a/flagai/trainer.py b/flagai/trainer.py index de551ca9..709cd7c6 100644 --- a/flagai/trainer.py +++ b/flagai/trainer.py @@ -136,6 +136,7 @@ def __init__( fp16=False, clip_grad=1.0, checkpoint_activations=False, + tokenizer=None, # model checkpointing save_dir='checkpoints', # 'Output directory to save checkpoints to.') @@ -185,6 +186,7 @@ def __init__( self.log_interval = log_interval self.eval_interval = eval_interval + self.tokenizer = tokenizer # model checkpointing self.save_dir = save_dir @@ -547,11 +549,7 @@ def train(self, best_score = -best_score for epoch in range(self.epochs): - # log_dist('working on epoch {} ...'.format(epoch), [0]) - # Set the data loader epoch to shuffle the index iterator. - # if self.env_type == 'deepspeed+mpu': - # if mpu.get_model_parallel_rank() == 0: - # train_dataloader.sampler.set_epoch(epoch + self.world_size) + print("epoch "+str(epoch)) if self.env_type != 'pytorch': train_dataloader.sampler.set_epoch(epoch + self.world_size) @@ -671,7 +669,6 @@ def train(self, save_dir=self.save_dir, save_rng=self.save_rng) self.iteration += 1 - # Checkpointing at the end of each epoch. # Evaluation #todo add train_args @@ -765,6 +762,7 @@ def train_step_pytorchDDP(self, # accumulate gradients lm_loss = step_output['loss'] + lm_loss /= self.gradient_accumulation_steps lm_loss = lm_loss / self.gradient_accumulation_steps # reduce sum of losses reduced_loss = lm_loss.detach().clone().view(1) @@ -784,7 +782,7 @@ def train_step_pytorchDDP(self, else: lm_loss.backward() - grad_norm = torch.nn.utils.clip_grad_norm_(model.module.parameters(), + torch.nn.utils.clip_grad_norm_(model.module.parameters(), self.clip_grad) self.timers('backward').stop() @@ -893,6 +891,7 @@ def train_step_bmtrain(self, # Calculate gradients, reduce across processes, and clip. self.timers('backward').start() optim_manager.backward(loss) + grad_norm = optim_manager.clip_grad_norm(optim_manager.optimizers[0].param_groups, max_norm=1.0) self.timers('backward').stop() @@ -945,7 +944,7 @@ def backward_step(self, optimizer, model, lm_loss): if 'deepspeed' in self.env_type: model.backward(loss) else: - # optimizer.zero_grad() + optimizer.zero_grad() if hasattr(optimizer, 'backward'): optimizer.backward(loss, update_master_grads=False) else: @@ -1014,7 +1013,6 @@ def evaluate(self, forward_step_func=None, verbose=False): """Evaluation.""" - # Turn off checkpoint_activations tmp_checkpoint_activations = None tmp_model = model @@ -1030,7 +1028,6 @@ def evaluate(self, mems = None metrics = [0. for _ in range(len(self.metric_methods))] - with torch.no_grad(): assert data_loader is not None, "val loader is not None." all_logits = [] @@ -1065,11 +1062,12 @@ def evaluate(self, deepspeed.checkpointing.reset() logits = step_output['logits'] lm_loss = step_output['loss'] - + if 'labels' in data_iterator: labels = data_iterator['labels'] else: labels = data_iterator['target_ids'] + loss_mask = data_iterator['loss_mask'] if len(self.metric_methods) != 0: if {metric_tuple[0] for metric_tuple in self.metric_methods} & {"rouge", "bleu"}: batch_preds = torch.argmax(logits.detach(), dim=-1).cpu() @@ -1077,10 +1075,16 @@ def evaluate(self, all_logits.extend(batch_preds) all_labels.extend(batch_labels) else: + if logits.size(0) != 1: + logits = torch.argmax(logits, dim=1).unsqueeze(0) all_logits.append(logits) all_labels.append(labels) + pass all_losses.append(lm_loss.view(1)) - + # size of all_logits: (1, n) + if len(self.metric_methods) != 0 and all_logits[0].size() != all_logits[-1].size(): + pd = (all_logits[0].size(0)-all_logits[-1].size(0), all_logits[0].size(1)-all_logits[-1].size(1)) + all_logits[-1] = torch.nn.functional.pad(all_logits[-1], pd ,"constant",0) all_losses = torch.cat(all_losses, dim=0) if len(self.metric_methods) != 0: all_logits = torch.cat(all_logits, dim=0) @@ -1103,11 +1107,10 @@ def evaluate(self, for i in range(len(self.metric_methods)): eval_method = self.metric_methods[i][1] - metrics[i] += eval_method(all_logits, all_labels, meta=meta) + metrics[i] += eval_method(all_logits, all_labels, meta=meta, tokenizer=self.tokenizer) # Move model back to the train mode. - # model.train() tmp_model.train() # recover the settings for checkpoint_activations if hasattr(tmp_model, @@ -1161,6 +1164,7 @@ def evaluate_and_print_results( verbose=False, ): """Helper function to evaluate and dump results on screen.""" + eval_dict = self.evaluate(forward_step_func=forward_step_func, data_loader=data_loader, model=model, diff --git a/setup.py b/setup.py index 970b72e9..be63eb7d 100644 --- a/setup.py +++ b/setup.py @@ -34,5 +34,7 @@ 'diffusers == 0.7.2', 'pytorch-lightning == 1.6.5', 'taming-transformers-rom1504 == 0.0.6', + 'rouge-score == 0.1.2', + 'sacrebleu == 2.3.1', ] ) diff --git a/tests/test_glm_seq2seq.py b/tests/test_glm_seq2seq.py index e12fa1fe..4bcf40aa 100644 --- a/tests/test_glm_seq2seq.py +++ b/tests/test_glm_seq2seq.py @@ -48,8 +48,8 @@ def test_init_trainer_pytorch(self): tokenizer, task_name=task_name) - train_dataset.example_list = train_dataset.example_list[:1] - valid_dataset.example_list = valid_dataset.example_list[:1] + train_dataset.example_list = train_dataset.example_list[:10] + valid_dataset.example_list = valid_dataset.example_list[:10] model = GLMForSeq2Seq.from_pretrain(model_name=model_name, only_download_config=True)