diff --git a/doc_zh/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md b/doc_zh/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md index 99d2ba82..5984f2fe 100644 --- a/doc_zh/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md +++ b/doc_zh/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md @@ -22,25 +22,24 @@ ## 模型训练(train.py) -运行前修改训练数据路径src_dir, tgt_dir, 模型路径model_dir。在命令行运行此命令: +在命令行运行此命令: ```commandline -cd ./examples/glm_title_generation +cd FlagAI/examples/glm_title_generation python ./train.py ``` ### 1.数据加载 -样例数据在 /examples/bert_title_generation/data/ +样例数据在 /examples/glm_title_generation/data/ +需要针对数据格式定义数据加载方法,例如:定义文件读取函数,从文件中读取数据,得到src和tgt列表: 1)定义加载过程 ```python def read_file(): src = [] tgt = [] - - ## read data file to load src and tgt, for example: - ## src = ["article_1", "article_2", "article_3" ......] - ## tgt = ["title_1", "title_2", "title_3" ......] - ## no matter what data you use, you need to construct the right src and tgt. + # src = ["article_1", "article_2", "article_3" ......] + # tgt = ["title_1", "title_2", "title_3" ......] + # 如果换为其他数据,修改处理方式即可,只需要构造好src以及对应tgt列表 with open(src_dir, 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines: @@ -53,12 +52,12 @@ def read_file(): return src,tgt ``` -2)定义数据集处理过程(Dataset): +2)定义数据集处理过程: ```python -class GLMSeq2seqDataset(Dataset): +class GLMTitleGenerationDataset(Dataset): def __init__(self, sents_src, sents_tgt): - super(GLMSeq2seqDataset, self).__init__() + super(GLMTitleGenerationDataset, self).__init__() self.sents_src = sents_src self.sents_tgt = sents_tgt @@ -74,7 +73,7 @@ class GLMSeq2seqDataset(Dataset): 3)定义数据迭代器(DataLoader)中的批处理函数(collate_fn),用于将一批(batch)数据填充(padding)成统一大小 ```python -class GLMSeq2seqDynamicCollateFN(): +class GLMTitleGenerationCollateFN(): def __init__(self, pad_id): self.pad_id = pad_id @@ -119,8 +118,8 @@ class GLMSeq2seqDynamicCollateFN(): ```python train_src, train_tgt = read_file() print('-----------train data length:', len(train_src)) -my_collate_fn = GLMSeq2seqDynamicCollateFN(pad_id=tokenizer.get_command('pad').Id) -train_dataset = GLMSeq2seqDataset(train_src, +my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command('pad').Id) +train_dataset = GLMTitleGenerationDataset(train_src, train_tgt) ``` ### 2.加载模型和分词器 @@ -128,12 +127,12 @@ train_dataset = GLMSeq2seqDataset(train_src, ```python from flagai.auto_model.auto_loader import AutoLoader -# the model dir, which contains the 1.config.json, 2.pytorch_model.bin, 3.vocab.txt, -# or we will download these files from the model hub to this dir. +# model_dir: 包含 1.config.json, 2.pytorch_model.bin, 3.vocab.txt, +# 如果本地没有,则会在modelhub上进行查找并下载 +# Autoloader 能够自动构建模型与切词器 +# 'title-generation' 是task_name model_dir = "./state_dict/glm/" -# Autoloader can build the model and tokenizer automatically. -# 'seq2seq' is the task_name. -AutoLoader("seq2seq",model_name="GLM-large-ch",model_dir=model_dir) +AutoLoader("title-generation",model_name="GLM-large-ch",model_dir=model_dir) model = auto_loader.get_model() tokenizer = auto_loader.get_tokenizer() ``` @@ -149,7 +148,7 @@ from flagai.trainer import Trainer device = torch.device("cuda" if torch.cuda.is_available() else "cpu") trainer = Trainer( env_type="pytorch", - experiment_name="roberta_seq2seq", + experiment_name="glm-title-generation", batch_size=1, gradient_accumulation_steps=1, lr=2e-4, diff --git a/doc_zh/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md b/doc_zh/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md index 8d9c7fa4..6b67798a 100644 --- a/doc_zh/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md +++ b/doc_zh/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md @@ -1,15 +1,23 @@ # GLM 例子:古诗生成 ## 古诗背景介绍 -中国古代近体诗有两种体裁:绝句和律诗。 +中国古代近体诗有两种体裁:绝句和律诗。绝句规定全诗四句。律诗规定全诗八句。每句话含五个或七个汉字,称为五言或七言。总共有四种类型,见下表。 -绝句规定全诗四句。其中五言绝句(五绝)为每句五个字,共二十个字;七言绝句(七绝)为每句七个字,共二十八个字。 +| | 绝句 | 律诗 | +| ---- | ---- | ---- | +| 五言 | 五言绝句 | 五言律诗 | +| 七言 | 七言绝句 | 七言律诗 | -律诗规定全诗八句。其中五言律诗(五律)为每句五个字,共四十个字;七言律诗(七律)为每句七个字,共五十六个字。 +一个五言绝句的示例: -绝句在格律上虽讲究押韵,但对平仄和对仗要求不严。律诗在格律上要求严谨,押韵严格、讲究平仄和要求对仗。 +**静夜思** 李白 -## 结果展示 +床前明月光,疑是地上霜。 + +举头望明月,低头思故乡。 + + +## 生成结果展示 #### 输入古诗标题与体裁 ``` "桃花:七言绝句" @@ -20,31 +28,47 @@ ``` ## 模型训练(train.py) -运行前修改训练数据路径src_dir, tgt_dir, 模型路径model_dir。在命令行运行此命令: +在命令行运行此命令: ```commandline -cd ./examples/glm_poetry_generation +cd FlagAI/examples/glm_poetry_generation python ./train.py ``` 这里使用`GLM-large-ch`作为样例,如果想要使用`GLM-10b-ch`请点[这里](https://model.baai.ac.cn/model-detail/100001)。 ### 1.准备训练数据 -1)定义文件读取函数,从文件中读取数据,得到src和tgt列表: +1)从文件中读取数据 + +样例数据在 FlagAI/examples/glm_poetry_generation/data/ + +需要在 train.py 中定义数据加载过程,得到src和tgt列表: ```python def read_file(): src = [] tgt = [] - - ##TODO read data file to load src and tgt, for example: - ## src = ["春晓:五言绝句", "标题:五言律诗",......] - ## tgt = ["春眠不觉晓,处处闻啼鸟。夜来风雨声,花落知多少。", "诗句...", ......] - ## no matter what data you use, you need to construct the right src and tgt. - - return src,tgt + # src = ["春晓:五言绝句", "标题:五言律诗",......] + # tgt = ["春眠不觉晓,处处闻啼鸟。夜来风雨声,花落知多少。", "诗句...", ......] + # 如果换为其他数据,修改处理方式即可,只需要构造好src以及对应tgt列表 + with open(src_dir, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if ":" in line: + l = line.split(":") #line eg:"初夏:五言绝句" + #if there are more than one ':', get title before the first ':' + title, style = l[0], l[-1] + if len(title) > 20: + title = title[:20] #cut the longer title + line = ":".join([title, style]) + src.append(line) + with open(tgt_dir, 'r', encoding='utf-8') as f: + for line in f: + tgt.append(line.strip()) + assert len(src) == len(tgt), 'lines not equal!' + return src, tgt ``` 2)定义数据迭代器(DataLoader): ```python -class BertSeq2seqDataset(Dataset): +class GLMPoetryDataset(Dataset): def __init__(self, sents_src, sents_tgt): - super(BertSeq2seqDataset, self).__init__() + super(GLMPoetryDataset, self).__init__() self.sents_src = sents_src self.sents_tgt = sents_tgt @@ -58,9 +82,9 @@ class BertSeq2seqDataset(Dataset): def __len__(self): return len(self.sents_src) ``` -其中tokenizer.encode_plus()方法将源、目标字符串转换为GLM模型的输入token id等数据 +其中tokenizer.encode_plus()方法将源、目标字符串转换为GLM模型的输入token索引、位置编码等数据 -3)定义数据迭代器(DataLoader)中的批处理函数(collate_fn),用于将一批(batch)数据填充(padding)成统一大小 +3)定义数据迭代器(DataLoader)中的collate_fn,用于将一批(batch)数据填充(padding)成统一大小 ```python class GLMPoetryDynamicCollateFN(): def __init__(self, pad_id): @@ -108,7 +132,7 @@ class GLMPoetryDynamicCollateFN(): train_src, train_tgt = read_file() print('-----------train data length:', len(train_src)) my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command('pad').Id) -train_dataset = BertSeq2seqDataset(train_src, +train_dataset = GLMPoetryDataset(train_src, train_tgt) ``` ### 2.加载模型和分词器 @@ -116,12 +140,12 @@ train_dataset = BertSeq2seqDataset(train_src, ```python from flagai.auto_model.auto_loader import AutoLoader -# the model dir, which contains the 1.config.json, 2.pytorch_model.bin, 3.vocab.txt, -# or we will download these files from the model hub to this dir. -model_dir = "./state_dict/glm/" -# Autoloader can build the model and tokenizer automatically. -# 'seq2seq' is the task_name. -AutoLoader("seq2seq",model_name="GLM-large-ch",model_dir=model_dir) +# model_dir: 包含 1.config.json, 2.pytorch_model.bin, 3.vocab.txt, +# 如果本地没有,则会在modelhub上进行查找并下载 +# Autoloader 能够自动构建模型与切词器 +# 'poetry' 是task_name +model_dir = "./state_dict/" +AutoLoader("poetry",model_name="GLM-large-ch",model_dir=model_dir) model = auto_loader.get_model() tokenizer = auto_loader.get_tokenizer() ``` @@ -132,30 +156,22 @@ tokenizer = auto_loader.get_tokenizer() ```python from flagai.trainer import Trainer trainer = Trainer( - env_type="pytorch",#pytorch or deepspeed - experiment_name="glm_seq2seq", - batch_size=64,#96 + env_type="pytorch", + experiment_name="glm_poetry", + batch_size=4, gradient_accumulation_steps=1, - lr=2e-4,#2e-4 - weight_decay=2e-8,#1e-3 + lr=2e-4, + weight_decay=2e-8, epochs=100, log_interval=10, tensorboard_dir="tbsummary", eval_interval=2000000, - load_dir="", + load_dir=None, save_dir="checkpoints_poetry", save_epoch=1, - num_checkpoints=1, - master_ip='127.0.0.1', - master_port=17750, - num_nodes=1, - num_gpus=2, - hostfile='./hostfile', - deepspeed_config='./deepspeed.json', - training_script=__file__, ) ``` -将模型、数据、批处理函数输入训练器开始训练: +将模型、数据、collate_fn输入训练器开始训练: ```python trainer.train(model, train_dataset=train_dataset, @@ -166,9 +182,9 @@ trainer.train(model, ## 生成(generate.py) -运行前修改模型配置路径model_dir,训练好的模型路径model_save_path。在命令行运行此命令: +在命令行运行此命令: ```commandline -cd ./examples/glm_poetry_generation +cd FlagAI/examples/glm_poetry_generation python ./generate.py ``` 可选择基于概率筛选的随机抽样(random sample)或集束搜索(beamsearch)两种生成方式: diff --git a/doc_zh/TUTORIAL_8_ENVIRONMENT_SETUP.md b/doc_zh/TUTORIAL_8_ENVIRONMENT_SETUP.md index fc4a1c2f..755614b5 100644 --- a/doc_zh/TUTORIAL_8_ENVIRONMENT_SETUP.md +++ b/doc_zh/TUTORIAL_8_ENVIRONMENT_SETUP.md @@ -1,4 +1,40 @@ # 多机训练模型搭建环境 +- [多机训练模型搭建环境](#多机训练模型搭建环境) +- [一. Docker](#一--docker) + - [1.安装docker](#1安装docker) + - [2.Docker 换源](#2docker-换源) + - [3.安装显卡驱动(如已装可跳过)](#3安装显卡驱动如已装可跳过) + - [4.配置nvidia-docker源:](#4配置nvidia-docker源) + - [5.制作dockerfile](#5制作dockerfile) + - [a.拉取nvidia 基础镜像, 创建临时文件夹(容器内,镜像创建完成后,删除)](#a拉取nvidia-基础镜像-创建临时文件夹容器内镜像创建完成后删除) + - [b.配置apt 安装源,并安装一些linux 系统常用基础包](#b配置apt-安装源并安装一些linux-系统常用基础包) + - [c. 安装最新版git(创建镜像clone 安装包)](#c--安装最新版git创建镜像clone-安装包) + - [d. 安装 Mellanox OFED, 由于网络问题,推荐安装包下到本地后,再执行dockerfile](#d-安装--mellanox-ofed-由于网络问题推荐安装包下到本地后再执行dockerfile) + - [e. 安装 nv_peer_mem](#e-安装-nv_peer_mem) + - [f. 安装openmpi, 需先安装libevent 依赖包](#f-安装openmpi-需先安装libevent-依赖包) + - [g.安装 python](#g安装-python) + - [h.安装 magma-cuda](#h安装-magma-cuda) + - [i.配置路径](#i配置路径) + - [j.安装一些pip 包](#j安装一些pip-包) + - [k.安装mpi4py (需下载到本地安装,pip 安装可能因为版本兼容问题报错)](#k安装mpi4py-需下载到本地安装pip-安装可能因为版本兼容问题报错) + - [l.安装pytorch, 版本可替换, 需先下载项目到本地,国内安装容易因为网速原因,造成终止, pytorch git clone 过程中可能有些子包下载过程中会终止。可以多 git clone 几次](#l安装pytorch-版本可替换-需先下载项目到本地国内安装容易因为网速原因造成终止-pytorch-git-clone-过程中可能有些子包下载过程中会终止可以多-git-clone-几次) + - [m.安装apex](#m安装apex) + - [n.安装deepspeed](#n安装deepspeed) + - [o.安装NCCL(可选,pytorch 已自带)](#o安装nccl可选pytorch-已自带) + - [p.配置网络端口、公钥和ssh](#p配置网络端口公钥和ssh) + - [6.构建docker 镜像](#6构建docker-镜像) + - [a.方式一. pull 镜像](#a方式一--pull-镜像) + - [b.方式二. 构建镜像](#b方式二--构建镜像) +- [二. 在每个机器节点构建容器](#二-在每个机器节点构建容器) +- [三. 互信机制设置](#三-互信机制设置) + - [1. 公钥生成默认docker 镜像创建时已生成,如不存在,则在shell 端输入](#1-公钥生成默认docker-镜像创建时已生成如不存在则在shell-端输入) + - [2.将各节点容器生成的公钥文件](#2将各节点容器生成的公钥文件) + - [3.免密登陆](#3免密登陆) + - [4.测试](#4测试) +- [四. 分布式训练测试](#四--分布式训练测试) + - [a.配置hostfile(hostfile 中的V100-1 与~/.ssh/config 对应):](#a配置hostfilehostfile-中的v100-1-与sshconfig-对应) + - [b. 配置glm 文件,各节点配置code 和数据,要求路径相同(也可共同访问云端共享文件)](#b-配置glm-文件各节点配置code-和数据要求路径相同也可共同访问云端共享文件) + - [c. cmd](#c-cmd) # 一. Docker @@ -29,7 +65,7 @@ apt-get install -y docker-ce ## 2.Docker 换源 -### (https://xxxx.mirror.aliyuncs.com) 为自己的docker源仓库 +(https://xxxx.mirror.aliyuncs.com) 为自己的docker源仓库 ```shell mkdir -p /etc/docker @@ -80,7 +116,7 @@ apt-get update apt-get install -y nvidia-docker2 ```` -### 修改/etc/docker/daemon.json,添加相关信息 +修改/etc/docker/daemon.json,添加相关信息 ```text "runtimes": { @@ -90,7 +126,7 @@ apt-get install -y nvidia-docker2 } } ``` -### /etc/docker/daemon.json最终内容 +/etc/docker/daemon.json最终内容 ```json { @@ -104,7 +140,7 @@ apt-get install -y nvidia-docker2 } ``` -### 重启docker服务 +重启docker服务 ```shell systemctl daemon-reload diff --git a/docs/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md b/docs/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md index 782933da..a16d8c90 100644 --- a/docs/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md +++ b/docs/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md @@ -22,14 +22,14 @@ The title generation task needs to input a piece of text, and the model outputs ## Model Train(train.py) -Modify the training data path src_dir, tgt_dir, and model path model_dir before running. Run this command at the command line: +Run this command at the command line: ```commandline -cd ./examples/glm_title_generation +cd FlagAI/examples/glm_title_generation python ./train.py ``` ### 1.Load data -Sample data is at /examples/bert_title_generation/data/ +Sample data is at /examples/glm_title_generation/data/ 1)Define the load function ```python @@ -37,10 +37,10 @@ def read_file(): src = [] tgt = [] - ## read data file to load src and tgt, for example: - ## src = ["article_1", "article_2", "article_3" ......] - ## tgt = ["title_1", "title_2", "title_3" ......] - ## no matter what data you use, you need to construct the right src and tgt. + # read data file to load src and tgt, for example: + # src = ["article_1", "article_2", "article_3" ......] + # tgt = ["title_1", "title_2", "title_3" ......] + # no matter what data you use, you need to construct the right src and tgt. with open(src_dir, 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines: @@ -55,10 +55,10 @@ def read_file(): 2)Define the data loading process ```python -class GLMSeq2seqDataset(Dataset): +class GLMTitleGenerationDataset(Dataset): def __init__(self, sents_src, sents_tgt): - super(GLMSeq2seqDataset, self).__init__() + super(GLMTitleGenerationDataset, self).__init__() self.sents_src = sents_src self.sents_tgt = sents_tgt @@ -74,7 +74,7 @@ class GLMSeq2seqDataset(Dataset): 3) Define the batch function (collate_fn) in the data iterator (DataLoader) to pad a batch of data into a uniform size ```python -class GLMSeq2seqDynamicCollateFN(): +class GLMTitleGenerationCollateFN(): def __init__(self, pad_id): self.pad_id = pad_id @@ -119,8 +119,8 @@ class GLMSeq2seqDynamicCollateFN(): ```python train_src, train_tgt = read_file() print('-----------train data length:', len(train_src)) -my_collate_fn = GLMSeq2seqDynamicCollateFN(pad_id=tokenizer.get_command('pad').Id) -train_dataset = GLMSeq2seqDataset(train_src, +my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command('pad').Id) +train_dataset = GLMTitleGenerationDataset(train_src, train_tgt) ``` ### 2.Load model and tokenizer @@ -133,7 +133,7 @@ from flagai.auto_model.auto_loader import AutoLoader model_dir = "./state_dict/glm/" # Autoloader can build the model and tokenizer automatically. # 'seq2seq' is the task_name. -AutoLoader("seq2seq",model_name="GLM-large-ch",model_dir=model_dir) +AutoLoader("title-generation",model_name="GLM-large-ch",model_dir=model_dir) model = auto_loader.get_model() tokenizer = auto_loader.get_tokenizer() ``` @@ -149,7 +149,7 @@ from flagai.trainer import Trainer device = torch.device("cuda" if torch.cuda.is_available() else "cpu") trainer = Trainer( env_type="pytorch", - experiment_name="roberta_seq2seq", + experiment_name="glm-title-generation", batch_size=1, gradient_accumulation_steps=1, lr=2e-4, @@ -176,7 +176,7 @@ trainer.train(model, Modify the model configuration path model_dir and the trained model path model_save_path before running. Run this command at the command line: ```commandline -cd ./examples/glm_title_generation +cd FlagAI/examples/glm_title_generation python ./generate.py ``` You can choose between random sampling based on probability screening (random sample) or beam search (beamsearch) two generation methods: diff --git a/docs/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md b/docs/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md index 93caef47..c55f40cb 100644 --- a/docs/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md +++ b/docs/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md @@ -1,13 +1,12 @@ # GLM example: Classical Chinese Poetry Generation ## Introduction to the Background of Classical Chinese Poetry -There are two types of classical Chinese poetry: Jueju(绝句), Lvshi(律诗). - -The Jueju contains four lines in the whole poem. The five-character Jueju (wujue) has five characters per sentence, with a total of twenty characters; the seven-character Jueju (qijue) has seven characters per sentence, with a total of twenty-eight characters. - -There are eight lines in the Lvshi. The five-character Lvshi (wulv) has five characters per sentence, with a total of forty characters; the seven-character Lvshi (qilv) has seven characters per sentence, with a total of fifty-six characters. - +There are two types of classical Chinese poetry: Jueju(绝句), Lvshi(律诗).The Jueju contains four lines in the whole poem. The Lvshi contains eight lines in the whole poem.Each line contains five or seven Chinese characters. There are four types in total, see the table below. +| | Jueju | Lvshi | +| ---- | ---- | ---- | +| five characters | wujue | wulv | +| seven characters | qijue | qilv | ## Result show #### Input ancient poem title and type: @@ -20,30 +19,47 @@ There are eight lines in the Lvshi. The five-character Lvshi (wulv) has five cha ``` ## Model training(train.py) -Modify the training data path (**src_dir**, **tgt_dir**) and **model_dir** before running the file. Input the code in commandline to train: +Input the code in commandline to train: ```commandline -cd ./examples/glm_poetry_generation +cd FlagAI/examples/glm_poetry_generation python ./train.py ``` ### 1.Prepare the training data 1)Define the file reading function: + +The sample data is in FlagAI/examples/glm_poetry_generation/data/ + +You need to define the data loading process in train.py, and get the list of **src** and **tgt**: ```python def read_file(): src = [] tgt = [] - - ##TODO read data file to load src and tgt, for example: ## src = ["春晓:五言绝句", "标题:五言律诗",......] ## tgt = ["春眠不觉晓,处处闻啼鸟。夜来风雨声,花落知多少。", "诗句...", ......] ## no matter what data you use, you need to construct the right src and tgt. - + with open(src_dir, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if ":" in line: + l = line.split(":") #line eg:"初夏:五言绝句" + #if there are more than one ':', get title before the first ':' + title, style = l[0], l[-1] + if len(title) > 20: + title = title[:20] #cut the longer title + line = ":".join([title, style]) + src.append(line) + with open(tgt_dir, 'r', encoding='utf-8') as f: + for line in f: + tgt.append(line.strip()) + assert len(src) == len(tgt), 'lines not equal!' + return src, tgt return src,tgt ``` 2)Define the DataLoader: ```python -class BertSeq2seqDataset(Dataset): +class GLMPoetryDataset(Dataset): def __init__(self, sents_src, sents_tgt): - super(BertSeq2seqDataset, self).__init__() + super(GLMPoetryDataset, self).__init__() self.sents_src = sents_src self.sents_tgt = sents_tgt @@ -107,7 +123,7 @@ class GLMPoetryDynamicCollateFN(): train_src, train_tgt = read_file() print('-----------train data length:', len(train_src)) my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command('pad').Id) -train_dataset = BertSeq2seqDataset(train_src, +train_dataset = GLMPoetryDataset(train_src, train_tgt) ``` ### 2.Load model and tokenizer @@ -131,27 +147,19 @@ Instantiate the Trainer, and set the training parameters. ```python from flagai.trainer import Trainer trainer = Trainer( - env_type="pytorch",#pytorch or deepspeed - experiment_name="glm_seq2seq", - batch_size=64,#96 + env_type="pytorch", + experiment_name="glm_poetry", + batch_size=4, gradient_accumulation_steps=1, - lr=2e-4,#2e-4 - weight_decay=2e-8,#1e-3 + lr=2e-4, + weight_decay=2e-8, epochs=100, log_interval=10, tensorboard_dir="tbsummary", eval_interval=2000000, - load_dir="", + load_dir=None, save_dir="checkpoints_poetry", save_epoch=1, - num_checkpoints=1, - master_ip='127.0.0.1', - master_port=17750, - num_nodes=1, - num_gpus=2, - hostfile='./hostfile', - deepspeed_config='./deepspeed.json', - training_script=__file__, ) ``` Pass the model, data, and collate_fn into the trainer to start training: @@ -167,7 +175,7 @@ trainer.train(model, ## Generate(generate.py) Modify the model configuration path **model_dir** and the saved model path **model_save_path** before running. Run the command at the command line: ```commandline -cd ./examples/glm_poetry_generation +cd FlagAI/examples/glm_poetry_generation python ./generate.py ``` You can choose between two generation methods: random sampling based on probability screening or beam search diff --git a/docs/TUTORIAL_8_ENVIRONMENT_SETUP.md b/docs/TUTORIAL_8_ENVIRONMENT_SETUP.md index 49f02aa7..d6ef4585 100644 --- a/docs/TUTORIAL_8_ENVIRONMENT_SETUP.md +++ b/docs/TUTORIAL_8_ENVIRONMENT_SETUP.md @@ -1,4 +1,40 @@ # Pre-training distributed environment setup +- [Pre-training distributed environment setup](#pre-training-distributed-environment-setup) +- [一. Docker](#一--docker) + - [1.install docker](#1install-docker) + - [2.Docker source change](#2docker-source-change) + - [3.Install the graphics card driver (skip if installed)](#3install-the-graphics-card-driver-skip-if-installed) + - [4.Configure NVIDIA-docker source:](#4configure-nvidia-docker-source) + - [5.Make dockerfile](#5make-dockerfile) + - [a. Pull the NVIDIA basic image and create a temporary folder (in the container, delete the image after it is created)](#a-pull-the-nvidia-basic-image-and-create-a-temporary-folder-in-the-container-delete-the-image-after-it-is-created) + - [b.Configure apt installation source and install some common basic packages of Linux](#bconfigure-apt-installation-source-and-install-some-common-basic-packages-of-linux) + - [c. Install the latest version of GIT (create an image clone installation package)](#c-install-the-latest-version-of-git-create-an-image-clone-installation-package) + - [d. Install Mellanox OFED. Due to network problems, it is recommended to download the installation package locally and then execute dockerfile](#d-install-mellanox-ofed-due-to-network-problems-it-is-recommended-to-download-the-installation-package-locally-and-then-execute-dockerfile) + - [e. Install nv_peer_mem](#e-install-nv_peer_mem) + - [f. Install openmpi, You need to install the libevent dependency package first](#f-install-openmpi-you-need-to-install-the-libevent-dependency-package-first) + - [g.Install python](#ginstall-python) + - [h.Install magma-cuda](#hinstall-magma-cuda) + - [i.Configuration path](#iconfiguration-path) + - [j.Install some packages](#jinstall-some-packages) + - [k.Install mpi4py (need to download to local installation, pip installation may report an error due to version compatibility)](#kinstall-mpi4py-need-to-download-to-local-installation-pip-installation-may-report-an-error-due-to-version-compatibility) + - [l.Install pytorch, the version can be replaced, need to download locally first. Installation is easy to be terminated due to network speed. Some sub packages may be terminated during the download process of pytorch git clone. You can try few more times.](#linstall-pytorch-the-version-can-be-replaced-need-to-download-locally-first-installation-is-easy-to-be-terminated-due-to-network-speed-some-sub-packages-may-be-terminated-during-the-download-process-of-pytorch-git-clone-you-can-try-few-more-times) + - [m.Install apex](#minstall-apex) + - [n.Install deepspeed](#ninstall-deepspeed) + - [o.Install NCCL(optional)](#oinstall-nccloptional) + - [p.Configure network port, public key and SSH](#pconfigure-network-port-public-key-and-ssh) + - [6.Build docker image](#6build-docker-image) + - [a.Method 1. Pull image](#amethod-1-pull-image) + - [b.Method 2. Build image](#bmethod-2-build-image) +- [二. Build containers at each machine node](#二-build-containers-at-each-machine-node) +- [三. Mutual trust mechanism setting](#三-mutual-trust-mechanism-setting) + - [1. The default docker image for public key has been generated when it is created. If it does not exist, enter flow on the shell](#1-the-default-docker-image-for-public-key-has-been-generated-when-it-is-created-if-it-does-not-exist-enter-flow-on-the-shell) + - [2.The public key file generated by each node container](#2the-public-key-file-generated-by-each-node-container) + - [3.login without password](#3login-without-password) + - [4.test](#4test) +- [四. Distributed training test](#四--distributed-training-test) + - [a.Configure hostfile (~/SSH/config and V100-1 in the hostfile correspondence):](#aconfigure-hostfile-sshconfig-and-v100-1-in-the-hostfile-correspondence) + - [b. Configure glm files. Each node is configured with code and data. The path is required to be the same (you can also access cloud shared files together)](#b-configure-glm-files-each-node-is-configured-with-code-and-data-the-path-is-required-to-be-the-same-you-can-also-access-cloud-shared-files-together) + - [c. cmd](#c-cmd) # 一. Docker @@ -30,7 +66,7 @@ apt-get install -y docker-ce ## 2.Docker source change -### (https://xxxx.mirror.aliyuncs.com) is your own docker source +(https://xxxx.mirror.aliyuncs.com) is your own docker source ```shell mkdir -p /etc/docker @@ -81,7 +117,7 @@ apt-get update apt-get install -y nvidia-docker2 ```` -### Modify /etc/docker/daemon.json,Add relevant information +Modify /etc/docker/daemon.json,Add relevant information ```text "runtimes": { @@ -91,7 +127,7 @@ apt-get install -y nvidia-docker2 } } ``` -### final content of /etc/docker/daemon.json +final content of /etc/docker/daemon.json ```json { @@ -105,7 +141,7 @@ apt-get install -y nvidia-docker2 } ``` -### reboot docker service +reboot docker service ```shell systemctl daemon-reload