FlagAI-Open · wchh-2000 · May 30, 2022 · May 30, 2022
diff --git a/doc_zh/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md b/doc_zh/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md
@@ -22,25 +22,24 @@
 
 ## 模型训练（train.py）
 
-运行前修改训练数据路径src_dir, tgt_dir, 模型路径model_dir。在命令行运行此命令：
+在命令行运行此命令：
 ```commandline
-cd ./examples/glm_title_generation
+cd FlagAI/examples/glm_title_generation
 python ./train.py
 ```
 
 ### 1.数据加载
-样例数据在 /examples/bert_title_generation/data/
+样例数据在 /examples/glm_title_generation/data/
 
+需要针对数据格式定义数据加载方法，例如：定义文件读取函数，从文件中读取数据，得到src和tgt列表：
 1）定义加载过程
 ```python
 def read_file():
     src = []
     tgt = []
-
-    ## read data file to load src and tgt, for example:
-    ## src = ["article_1", "article_2", "article_3" ......]
-    ## tgt = ["title_1", "title_2", "title_3" ......]
-    ## no matter what data you use, you need to construct the right src and tgt.
+    # src = ["article_1", "article_2", "article_3" ......]
+    # tgt = ["title_1", "title_2", "title_3" ......]
+    # 如果换为其他数据，修改处理方式即可，只需要构造好src以及对应tgt列表
     with open(src_dir, 'r', encoding='utf-8') as f:
         lines = f.readlines()
         for line in lines:
@@ -53,12 +52,12 @@ def read_file():
     return src,tgt
 ```
 
-2）定义数据集处理过程（Dataset）：
+2）定义数据集处理过程：
 ```python
-class GLMSeq2seqDataset(Dataset):
+class GLMTitleGenerationDataset(Dataset):
 
     def __init__(self, sents_src, sents_tgt):
-        super(GLMSeq2seqDataset, self).__init__()
+        super(GLMTitleGenerationDataset, self).__init__()
         self.sents_src = sents_src
         self.sents_tgt = sents_tgt
 
@@ -74,7 +73,7 @@ class GLMSeq2seqDataset(Dataset):
 
 3）定义数据迭代器（DataLoader）中的批处理函数（collate_fn），用于将一批（batch）数据填充（padding）成统一大小
 ```python
-class GLMSeq2seqDynamicCollateFN():
+class GLMTitleGenerationCollateFN():
     def __init__(self, pad_id):
         self.pad_id = pad_id
 
@@ -119,21 +118,21 @@ class GLMSeq2seqDynamicCollateFN():
 ```python
 train_src, train_tgt = read_file()
 print('-----------train data length:', len(train_src))
-my_collate_fn = GLMSeq2seqDynamicCollateFN(pad_id=tokenizer.get_command('pad').Id)
-train_dataset = GLMSeq2seqDataset(train_src,
+my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command('pad').Id)
+train_dataset = GLMTitleGenerationDataset(train_src,
                                    train_tgt)
 ```
 ### 2.加载模型和分词器
 
 ```python
 from flagai.auto_model.auto_loader import AutoLoader
 
-# the model dir, which contains the 1.config.json, 2.pytorch_model.bin, 3.vocab.txt,
-# or we will download these files from the model hub to this dir.
+# model_dir: 包含 1.config.json, 2.pytorch_model.bin, 3.vocab.txt,
+# 如果本地没有，则会在modelhub上进行查找并下载
+# Autoloader 能够自动构建模型与切词器
+# 'title-generation' 是task_name
 model_dir = "./state_dict/glm/"
-# Autoloader can build the model and tokenizer automatically.
-# 'seq2seq' is the task_name.
-AutoLoader("seq2seq",model_name="GLM-large-ch",model_dir=model_dir)
+AutoLoader("title-generation",model_name="GLM-large-ch",model_dir=model_dir)
 model = auto_loader.get_model()
 tokenizer = auto_loader.get_tokenizer()
 ```
@@ -149,7 +148,7 @@ from flagai.trainer import Trainer
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 trainer = Trainer(
     env_type="pytorch",
-    experiment_name="roberta_seq2seq",
+    experiment_name="glm-title-generation",
     batch_size=1,
     gradient_accumulation_steps=1,
     lr=2e-4,

diff --git a/doc_zh/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md b/doc_zh/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md
@@ -1,15 +1,23 @@
 # GLM 例子：古诗生成
 
 ## 古诗背景介绍
-中国古代近体诗有两种体裁：绝句和律诗。
+中国古代近体诗有两种体裁：绝句和律诗。绝句规定全诗四句。律诗规定全诗八句。每句话含五个或七个汉字，称为五言或七言。总共有四种类型，见下表。
 
-绝句规定全诗四句。其中五言绝句（五绝）为每句五个字，共二十个字；七言绝句（七绝）为每句七个字，共二十八个字。
+|     | 绝句 | 律诗 |
+|  ----  | ---- | ---- |
+| 五言 | 五言绝句 | 五言律诗 |
+| 七言 | 七言绝句 | 七言律诗 |
 
-律诗规定全诗八句。其中五言律诗（五律）为每句五个字，共四十个字；七言律诗（七律）为每句七个字，共五十六个字。
+一个五言绝句的示例：
 
-绝句在格律上虽讲究押韵，但对平仄和对仗要求不严。律诗在格律上要求严谨，押韵严格、讲究平仄和要求对仗。
+**静夜思** 李白
 
-## 结果展示
+床前明月光，疑是地上霜。
+
+举头望明月，低头思故乡。
+
+
+## 生成结果展示
 #### 输入古诗标题与体裁
 ```
 "桃花：七言绝句"
@@ -20,31 +28,47 @@
 ```
 ## 模型训练（train.py）
 
-运行前修改训练数据路径src_dir, tgt_dir, 模型路径model_dir。在命令行运行此命令：
+在命令行运行此命令：
 ```commandline
-cd ./examples/glm_poetry_generation
+cd FlagAI/examples/glm_poetry_generation
 python ./train.py
 ```
 这里使用`GLM-large-ch`作为样例,如果想要使用`GLM-10b-ch`请点[这里](https://model.baai.ac.cn/model-detail/100001)。
 ### 1.准备训练数据
-1）定义文件读取函数，从文件中读取数据，得到src和tgt列表：
+1）从文件中读取数据
+
+样例数据在 FlagAI/examples/glm_poetry_generation/data/
+
+需要在 train.py 中定义数据加载过程，得到src和tgt列表：
 ```python
 def read_file():
     src = []
     tgt = []
-
-    ##TODO read data file to load src and tgt, for example:
-    ## src = ["春晓：五言绝句", "标题：五言律诗",......]
-    ## tgt = ["春眠不觉晓，处处闻啼鸟。夜来风雨声，花落知多少。", "诗句...", ......]
-    ## no matter what data you use, you need to construct the right src and tgt.
-
-    return src,tgt
+    # src = ["春晓：五言绝句", "标题：五言律诗",......]
+    # tgt = ["春眠不觉晓，处处闻啼鸟。夜来风雨声，花落知多少。", "诗句...", ......]
+    # 如果换为其他数据，修改处理方式即可，只需要构造好src以及对应tgt列表
+    with open(src_dir, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            if "：" in line:
+                l = line.split("：")  #line eg:"初夏：五言绝句"
+                #if there are more than one '：', get title before the first '：'
+                title, style = l[0], l[-1]
+                if len(title) > 20:
+                    title = title[:20]  #cut the longer title
+                line = "：".join([title, style])
+            src.append(line)
+    with open(tgt_dir, 'r', encoding='utf-8') as f:
+        for line in f:
+            tgt.append(line.strip())
+    assert len(src) == len(tgt), 'lines not equal!'
+    return src, tgt
 ```
 2）定义数据迭代器（DataLoader）：
 ```python
-class BertSeq2seqDataset(Dataset):
+class GLMPoetryDataset(Dataset):
     def __init__(self, sents_src, sents_tgt):
-        super(BertSeq2seqDataset, self).__init__()
+        super(GLMPoetryDataset, self).__init__()
         self.sents_src = sents_src
         self.sents_tgt = sents_tgt
 
@@ -58,9 +82,9 @@ class BertSeq2seqDataset(Dataset):
     def __len__(self):
         return len(self.sents_src)
 ```
-其中tokenizer.encode_plus()方法将源、目标字符串转换为GLM模型的输入token id等数据
+其中tokenizer.encode_plus()方法将源、目标字符串转换为GLM模型的输入token索引、位置编码等数据
 
-3）定义数据迭代器（DataLoader）中的批处理函数（collate_fn），用于将一批（batch）数据填充（padding）成统一大小
+3）定义数据迭代器（DataLoader）中的collate_fn，用于将一批（batch）数据填充（padding）成统一大小
 ```python
 class GLMPoetryDynamicCollateFN():
     def __init__(self, pad_id):
@@ -108,20 +132,20 @@ class GLMPoetryDynamicCollateFN():
 train_src, train_tgt = read_file()
 print('-----------train data length:', len(train_src))
 my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command('pad').Id)
-train_dataset = BertSeq2seqDataset(train_src,
+train_dataset = GLMPoetryDataset(train_src,
                                    train_tgt)
 ```
 ### 2.加载模型和分词器
 
 ```python
 from flagai.auto_model.auto_loader import AutoLoader
 
-# the model dir, which contains the 1.config.json, 2.pytorch_model.bin, 3.vocab.txt,
-# or we will download these files from the model hub to this dir.
-model_dir = "./state_dict/glm/"
-# Autoloader can build the model and tokenizer automatically.
-# 'seq2seq' is the task_name.
-AutoLoader("seq2seq",model_name="GLM-large-ch",model_dir=model_dir)
+# model_dir: 包含 1.config.json, 2.pytorch_model.bin, 3.vocab.txt,
+# 如果本地没有，则会在modelhub上进行查找并下载
+# Autoloader 能够自动构建模型与切词器
+# 'poetry' 是task_name
+model_dir = "./state_dict/"
+AutoLoader("poetry",model_name="GLM-large-ch",model_dir=model_dir)
 model = auto_loader.get_model()
 tokenizer = auto_loader.get_tokenizer()
 ```
@@ -132,30 +156,22 @@ tokenizer = auto_loader.get_tokenizer()
 ```python
 from flagai.trainer import Trainer
 trainer = Trainer(
-    env_type="pytorch",#pytorch or deepspeed
-    experiment_name="glm_seq2seq",
-    batch_size=64,#96
+    env_type="pytorch",
+    experiment_name="glm_poetry",
+    batch_size=4,
     gradient_accumulation_steps=1,
-    lr=2e-4,#2e-4
-    weight_decay=2e-8,#1e-3
+    lr=2e-4,
+    weight_decay=2e-8,
     epochs=100,
     log_interval=10,  
     tensorboard_dir="tbsummary",
     eval_interval=2000000,
-    load_dir="",
+    load_dir=None,
     save_dir="checkpoints_poetry",
     save_epoch=1,
-    num_checkpoints=1,
-    master_ip='127.0.0.1',
-    master_port=17750,
-    num_nodes=1,
-    num_gpus=2,
-    hostfile='./hostfile',
-    deepspeed_config='./deepspeed.json',
-    training_script=__file__,
 )
 ```
-将模型、数据、批处理函数输入训练器开始训练：
+将模型、数据、collate_fn输入训练器开始训练：
 ```python
 trainer.train(model,
               train_dataset=train_dataset,
@@ -166,9 +182,9 @@ trainer.train(model,
 
 
 ## 生成（generate.py）
-运行前修改模型配置路径model_dir，训练好的模型路径model_save_path。在命令行运行此命令：
+在命令行运行此命令：
 ```commandline
-cd ./examples/glm_poetry_generation
+cd FlagAI/examples/glm_poetry_generation
 python ./generate.py
 ```
 可选择基于概率筛选的随机抽样（random sample）或集束搜索（beamsearch）两种生成方式：

diff --git a/doc_zh/TUTORIAL_8_ENVIRONMENT_SETUP.md b/doc_zh/TUTORIAL_8_ENVIRONMENT_SETUP.md
@@ -1,4 +1,40 @@
 # 多机训练模型搭建环境
+- [多机训练模型搭建环境](#多机训练模型搭建环境)
+- [一.  Docker](#一--docker)
+  - [1.安装docker](#1安装docker)
+  - [2.Docker 换源](#2docker-换源)
+  - [3.安装显卡驱动（如已装可跳过）](#3安装显卡驱动如已装可跳过)
+  - [4.配置nvidia-docker源：](#4配置nvidia-docker源)
+  - [5.制作dockerfile](#5制作dockerfile)
+    - [a.拉取nvidia 基础镜像, 创建临时文件夹（容器内，镜像创建完成后，删除）](#a拉取nvidia-基础镜像-创建临时文件夹容器内镜像创建完成后删除)
+    - [b.配置apt 安装源,并安装一些linux 系统常用基础包](#b配置apt-安装源并安装一些linux-系统常用基础包)
+    - [c.  安装最新版git(创建镜像clone 安装包)](#c--安装最新版git创建镜像clone-安装包)
+    - [d. 安装  Mellanox OFED, 由于网络问题，推荐安装包下到本地后，再执行dockerfile](#d-安装--mellanox-ofed-由于网络问题推荐安装包下到本地后再执行dockerfile)
+    - [e. 安装 nv_peer_mem](#e-安装-nv_peer_mem)
+    - [f. 安装openmpi, 需先安装libevent 依赖包](#f-安装openmpi-需先安装libevent-依赖包)
+    - [g.安装 python](#g安装-python)
+    - [h.安装 magma-cuda](#h安装-magma-cuda)
+    - [i.配置路径](#i配置路径)
+    - [j.安装一些pip 包](#j安装一些pip-包)
+    - [k.安装mpi4py （需下载到本地安装，pip 安装可能因为版本兼容问题报错）](#k安装mpi4py-需下载到本地安装pip-安装可能因为版本兼容问题报错)
+    - [l.安装pytorch, 版本可替换， 需先下载项目到本地，国内安装容易因为网速原因，造成终止, pytorch git clone 过程中可能有些子包下载过程中会终止。可以多 git clone 几次](#l安装pytorch-版本可替换-需先下载项目到本地国内安装容易因为网速原因造成终止-pytorch-git-clone-过程中可能有些子包下载过程中会终止可以多-git-clone-几次)
+    - [m.安装apex](#m安装apex)
+    - [n.安装deepspeed](#n安装deepspeed)
+    - [o.安装NCCL(可选，pytorch 已自带)](#o安装nccl可选pytorch-已自带)
+    - [p.配置网络端口、公钥和ssh](#p配置网络端口公钥和ssh)
+  - [6.构建docker 镜像](#6构建docker-镜像)
+    - [a.方式一.  pull 镜像](#a方式一--pull-镜像)
+    - [b.方式二.  构建镜像](#b方式二--构建镜像)
+- [二. 在每个机器节点构建容器](#二-在每个机器节点构建容器)
+- [三. 互信机制设置](#三-互信机制设置)
+  - [1. 公钥生成默认docker 镜像创建时已生成，如不存在，则在shell 端输入](#1-公钥生成默认docker-镜像创建时已生成如不存在则在shell-端输入)
+  - [2.将各节点容器生成的公钥文件](#2将各节点容器生成的公钥文件)
+  - [3.免密登陆](#3免密登陆)
+  - [4.测试](#4测试)
+- [四.  分布式训练测试](#四--分布式训练测试)
+  - [a.配置hostfile（hostfile 中的V100-1 与~/.ssh/config 对应）:](#a配置hostfilehostfile-中的v100-1-与sshconfig-对应)
+  - [b. 配置glm 文件，各节点配置code 和数据，要求路径相同（也可共同访问云端共享文件）](#b-配置glm-文件各节点配置code-和数据要求路径相同也可共同访问云端共享文件)
+  - [c. cmd](#c-cmd)
 
 # 一.  Docker
 
@@ -29,7 +65,7 @@ apt-get install -y docker-ce
 
 ## 2.Docker 换源
 
-### (https://xxxx.mirror.aliyuncs.com) 为自己的docker源仓库
+(https://xxxx.mirror.aliyuncs.com) 为自己的docker源仓库
 
 ```shell
 mkdir -p /etc/docker
@@ -80,7 +116,7 @@ apt-get update
 apt-get install -y nvidia-docker2
 ````
 
-### 修改/etc/docker/daemon.json，添加相关信息
+修改/etc/docker/daemon.json，添加相关信息
 
 ```text
 "runtimes": {
@@ -90,7 +126,7 @@ apt-get install -y nvidia-docker2
    }
 }
 ```
-### /etc/docker/daemon.json最终内容
+/etc/docker/daemon.json最终内容
 
 ```json
 {
@@ -104,7 +140,7 @@ apt-get install -y nvidia-docker2
 }
 ```
 
-### 重启docker服务
+重启docker服务
 
 ```shell
 systemctl daemon-reload