From 2e145eb843d0eee5bacd6b19b8a806677892c3ef Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 14 Sep 2020 15:44:27 -0700
Subject: [PATCH 01/18] moving to .readthedocs.yml

---
 .readthedocs.yml                                | 17 +++++++++++++++++
 DeepSpeedExamples                               |  2 +-
 deepspeed/git_version_info.py                   |  9 +++++++++
 docs/code-docs/requirements.local.txt           |  3 ---
 requirements/requirements-dev.txt               |  3 +++
 .../requirements-readthedocs.txt                |  0
 6 files changed, 30 insertions(+), 4 deletions(-)
 create mode 100644 .readthedocs.yml
 create mode 100644 deepspeed/git_version_info.py
 delete mode 100755 docs/code-docs/requirements.local.txt
 rename docs/code-docs/requirements.readthedocs.txt => requirements/requirements-readthedocs.txt (100%)

diff --git a/.readthedocs.yml b/.readthedocs.yml
new file mode 100644
index 000000000000..84188849f014
--- /dev/null
+++ b/.readthedocs.yml
@@ -0,0 +1,17 @@
+
+# Required
+version: 2
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: docs/code-docs/conf.py
+
+# Optionally build your docs in additional formats such as PDF
+formats:
+  - pdf
+
+# Optionally set the version of Python and requirements required to build your docs
+python:
+  version: 3.7
+  install:
+    - requirements: requirements/requirements-readthedocs.txt
diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index b989b41b526d..9e2c34e31cec 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit b989b41b526db164611bedd3e73c09b8c2c5cbfc
+Subproject commit 9e2c34e31cec99f7d5785c6a1a3b0854c322f883
diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py
new file mode 100644
index 000000000000..7072127e3aea
--- /dev/null
+++ b/deepspeed/git_version_info.py
@@ -0,0 +1,9 @@
+version = '0.3.0'
+git_hash = '[none]'
+git_branch = '[none]'
+installed_ops = {
+    'lamb': False,
+    'transformer': False,
+    'sparse-attn': False,
+    'cpu-adam': False
+}
diff --git a/docs/code-docs/requirements.local.txt b/docs/code-docs/requirements.local.txt
deleted file mode 100755
index fdce5922d1b3..000000000000
--- a/docs/code-docs/requirements.local.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-sphinx
-recommonmark
-sphinx-rtd-theme
diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
index b6a873656eac..0118f6ee760c 100644
--- a/requirements/requirements-dev.txt
+++ b/requirements/requirements-dev.txt
@@ -2,3 +2,6 @@ pytest
 pytest-forked
 pre-commit
 clang-format
+sphinx
+recommonmark
+sphinx-rtd-theme
diff --git a/docs/code-docs/requirements.readthedocs.txt b/requirements/requirements-readthedocs.txt
similarity index 100%
rename from docs/code-docs/requirements.readthedocs.txt
rename to requirements/requirements-readthedocs.txt

From 88fab2b4eed1764d977bc72485c7cb77ae425de1 Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 14 Sep 2020 15:46:02 -0700
Subject: [PATCH 02/18] catching up DSE

---
 DeepSpeedExamples | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index 9e2c34e31cec..b989b41b526d 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit 9e2c34e31cec99f7d5785c6a1a3b0854c322f883
+Subproject commit b989b41b526db164611bedd3e73c09b8c2c5cbfc

From 975cbb7408385a9f80ffcbf54fd363341dda7518 Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 14 Sep 2020 15:55:39 -0700
Subject: [PATCH 03/18] conf.py location

---
 .readthedocs.yml                   |  2 +-
 docs/code-docs/source/training.rst | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index 84188849f014..b6e6a8dda828 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -4,7 +4,7 @@ version: 2
 
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
-  configuration: docs/code-docs/conf.py
+  configuration: docs/code-docs/source/conf.py
 
 # Optionally build your docs in additional formats such as PDF
 formats:
diff --git a/docs/code-docs/source/training.rst b/docs/code-docs/source/training.rst
index 55ce4642ff70..f72b721076b3 100644
--- a/docs/code-docs/source/training.rst
+++ b/docs/code-docs/source/training.rst
@@ -2,7 +2,7 @@ Training API
 ============
 
 :func:`deepspeed.initialize` returns a *model engine* in its first argument
-of type ``DeepSpeedLight``. This engine is used to progress training:
+of type :class:`DeepSpeedEngine`. This engine is used to progress training:
 
 .. code-block:: python
 
@@ -18,12 +18,14 @@ of type ``DeepSpeedLight``. This engine is used to progress training:
 
 Forward Propagation
 -------------------
-.. autofunction:: deepspeed.DeepSpeedLight.forward
+.. automodule:: deepspeed.DeepSpeedEngine
+
+.. autofunction:: deepspeed.DeepSpeedEngine.forward
 
 Backward Propagation
 --------------------
-.. autofunction:: deepspeed.DeepSpeedLight.backward
+.. autofunction:: deepspeed.DeepSpeedEngine.backward
 
 Optimizer Step
 --------------
-.. autofunction:: deepspeed.DeepSpeedLight.step
+.. autofunction:: deepspeed.DeepSpeedEngine.step

From 2c2e9d163cefb7ebe2918542a714f8f4fa171215 Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 14 Sep 2020 15:59:23 -0700
Subject: [PATCH 04/18] mock numpy import

---
 .readthedocs.yml              | 1 +
 docs/code-docs/source/conf.py | 9 +--------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index b6e6a8dda828..7375338f09fe 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -5,6 +5,7 @@ version: 2
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
   configuration: docs/code-docs/source/conf.py
+  fail_on_warning: true
 
 # Optionally build your docs in additional formats such as PDF
 formats:
diff --git a/docs/code-docs/source/conf.py b/docs/code-docs/source/conf.py
index d49496c5ba7c..f0f111c3a879 100644
--- a/docs/code-docs/source/conf.py
+++ b/docs/code-docs/source/conf.py
@@ -78,11 +78,4 @@
 
 autoclass_content = 'both'
 
-autodoc_mock_imports = [
-    "torch",
-    "apex",
-    "mpi4py",
-    "tensorboardX",
-    "deepspeed_transformer_cuda",
-    "deepspeed_stochastic_transformer_cuda",
-]
+autodoc_mock_imports = ["torch", "apex", "mpi4py", "tensorboardX", "numpy"]

From f1dea531a86aca8faebcb60cf6722227bd4ca4ac Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 14 Sep 2020 16:15:37 -0700
Subject: [PATCH 05/18] version info redirect

---
 .gitignore                             |  2 +-
 deepspeed/git_version_info.py          | 13 ++++---------
 deepspeed/git_version_info_template.py |  9 +++++++++
 setup.py                               |  2 +-
 4 files changed, 15 insertions(+), 11 deletions(-)
 create mode 100644 deepspeed/git_version_info_template.py

diff --git a/.gitignore b/.gitignore
index 8bf6f1d2b33d..ae2fb660b176 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,7 +3,7 @@
 *~
 *.swp
 *.log
-deepspeed/git_version_info.py
+deepspeed/git_version_info_installed.py
 
 # Build + installation data
 build/
diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py
index 7072127e3aea..b2773b2aef82 100644
--- a/deepspeed/git_version_info.py
+++ b/deepspeed/git_version_info.py
@@ -1,9 +1,4 @@
-version = '0.3.0'
-git_hash = '[none]'
-git_branch = '[none]'
-installed_ops = {
-    'lamb': False,
-    'transformer': False,
-    'sparse-attn': False,
-    'cpu-adam': False
-}
+try:
+    from .git_version_info_installed import *
+except ImportError:
+    from .git_version_info_sample import *
diff --git a/deepspeed/git_version_info_template.py b/deepspeed/git_version_info_template.py
new file mode 100644
index 000000000000..53040ef01547
--- /dev/null
+++ b/deepspeed/git_version_info_template.py
@@ -0,0 +1,9 @@
+version = '0.3.0+[none]'
+git_hash = '[none]'
+git_branch = '[none]'
+installed_ops = {
+    'lamb': False,
+    'transformer': False,
+    'sparse-attn': False,
+    'cpu-adam': False
+}
diff --git a/setup.py b/setup.py
index 55459395ec79..8c38826f84e9 100755
--- a/setup.py
+++ b/setup.py
@@ -297,7 +297,7 @@ def command_exists(cmd):
     git_hash = "unknown"
     git_branch = "unknown"
 print(f"version={VERSION}+{git_hash}, git_hash={git_hash}, git_branch={git_branch}")
-with open('deepspeed/git_version_info.py', 'w') as fd:
+with open('deepspeed/git_version_info_installed.py', 'w') as fd:
     fd.write(f"version='{VERSION}+{git_hash}'\n")
     fd.write(f"git_hash='{git_hash}'\n")
     fd.write(f"git_branch='{git_branch}'\n")

From 19e65b988b00962ea39b31e3d8bd18fe94a6712a Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 14 Sep 2020 16:21:48 -0700
Subject: [PATCH 06/18] import documentation

---
 deepspeed/git_version_info.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py
index b2773b2aef82..5bb2735599a7 100644
--- a/deepspeed/git_version_info.py
+++ b/deepspeed/git_version_info.py
@@ -1,4 +1,6 @@
 try:
+    #  This is populated by setup.py
     from .git_version_info_installed import *
-except ImportError:
-    from .git_version_info_sample import *
+except ModuleNotFoundError:
+    # Will be missing from checkouts that haven't been installed (e.g., readthedocs)
+    from .git_version_info_template import *

From 4fb0451d56821d058136199d8d823118e3cd77dc Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 14 Sep 2020 16:28:02 -0700
Subject: [PATCH 07/18] cleaning up

---
 docs/code-docs/source/deepspeed.pt.rst | 134 -------------------------
 docs/code-docs/source/deepspeed.rst    |  38 -------
 2 files changed, 172 deletions(-)
 delete mode 100644 docs/code-docs/source/deepspeed.pt.rst
 delete mode 100644 docs/code-docs/source/deepspeed.rst

diff --git a/docs/code-docs/source/deepspeed.pt.rst b/docs/code-docs/source/deepspeed.pt.rst
deleted file mode 100644
index 991963e32a43..000000000000
--- a/docs/code-docs/source/deepspeed.pt.rst
+++ /dev/null
@@ -1,134 +0,0 @@
-deepspeed.pt package
-====================
-
-Submodules
-----------
-
-deepspeed.pt.deepspeed\_config module
--------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_config
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_constants module
-----------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_constants
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_csr\_tensor module
-------------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_csr_tensor
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_dataloader module
------------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_dataloader
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_fused\_lamb module
-------------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_fused_lamb
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_launch module
--------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_launch
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_light module
-------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_light
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_lr\_schedules module
---------------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_lr_schedules
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_run module
-----------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_run
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_timer module
-------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_timer
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_utils module
-------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_zero\_optimizer module
-----------------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_zero_optimizer
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.fp16\_optimizer module
------------------------------------
-
-.. automodule:: deepspeed.pt.fp16_optimizer
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.fp16\_unfused\_optimizer module
---------------------------------------------
-
-.. automodule:: deepspeed.pt.fp16_unfused_optimizer
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.loss\_scaler module
---------------------------------
-
-.. automodule:: deepspeed.pt.loss_scaler
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-
-Module contents
----------------
-
-.. automodule:: deepspeed.pt
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/code-docs/source/deepspeed.rst b/docs/code-docs/source/deepspeed.rst
deleted file mode 100644
index 480793bbcaba..000000000000
--- a/docs/code-docs/source/deepspeed.rst
+++ /dev/null
@@ -1,38 +0,0 @@
-deepspeed package
-=================
-
-Subpackages
------------
-
-.. toctree::
-   :maxdepth: 4
-
-   deepspeed.pt
-
-Submodules
-----------
-
-deepspeed.git\_version\_info module
------------------------------------
-
-.. automodule:: deepspeed.git_version_info
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.install\_config module
---------------------------------
-
-.. automodule:: deepspeed.install_config
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-
-Module contents
----------------
-
-.. automodule:: deepspeed
-   :members:
-   :undoc-members:
-   :show-inheritance:

From 6e5872ef40338c403fa6a2fc0e325c8a0badfec8 Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 14 Sep 2020 16:38:18 -0700
Subject: [PATCH 08/18] checkpoint updates

---
 docs/code-docs/source/model-checkpointing.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/code-docs/source/model-checkpointing.rst b/docs/code-docs/source/model-checkpointing.rst
index eaf349b27ccb..064f228f1e2c 100644
--- a/docs/code-docs/source/model-checkpointing.rst
+++ b/docs/code-docs/source/model-checkpointing.rst
@@ -5,8 +5,8 @@ DeepSpeed provides routines for checkpointing model state during training.
 
 Loading Training Checkpoints
 ----------------------------
-.. autofunction:: deepspeed.DeepSpeedLight.load_checkpoint
+.. autofunction:: deepspeed.DeepSpeedEngine.load_checkpoint
 
 Saving Training Checkpoints
 ---------------------------
-.. autofunction:: deepspeed.DeepSpeedLight.save_checkpoint
+.. autofunction:: deepspeed.DeepSpeedEngine.save_checkpoint

From aa58ce8f0ad88d02c6a807f86475f01b1cf97057 Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 14 Sep 2020 16:40:15 -0700
Subject: [PATCH 09/18] clean old file

---
 .readthedocs.yml                  | 2 +-
 docs/code-docs/source/modules.rst | 7 -------
 2 files changed, 1 insertion(+), 8 deletions(-)
 delete mode 100644 docs/code-docs/source/modules.rst

diff --git a/.readthedocs.yml b/.readthedocs.yml
index 7375338f09fe..a2da36620152 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -5,7 +5,7 @@ version: 2
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
   configuration: docs/code-docs/source/conf.py
-  fail_on_warning: true
+  fail_on_warning: false
 
 # Optionally build your docs in additional formats such as PDF
 formats:
diff --git a/docs/code-docs/source/modules.rst b/docs/code-docs/source/modules.rst
deleted file mode 100644
index ffb76bdd7102..000000000000
--- a/docs/code-docs/source/modules.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-deepspeed
-=========
-
-.. toctree::
-   :maxdepth: 4
-
-   deepspeed

From e75ccd39635e707dc003d8aa03f871c14d979d90 Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 14 Sep 2020 16:45:58 -0700
Subject: [PATCH 10/18] docstring indents

---
 deepspeed/ops/transformer/transformer.py      | 60 +++++++++----------
 .../activation_checkpointing/checkpointing.py | 15 +++--
 2 files changed, 39 insertions(+), 36 deletions(-)

diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py
index 54ed407c6ddb..8abc04d50079 100644
--- a/deepspeed/ops/transformer/transformer.py
+++ b/deepspeed/ops/transformer/transformer.py
@@ -37,52 +37,52 @@ def __init__(self,
 class DeepSpeedTransformerConfig(TransformerConfig):
     """Initialize the DeepSpeed Transformer Config.
 
-        Arguments:
-            batch_size: The maximum batch size used for running the kernel on each GPU
+    Arguments:
+        batch_size: The maximum batch size used for running the kernel on each GPU
 
-            max_seq_length: The sequence-length of the model being trained with DeepSpeed
+        max_seq_length: The sequence-length of the model being trained with DeepSpeed
 
-            hidden_size: The hidden size of the transformer layer
+        hidden_size: The hidden size of the transformer layer
 
-            heads: The number of heads in the self-attention of the transformer layer
+        heads: The number of heads in the self-attention of the transformer layer
 
-            attn_dropout_ratio: The ratio of dropout for the attention's output
+        attn_dropout_ratio: The ratio of dropout for the attention's output
 
-            hidden_dropout_ratio: The ratio of dropout for the transformer's output
+        hidden_dropout_ratio: The ratio of dropout for the transformer's output
 
-            num_hidden_layers: The number of transformer layers
+        num_hidden_layers: The number of transformer layers
 
-            initializer_range: BERT model's initializer range for initializing parameter data
+        initializer_range: BERT model's initializer range for initializing parameter data
 
-            local_rank: Optional: The rank of GPU running the transformer kernel, it is not required
-                to use if the model already set the current device, otherwise need to set it
-                so that the transformer kernel can work on the right device
+        local_rank: Optional: The rank of GPU running the transformer kernel, it is not required
+            to use if the model already set the current device, otherwise need to set it
+            so that the transformer kernel can work on the right device
 
-            seed: The random seed for the dropout layers
+        seed: The random seed for the dropout layers
 
-            fp16: Enable half-precision computation
+        fp16: Enable half-precision computation
 
-            pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture
+        pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture
 
-            normalize_invertible: Optional: Enable invertible LayerNorm execution (dropping the input activation),
-                default is False
+        normalize_invertible: Optional: Enable invertible LayerNorm execution (dropping the input activation),
+            default is False
 
-            gelu_checkpoint: Optional: Enable checkpointing of Gelu activation output to save memory,
-                default is False
+        gelu_checkpoint: Optional: Enable checkpointing of Gelu activation output to save memory,
+            default is False
 
-            adjust_init_range: Optional: Set as True (default) if the model adjusts the weight initial values of
-                its self-attention output and layer output, False keeps the initializer_range no change.
-                See the adjustment below:
-                    output_std = self.config.initializer_range / math.sqrt(2.0 * num_layers)
+        adjust_init_range: Optional: Set as True (default) if the model adjusts the weight initial values of
+            its self-attention output and layer output, False keeps the initializer_range no change.
+            See the adjustment below:
+                output_std = self.config.initializer_range / math.sqrt(2.0 * num_layers)
 
-            attn_dropout_checkpoint: Optional: Enable checkpointing of attention dropout to save memory,
-                default is False
+        attn_dropout_checkpoint: Optional: Enable checkpointing of attention dropout to save memory,
+            default is False
 
-            stochastic_mode:  Enable for high performance, please note that this flag has some level of
-                non-determinism and can produce different results on different runs.  However, we have seen
-                that by enabling it, the pretraining tasks such as BERT are not affected and can obtain
-                a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend
-                to turn it off in order to be able to reproduce the same result through the regular kernel execution.
+        stochastic_mode:  Enable for high performance, please note that this flag has some level of
+            non-determinism and can produce different results on different runs.  However, we have seen
+            that by enabling it, the pretraining tasks such as BERT are not affected and can obtain
+            a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend
+            to turn it off in order to be able to reproduce the same result through the regular kernel execution.
     """
     def __init__(self,
                  batch_size=-1,
diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index 3950e7eced20..d95d04e7df1e 100755
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -236,6 +236,9 @@ def model_parallel_cuda_manual_seed(seed):
                               parallel GPUs, but the same across data parallel
                               groups. This is used for example for dropout in
                               model parallel regions.
+
+    Args:
+        seed (int): The seed to use.
     """
     global mpu
     # 2718 is just for fun and any POSITIVE value will work.
@@ -314,12 +317,12 @@ def get_full_inputs(tensors, device=None):
 class CheckpointFunction(torch.autograd.Function):
     """This function is adapted from torch.utils.checkpoint with
        two main changes:
-           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
-           2) the states in the model parallel tracker are also properly
-              tracked/set/reset.
-           3) Performance activation partitioning, contiguous memory optimization
-           4) CPU Checkpointing
-           5) Profile forward and backward functions
+       1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+       2) the states in the model parallel tracker are also properly
+           tracked/set/reset.
+       3) Performance activation partitioning, contiguous memory optimization
+       4) CPU Checkpointing
+       5) Profile forward and backward functions
     """
     @staticmethod
     def forward(ctx, run_function, *args):

From 213f6105442f00510706ae5feebf061ba8951a88 Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 14 Sep 2020 16:49:46 -0700
Subject: [PATCH 11/18] docstring indents

---
 deepspeed/ops/transformer/transformer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py
index 8abc04d50079..acb9f7bccc7c 100644
--- a/deepspeed/ops/transformer/transformer.py
+++ b/deepspeed/ops/transformer/transformer.py
@@ -55,8 +55,8 @@ class DeepSpeedTransformerConfig(TransformerConfig):
         initializer_range: BERT model's initializer range for initializing parameter data
 
         local_rank: Optional: The rank of GPU running the transformer kernel, it is not required
-            to use if the model already set the current device, otherwise need to set it
-            so that the transformer kernel can work on the right device
+        to use if the model already set the current device, otherwise need to set it
+        so that the transformer kernel can work on the right device
 
         seed: The random seed for the dropout layers
 

From 7b0f5ffc638dd69f84dcee023042c4f23368464c Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 14 Sep 2020 16:52:32 -0700
Subject: [PATCH 12/18] ignoring warnings for now

---
 deepspeed/ops/transformer/transformer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py
index acb9f7bccc7c..8abc04d50079 100644
--- a/deepspeed/ops/transformer/transformer.py
+++ b/deepspeed/ops/transformer/transformer.py
@@ -55,8 +55,8 @@ class DeepSpeedTransformerConfig(TransformerConfig):
         initializer_range: BERT model's initializer range for initializing parameter data
 
         local_rank: Optional: The rank of GPU running the transformer kernel, it is not required
-        to use if the model already set the current device, otherwise need to set it
-        so that the transformer kernel can work on the right device
+            to use if the model already set the current device, otherwise need to set it
+            so that the transformer kernel can work on the right device
 
         seed: The random seed for the dropout layers
 

From dd436518f93717f058cccb8250e25a557456239b Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 14 Sep 2020 16:54:35 -0700
Subject: [PATCH 13/18] view source

---
 docs/code-docs/source/conf.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/code-docs/source/conf.py b/docs/code-docs/source/conf.py
index f0f111c3a879..32bd66c17752 100644
--- a/docs/code-docs/source/conf.py
+++ b/docs/code-docs/source/conf.py
@@ -20,10 +20,12 @@
 author = 'Microsoft'
 
 # The full version, including alpha/beta/rc tags
-release = '0.1.0'
+release = '0.3.0'
 
 master_doc = 'index'
 
+html_show_sourcelink = True
+
 autodoc_member_order = 'bysource'
 
 # -- General configuration ---------------------------------------------------

From 12b4d55e0e47f72e2fba7ca473a3f1910b98381c Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 14 Sep 2020 16:57:54 -0700
Subject: [PATCH 14/18] viewcode extension

---
 docs/code-docs/source/conf.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/code-docs/source/conf.py b/docs/code-docs/source/conf.py
index 32bd66c17752..167f6427d7b4 100644
--- a/docs/code-docs/source/conf.py
+++ b/docs/code-docs/source/conf.py
@@ -24,8 +24,6 @@
 
 master_doc = 'index'
 
-html_show_sourcelink = True
-
 autodoc_member_order = 'bysource'
 
 # -- General configuration ---------------------------------------------------
@@ -36,6 +34,7 @@
 extensions = [
     'sphinx.ext.autodoc',
     'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
     'recommonmark',
     'sphinx_rtd_theme',
 ]

From cd460d195b7971e16b4487872d7987e8cc0a3fc5 Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 14 Sep 2020 17:22:52 -0700
Subject: [PATCH 15/18] version info cleanup

---
 deepspeed/git_version_info_template.py | 9 ---------
 1 file changed, 9 deletions(-)
 delete mode 100644 deepspeed/git_version_info_template.py

diff --git a/deepspeed/git_version_info_template.py b/deepspeed/git_version_info_template.py
deleted file mode 100644
index 53040ef01547..000000000000
--- a/deepspeed/git_version_info_template.py
+++ /dev/null
@@ -1,9 +0,0 @@
-version = '0.3.0+[none]'
-git_hash = '[none]'
-git_branch = '[none]'
-installed_ops = {
-    'lamb': False,
-    'transformer': False,
-    'sparse-attn': False,
-    'cpu-adam': False
-}

From c3316407366d463ed86ea3b1e615cd4c36880c5c Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 14 Sep 2020 17:24:19 -0700
Subject: [PATCH 16/18] version info cleanup

---
 deepspeed/git_version_info.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py
index 5bb2735599a7..82f60a86a6f6 100644
--- a/deepspeed/git_version_info.py
+++ b/deepspeed/git_version_info.py
@@ -3,4 +3,12 @@
     from .git_version_info_installed import *
 except ModuleNotFoundError:
     # Will be missing from checkouts that haven't been installed (e.g., readthedocs)
-    from .git_version_info_template import *
+    version = '0.3.0+[none]'
+    git_hash = '[none]'
+    git_branch = '[none]'
+    installed_ops = {
+        'lamb': False,
+        'transformer': False,
+        'sparse-attn': False,
+        'cpu-adam': False
+    }

From 3d12352b571cb595f15e5f33ed7cf1b1e3bfd859 Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Mon, 14 Sep 2020 17:25:12 -0700
Subject: [PATCH 17/18] reverting docstring indentation changes

---
 deepspeed/ops/transformer/transformer.py      | 60 +++++++++----------
 .../activation_checkpointing/checkpointing.py | 15 ++---
 2 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py
index 8abc04d50079..54ed407c6ddb 100644
--- a/deepspeed/ops/transformer/transformer.py
+++ b/deepspeed/ops/transformer/transformer.py
@@ -37,52 +37,52 @@ def __init__(self,
 class DeepSpeedTransformerConfig(TransformerConfig):
     """Initialize the DeepSpeed Transformer Config.
 
-    Arguments:
-        batch_size: The maximum batch size used for running the kernel on each GPU
+        Arguments:
+            batch_size: The maximum batch size used for running the kernel on each GPU
 
-        max_seq_length: The sequence-length of the model being trained with DeepSpeed
+            max_seq_length: The sequence-length of the model being trained with DeepSpeed
 
-        hidden_size: The hidden size of the transformer layer
+            hidden_size: The hidden size of the transformer layer
 
-        heads: The number of heads in the self-attention of the transformer layer
+            heads: The number of heads in the self-attention of the transformer layer
 
-        attn_dropout_ratio: The ratio of dropout for the attention's output
+            attn_dropout_ratio: The ratio of dropout for the attention's output
 
-        hidden_dropout_ratio: The ratio of dropout for the transformer's output
+            hidden_dropout_ratio: The ratio of dropout for the transformer's output
 
-        num_hidden_layers: The number of transformer layers
+            num_hidden_layers: The number of transformer layers
 
-        initializer_range: BERT model's initializer range for initializing parameter data
+            initializer_range: BERT model's initializer range for initializing parameter data
 
-        local_rank: Optional: The rank of GPU running the transformer kernel, it is not required
-            to use if the model already set the current device, otherwise need to set it
-            so that the transformer kernel can work on the right device
+            local_rank: Optional: The rank of GPU running the transformer kernel, it is not required
+                to use if the model already set the current device, otherwise need to set it
+                so that the transformer kernel can work on the right device
 
-        seed: The random seed for the dropout layers
+            seed: The random seed for the dropout layers
 
-        fp16: Enable half-precision computation
+            fp16: Enable half-precision computation
 
-        pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture
+            pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture
 
-        normalize_invertible: Optional: Enable invertible LayerNorm execution (dropping the input activation),
-            default is False
+            normalize_invertible: Optional: Enable invertible LayerNorm execution (dropping the input activation),
+                default is False
 
-        gelu_checkpoint: Optional: Enable checkpointing of Gelu activation output to save memory,
-            default is False
+            gelu_checkpoint: Optional: Enable checkpointing of Gelu activation output to save memory,
+                default is False
 
-        adjust_init_range: Optional: Set as True (default) if the model adjusts the weight initial values of
-            its self-attention output and layer output, False keeps the initializer_range no change.
-            See the adjustment below:
-                output_std = self.config.initializer_range / math.sqrt(2.0 * num_layers)
+            adjust_init_range: Optional: Set as True (default) if the model adjusts the weight initial values of
+                its self-attention output and layer output, False keeps the initializer_range no change.
+                See the adjustment below:
+                    output_std = self.config.initializer_range / math.sqrt(2.0 * num_layers)
 
-        attn_dropout_checkpoint: Optional: Enable checkpointing of attention dropout to save memory,
-            default is False
+            attn_dropout_checkpoint: Optional: Enable checkpointing of attention dropout to save memory,
+                default is False
 
-        stochastic_mode:  Enable for high performance, please note that this flag has some level of
-            non-determinism and can produce different results on different runs.  However, we have seen
-            that by enabling it, the pretraining tasks such as BERT are not affected and can obtain
-            a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend
-            to turn it off in order to be able to reproduce the same result through the regular kernel execution.
+            stochastic_mode:  Enable for high performance, please note that this flag has some level of
+                non-determinism and can produce different results on different runs.  However, we have seen
+                that by enabling it, the pretraining tasks such as BERT are not affected and can obtain
+                a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend
+                to turn it off in order to be able to reproduce the same result through the regular kernel execution.
     """
     def __init__(self,
                  batch_size=-1,
diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index d95d04e7df1e..3950e7eced20 100755
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -236,9 +236,6 @@ def model_parallel_cuda_manual_seed(seed):
                               parallel GPUs, but the same across data parallel
                               groups. This is used for example for dropout in
                               model parallel regions.
-
-    Args:
-        seed (int): The seed to use.
     """
     global mpu
     # 2718 is just for fun and any POSITIVE value will work.
@@ -317,12 +314,12 @@ def get_full_inputs(tensors, device=None):
 class CheckpointFunction(torch.autograd.Function):
     """This function is adapted from torch.utils.checkpoint with
        two main changes:
-       1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
-       2) the states in the model parallel tracker are also properly
-           tracked/set/reset.
-       3) Performance activation partitioning, contiguous memory optimization
-       4) CPU Checkpointing
-       5) Profile forward and backward functions
+           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+           2) the states in the model parallel tracker are also properly
+              tracked/set/reset.
+           3) Performance activation partitioning, contiguous memory optimization
+           4) CPU Checkpointing
+           5) Profile forward and backward functions
     """
     @staticmethod
     def forward(ctx, run_function, *args):

From 026bea75eac40728ad933bbac1c4b1c54cf615b3 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Wed, 16 Sep 2020 16:08:59 -0700
Subject: [PATCH 18/18] remove git_version_info_installed.py if exists

---
 install.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install.sh b/install.sh
index b8360e392659..7389a6f993a0 100755
--- a/install.sh
+++ b/install.sh
@@ -137,7 +137,7 @@ rm_if_exist() {
 
 if [ "$no_clean" == "0" ]; then
     # remove deepspeed build files
-    rm_if_exist deepspeed/git_version_info.py
+    rm_if_exist deepspeed/git_version_info_installed.py
     rm_if_exist dist
     rm_if_exist build
     rm_if_exist deepspeed.egg-info