diff --git a/.clang-format b/.clang-format
index 9b547c1a18fe..9f90836e1f2a 100755
--- a/.clang-format
+++ b/.clang-format
@@ -20,7 +20,7 @@ AllowShortLoopsOnASingleLine: true
 AlwaysBreakAfterDefinitionReturnType: None
 AlwaysBreakAfterReturnType: None
 AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: Yes
+AlwaysBreakTemplateDeclarations: true
 BinPackArguments:  false
 BinPackParameters: false
 BraceWrapping:
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
new file mode 100644
index 000000000000..173a51cda5de
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,51 @@
+# This is a basic workflow to help you get started with Actions
+
+name: Build
+
+# Controls when the action will run.
+on:
+  push:
+    paths-ignore:
+    - 'docs/**'
+  pull_request:
+    paths-ignore:
+    - 'docs/**'
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "build"
+  build:
+    # The type of runner that the job will run on
+    runs-on: self-hosted
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+
+      # Runs a single command using the runners shell
+      - name: environment
+        run: |
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      # Runs a set of commands using the runners shell
+      - name: Install deepspeed
+        run: |
+          pip install .[dev]
+          ds_report
+
+      - name: Formatting checks
+        run: |
+           pre-commit run --all-files
+
+      # Runs a set of commands using the runners shell
+      - name: Unit tests
+        run: |
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --durations=0 --forked --verbose -x tests/unit/
diff --git a/.github/workflows/pre-compile-ops.yml b/.github/workflows/pre-compile-ops.yml
new file mode 100644
index 000000000000..4005d4baf2fc
--- /dev/null
+++ b/.github/workflows/pre-compile-ops.yml
@@ -0,0 +1,47 @@
+# This is a basic workflow to help you get started with Actions
+
+name: Tests-w-precompiled-ops
+
+# Controls when the action will run.
+on:
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "build"
+  build:
+    # The type of runner that the job will run on
+    runs-on: self-hosted
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+
+      # Runs a single command using the runners shell
+      - name: environment
+        run: |
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      # Runs a set of commands using the runners shell
+      - name: Install deepspeed
+        run: |
+          DS_BUILD_OPS=1 pip install .[dev]
+          ds_report
+
+      - name: Formatting checks
+        run: |
+           pre-commit run --all-files
+
+      # Runs a set of commands using the runners shell
+      - name: Unit tests
+        run: |
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --durations=0 --forked --verbose -x tests/unit/
diff --git a/.gitignore b/.gitignore
index e83ac2d32f53..84340857f802 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,21 +3,28 @@
 *~
 *.swp
 *.log
-deepspeed/git_version_info.py
+deepspeed/git_version_info_installed.py
 
 # Build + installation data
 build/
 dist/
-fused_lamb_*.so
+*.so
 deepspeed.egg-info/
+build.txt
 
 # Website
 docs/_site/
 docs/build
+docs/code-docs/source/_build
 docs/code-docs/_build
 docs/code-docs/build
 .sass-cache/
 .jekyll-cache/
 .jekyll-metadata
 
+# Testing data
 tests/unit/saved_checkpoint/
+
+# Dev/IDE data
+.vscode
+.theia
diff --git a/.gitmodules b/.gitmodules
index 1257dc13e0f4..37adb6f39e5c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,3 @@
-[submodule "third_party/apex"]
-	path = third_party/apex
-	url = https://github.com/NVIDIA/apex.git
 [submodule "DeepSpeedExamples"]
 	path = DeepSpeedExamples
 	url = https://github.com/microsoft/DeepSpeedExamples
diff --git a/.readthedocs.yml b/.readthedocs.yml
new file mode 100644
index 000000000000..a2da36620152
--- /dev/null
+++ b/.readthedocs.yml
@@ -0,0 +1,18 @@
+
+# Required
+version: 2
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: docs/code-docs/source/conf.py
+  fail_on_warning: false
+
+# Optionally build your docs in additional formats such as PDF
+formats:
+  - pdf
+
+# Optionally set the version of Python and requirements required to build your docs
+python:
+  version: 3.7
+  install:
+    - requirements: requirements/requirements-readthedocs.txt
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 000000000000..ec7993c060aa
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1 @@
+*       @jeffra @samyam @tjruwase @ShadenSmith @conglongli @awan-10 @arashashari @cli99 @eltonzheng @minjiaz @RezaYazdaniAminabadi @niumanar
diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index fd6fb5148ccf..78d69cb2f89a 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit fd6fb5148ccf5c9ce222432006f1d93806187cd9
+Subproject commit 78d69cb2f89a27b1e9b072df8c3e47d00c024fdc
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 000000000000..53fcc885090e
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,4 @@
+include *.txt README.md
+recursive-include requirements *.txt
+recursive-include deepspeed *.cpp *.h *.cu *.tr *.cuh *.cc
+recursive-include csrc *.cpp *.h *.cu *.tr *.cuh *.cc
diff --git a/README.md b/README.md
index edfcb2a98e6c..ee2d3e6bb676 100755
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
-[![Build Status](https://dev.azure.com/DeepSpeedMSFT/DeepSpeed/_apis/build/status/microsoft.DeepSpeed?branchName=master)](https://dev.azure.com/DeepSpeedMSFT/DeepSpeed/_build/latest?definitionId=1&branchName=master)
+[![Build Status](https://github.com/microsoft/deepspeed/workflows/Build/badge.svg)](https://github.com/microsoft/DeepSpeed/actions)
+[![PyPI version](https://badge.fury.io/py/deepspeed.svg)](https://pypi.org/project/deepspeed/)
 [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)
 [![License MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)
+[![Docker Pulls](https://img.shields.io/docker/pulls/deepspeed/deepspeed)](https://hub.docker.com/r/deepspeed/deepspeed)
 
 [DeepSpeed](https://www.deepspeed.ai/) is a deep learning optimization
 library that makes distributed training easy, efficient, and effective.
@@ -9,9 +11,13 @@ library that makes distributed training easy, efficient, and effective.
 <p align="center"><i><b>10x Faster Training</b></i></p>
 <p align="center"><i><b>Minimal Code Change</b></i></p>
 
-DeepSpeed can train deep learning models with over a hundred billion parameters on current
-generation of GPU clusters, while achieving over 10x in system performance
-compared to the state-of-art. Early adopters of DeepSpeed have already produced
+DeepSpeed delivers extreme-scale model training for everyone, from data scientists training on massive supercomputers to those training on low-end clusters or even on a single GPU:
+* Extreme scale: Using current generation of GPU clusters with hundreds of devices,  3D parallelism of DeepSpeed can efficiently train deep learning models with trillions of parameters.  
+* Extremely memory efficient: With just a single GPU, ZeRO-Offload of DeepSpeed can train models with over 10B parameters, 10x bigger than the state of arts, democratizing multi-billion-parameter model training such that many deep learning scientists can explore bigger and better models.
+* Extremely long sequence length: Sparse attention of DeepSpeed powers an order-of-magnitude longer input sequence and obtains up to 6x faster execution comparing with dense transformers.  
+* Extremely communication efficient: 3D parallelism improves communication efficiency allows users to train multi-billion-parameter models 2–7x faster on clusters with limited network bandwidth.  1-bit Adam reduces communication volume by up to 5x while achieving similar convergence efficiency to Adam, allowing for scaling to different types of GPU clusters and networks.
+
+Early adopters of DeepSpeed have already produced
 a language model (LM) with over 17B parameters called
 [Turing-NLG](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft),
 establishing a new SOTA in the LM category.
@@ -25,25 +31,26 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 
 
 # News
-
-* [2020/05/19] [ZeRO-2 & DeepSpeed: Shattering Barriers of Deep Learning Speed & Scale](https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/)
-<span style="color:dodgerblue">**[_NEW_]**</span>
-* [2020/05/19] [An Order-of-Magnitude Larger and Faster Training with ZeRO-2](https://www.deepspeed.ai/news/2020/05/18/zero-stage2.html)
-<span style="color:dodgerblue">**[_NEW_]**</span>
-* [2020/05/19] [The Fastest and Most Efficient BERT Training through Optimized Transformer Kernels](https://www.deepspeed.ai/news/2020/05/18/bert-record.html)
-<span style="color:dodgerblue">**[_NEW_]**</span>
-* [2020/02/13] [Turing-NLG: A 17-billion-parameter language model by Microsoft](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/)
-* [2020/02/13] [ZeRO & DeepSpeed: New system optimizations enable training models with over 100 billion parameters](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/)
+* [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
+* [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
+* [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/)
+  * [Powering 10x longer sequences and 6x faster execution through DeepSpeed Sparse Attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention-news.html)
+  * [Training a trillion parameters with pipeline parallelism](https://www.deepspeed.ai/news/2020/09/08/pipeline-parallelism.html)
+  * [Up to 5x less communication and 3.4x faster training through 1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-news.html)
+  * [10x bigger model training on a single GPU with ZeRO-Offload](https://www.deepspeed.ai/news/2020/09/08/ZeRO-Offload.html)
+* [2020/08/07] [DeepSpeed Microsoft Research Webinar](https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html) is now available on-demand
 
 
 # Table of Contents
 | Section                                 | Description                                 |
 | --------------------------------------- | ------------------------------------------- |
 | [Why DeepSpeed?](#why-deepspeed)        |  DeepSpeed overview                         |
-| [Features](#features)                   |  DeepSpeed features                         |
-| [Further Reading](#further-reading)     |  DeepSpeed documentation, tutorials, etc.   |
-| [Contributing](#contributing)           |  Instructions for contributing to DeepSpeed |
-| [Publications](#publications)           |  DeepSpeed publications                     |
+| [Install](#installation)                |  Installation details                       |
+| [Features](#features)                   |  Feature list and overview                  |
+| [Further Reading](#further-reading)     |  Documentation, tutorials, etc.             |
+| [Contributing](#contributing)           |  Instructions for contributing              |
+| [Publications](#publications)           |  Publications related to DeepSpeed          |
+| [Videos](#videos)                       |  Videos related to DeepSpeed                |
 
 # Why DeepSpeed?
 Training advanced deep learning models is challenging. Beyond model design,
@@ -55,8 +62,35 @@ a large model easily runs out of memory with pure data parallelism and it is
 difficult to use model parallelism. DeepSpeed addresses these challenges to
 accelerate model development *and* training.
 
-# Features
+# Installation
+
+The quickest way to get started with DeepSpeed is via pip, this will install
+the latest release of DeepSpeed which is not tied to specific PyTorch or CUDA
+versions. DeepSpeed includes several C++/CUDA extensions that we commonly refer
+to as our 'ops'.  By default, all of these extensions/ops will be built
+just-in-time (JIT) using [torch's JIT C++ extension loader that relies on
+ninja](https://pytorch.org/docs/stable/cpp_extension.html) to build and
+dynamically link them at runtime.
+
+**Note:** [PyTorch](https://pytorch.org/) must be installed _before_ installing
+DeepSpeed.
 
+```bash
+pip install deepspeed
+```
+
+After installation, you can validate your install and see which extensions/ops
+your machine is compatible with via the DeepSpeed environment report.
+
+```bash
+ds_report
+```
+
+If you would like to pre-install any of the DeepSpeed extensions/ops (instead
+of JIT compiling) or install pre-compiled ops via PyPI please see our [advanced
+installation instructions](https://www.deepspeed.ai/tutorials/advanced-install/).
+
+# Features
 Below we provide a brief feature list, see our detailed [feature
 overview](https://www.deepspeed.ai/features/) for descriptions and usage.
 
@@ -66,10 +100,27 @@ overview](https://www.deepspeed.ai/features/) for descriptions and usage.
 * [Model Parallelism](https://www.deepspeed.ai/features/#model-parallelism)
   * Support for Custom Model Parallelism
   * Integration with Megatron-LM
-* [Memory and Bandwidth Optimizations](https://www.deepspeed.ai/features/#memory-and-bandwidth-optimizations)
-  * The Zero Redundancy Optimizer (ZeRO)
-  * Constant Buffer Optimization (CBO)
+* [Pipeline Parallelism](https://www.deepspeed.ai/tutorials/pipeline/)
+  * 3D Parallelism
+* [The Zero Redundancy Optimizer (ZeRO)](https://www.deepspeed.ai/tutorials/zero/)
+  * Optimizer State and Gradient Partitioning
+  * Activation Partitioning
+  * Constant Buffer Optimization
+  * Contiguous Memory Optimization
+* [ZeRO-Offload](https://www.deepspeed.ai/tutorials/zero-offload/)
+  * Leverage both CPU/GPU memory for model training
+  * Support 10B model training on a single GPU
+* [Ultra-fast dense transformer kernels](https://www.deepspeed.ai/news/2020/05/18/bert-record.html)
+* [Sparse attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention.html)
+  * Memory- and compute-efficient sparse kernels
+  * Support 10x long sequences than dense
+  * Flexible support to different sparse structures
+* [1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html)
+  * Custom communication collective
+  * Up to 5x communication volume saving
+* [Additional Memory and Bandwidth Optimizations](https://www.deepspeed.ai/features/#additional-memory-and-bandwidth-optimizations)
   * Smart Gradient Accumulation
+  * Communication/Computation Overlap
 * [Training Features](https://www.deepspeed.ai/features/#training-features)
   * Simplified training API
   * Gradient Clipping
@@ -79,6 +130,7 @@ overview](https://www.deepspeed.ai/features/) for descriptions and usage.
   * Memory bandwidth optimized FP16 Optimizer
   * Large Batch Training with LAMB Optimizer
   * Memory efficient Training with ZeRO Optimizer
+  * CPU-Adam
 * [Training Agnostic Checkpointing](https://www.deepspeed.ai/features/#training-agnostic-checkpointing)
 * [Advanced Parameter Search](https://www.deepspeed.ai/features/#advanced-parameter-search)
   * Learning Rate Range Test
@@ -127,8 +179,23 @@ all repos using our CLA.
 This project has adopted the [Microsoft Open Source Code of
 Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the
 [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact
-[opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or
-comments.
+[opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
 
 # Publications
-1. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He. (2019) ZeRO: Memory Optimization Towards Training A Trillion Parameter Models. [ArXiv:1910.02054](https://arxiv.org/abs/1910.02054)
+1. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He. (2019) ZeRO: memory optimizations toward training trillion parameter models. [arXiv:1910.02054](https://arxiv.org/abs/1910.02054) and [In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC '20)](https://dl.acm.org/doi/10.5555/3433701.3433727).
+2. Jeff Rasley, Samyam Rajbhandari, Olatunji Ruwase, and Yuxiong He. (2020) DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters. [In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD '20, Tutorial)](https://dl.acm.org/doi/10.1145/3394486.3406703).
+3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
+4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840).
+
+# Videos
+1. DeepSpeed KDD 2020 Tutorial
+    1. [Overview](https://www.youtube.com/watch?v=CaseqC45DNc&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=29)
+    2. [ZeRO + large model training](https://www.youtube.com/watch?v=y4_bCiAsIAk&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=28)
+    3. [17B T-NLG demo](https://www.youtube.com/watch?v=9V-ZbP92drg&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=27)
+    4. [Fastest BERT training + RScan tuning](https://www.youtube.com/watch?v=o1K-ZG9F6u0&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=26)
+    5. DeepSpeed hands on deep dive: [part 1](https://www.youtube.com/watch?v=_NOk-mBwDYg&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=92), [part 2](https://www.youtube.com/watch?v=sG6_c4VXLww&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=94), [part 3](https://www.youtube.com/watch?v=k9yPkBTayos&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=93)
+    6. [FAQ](https://www.youtube.com/watch?v=nsHu6vEgPew&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=24)
+2. Microsoft Research Webinar
+    * Registration is free and all videos are available on-demand.
+    * [ZeRO & Fastest BERT: Increasing the scale and speed of deep learning training in DeepSpeed](https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html).
+3. [DeepSpeed on AzureML](https://youtu.be/yBVXR8G8Bg8)
diff --git a/azure-pipelines-docker.yml b/azure-pipelines-docker.yml
deleted file mode 100644
index dc1782f997f3..000000000000
--- a/azure-pipelines-docker.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-# Docker
-# Build a Docker image
-# https://docs.microsoft.com/azure/devops/pipelines/languages/docker
-
-trigger:
-- master
-
-resources:
-- repo: self
-
-variables:
-  tag: '$(Build.BuildId)'
-
-stages:
-- stage: Build
-  displayName: Build image
-  jobs:
-  - job: Build
-    displayName: Build
-    pool:
-      vmImage: 'ubuntu-latest'
-    steps:
-    - task: Docker@2
-      displayName: Login to Docker Hub
-      inputs:
-        command: login
-        containerRegistry: DeepSpeedDocker
-    - task: Docker@2
-      displayName: Build and Push
-      inputs:
-        command: buildAndPush
-        dockerfile: '$(Build.SourcesDirectory)/Dockerfile'
-        repository: deepspeed/deepspeed
-        tags: |
-          $(tag)
-          latest
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
deleted file mode 100644
index ba6502606ee8..000000000000
--- a/azure-pipelines.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-
-jobs:
-- job: Default
-  timeoutInMinutes: 360
-  pool:
-    name: 'GPU_testing'
-
-  strategy:
-    matrix:
-      Python36:
-        python.version: '3.6'
-      #Python35:
-      #  python.version: '3.5'
-      #Python37:
-      #  python.version: '3.7'
-      #Python38:
-      #  python.version: '3.8'
-
-
-  steps:
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: '$(python.version)'
-      addToPath: true
-      architecture: 'x64'
-    displayName: 'Use Python $(python.version)'
-
-  - script: |
-      python -m pip install --upgrade pip
-      pip install --user -r requirements.txt
-      ./install.sh --pip_sudo
-    displayName: 'Install dependencies'
-
-  - script: |
-      pre-commit run --all-files
-    displayName: 'Formatting checks'
-
-  - script: |
-      pytest --forked --verbose tests/unit/
-    displayName: 'Unit tests'
-
-  - script: |
-      ln -s /data/Megatron-LM/data DeepSpeedExamples/Megatron-LM/
-      pip install --user -r DeepSpeedExamples/Megatron-LM/requirements.txt
-      cd tests/model/
-      pytest -s run_sanity_check.py
-    displayName: 'Model tests'
-
-   #BingBertSquad logs
-  - task: PublishPipelineArtifact@1
-    inputs:
-      targetPath: '$(Build.SourcesDirectory)/tests/model/BingBertSquad/test/'
-      artifactName: BingBertSquad_logs
-    displayName: 'BingBertSquad log uploads'
-    condition: always()
-
-  # Megatron test logs
-  #- task: PublishPipelineArtifact@1
-  #  inputs:
-  #    targetPath: '$(Build.SourcesDirectory)/tests/model/Megatron_GPT2/test/'
-  #    artifactName: Megatron_GPT2_logs
-  #  displayName: 'Megatron GPT2 log uploads'
-  #  condition: always()
-
-  #- task: PublishPipelineArtifact@1
-  #  inputs:
-  #    targetPath: '$(Build.SourcesDirectory)/tests/model/Megatron_GPT2/checkpoint_test_logs/'
-  #    artifactName: Megatron_GPT2_checkpoint_logs
-  #  displayName: 'Megatron GPT2 checkpoint log uploads'
-  #  condition: always()
-
-
-  #BingBert logs
-  #- task: PublishPipelineArtifact@1
-  #  inputs:
-  #    targetPath: '$(Build.SourcesDirectory)/tests/model/bing_bert/pretrain_test/'
-  #    artifactName: BingBert_pretrain_logs
-  #  displayName: 'BingBert pretrain logs'
-  #  condition: always()
-
-  #- task: PublishPipelineArtifact@1
-  #  inputs:
-  #    targetPath: '$(Build.SourcesDirectory)/tests/model/bing_bert/checkpoint_test_logs/'
-  #    artifactName: BingBert_checkpoint_logs
-  #  displayName: 'BingBert checkpoint logs'
-  #  condition: always()
diff --git a/azure/README.md b/azure/README.md
deleted file mode 120000
index fb962e96a1f9..000000000000
--- a/azure/README.md
+++ /dev/null
@@ -1 +0,0 @@
-../docs/_tutorials/azure.md
\ No newline at end of file
diff --git a/azure/README.md b/azure/README.md
new file mode 100644
index 000000000000..1cca695bfa7e
--- /dev/null
+++ b/azure/README.md
@@ -0,0 +1,3 @@
+# Getting Started with DeepSpeed on Azure
+
+Please see our [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/) to get started with DeepSpeed on Azure!
diff --git a/basic_install_test.py b/basic_install_test.py
deleted file mode 100644
index 966b124f5813..000000000000
--- a/basic_install_test.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import torch
-import importlib
-
-try:
-    import deepspeed as ds
-    print("deepspeed successfully imported")
-except ImportError as err:
-    raise err
-
-print(f"torch version: {torch.__version__}")
-
-print(f"deepspeed info: {ds.__version__}, {ds.__git_hash__}, {ds.__git_branch__}")
-
-try:
-    apex_C = importlib.import_module('apex_C')
-    print("apex successfully installed")
-except Exception as err:
-    raise err
-
-try:
-    fused_lamb = importlib.import_module('deepspeed_lamb_cuda')
-    print('deepspeed fused lamb kernels successfully installed')
-except Exception as err:
-    raise err
-
-try:
-    from apex.optimizers import FP16_Optimizer
-    print("using old-style apex")
-except ImportError:
-    print("using new-style apex")
-
-try:
-    ds_transformer = importlib.import_module('deepspeed_transformer_cuda')
-    print('deepspeed transformer kernels successfully installed')
-except Exception as err:
-    raise err
diff --git a/bin/ds b/bin/ds
index 47efea32da34..6bb47da8ce7c 100755
--- a/bin/ds
+++ b/bin/ds
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-from deepspeed.pt.deepspeed_run import main
+from deepspeed.launcher.runner import main
 
 if __name__ == '__main__':
     main()
diff --git a/bin/ds_elastic b/bin/ds_elastic
new file mode 100644
index 000000000000..ef92cbdab32d
--- /dev/null
+++ b/bin/ds_elastic
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+
+import argparse
+import json
+
+import deepspeed
+from deepspeed.elasticity import compute_elastic_config
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, help="DeepSpeed config json")
+    parser.add_argument('-w', '--world-size', type=int, default=0, help="Intended/current world size")
+    args = parser.parse_args()
+    ds_config = json.load(open(args.config, 'r'))
+
+    ds_version = deepspeed.__version__
+
+    elastic_config = ds_config['elasticity']
+    print('------------------------------------------')
+    print("Elasticity config:")
+    print('------------------------------------------')
+    print(json.dumps(elastic_config, indent=4, sort_keys=True))
+
+    if args.world_size > 0:
+        final_batch_size, valid_gpus, micro_batch_size = compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version, world_size=args.world_size)
+        print('------------------------------------------')
+        print(f"Calculated results for world size {args.world_size}:")
+        print('------------------------------------------')
+        print(f'final_batch_size .... {final_batch_size}')
+        print(f'valid_gpus .......... {valid_gpus}')
+        print(f'micro_batch_size .... {micro_batch_size}')
+    else:
+        final_batch_size, valid_gpus = compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
+        print('------------------------------------------')
+        print("Calculated results:")
+        print('------------------------------------------')
+        print(f'final_batch_size .... {final_batch_size}')
+        print(f'valid_gpus .......... {valid_gpus}')
diff --git a/bin/ds_report b/bin/ds_report
new file mode 100644
index 000000000000..c03a95645eae
--- /dev/null
+++ b/bin/ds_report
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+
+from deepspeed.env_report import main
+
+if __name__ == '__main__':
+    main()
diff --git a/csrc/adam/compat.h b/csrc/adam/compat.h
new file mode 100644
index 000000000000..86f84a85065c
--- /dev/null
+++ b/csrc/adam/compat.h
@@ -0,0 +1,14 @@
+/* Copyright 2020 The Microsoft DeepSpeed Team
+   Copyright NVIDIA/apex
+   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+*/
+
+#ifndef TORCH_CHECK
+#define TORCH_CHECK AT_CHECK
+#endif
+
+#ifdef VERSION_GE_1_3
+#define DATA_PTR data_ptr
+#else
+#define DATA_PTR data
+#endif
diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
new file mode 100755
index 000000000000..e817322630b8
--- /dev/null
+++ b/csrc/adam/cpu_adam.cpp
@@ -0,0 +1,677 @@
+#include "cpu_adam.h"
+#include <cuda_runtime_api.h>
+#include <math.h>
+#include <omp.h>
+#include <torch/extension.h>
+#include <iostream>
+#include <memory>
+#include <type_traits>
+#include <unordered_map>
+#include "cublas_v2.h"
+#include "cuda.h"
+#include "curand.h"
+#include "custom_cuda_layers.h"
+
+static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
+
+#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+
+// C++ interface
+
+void Adam_Optimizer::Step(float* _params,
+                          float* grads,
+                          float* _exp_avg,
+                          float* _exp_avg_sq,
+                          size_t _param_size,
+                          __half* dev_params)
+{
+    float betta1_minus1 = 1 - _betta1;
+    float betta2_minus1 = 1 - _betta2;
+
+    float step_size = -1 * _alpha / _bias_correction1;
+    float w_decay = -1 * _alpha * _weight_decay;
+    size_t rounded_size = 0;
+
+#if defined(__AVX512__) or defined(__AVX256__)
+
+    AVX_Data betta1_4;
+    betta1_4.data = SIMD_SET(_betta1);
+    AVX_Data betta2_4;
+    betta2_4.data = SIMD_SET(_betta2);
+
+    AVX_Data betta1_minus1_4;
+    betta1_minus1_4.data = SIMD_SET(betta1_minus1);
+    AVX_Data betta2_minus1_4;
+    betta2_minus1_4.data = SIMD_SET(betta2_minus1);
+
+    AVX_Data bias2_sqrt;
+    bias2_sqrt.data = SIMD_SET(_bias_correction2);
+
+    AVX_Data eps_4;
+    eps_4.data = SIMD_SET(_eps);
+
+    AVX_Data step_size_4;
+    step_size_4.data = SIMD_SET(step_size);
+
+    AVX_Data weight_decay4;
+    if (_weight_decay > 0)
+        weight_decay4.data = (_adamw_mode ? SIMD_SET(w_decay) : SIMD_SET(_weight_decay));
+    rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH);
+
+    for (size_t t = 0; t < rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += SIMD_WIDTH) {
+            AVX_Data grad_4;
+            grad_4.data = SIMD_LOAD(grads + i);
+
+            AVX_Data momentum_4;
+            momentum_4.data = SIMD_LOAD(_exp_avg + i);
+            AVX_Data variance_4;
+            variance_4.data = SIMD_LOAD(_exp_avg_sq + i);
+
+            AVX_Data param_4;
+            param_4.data = SIMD_LOAD(_params + i);
+
+            if (_weight_decay > 0 && !_adamw_mode) {
+                grad_4.data = SIMD_FMA(param_4.data, weight_decay4.data, grad_4.data);
+            }
+            momentum_4.data = SIMD_MUL(momentum_4.data, betta1_4.data);
+            momentum_4.data = SIMD_FMA(grad_4.data, betta1_minus1_4.data, momentum_4.data);
+
+            variance_4.data = SIMD_MUL(variance_4.data, betta2_4.data);
+            grad_4.data = SIMD_MUL(grad_4.data, grad_4.data);
+            variance_4.data = SIMD_FMA(grad_4.data, betta2_minus1_4.data, variance_4.data);
+
+            grad_4.data = SIMD_SQRT(variance_4.data);
+            grad_4.data = SIMD_FMA(grad_4.data, bias2_sqrt.data, eps_4.data);
+            grad_4.data = SIMD_DIV(momentum_4.data, grad_4.data);
+            if (_weight_decay > 0 && _adamw_mode) {
+                param_4.data = SIMD_FMA(param_4.data, weight_decay4.data, param_4.data);
+            }
+            param_4.data = SIMD_FMA(grad_4.data, step_size_4.data, param_4.data);
+
+            SIMD_STORE(_params + i, param_4.data);
+
+            if (dev_params) SIMD_STORE(_doubled_buffer[_buf_index] + (i - t), param_4.data);
+
+            SIMD_STORE(_exp_avg + i, momentum_4.data);
+            SIMD_STORE(_exp_avg_sq + i, variance_4.data);
+        }
+        if (dev_params) {
+            launch_param_update(_doubled_buffer[_buf_index],
+                                dev_params + t,
+                                copy_size,
+                                Context::Instance().GetCurrentStream());
+            _buf_index = !_buf_index;
+        }
+    }
+
+#endif
+
+    if (_param_size > rounded_size) {
+#pragma omp parallel for
+        for (size_t k = rounded_size; k < _param_size; k++) {
+            float grad = grads[k];
+            float param = _params[k];
+            float momentum = _exp_avg[k];
+            float variance = _exp_avg_sq[k];
+            if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
+            momentum = momentum * _betta1;
+            momentum = grad * betta1_minus1 + momentum;
+
+            variance = variance * _betta2;
+            grad = grad * grad;
+            variance = grad * betta2_minus1 + variance;
+
+            grad = sqrt(variance);
+            grad = grad * _bias_correction2 + _eps;
+            grad = momentum / grad;
+            if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
+            param = grad * step_size + param;
+            if (dev_params) _doubled_buffer[_buf_index][k - rounded_size] = (__half)param;
+
+            _params[k] = param;
+            _exp_avg[k] = momentum;
+            _exp_avg_sq[k] = variance;
+        }
+        if (dev_params) {
+            launch_param_update(_doubled_buffer[_buf_index],
+                                dev_params + rounded_size,
+                                (_param_size - rounded_size),
+                                Context::Instance().GetCurrentStream());
+        }
+    }
+}
+
+void Adam_Optimizer::Step_4(float* _params,
+                            float* grads,
+                            float* _exp_avg,
+                            float* _exp_avg_sq,
+                            size_t _param_size,
+                            __half* dev_params)
+{
+    size_t rounded_size = 0;
+
+#if defined(__AVX512__) or defined(__AVX256__)
+
+    AVX_Data betta1_4;
+    betta1_4.data = SIMD_SET(_betta1);
+    AVX_Data betta2_4;
+    betta2_4.data = SIMD_SET(_betta2);
+
+    float betta1_minus1 = 1 - _betta1;
+    float betta2_minus1 = 1 - _betta2;
+    AVX_Data betta1_minus1_4;
+    betta1_minus1_4.data = SIMD_SET(betta1_minus1);
+    AVX_Data betta2_minus1_4;
+    betta2_minus1_4.data = SIMD_SET(betta2_minus1);
+
+    AVX_Data bias2_sqrt;
+    bias2_sqrt.data = SIMD_SET(_bias_correction2);
+
+    AVX_Data eps_4;
+    eps_4.data = SIMD_SET(_eps);
+
+    float step_size = -1 * _alpha / _bias_correction1;
+    AVX_Data step_size_4;
+    step_size_4.data = SIMD_SET(step_size);
+
+    float w_decay = -1 * _alpha * _weight_decay;
+    AVX_Data weight_decay4;
+    if (_weight_decay > 0)
+        weight_decay4.data = (_adamw_mode ? SIMD_SET(w_decay) : SIMD_SET(_weight_decay));
+    rounded_size = ROUND_DOWN(_param_size, (SIMD_WIDTH << 2));
+
+    for (size_t t = 0; t < rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 2)) {
+            AVX_Data grad_4[4];
+            grad_4[0].data = SIMD_LOAD(grads + i);
+            grad_4[1].data = SIMD_LOAD(grads + i + SIMD_WIDTH);
+            grad_4[2].data = SIMD_LOAD(grads + i + (SIMD_WIDTH << 1));
+            grad_4[3].data = SIMD_LOAD(grads + i + SIMD_WIDTH * 3);
+
+            AVX_Data momentum_4[4];
+            momentum_4[0].data = SIMD_LOAD(_exp_avg + i);
+            momentum_4[1].data = SIMD_LOAD(_exp_avg + i + SIMD_WIDTH);
+            momentum_4[2].data = SIMD_LOAD(_exp_avg + i + (SIMD_WIDTH << 1));
+            momentum_4[3].data = SIMD_LOAD(_exp_avg + i + SIMD_WIDTH * 3);
+
+            AVX_Data variance_4[4];
+            variance_4[0].data = SIMD_LOAD(_exp_avg_sq + i);
+            variance_4[1].data = SIMD_LOAD(_exp_avg_sq + i + SIMD_WIDTH);
+            variance_4[2].data = SIMD_LOAD(_exp_avg_sq + i + (SIMD_WIDTH << 1));
+            variance_4[3].data = SIMD_LOAD(_exp_avg_sq + i + SIMD_WIDTH * 3);
+
+            AVX_Data param_4[4];
+            param_4[0].data = SIMD_LOAD(_params + i);
+            param_4[1].data = SIMD_LOAD(_params + i + SIMD_WIDTH);
+            param_4[2].data = SIMD_LOAD(_params + i + (SIMD_WIDTH << 1));
+            param_4[3].data = SIMD_LOAD(_params + i + SIMD_WIDTH * 3);
+
+            if (_weight_decay > 0 && !_adamw_mode) {
+                grad_4[0].data = SIMD_FMA(param_4[0].data, weight_decay4.data, grad_4[0].data);
+                grad_4[1].data = SIMD_FMA(param_4[1].data, weight_decay4.data, grad_4[1].data);
+                grad_4[2].data = SIMD_FMA(param_4[2].data, weight_decay4.data, grad_4[2].data);
+                grad_4[3].data = SIMD_FMA(param_4[3].data, weight_decay4.data, grad_4[3].data);
+            }
+
+            momentum_4[0].data = SIMD_MUL(momentum_4[0].data, betta1_4.data);
+            momentum_4[0].data = SIMD_FMA(grad_4[0].data, betta1_minus1_4.data, momentum_4[0].data);
+            momentum_4[1].data = SIMD_MUL(momentum_4[1].data, betta1_4.data);
+            momentum_4[1].data = SIMD_FMA(grad_4[1].data, betta1_minus1_4.data, momentum_4[1].data);
+            momentum_4[2].data = SIMD_MUL(momentum_4[2].data, betta1_4.data);
+            momentum_4[2].data = SIMD_FMA(grad_4[2].data, betta1_minus1_4.data, momentum_4[2].data);
+            momentum_4[3].data = SIMD_MUL(momentum_4[3].data, betta1_4.data);
+            momentum_4[3].data = SIMD_FMA(grad_4[3].data, betta1_minus1_4.data, momentum_4[3].data);
+
+            variance_4[0].data = SIMD_MUL(variance_4[0].data, betta2_4.data);
+            variance_4[1].data = SIMD_MUL(variance_4[1].data, betta2_4.data);
+            variance_4[2].data = SIMD_MUL(variance_4[2].data, betta2_4.data);
+            variance_4[3].data = SIMD_MUL(variance_4[3].data, betta2_4.data);
+            grad_4[0].data = SIMD_MUL(grad_4[0].data, grad_4[0].data);
+            grad_4[1].data = SIMD_MUL(grad_4[1].data, grad_4[1].data);
+            grad_4[2].data = SIMD_MUL(grad_4[2].data, grad_4[2].data);
+            grad_4[3].data = SIMD_MUL(grad_4[3].data, grad_4[3].data);
+            variance_4[0].data = SIMD_FMA(grad_4[0].data, betta2_minus1_4.data, variance_4[0].data);
+            variance_4[1].data = SIMD_FMA(grad_4[1].data, betta2_minus1_4.data, variance_4[1].data);
+            variance_4[2].data = SIMD_FMA(grad_4[2].data, betta2_minus1_4.data, variance_4[2].data);
+            variance_4[3].data = SIMD_FMA(grad_4[3].data, betta2_minus1_4.data, variance_4[3].data);
+
+            grad_4[0].data = SIMD_SQRT(variance_4[0].data);
+            grad_4[1].data = SIMD_SQRT(variance_4[1].data);
+            grad_4[2].data = SIMD_SQRT(variance_4[2].data);
+            grad_4[3].data = SIMD_SQRT(variance_4[3].data);
+
+            grad_4[0].data = SIMD_FMA(grad_4[0].data, bias2_sqrt.data, eps_4.data);
+            grad_4[1].data = SIMD_FMA(grad_4[1].data, bias2_sqrt.data, eps_4.data);
+            grad_4[2].data = SIMD_FMA(grad_4[2].data, bias2_sqrt.data, eps_4.data);
+            grad_4[3].data = SIMD_FMA(grad_4[3].data, bias2_sqrt.data, eps_4.data);
+            grad_4[0].data = SIMD_DIV(momentum_4[0].data, grad_4[0].data);
+            grad_4[1].data = SIMD_DIV(momentum_4[1].data, grad_4[1].data);
+            grad_4[2].data = SIMD_DIV(momentum_4[2].data, grad_4[2].data);
+            grad_4[3].data = SIMD_DIV(momentum_4[3].data, grad_4[3].data);
+
+            if (_weight_decay > 0 && _adamw_mode) {
+                param_4[0].data = SIMD_FMA(param_4[0].data, weight_decay4.data, param_4[0].data);
+                param_4[1].data = SIMD_FMA(param_4[1].data, weight_decay4.data, param_4[1].data);
+                param_4[2].data = SIMD_FMA(param_4[2].data, weight_decay4.data, param_4[2].data);
+                param_4[3].data = SIMD_FMA(param_4[3].data, weight_decay4.data, param_4[3].data);
+            }
+
+            param_4[0].data = SIMD_FMA(grad_4[0].data, step_size_4.data, param_4[0].data);
+            param_4[1].data = SIMD_FMA(grad_4[1].data, step_size_4.data, param_4[1].data);
+            param_4[2].data = SIMD_FMA(grad_4[2].data, step_size_4.data, param_4[2].data);
+            param_4[3].data = SIMD_FMA(grad_4[3].data, step_size_4.data, param_4[3].data);
+
+            SIMD_STORE(_params + i, param_4[0].data);
+            SIMD_STORE(_params + i + SIMD_WIDTH, param_4[1].data);
+            SIMD_STORE(_params + i + (SIMD_WIDTH << 1), param_4[2].data);
+            SIMD_STORE(_params + i + SIMD_WIDTH * 3, param_4[3].data);
+
+            if (dev_params) {
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t), param_4[0].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH, param_4[1].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + (SIMD_WIDTH << 1),
+                           param_4[2].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 3, param_4[3].data);
+            }
+
+            SIMD_STORE(_exp_avg + i, momentum_4[0].data);
+            SIMD_STORE(_exp_avg + i + SIMD_WIDTH, momentum_4[1].data);
+            SIMD_STORE(_exp_avg + i + (SIMD_WIDTH << 1), momentum_4[2].data);
+            SIMD_STORE(_exp_avg + i + SIMD_WIDTH * 3, momentum_4[3].data);
+
+            SIMD_STORE(_exp_avg_sq + i, variance_4[0].data);
+            SIMD_STORE(_exp_avg_sq + i + SIMD_WIDTH, variance_4[1].data);
+            SIMD_STORE(_exp_avg_sq + i + (SIMD_WIDTH << 1), variance_4[2].data);
+            SIMD_STORE(_exp_avg_sq + i + SIMD_WIDTH * 3, variance_4[3].data);
+        }
+
+        if (dev_params) {
+            launch_param_update(_doubled_buffer[_buf_index],
+                                dev_params + t,
+                                copy_size,
+                                Context::Instance().GetCurrentStream());
+            _buf_index = !_buf_index;
+        }
+    }
+#endif
+    if (_param_size > rounded_size)
+        Step((_params + rounded_size),
+             (grads + rounded_size),
+             (_exp_avg + rounded_size),
+             (_exp_avg_sq + rounded_size),
+             (_param_size - rounded_size),
+             (dev_params != nullptr ? (dev_params + rounded_size) : dev_params));
+}
+
+int create_adam_optimizer(int optimizer_id,
+                          float alpha = 1e-3,
+                          float betta1 = 0.9,
+                          float betta2 = 0.999,
+                          float eps = 1e-8,
+                          float weight_decay = 0,
+                          bool adamw_mode = true)
+{
+    auto opt =
+        std::make_shared<Adam_Optimizer>(alpha, betta1, betta2, eps, weight_decay, adamw_mode);
+
+    s_optimizers[optimizer_id] = opt;
+#if defined(__AVX512__)
+    std::cout << "Adam Optimizer #" << optimizer_id
+              << " is created with AVX512 arithmetic capability." << std::endl;
+    printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
+           alpha,
+           betta1,
+           betta2,
+           weight_decay,
+           (int)adamw_mode);
+#else
+#if defined(__AVX256__)
+    std::cout << "Adam Optimizer #" << optimizer_id
+              << " is created with AVX2 arithmetic capability." << std::endl;
+    printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
+           alpha,
+           betta1,
+           betta2,
+           weight_decay,
+           (int)adamw_mode);
+#else
+    std::cout << "Adam Optimizer #" << optimizer_id
+              << " is created with scalar arithmetic capability." << std::endl;
+    printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
+           alpha,
+           betta1,
+           betta2,
+           weight_decay,
+           (int)adamw_mode);
+#endif
+#endif
+    return 0;
+}
+
+void Adam_Optimizer::Step_8(float* _params,
+                            float* grads,
+                            float* _exp_avg,
+                            float* _exp_avg_sq,
+                            size_t _param_size,
+                            __half* dev_params)
+{
+    size_t rounded_size = 0;
+
+#if defined(__AVX512__) or defined(__AVX256__)
+
+    AVX_Data betta1_4;
+    betta1_4.data = SIMD_SET(_betta1);
+    AVX_Data betta2_4;
+    betta2_4.data = SIMD_SET(_betta2);
+
+    float betta1_minus1 = 1 - _betta1;
+    float betta2_minus1 = 1 - _betta2;
+    AVX_Data betta1_minus1_4;
+    betta1_minus1_4.data = SIMD_SET(betta1_minus1);
+    AVX_Data betta2_minus1_4;
+    betta2_minus1_4.data = SIMD_SET(betta2_minus1);
+
+    AVX_Data bias2_sqrt;
+    bias2_sqrt.data = SIMD_SET(_bias_correction2);
+
+    AVX_Data eps_4;
+    eps_4.data = SIMD_SET(_eps);
+
+    float step_size = -1 * _alpha / _bias_correction1;
+    AVX_Data step_size_4;
+    step_size_4.data = SIMD_SET(step_size);
+
+    float w_decay = -1 * _alpha * _weight_decay;
+    AVX_Data weight_decay4;
+    if (_weight_decay > 0)
+        weight_decay4.data = (_adamw_mode ? SIMD_SET(w_decay) : SIMD_SET(_weight_decay));
+    rounded_size = ROUND_DOWN(_param_size, (SIMD_WIDTH << 3));
+
+    for (size_t t = 0; t < rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) {
+            AVX_Data grad_4[8];
+            grad_4[0].data = SIMD_LOAD(grads + i);
+            grad_4[1].data = SIMD_LOAD(grads + i + SIMD_WIDTH);
+            grad_4[2].data = SIMD_LOAD(grads + i + (SIMD_WIDTH << 1));
+            grad_4[3].data = SIMD_LOAD(grads + i + SIMD_WIDTH * 3);
+            grad_4[4].data = SIMD_LOAD(grads + i + (SIMD_WIDTH << 2));
+            grad_4[5].data = SIMD_LOAD(grads + i + SIMD_WIDTH * 5);
+            grad_4[6].data = SIMD_LOAD(grads + i + SIMD_WIDTH * 6);
+            grad_4[7].data = SIMD_LOAD(grads + i + SIMD_WIDTH * 7);
+
+            AVX_Data momentum_4[8];
+            momentum_4[0].data = SIMD_LOAD(_exp_avg + i);
+            momentum_4[1].data = SIMD_LOAD(_exp_avg + i + SIMD_WIDTH);
+            momentum_4[2].data = SIMD_LOAD(_exp_avg + i + (SIMD_WIDTH << 1));
+            momentum_4[3].data = SIMD_LOAD(_exp_avg + i + SIMD_WIDTH * 3);
+            momentum_4[4].data = SIMD_LOAD(_exp_avg + i + (SIMD_WIDTH << 2));
+            momentum_4[5].data = SIMD_LOAD(_exp_avg + i + SIMD_WIDTH * 5);
+            momentum_4[6].data = SIMD_LOAD(_exp_avg + i + SIMD_WIDTH * 6);
+            momentum_4[7].data = SIMD_LOAD(_exp_avg + i + SIMD_WIDTH * 7);
+
+            AVX_Data variance_4[8];
+            variance_4[0].data = SIMD_LOAD(_exp_avg_sq + i);
+            variance_4[1].data = SIMD_LOAD(_exp_avg_sq + i + SIMD_WIDTH);
+            variance_4[2].data = SIMD_LOAD(_exp_avg_sq + i + (SIMD_WIDTH << 1));
+            variance_4[3].data = SIMD_LOAD(_exp_avg_sq + i + SIMD_WIDTH * 3);
+            variance_4[4].data = SIMD_LOAD(_exp_avg_sq + i + (SIMD_WIDTH << 2));
+            variance_4[5].data = SIMD_LOAD(_exp_avg_sq + i + SIMD_WIDTH * 5);
+            variance_4[6].data = SIMD_LOAD(_exp_avg_sq + i + SIMD_WIDTH * 6);
+            variance_4[7].data = SIMD_LOAD(_exp_avg_sq + i + SIMD_WIDTH * 7);
+
+            AVX_Data param_4[8];
+            param_4[0].data = SIMD_LOAD(_params + i);
+            param_4[1].data = SIMD_LOAD(_params + i + SIMD_WIDTH);
+            param_4[2].data = SIMD_LOAD(_params + i + (SIMD_WIDTH << 1));
+            param_4[3].data = SIMD_LOAD(_params + i + SIMD_WIDTH * 3);
+            param_4[4].data = SIMD_LOAD(_params + i + (SIMD_WIDTH << 2));
+            param_4[5].data = SIMD_LOAD(_params + i + SIMD_WIDTH * 5);
+            param_4[6].data = SIMD_LOAD(_params + i + SIMD_WIDTH * 6);
+            param_4[7].data = SIMD_LOAD(_params + i + SIMD_WIDTH * 7);
+
+            if (_weight_decay > 0 && !_adamw_mode) {
+                grad_4[0].data = SIMD_FMA(param_4[0].data, weight_decay4.data, grad_4[0].data);
+                grad_4[1].data = SIMD_FMA(param_4[1].data, weight_decay4.data, grad_4[1].data);
+                grad_4[2].data = SIMD_FMA(param_4[2].data, weight_decay4.data, grad_4[2].data);
+                grad_4[3].data = SIMD_FMA(param_4[3].data, weight_decay4.data, grad_4[3].data);
+                grad_4[4].data = SIMD_FMA(param_4[4].data, weight_decay4.data, grad_4[4].data);
+                grad_4[5].data = SIMD_FMA(param_4[5].data, weight_decay4.data, grad_4[5].data);
+                grad_4[6].data = SIMD_FMA(param_4[6].data, weight_decay4.data, grad_4[6].data);
+                grad_4[7].data = SIMD_FMA(param_4[7].data, weight_decay4.data, grad_4[7].data);
+            }
+
+            momentum_4[0].data = SIMD_MUL(momentum_4[0].data, betta1_4.data);
+            momentum_4[0].data = SIMD_FMA(grad_4[0].data, betta1_minus1_4.data, momentum_4[0].data);
+            momentum_4[1].data = SIMD_MUL(momentum_4[1].data, betta1_4.data);
+            momentum_4[1].data = SIMD_FMA(grad_4[1].data, betta1_minus1_4.data, momentum_4[1].data);
+            momentum_4[2].data = SIMD_MUL(momentum_4[2].data, betta1_4.data);
+            momentum_4[2].data = SIMD_FMA(grad_4[2].data, betta1_minus1_4.data, momentum_4[2].data);
+            momentum_4[3].data = SIMD_MUL(momentum_4[3].data, betta1_4.data);
+            momentum_4[3].data = SIMD_FMA(grad_4[3].data, betta1_minus1_4.data, momentum_4[3].data);
+            momentum_4[4].data = SIMD_MUL(momentum_4[4].data, betta1_4.data);
+            momentum_4[4].data = SIMD_FMA(grad_4[4].data, betta1_minus1_4.data, momentum_4[4].data);
+            momentum_4[5].data = SIMD_MUL(momentum_4[5].data, betta1_4.data);
+            momentum_4[5].data = SIMD_FMA(grad_4[5].data, betta1_minus1_4.data, momentum_4[5].data);
+            momentum_4[6].data = SIMD_MUL(momentum_4[6].data, betta1_4.data);
+            momentum_4[6].data = SIMD_FMA(grad_4[6].data, betta1_minus1_4.data, momentum_4[6].data);
+            momentum_4[7].data = SIMD_MUL(momentum_4[7].data, betta1_4.data);
+            momentum_4[7].data = SIMD_FMA(grad_4[7].data, betta1_minus1_4.data, momentum_4[7].data);
+
+            variance_4[0].data = SIMD_MUL(variance_4[0].data, betta2_4.data);
+            variance_4[1].data = SIMD_MUL(variance_4[1].data, betta2_4.data);
+            variance_4[2].data = SIMD_MUL(variance_4[2].data, betta2_4.data);
+            variance_4[3].data = SIMD_MUL(variance_4[3].data, betta2_4.data);
+            variance_4[4].data = SIMD_MUL(variance_4[4].data, betta2_4.data);
+            variance_4[5].data = SIMD_MUL(variance_4[5].data, betta2_4.data);
+            variance_4[6].data = SIMD_MUL(variance_4[6].data, betta2_4.data);
+            variance_4[7].data = SIMD_MUL(variance_4[7].data, betta2_4.data);
+            grad_4[0].data = SIMD_MUL(grad_4[0].data, grad_4[0].data);
+            grad_4[1].data = SIMD_MUL(grad_4[1].data, grad_4[1].data);
+            grad_4[2].data = SIMD_MUL(grad_4[2].data, grad_4[2].data);
+            grad_4[3].data = SIMD_MUL(grad_4[3].data, grad_4[3].data);
+            grad_4[4].data = SIMD_MUL(grad_4[4].data, grad_4[4].data);
+            grad_4[5].data = SIMD_MUL(grad_4[5].data, grad_4[5].data);
+            grad_4[6].data = SIMD_MUL(grad_4[6].data, grad_4[6].data);
+            grad_4[7].data = SIMD_MUL(grad_4[7].data, grad_4[7].data);
+            variance_4[0].data = SIMD_FMA(grad_4[0].data, betta2_minus1_4.data, variance_4[0].data);
+            variance_4[1].data = SIMD_FMA(grad_4[1].data, betta2_minus1_4.data, variance_4[1].data);
+            variance_4[2].data = SIMD_FMA(grad_4[2].data, betta2_minus1_4.data, variance_4[2].data);
+            variance_4[3].data = SIMD_FMA(grad_4[3].data, betta2_minus1_4.data, variance_4[3].data);
+            variance_4[4].data = SIMD_FMA(grad_4[4].data, betta2_minus1_4.data, variance_4[4].data);
+            variance_4[5].data = SIMD_FMA(grad_4[5].data, betta2_minus1_4.data, variance_4[5].data);
+            variance_4[6].data = SIMD_FMA(grad_4[6].data, betta2_minus1_4.data, variance_4[6].data);
+            variance_4[7].data = SIMD_FMA(grad_4[7].data, betta2_minus1_4.data, variance_4[7].data);
+
+            grad_4[0].data = SIMD_SQRT(variance_4[0].data);
+            grad_4[1].data = SIMD_SQRT(variance_4[1].data);
+            grad_4[2].data = SIMD_SQRT(variance_4[2].data);
+            grad_4[3].data = SIMD_SQRT(variance_4[3].data);
+            grad_4[4].data = SIMD_SQRT(variance_4[4].data);
+            grad_4[5].data = SIMD_SQRT(variance_4[5].data);
+            grad_4[6].data = SIMD_SQRT(variance_4[6].data);
+            grad_4[7].data = SIMD_SQRT(variance_4[7].data);
+
+            grad_4[0].data = SIMD_FMA(grad_4[0].data, bias2_sqrt.data, eps_4.data);
+            grad_4[1].data = SIMD_FMA(grad_4[1].data, bias2_sqrt.data, eps_4.data);
+            grad_4[2].data = SIMD_FMA(grad_4[2].data, bias2_sqrt.data, eps_4.data);
+            grad_4[3].data = SIMD_FMA(grad_4[3].data, bias2_sqrt.data, eps_4.data);
+            grad_4[4].data = SIMD_FMA(grad_4[4].data, bias2_sqrt.data, eps_4.data);
+            grad_4[5].data = SIMD_FMA(grad_4[5].data, bias2_sqrt.data, eps_4.data);
+            grad_4[6].data = SIMD_FMA(grad_4[6].data, bias2_sqrt.data, eps_4.data);
+            grad_4[7].data = SIMD_FMA(grad_4[7].data, bias2_sqrt.data, eps_4.data);
+            grad_4[0].data = SIMD_DIV(momentum_4[0].data, grad_4[0].data);
+            grad_4[1].data = SIMD_DIV(momentum_4[1].data, grad_4[1].data);
+            grad_4[2].data = SIMD_DIV(momentum_4[2].data, grad_4[2].data);
+            grad_4[3].data = SIMD_DIV(momentum_4[3].data, grad_4[3].data);
+            grad_4[4].data = SIMD_DIV(momentum_4[4].data, grad_4[4].data);
+            grad_4[5].data = SIMD_DIV(momentum_4[5].data, grad_4[5].data);
+            grad_4[6].data = SIMD_DIV(momentum_4[6].data, grad_4[6].data);
+            grad_4[7].data = SIMD_DIV(momentum_4[7].data, grad_4[7].data);
+
+            if (_weight_decay > 0 && _adamw_mode) {
+                param_4[0].data = SIMD_FMA(param_4[0].data, weight_decay4.data, param_4[0].data);
+                param_4[1].data = SIMD_FMA(param_4[1].data, weight_decay4.data, param_4[1].data);
+                param_4[2].data = SIMD_FMA(param_4[2].data, weight_decay4.data, param_4[2].data);
+                param_4[3].data = SIMD_FMA(param_4[3].data, weight_decay4.data, param_4[3].data);
+                param_4[4].data = SIMD_FMA(param_4[4].data, weight_decay4.data, param_4[4].data);
+                param_4[5].data = SIMD_FMA(param_4[5].data, weight_decay4.data, param_4[5].data);
+                param_4[6].data = SIMD_FMA(param_4[6].data, weight_decay4.data, param_4[6].data);
+                param_4[7].data = SIMD_FMA(param_4[7].data, weight_decay4.data, param_4[7].data);
+            }
+
+            param_4[0].data = SIMD_FMA(grad_4[0].data, step_size_4.data, param_4[0].data);
+            param_4[1].data = SIMD_FMA(grad_4[1].data, step_size_4.data, param_4[1].data);
+            param_4[2].data = SIMD_FMA(grad_4[2].data, step_size_4.data, param_4[2].data);
+            param_4[3].data = SIMD_FMA(grad_4[3].data, step_size_4.data, param_4[3].data);
+            param_4[4].data = SIMD_FMA(grad_4[4].data, step_size_4.data, param_4[4].data);
+            param_4[5].data = SIMD_FMA(grad_4[5].data, step_size_4.data, param_4[5].data);
+            param_4[6].data = SIMD_FMA(grad_4[6].data, step_size_4.data, param_4[6].data);
+            param_4[7].data = SIMD_FMA(grad_4[7].data, step_size_4.data, param_4[7].data);
+
+            SIMD_STORE(_params + i, param_4[0].data);
+            SIMD_STORE(_params + i + SIMD_WIDTH, param_4[1].data);
+            SIMD_STORE(_params + i + (SIMD_WIDTH << 1), param_4[2].data);
+            SIMD_STORE(_params + i + SIMD_WIDTH * 3, param_4[3].data);
+            SIMD_STORE(_params + i + (SIMD_WIDTH << 2), param_4[4].data);
+            SIMD_STORE(_params + i + SIMD_WIDTH * 5, param_4[5].data);
+            SIMD_STORE(_params + i + SIMD_WIDTH * 6, param_4[6].data);
+            SIMD_STORE(_params + i + SIMD_WIDTH * 7, param_4[7].data);
+
+            if (dev_params) {
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t), param_4[0].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH, param_4[1].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + (SIMD_WIDTH << 1),
+                           param_4[2].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 3, param_4[3].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + (SIMD_WIDTH << 2),
+                           param_4[4].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 5, param_4[5].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 6, param_4[6].data);
+                SIMD_STORE(_doubled_buffer[_buf_index] + (i - t) + SIMD_WIDTH * 7, param_4[7].data);
+            }
+
+            SIMD_STORE(_exp_avg + i, momentum_4[0].data);
+            SIMD_STORE(_exp_avg + i + SIMD_WIDTH, momentum_4[1].data);
+            SIMD_STORE(_exp_avg + i + (SIMD_WIDTH << 1), momentum_4[2].data);
+            SIMD_STORE(_exp_avg + i + SIMD_WIDTH * 3, momentum_4[3].data);
+            SIMD_STORE(_exp_avg + i + (SIMD_WIDTH << 2), momentum_4[4].data);
+            SIMD_STORE(_exp_avg + i + SIMD_WIDTH * 5, momentum_4[5].data);
+            SIMD_STORE(_exp_avg + i + SIMD_WIDTH * 6, momentum_4[6].data);
+            SIMD_STORE(_exp_avg + i + SIMD_WIDTH * 7, momentum_4[7].data);
+
+            SIMD_STORE(_exp_avg_sq + i, variance_4[0].data);
+            SIMD_STORE(_exp_avg_sq + i + SIMD_WIDTH, variance_4[1].data);
+            SIMD_STORE(_exp_avg_sq + i + (SIMD_WIDTH << 1), variance_4[2].data);
+            SIMD_STORE(_exp_avg_sq + i + SIMD_WIDTH * 3, variance_4[3].data);
+            SIMD_STORE(_exp_avg_sq + i + (SIMD_WIDTH << 2), variance_4[4].data);
+            SIMD_STORE(_exp_avg_sq + i + SIMD_WIDTH * 5, variance_4[5].data);
+            SIMD_STORE(_exp_avg_sq + i + SIMD_WIDTH * 6, variance_4[6].data);
+            SIMD_STORE(_exp_avg_sq + i + SIMD_WIDTH * 7, variance_4[7].data);
+        }
+        if (dev_params) {
+            launch_param_update(_doubled_buffer[_buf_index],
+                                dev_params + t,
+                                copy_size,
+                                Context::Instance().GetCurrentStream());
+            _buf_index = !_buf_index;
+        }
+    }
+#endif
+    if (_param_size > rounded_size)
+        Step_4((_params + rounded_size),
+               (grads + rounded_size),
+               (_exp_avg + rounded_size),
+               (_exp_avg_sq + rounded_size),
+               (_param_size - rounded_size),
+               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params));
+}
+
+int ds_adam_step(int optimizer_id,
+                 size_t step,
+                 float lr,
+                 float beta1,
+                 float beta2,
+                 float epsilon,
+                 float weight_decay,
+                 bool bias_correction,
+                 torch::Tensor& params,
+                 torch::Tensor& grads,
+                 torch::Tensor& exp_avg,
+                 torch::Tensor& exp_avg_sq)
+{
+    auto params_c = params.contiguous();
+    auto grads_c = grads.contiguous();
+    auto exp_avg_c = exp_avg.contiguous();
+    auto exp_avg_sq_c = exp_avg_sq.contiguous();
+
+    float* params_ptr = (float*)params_c.data_ptr();
+    float* grads_ptr = (float*)grads_c.data_ptr();
+    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
+    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
+
+    std::shared_ptr<Adam_Optimizer> opt =
+        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
+    opt->IncrementStep(step, beta1, beta2);
+    opt->update_state(lr, epsilon, weight_decay, bias_correction);
+    opt->Step_8(params_ptr, grads_ptr, exp_avg_ptr, exp_avg_sq_ptr, params_c.size(0));
+
+    return 0;
+}
+
+int ds_adam_step_plus_copy(int optimizer_id,
+                           size_t step,
+                           float lr,
+                           float beta1,
+                           float beta2,
+                           float epsilon,
+                           float weight_decay,
+                           bool bias_correction,
+                           torch::Tensor& params,
+                           torch::Tensor& grads,
+                           torch::Tensor& exp_avg,
+                           torch::Tensor& exp_avg_sq,
+                           torch::Tensor& gpu_params)
+{
+    auto params_c = params.contiguous();
+    auto gpu_params_c = gpu_params.contiguous();
+    auto exp_avg_c = exp_avg.contiguous();
+    auto exp_avg_sq_c = exp_avg_sq.contiguous();
+    auto grads_c = grads.contiguous();
+
+    float* params_ptr = (float*)params_c.data_ptr();
+    float* grads_ptr = (float*)grads_c.data_ptr();
+    __half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
+    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
+    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
+
+    std::shared_ptr<Adam_Optimizer> opt =
+        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
+    opt->IncrementStep(step, beta1, beta2);
+    opt->update_state(lr, epsilon, weight_decay, bias_correction);
+    opt->Step_8(
+        params_ptr, grads_ptr, exp_avg_ptr, exp_avg_sq_ptr, params_c.size(0), gpu_params_ptr);
+
+    return 0;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("adam_update", &ds_adam_step, "DeepSpeed CPU Adam update (C++)");
+    m.def("adam_update_copy",
+          &ds_adam_step_plus_copy,
+          "DeepSpeed CPU Adam update and param copy (C++)");
+    m.def("create_adam", &create_adam_optimizer, "DeepSpeed CPU Adam (C++)");
+}
diff --git a/csrc/adam/custom_cuda_kernel.cu b/csrc/adam/custom_cuda_kernel.cu
new file mode 100755
index 000000000000..2f282aff1aca
--- /dev/null
+++ b/csrc/adam/custom_cuda_kernel.cu
@@ -0,0 +1,20 @@
+
+
+#include "custom_cuda_layers.h"
+
+__global__ void param_update_kernel(const float* input, __half* output, int size)
+{
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (id < size) { output[id] = (__half)input[id]; }
+}
+
+void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream)
+{
+    int threads = 1024;
+
+    dim3 grid_dim((size - 1) / threads + 1);
+    dim3 block_dim(threads);
+
+    param_update_kernel<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
+}
diff --git a/csrc/adam/fused_adam_frontend.cpp b/csrc/adam/fused_adam_frontend.cpp
new file mode 100644
index 000000000000..b06531c53002
--- /dev/null
+++ b/csrc/adam/fused_adam_frontend.cpp
@@ -0,0 +1,20 @@
+#include <torch/extension.h>
+
+void multi_tensor_adam_cuda(int chunk_size,
+                            at::Tensor noop_flag,
+                            std::vector<std::vector<at::Tensor>> tensor_lists,
+                            const float lr,
+                            const float beta1,
+                            const float beta2,
+                            const float epsilon,
+                            const int step,
+                            const int mode,
+                            const int bias_correction,
+                            const float weight_decay);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("multi_tensor_adam",
+          &multi_tensor_adam_cuda,
+          "Compute and apply gradient update to parameters for Adam optimizer");
+}
diff --git a/csrc/adam/multi_tensor_adam.cu b/csrc/adam/multi_tensor_adam.cu
new file mode 100644
index 000000000000..3cb9763befce
--- /dev/null
+++ b/csrc/adam/multi_tensor_adam.cu
@@ -0,0 +1,163 @@
+/* Copyright 2020 The Microsoft DeepSpeed Team
+   Copyright NVIDIA/apex
+   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+*/
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+// Another possibility:
+// #include <torch/all.h>
+
+#include <assert.h>
+
+#include "multi_tensor_apply.cuh"
+#include "type_shim.h"
+
+#define BLOCK_SIZE 512
+#define ILP 4
+
+typedef enum {
+    ADAM_MODE_0 = 0,  // L2 regularization mode
+    ADAM_MODE_1 = 1   // Decoupled weight decay mode(AdamW)
+} adamMode_t;
+
+using MATH_T = float;
+
+template <typename T>
+struct AdamFunctor {
+    __device__ __forceinline__ void operator()(int chunk_size,
+                                               volatile int* noop_gmem,
+                                               TensorListMetadata<4>& tl,
+                                               const float beta1,
+                                               const float beta2,
+                                               const float beta1_correction,
+                                               const float beta2_correction,
+                                               const float epsilon,
+                                               const float lr,
+                                               adamMode_t mode,
+                                               const float decay)
+    {
+        // I'd like this kernel to propagate infs/nans.
+        // if(*noop_gmem == 1)
+        //   return;
+
+        int tensor_loc = tl.block_to_tensor[blockIdx.x];
+
+        // potentially use to pass in list of scalar
+        // int tensor_num = tl.start_tensor_this_launch + tensor_loc;
+
+        int chunk_idx = tl.block_to_chunk[blockIdx.x];
+        int n = tl.sizes[tensor_loc];
+
+        T* g = (T*)tl.addresses[0][tensor_loc];
+        g += chunk_idx * chunk_size;
+
+        T* p = (T*)tl.addresses[1][tensor_loc];
+        p += chunk_idx * chunk_size;
+
+        T* m = (T*)tl.addresses[2][tensor_loc];
+        m += chunk_idx * chunk_size;
+
+        T* v = (T*)tl.addresses[3][tensor_loc];
+        v += chunk_idx * chunk_size;
+
+        n -= chunk_idx * chunk_size;
+
+        // see note in multi_tensor_scale_kernel.cu
+        for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
+            MATH_T r_g[ILP];
+            MATH_T r_p[ILP];
+            MATH_T r_m[ILP];
+            MATH_T r_v[ILP];
+#pragma unroll
+            for (int ii = 0; ii < ILP; ii++) {
+                int i = i_start + threadIdx.x + ii * blockDim.x;
+                if (i < n && i < chunk_size) {
+                    r_g[ii] = g[i];
+                    r_p[ii] = p[i];
+                    r_m[ii] = m[i];
+                    r_v[ii] = v[i];
+                } else {
+                    r_g[ii] = MATH_T(0);
+                    r_p[ii] = MATH_T(0);
+                    r_m[ii] = MATH_T(0);
+                    r_v[ii] = MATH_T(0);
+                }
+            }
+#pragma unroll
+            for (int ii = 0; ii < ILP; ii++) {
+                if (mode == ADAM_MODE_0) {  // L2
+                    r_g[ii] = r_g[ii] + (decay * r_p[ii]);
+                    r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
+                    r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
+                    MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
+                    MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
+                    MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
+                    MATH_T update = next_m_unbiased / denom;
+                    r_p[ii] = r_p[ii] - (lr * update);
+                } else {  // weight decay
+                    r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
+                    r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
+                    MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
+                    MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
+                    MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
+                    MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
+                    r_p[ii] = r_p[ii] - (lr * update);
+                }
+            }
+#pragma unroll
+            for (int ii = 0; ii < ILP; ii++) {
+                int i = i_start + threadIdx.x + ii * blockDim.x;
+                if (i < n && i < chunk_size) {
+                    p[i] = r_p[ii];
+                    m[i] = r_m[ii];
+                    v[i] = r_v[ii];
+                }
+            }
+        }
+    }
+};
+
+void multi_tensor_adam_cuda(int chunk_size,
+                            at::Tensor noop_flag,
+                            std::vector<std::vector<at::Tensor>> tensor_lists,
+                            const float lr,
+                            const float beta1,
+                            const float beta2,
+                            const float epsilon,
+                            const int step,
+                            const int mode,
+                            const int bias_correction,
+                            const float weight_decay)
+{
+    using namespace at;
+
+    // Handle bias correction mode
+    float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
+    if (bias_correction == 1) {
+        bias_correction1 = 1 - std::pow(beta1, step);
+        bias_correction2 = 1 - std::pow(beta2, step);
+    }
+
+    // Assume single type across p,g,m1,m2 now
+    DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
+                                   0,
+                                   "adam",
+                                   multi_tensor_apply<4>(BLOCK_SIZE,
+                                                         chunk_size,
+                                                         noop_flag,
+                                                         tensor_lists,
+                                                         AdamFunctor<scalar_t_0>(),
+                                                         beta1,
+                                                         beta2,
+                                                         bias_correction1,
+                                                         bias_correction2,
+                                                         epsilon,
+                                                         lr,
+                                                         (adamMode_t)mode,
+                                                         weight_decay);)
+
+    AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/csrc/adam/multi_tensor_apply.cuh b/csrc/adam/multi_tensor_apply.cuh
new file mode 100644
index 000000000000..13af4b7578f6
--- /dev/null
+++ b/csrc/adam/multi_tensor_apply.cuh
@@ -0,0 +1,127 @@
+/* Copyright 2020 The Microsoft DeepSpeed Team
+   Copyright NVIDIA/apex
+   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+*/
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "compat.h"
+
+#include <assert.h>
+
+// #include <iostream>
+
+// This header is the one-stop shop for all your multi-tensor apply needs.
+
+// TODO:  Kernel arg size limit may be <4KB for some other cards (ie Jetson)
+constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
+constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+
+template <int n>
+struct TensorListMetadata {
+    void* addresses[n][depth_to_max_tensors[n - 1]];
+    int sizes[depth_to_max_tensors[n - 1]];
+    unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+    int block_to_chunk[depth_to_max_blocks[n - 1]];  // I fear this needs to be a full int.
+    int start_tensor_this_launch;
+};
+
+template <typename T, typename U, typename... ArgTypes>
+__global__ void multi_tensor_apply_kernel(int chunk_size,
+                                          volatile int* noop_flag,
+                                          T tl,
+                                          U callable,
+                                          ArgTypes... args)
+{
+    // Hand the chunk information to the user-supplied functor to process however it likes.
+    callable(chunk_size, noop_flag, tl, args...);
+}
+
+template <int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(int block_size,
+                        int chunk_size,
+                        const at::Tensor& noop_flag,
+                        const std::vector<std::vector<at::Tensor>>& tensor_lists,
+                        T callable,
+                        ArgTypes... args)
+{
+    TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
+    int len0 = tensor_lists[0].size();
+    TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
+    auto ref_device = tensor_lists[0][0].device();
+    TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
+    for (int l = 0; l < tensor_lists.size(); l++)  // No range-based for because I need indices
+    {
+        TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
+        for (int t = 0; t < tensor_lists[l].size(); t++) {
+            // TODO:  Print which tensor fails.
+            bool contiguous_memory = tensor_lists[l][t].is_contiguous();
+#ifdef VERSION_GE_1_5
+            contiguous_memory = (contiguous_memory ||
+                                 tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
+#endif
+            TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
+            TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
+                        "A tensor was not on the same device as the first tensor");
+            TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
+        }
+    }
+
+    int ntensors = tensor_lists[0].size();
+
+    TensorListMetadata<depth> tl;
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
+    auto stream = at::cuda::getCurrentCUDAStream();
+
+    tl.start_tensor_this_launch = 0;
+    int loc_block_info = 0;
+    int loc_tensor_info = 0;
+    for (int t = 0; t < ntensors; t++) {
+        tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
+        for (int d = 0; d < depth; d++)
+            tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
+        loc_tensor_info++;
+
+        int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
+
+        for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
+            // std::cout << chunks_this_tensor << std::endl;
+            tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+            tl.block_to_chunk[loc_block_info] = chunk;
+            loc_block_info++;
+
+            bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
+                                 chunk == chunks_this_tensor - 1);
+            bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
+            bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
+            if (tensors_full || blocks_full || last_chunk) {
+                // using accscalar_t = acc_type<scalar_t, true>;
+                multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
+                    chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
+
+                AT_CUDA_CHECK(cudaGetLastError());
+
+                // Reset.  The control flow possibilities here make my brain hurt.
+                loc_block_info = 0;
+                if (chunk == chunks_this_tensor - 1) {
+                    // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 <<
+                    // std::endl;
+                    loc_tensor_info = 0;
+                    tl.start_tensor_this_launch = t + 1;
+                } else {
+                    // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 <<
+                    // std::endl;
+                    tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
+                    for (int d = 0; d < depth; d++)
+                        tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
+                    loc_tensor_info = 1;
+                    tl.start_tensor_this_launch = t;
+                }
+            }
+        }
+    }
+}
diff --git a/csrc/includes/context.h b/csrc/includes/context.h
old mode 100644
new mode 100755
index 1e4820177c5d..c2e26cdfa708
--- a/csrc/includes/context.h
+++ b/csrc/includes/context.h
@@ -29,12 +29,12 @@
         for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
 
 #define DS_CUDA_NUM_THREADS 512
-#define DS_MAXIMUM_NUM_BLOCKS 4096
+#define DS_MAXIMUM_NUM_BLOCKS 262144
 
 inline int DS_GET_BLOCKS(const int N)
 {
-    return std::max(
-        std::min((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
+    return (std::max)(
+        (std::min)((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
         // Use at least 1 block, since CUDA does not allow empty block
         1);
 }
@@ -64,17 +64,10 @@ class Context {
         return _ctx;
     }
 
-    void GenWorkSpace(size_t size)
+    void SetWorkSpace(void* workspace)
     {
-        if (!_workspace) {
-            assert(_workspace == nullptr);
-            cudaMalloc(&_workspace, size);
-        } else if (_workSpaceSize != size) {
-            cudaFree(_workspace);
-            cudaMalloc(&_workspace, size);
-        }
-
-        _workSpaceSize = size;
+        if (!workspace) { throw std::runtime_error("Workspace is null."); }
+        _workspace = workspace;
     }
 
     void* GetWorkSpace() { return _workspace; }
@@ -172,6 +165,5 @@ class Context {
     void* _workspace;
     uint64_t _seed;
     uint64_t _curr_offset;
-    size_t _workSpaceSize;
     std::vector<std::array<int, 3>> _gemm_algos;
 };
diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
new file mode 100755
index 000000000000..0f45409186c1
--- /dev/null
+++ b/csrc/includes/cpu_adam.h
@@ -0,0 +1,155 @@
+#pragma once
+
+#include <cpuid.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdio.h>
+#include <x86intrin.h>
+#include <cassert>
+#include "context.h"
+#include "cublas_v2.h"
+#include "cuda.h"
+#include "curand.h"
+
+#define CUDA_CHECK(callstr)                                                                    \
+    {                                                                                          \
+        cudaError_t error_code = callstr;                                                      \
+        if (error_code != cudaSuccess) {                                                       \
+            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
+            assert(0);                                                                         \
+        }                                                                                      \
+    }
+
+#define TILE (1024 * 1024 * 1024)
+
+#if defined(__AVX512__)
+#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
+#define SIMD_LOAD(x) _mm512_loadu_ps(x)
+#define SIMD_SET(x) _mm512_set1_ps(x)
+#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
+#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
+#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
+#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
+#define SIMD_WIDTH 16
+#else
+#if defined(__AVX256__)
+#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
+#define SIMD_LOAD(x) _mm256_loadu_ps(x)
+#define SIMD_SET(x) _mm256_set1_ps(x)
+#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
+#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
+#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
+#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
+#define SIMD_WIDTH 8
+#endif
+#endif
+
+class Adam_Optimizer {
+public:
+    Adam_Optimizer(float alpha = 1e-3,
+                   float betta1 = 0.9,
+                   float betta2 = 0.999,
+                   float eps = 1e-8,
+                   float weight_decay = 0,
+                   bool adamw_mode = true)
+        : _alpha(alpha),
+          _betta1(betta1),
+          _betta2(betta2),
+          _eps(eps),
+          _weight_decay(weight_decay),
+          _betta1_t(1.0),
+          _betta2_t(1.0),
+          _step(0),
+          _buf_index(false),
+          _adamw_mode(adamw_mode)
+    {
+        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
+        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
+    }
+    ~Adam_Optimizer()
+    {
+        cudaFreeHost(_doubled_buffer[0]);
+        cudaFreeHost(_doubled_buffer[1]);
+    }
+    void Step(float* _params,
+              float* grads,
+              float* _exp_avg,
+              float* _exp_avg_sq,
+              size_t param_size,
+              __half* dev_param = nullptr);
+    void Step_4(float* _params,
+                float* grads,
+                float* _exp_avg,
+                float* _exp_avg_sa,
+                size_t param_size,
+                __half* dev_param = nullptr);
+    void Step_8(float* _params,
+                float* grads,
+                float* _exp_avg,
+                float* _exp_avg_sq,
+                size_t _param_size,
+                __half* dev_params = nullptr);
+
+    inline void IncrementStep(size_t step, float beta1, float beta2)
+    {
+        if (beta1 != _betta1 || beta2 != _betta2) {
+            _step = step;
+            _betta1 = beta1;
+            _betta2 = beta2;
+            _betta1_t = std::pow(_betta1, step);
+            _betta2_t = std::pow(_betta2, step);
+        } else {
+            _step++;
+            if (_step != step) {
+                _betta1_t = std::pow(_betta1, step);
+                _betta2_t = std::pow(_betta2, step);
+                _step = step;
+            } else {
+                _betta1_t *= _betta1;
+                _betta2_t *= _betta2;
+            }
+        }
+    }
+    inline void update_state(float lr, float epsilon, float weight_decay, bool bias_correction)
+    {
+        _alpha = lr;
+        _eps = epsilon;
+        _weight_decay = weight_decay;
+
+        _bias_correction1 = 1.0f;
+        _bias_correction2 = 1.0f;
+        if (bias_correction == 1) {
+            _bias_correction1 = 1 - _betta1_t;
+            _bias_correction2 = 1 / sqrt(1 - _betta2_t);
+        }
+    }
+
+private:
+#if defined(__AVX512__) or defined(__AVX256__)
+    union AVX_Data {
+#if defined(__AVX512__)
+        __m512 data;
+#else
+        __m256 data;
+#endif
+        // float data_f[16];
+    };
+#endif
+
+    float _alpha;
+    float _betta1;
+    float _betta2;
+    float _eps;
+    float _weight_decay;
+
+    float _betta1_t;
+    float _betta2_t;
+    size_t _step;
+
+    float _bias_correction1;
+    float _bias_correction2;
+
+    float* _doubled_buffer[2];
+    bool _buf_index;
+    bool _adamw_mode;
+};
diff --git a/csrc/includes/custom_cuda_layers.h b/csrc/includes/custom_cuda_layers.h
index d9d0a8a68b38..88b51161d8d6 100755
--- a/csrc/includes/custom_cuda_layers.h
+++ b/csrc/includes/custom_cuda_layers.h
@@ -22,6 +22,8 @@
 #define MAX_THREAD_ITERATIONS 8  // Maximum 8K
 #define MAX_WARP_NUM 32
 
+#define MAX_REGISTERS 256
+
 // Fused bias add with gelu activation
 template <typename T>
 void launch_bias_gelu(const T* input,
@@ -29,7 +31,6 @@ void launch_bias_gelu(const T* input,
                       T* output,
                       int intermediate_size,
                       int batch_size,
-                      int sequence_length,
                       cudaStream_t stream);
 
 template <typename T>
@@ -37,7 +38,6 @@ void launch_gelu(const T* input,
                  T* output,
                  int intermediate_size,
                  int batch_size,
-                 int sequence_length,
                  cudaStream_t stream);
 
 template <typename T>
@@ -46,7 +46,6 @@ void launch_d_gelu(T* d_output,
                    const T* bias,
                    int intermediate_size,
                    int batch_size,
-                   int sequence_length,
                    cudaStream_t stream);
 
 // Custom fused bias add with layer normalization
@@ -57,14 +56,12 @@ void launch_bias_residual_layer_norm(T* vals,
                                      const T* beta,
                                      float epsilon,
                                      int batch_size,
-                                     int sequence_length,
                                      int hidden_dim,
                                      cudaStream_t stream,
                                      bool preLayerNorm,
-                                     bool training = false,
-                                     T* vars = nullptr,
-                                     T* means = nullptr,
-                                     T* vals_hat = nullptr);
+                                     bool training,
+                                     T* vars,
+                                     T* means);
 
 template <typename T>
 void launch_bias_residual_layer_norm(T* vals,
@@ -73,14 +70,11 @@ void launch_bias_residual_layer_norm(T* vals,
                                      const T* beta,
                                      float epsilon,
                                      int batch_size,
-                                     int sequence_length,
                                      int hidden_dim,
                                      cudaStream_t stream,
                                      bool preLayerNorm,
-                                     bool training = false,
-                                     T* vars = nullptr,
-                                     T* vals_hat = nullptr,
-                                     bool save_vals = false);
+                                     bool training,
+                                     T* vars);
 
 template <typename T>
 void launch_layerNorm_backward_fused_add(const T* out_grad1,
@@ -93,7 +87,6 @@ void launch_layerNorm_backward_fused_add(const T* out_grad1,
                                          T* betta_grad,
                                          T* inp_grad,
                                          int batch_size,
-                                         int sequence_length,
                                          int hidden_dim,
                                          cudaStream_t stream[2]);
 template <typename T>
@@ -106,7 +99,6 @@ void launch_layerNorm_backward_fused_add(const T* out_grad1,
                                          T* betta_grad,
                                          T* inp_grad,
                                          int batch_size,
-                                         int sequence_length,
                                          int hidden_dim,
                                          cudaStream_t stream[2],
                                          bool invertible = false,
@@ -122,7 +114,6 @@ void launch_layerNorm_backward(const T* out_grad,
                                T* betta_grad,
                                T* inp_grad,
                                int batch_size,
-                               int sequence_length,
                                int hidden_dim,
                                cudaStream_t stream[2]);
 
@@ -135,7 +126,6 @@ void launch_layerNorm_backward(const T* out_grad,
                                T* betta_grad,
                                T* inp_grad,
                                int batch_size,
-                               int sequence_length,
                                int hidden_dim,
                                cudaStream_t stream[2],
                                bool invertible = false,
@@ -153,7 +143,6 @@ void launch_layerNorm_backward_nreversible(const T* out_grad,
                                            T* betta_grad,
                                            T* inp_grad,
                                            int batch_size,
-                                           int sequence_length,
                                            int hidden_dim,
                                            cudaStream_t stream[2]);
 
@@ -264,3 +253,5 @@ void launch_fuse_transpose_bias_kernel(const T* inp,
                                        int rows,
                                        int cols,
                                        cudaStream_t stream);
+
+void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream);
diff --git a/csrc/includes/dropout.h b/csrc/includes/dropout.h
index 090df3a0abf8..f6e32af5608d 100644
--- a/csrc/includes/dropout.h
+++ b/csrc/includes/dropout.h
@@ -9,15 +9,13 @@ class Dropout {
 public:
     struct Config {
         float ratio;
-        uint32_t batch, dim;
+        uint32_t dim;
         bool training;
 
-        Config(float r, uint32_t batch, uint32_t dim)
-            : ratio(r), batch(batch), dim(dim), training(true)
-        {
-        }
+        Config(float r, uint32_t d) : ratio(r), dim(d), training(true) {}
 
         float RATIO() const { return training ? ratio : 0.0; }
+        inline void SetDim(uint32_t d) { dim = d; }
     };
 
     Dropout(const Config& config) : _config(config), _mask(nullptr) {}
@@ -70,6 +68,8 @@ class Dropout {
 
     Config GetConfig() const { return _config; }
 
+    inline void SetDimension(uint32_t dim) { _config.SetDim(dim); }
+
 private:
     uint8_t* _mask;
     Config _config;
diff --git a/csrc/includes/ds_transformer_cuda.h b/csrc/includes/ds_transformer_cuda.h
index 896dce8c26db..dbae797a8ecd 100755
--- a/csrc/includes/ds_transformer_cuda.h
+++ b/csrc/includes/ds_transformer_cuda.h
@@ -121,13 +121,22 @@ class BertTransformerLayer {
 
     void SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr,
                                 uint8_t* attn_output_dropout_mask_ptr,
-                                uint8_t* layer_output_dropout_mask_ptr);
+                                uint8_t* layer_output_dropout_mask_ptr,
+                                T* layer_norm_var,
+                                T* layer_norm_mean,
+                                T* attn_layer_norm_var,
+                                T* attn_layer_norm_mean);
 
     inline int GetBatchSize() const { return _batch_size; }
     inline int GetNumHeads() const { return _heads; }
     inline int GetSeqLength() const { return _seq_length; }
+    inline int GetIntermediateSize() const { return _intermediate_size; }
+
+    void SetSeqLength(int seq_len);
     inline int GetHiddenSize() const { return _hidden_size; }
     void SetTrainingMode(bool training);
+    inline bool IsTrainingMode() const { return _training; }
+    inline bool GeluCheckpoint() const { return _gelu_checkpoint; }
 
 private:
     void Initialize();
@@ -150,8 +159,8 @@ class BertTransformerLayer {
     // layers
     FeedForward<T> _qkv_linear;
     FeedForward<T> _attn_out_linear;
-    Normalize_Layer<T> _norm_layer2;
-    Normalize_Layer<T> _norm_layer3;
+    Normalize_Layer<T> _attn_layer_norm;
+    Normalize_Layer<T> _layer_norm;
     Normalize_Layer<T>* _last_normalize;
     FeedForward<T> _ff1, _ff2;
     Softmax<T> _softmax;
diff --git a/csrc/includes/gelu.h b/csrc/includes/gelu.h
index 247bfb273de0..41cf6f2a68a7 100644
--- a/csrc/includes/gelu.h
+++ b/csrc/includes/gelu.h
@@ -9,13 +9,8 @@ template <typename T>
 class Gelu {
 public:
     struct Config {
-        uint32_t batch_size;
-        uint32_t seq_length;
         uint32_t intermediate_size;
-        Config(uint32_t batch, uint32_t seq, uint32_t inter_size)
-            : batch_size(batch), seq_length(seq), intermediate_size(inter_size)
-        {
-        }
+        Config(uint32_t inter_size) : intermediate_size(inter_size) {}
     };
 
     Gelu(const Config& config) : _config(config) {}
@@ -28,14 +23,12 @@ class Gelu {
                             T* output,
                             cudaStream_t stream)
     {
-        launch_bias_gelu<T>(
-            input_buf, bias, output, _config.intermediate_size, bsz, _config.seq_length, stream);
+        launch_bias_gelu<T>(input_buf, bias, output, _config.intermediate_size, bsz, stream);
     }
 
     void Backward(int bsz, T* d_output, const T* input_buf, const T* bias, cudaStream_t stream)
     {
-        launch_d_gelu<T>(
-            d_output, input_buf, bias, _config.intermediate_size, bsz, _config.seq_length, stream);
+        launch_d_gelu<T>(d_output, input_buf, bias, _config.intermediate_size, bsz, stream);
     }
 
 private:
diff --git a/csrc/includes/gemm_test.h b/csrc/includes/gemm_test.h
index ff06f884351c..b920896b419e 100644
--- a/csrc/includes/gemm_test.h
+++ b/csrc/includes/gemm_test.h
@@ -97,7 +97,7 @@ class GemmTest {
     template <typename Func>
     int Run(int loops, Func f)
     {
-        float fast_latency = std::numeric_limits<float>::max();
+        float fast_latency = (std::numeric_limits<float>::max)();
         int fast_algo = 0;
 
         for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
@@ -252,7 +252,7 @@ class StridedGemmTest {
     template <typename Func>
     int Run(int loops, Func f)
     {
-        float fast_latency = std::numeric_limits<float>::max();
+        float fast_latency = (std::numeric_limits<float>::max)();
         int fast_algo = 0;
 
         for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
diff --git a/csrc/includes/normalize_layer.h b/csrc/includes/normalize_layer.h
index 37ee752c88b5..bfe84636ddb9 100644
--- a/csrc/includes/normalize_layer.h
+++ b/csrc/includes/normalize_layer.h
@@ -16,57 +16,27 @@ class Normalize_Layer {
         uint32_t seqLength;
         uint32_t hiddenDim;
         float epsilon;
-        bool training, save_vals;
-        bool allocateGrad;
+        bool training;
         bool useMean;
-        Config(uint32_t batch,
-               uint32_t seq,
-               uint32_t h,
-               bool training,
-               bool save_vals = true,
-               bool allocateGrad = true,
-               bool useMean = true)
+        Config(uint32_t batch, uint32_t seq, uint32_t h, bool training, bool useMean = true)
             : batchSize(batch),
               seqLength(seq),
               hiddenDim(h),
               epsilon(1e-12),
               training(training),
-              save_vals(save_vals),
-              allocateGrad(allocateGrad),
               useMean(useMean)
         {
         }
     };
 
-    Normalize_Layer(Config config) : config_(config), vars(nullptr), vals_hat(nullptr)
+    Normalize_Layer(Config config)
+        : config_(config), vars(nullptr), means(nullptr), vals_hat(nullptr)
     {
-        if (config_.training) {
-            cudaMalloc((void**)&vars, config_.batchSize * config_.seqLength * sizeof(T));
-
-            if (config_.useMean)
-                cudaMalloc((void**)&means, config_.batchSize * config_.seqLength * sizeof(T));
-
-            if (config_.save_vals)
-                cudaMalloc((void**)&vals_hat,
-                           config_.batchSize * config_.seqLength * config_.hiddenDim * sizeof(T));
-
-            if (config_.allocateGrad)
-                cudaMalloc((void**)&inp_grad,
-                           config_.batchSize * config_.seqLength * config_.hiddenDim * sizeof(T));
-        }
     }
 
-    ~Normalize_Layer()
-    {
-        if (config_.training) {
-            cudaFree(vars);
-            if (config_.useMean) cudaFree(means);
-            if (config_.save_vals) cudaFree(vals_hat);
-            if (config_.allocateGrad) cudaFree(inp_grad);
-        }
-    }
+    ~Normalize_Layer() {}
 
-    void ForwardCheckpoint(int bsz,
+    void ForwardCheckpoint(int bsz,  // batch * seq
                            T* vals,
                            const T* residual,
                            const T* gamma,
@@ -80,14 +50,12 @@ class Normalize_Layer {
                                         betta,
                                         config_.epsilon,
                                         bsz,
-                                        config_.seqLength,
                                         config_.hiddenDim,
                                         stream,
                                         preLayerNorm,
                                         config_.training,
                                         vars,
-                                        means,
-                                        vals_hat);
+                                        means);
     }
 
     void Forward(int bsz,
@@ -104,14 +72,11 @@ class Normalize_Layer {
                                         betta,
                                         config_.epsilon,
                                         bsz,
-                                        config_.seqLength,
                                         config_.hiddenDim,
                                         stream,
                                         preLayerNorm,
                                         config_.training,
-                                        vars,
-                                        vals_hat,
-                                        config_.save_vals);
+                                        vars);
     }
 
     void Backward(int bsz,
@@ -120,7 +85,7 @@ class Normalize_Layer {
                   T* gamma_grad,
                   T* betta_grad,
                   cudaStream_t stream[2],
-                  T* inp_grad_out = nullptr,
+                  T* inp_grad_out,
                   const T* norm_in = nullptr)
     {
         launch_layerNorm_backward(out_grad,
@@ -130,9 +95,8 @@ class Normalize_Layer {
                                   gamma,
                                   gamma_grad,
                                   betta_grad,
-                                  (config_.allocateGrad ? inp_grad : inp_grad_out),
+                                  inp_grad_out,
                                   bsz,
-                                  config_.seqLength,
                                   config_.hiddenDim,
                                   stream);
     }
@@ -144,21 +108,20 @@ class Normalize_Layer {
                   T* gamma_grad,
                   T* betta_grad,
                   cudaStream_t stream[2],
-                  T* inp_grad_out = nullptr,
-                  const T* norm_out = nullptr)
+                  T* inp_grad_out,
+                  const T* norm_out)
     {
         launch_layerNorm_backward(out_grad,
-                                  (config_.save_vals ? vals_hat : norm_out),
+                                  norm_out,
                                   vars,
                                   gamma,
                                   gamma_grad,
                                   betta_grad,
-                                  (config_.allocateGrad ? inp_grad : inp_grad_out),
+                                  inp_grad_out,
                                   bsz,
-                                  config_.seqLength,
                                   config_.hiddenDim,
                                   stream,
-                                  config_.save_vals,
+                                  !config_.useMean,
                                   betta);
     }
 
@@ -169,7 +132,7 @@ class Normalize_Layer {
                           T* gamma_grad,
                           T* betta_grad,
                           cudaStream_t stream[2],
-                          T* inp_grad_out = nullptr,
+                          T* inp_grad_out,
                           const T* norm_in = nullptr)
     {
         launch_layerNorm_backward_fused_add(out_grad1,
@@ -180,9 +143,8 @@ class Normalize_Layer {
                                             gamma,
                                             gamma_grad,
                                             betta_grad,
-                                            (config_.allocateGrad ? inp_grad : inp_grad_out),
+                                            inp_grad_out,
                                             bsz,
-                                            config_.seqLength,
                                             config_.hiddenDim,
                                             stream);
     }
@@ -195,33 +157,41 @@ class Normalize_Layer {
                           T* gamma_grad,
                           T* betta_grad,
                           cudaStream_t stream[2],
-                          T* inp_grad_out = nullptr,
-                          const T* norm_out = nullptr)
+                          T* inp_grad_out,
+                          const T* norm_out)
     {
         launch_layerNorm_backward_fused_add(out_grad1,
                                             out_grad2,
-                                            (config_.save_vals ? vals_hat : norm_out),
+                                            norm_out,
                                             vars,
                                             gamma,
                                             gamma_grad,
                                             betta_grad,
-                                            (config_.allocateGrad ? inp_grad : inp_grad_out),
+                                            inp_grad_out,
                                             bsz,
-                                            config_.seqLength,
                                             config_.hiddenDim,
                                             stream,
-                                            config_.save_vals,
+                                            !config_.useMean,
                                             betta);
     }
 
-    inline T* GetInputGrad() const { return inp_grad; }
-
     inline bool UseMean() const { return config_.useMean; }
 
+    inline void SetVar(T* variance)
+    {
+        if (!variance) { throw std::runtime_error("Normalize variance is null."); }
+        vars = variance;
+    }
+
+    inline void SetMean(T* mean)
+    {
+        if (!mean) { throw std::runtime_error("Normalize mean is null."); }
+        means = mean;
+    }
+
 private:
     Config config_;
     T* vars;
     T* means;
     T* vals_hat;
-    T* inp_grad;
 };
diff --git a/csrc/includes/softmax.h b/csrc/includes/softmax.h
old mode 100644
new mode 100755
index 2a18daee0b78..2bc2f67059cf
--- a/csrc/includes/softmax.h
+++ b/csrc/includes/softmax.h
@@ -45,13 +45,15 @@ class Softmax {
             out_grad, soft_out, bsz, config_.heads, config_.seq_length, stream);
     }
 
-    inline int GetProbDepth() const { return config_.prob_depth; }
+    inline size_t GetProbDepth() const { return config_.prob_depth; }
 
-    inline int GetBatchSize() const { return config_.batchSize; }
+    inline size_t GetBatchSize() const { return config_.batchSize; }
 
-    inline int GetNumHeads() const { return config_.heads; }
+    inline size_t GetNumHeads() const { return config_.heads; }
 
-    inline int GetSeqLength() const { return config_.seq_length; }
+    inline size_t GetSeqLength() const { return config_.seq_length; }
+
+    inline void SetSeqLength(size_t seq_len) { config_.seq_length = seq_len; }
 
 private:
     Config config_;
diff --git a/csrc/includes/strided_batch_gemm.h b/csrc/includes/strided_batch_gemm.h
index 8c43608e2ecf..44a1b313b986 100644
--- a/csrc/includes/strided_batch_gemm.h
+++ b/csrc/includes/strided_batch_gemm.h
@@ -3,6 +3,7 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <stdio.h>
+#include "context.h"
 
 template <typename T>
 class StridedBatchGemm {
@@ -38,6 +39,12 @@ class StridedBatchGemm {
               gemm_algos(algos)
         {
         }
+        void SetConfig(int mm, int nn, int kk)
+        {
+            m = mm;
+            n = nn;
+            k = kk;
+        }
     };
 
     StridedBatchGemm(const Config& config) : _config(config) {}
@@ -163,6 +170,8 @@ class StridedBatchGemm {
 
     inline const T* GetBufferB() const { return q_buf; }
 
+    inline void SetConfig(int m, int n, int k) { _config.SetConfig(m, n, k); }
+
 private:
     Config _config;
     const T* q_buf;
diff --git a/csrc/sparse_attention/utils.cpp b/csrc/sparse_attention/utils.cpp
new file mode 100644
index 000000000000..a802025e92ed
--- /dev/null
+++ b/csrc/sparse_attention/utils.cpp
@@ -0,0 +1,120 @@
+// DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
+// https://github.com/ptillet/torch-blocksparse/blob/master/csrc/utils.cpp
+
+#include <torch/extension.h>
+#include <string>
+#include <tuple>
+#include <vector>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+typedef std::vector<std::tuple<int, torch::Tensor>> ret_t;
+
+void segment_blocks(torch::Tensor layout,
+                    torch::Tensor idx,
+                    torch::Tensor scratch,
+                    int max_width,
+                    ret_t& ret)
+{
+    size_t H = layout.size(0);
+    size_t M = layout.size(1);
+    size_t N = layout.size(2);
+    torch::Tensor tmp = torch::zeros_like(layout);
+
+    auto _tmp = tmp.accessor<int, 3>();
+    auto _layout = layout.accessor<int, 3>();
+    auto _idx = idx.accessor<int, 3>();
+    auto _scratch = scratch.accessor<int, 3>();
+    std::vector<int> current(H, 0);
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (size_t h = 0; h < H; h++) {
+        // surrounding indices
+        std::vector<int> ii_left(max_width, -1);
+        std::vector<std::vector<int>> ii_top(max_width, std::vector<int>(N, -1));
+
+        for (size_t m = 0; m < M; m++) {
+            for (size_t n = 0; n < N; n++) {
+                int v = _layout[h][m][n];
+                if (v == 0) continue;
+                int n_left = ii_left[max_width - 1];
+                int m_top = ii_top[max_width - 1][n];
+                int top = (m_top >= 0) ? _tmp[h][m_top][n] : 0;
+                int left = (n_left >= 0) ? _tmp[h][m][n_left] : 0;
+                int topleft = (m_top >= 0 && n_left >= 0) ? _tmp[h][m_top][n_left] : 0;
+                int width = std::min(left, std::min(top, topleft)) + 1;
+
+                // reset width if blocks cannot be
+                // packed together (i.e., there's a 1 "in the middle")
+                for (int nn = n_left + 1; nn < n; nn++)
+                    if (ii_top[max_width - 1][nn] > ii_top[max_width - 1][n]) width = 1;
+                _tmp[h][m][n] = width;
+
+                // update n_left ring buffer
+                for (int k = 0; k < max_width - 1; k++) ii_left[k] = ii_left[k + 1];
+                ii_left[max_width - 1] = n;
+
+                // update ii_top ring buffer
+                for (int k = 0; k < max_width - 1; k++) ii_top[k][n] = ii_top[k + 1][n];
+                ii_top[max_width - 1][n] = m;
+
+                // block is too small -- skip
+                if (width != max_width) continue;
+
+                // retained blocks are set to zeros
+                for (size_t km = 0; km < max_width; km++)
+                    for (size_t kn = 0; kn < max_width; kn++) {
+                        int mm = ii_top[km][n];
+                        int nn = ii_left[kn];
+                        if (mm < 0 || nn < 0) continue;
+                        _layout[h][mm][nn] = 0;
+                        _tmp[h][mm][nn] = 0;
+                        _scratch[h][current[h]][0] = (int)h;
+                        _scratch[h][current[h]][1] = (int)mm;
+                        _scratch[h][current[h]][2] = (int)nn;
+                        _scratch[h][current[h]][3] = _idx[h][mm][nn];
+                        current[h]++;
+                    }
+            }
+        }
+    }
+    std::vector<torch::Tensor> to_cat;
+    for (size_t h = 0; h < H; h++)
+        if (current[h] > 0) to_cat.push_back(scratch[h].slice(0, 0, current[h]));
+    if (!to_cat.empty()) ret.push_back({max_width, torch::cat(to_cat)});
+}
+
+ret_t sdd_segment(torch::Tensor layout, int start_width)
+{
+    ret_t ret;
+
+    // block index
+    torch::Tensor idx = torch::zeros_like(layout);
+    int current = 0;
+    size_t H = layout.size(0);
+    size_t M = layout.size(1);
+    size_t N = layout.size(2);
+    auto _layout = layout.accessor<int, 3>();
+    auto _idx = idx.accessor<int, 3>();
+    for (size_t h = 0; h < H; h++)
+        for (size_t m = 0; m < M; m++)
+            for (size_t n = 0; n < N; n++) {
+                if (_layout[h][m][n] == 0) continue;
+                _idx[h][m][n] = current++;
+            }
+
+    // scratch memory
+    torch::Tensor scratch = torch::empty({H, layout.sum().item<int>(), 4}, layout.dtype());
+
+    for (int max_width = start_width; max_width > 0; max_width /= 2)
+        segment_blocks(layout, idx, scratch, max_width, ret);
+    return ret;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("sdd_segment", &sdd_segment, "SDD segmentation handler");
+}
diff --git a/csrc/transformer/cublas_wrappers.cu b/csrc/transformer/cublas_wrappers.cu
index 7b0016bcae5e..72b62386ea6d 100644
--- a/csrc/transformer/cublas_wrappers.cu
+++ b/csrc/transformer/cublas_wrappers.cu
@@ -34,7 +34,12 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                          algo);
 
     if (status != CUBLAS_STATUS_SUCCESS) {
-        fprintf(stderr, "!!!! kernel execution error.\n");
+        fprintf(stderr,
+                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+                m,
+                n,
+                k,
+                (int)status);
         return EXIT_FAILURE;
     }
     return 0;
@@ -74,7 +79,12 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                          algo);
 
     if (status != CUBLAS_STATUS_SUCCESS) {
-        fprintf(stderr, "!!!! kernel execution error.\n");
+        fprintf(stderr,
+                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+                m,
+                n,
+                k,
+                (int)status);
         return EXIT_FAILURE;
     }
     return 0;
@@ -122,7 +132,13 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                        algo);
 
     if (status != CUBLAS_STATUS_SUCCESS) {
-        fprintf(stderr, "!!!! kernel execution error.\n");
+        fprintf(stderr,
+                "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
+                batch,
+                m,
+                n,
+                k,
+                (int)status);
         return EXIT_FAILURE;
     }
     return 0;
@@ -170,7 +186,12 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                        algo);
 
     if (status != CUBLAS_STATUS_SUCCESS) {
-        fprintf(stderr, "!!!! kernel execution error.\n");
+        fprintf(stderr,
+                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+                m,
+                n,
+                k,
+                (int)status);
         return EXIT_FAILURE;
     }
 
diff --git a/csrc/transformer/dropout_kernels.cu b/csrc/transformer/dropout_kernels.cu
index faf39b44eb91..6b0655b788eb 100644
--- a/csrc/transformer/dropout_kernels.cu
+++ b/csrc/transformer/dropout_kernels.cu
@@ -1,5 +1,7 @@
 #include "custom_cuda_layers.h"
 
+const int unroll_factor = 4;
+
 __global__ void dropout_kernel(const int N,
                                const float ratio,
                                float* out,
@@ -13,17 +15,17 @@ __global__ void dropout_kernel(const int N,
     curandStatePhilox4_32_10_t state;
     curand_init(seed.first, idx, seed.second, &state);
 
-    CUDA_1D_KERNEL_LOOP(j, N / 4)
+    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
     {
         float4 rand = curand_uniform4(&state);
-        uint8_t m[4];
+        uint8_t m[unroll_factor];
 
         m[0] = (uint8_t)(rand.x > ratio);
         m[1] = (uint8_t)(rand.y > ratio);
         m[2] = (uint8_t)(rand.z > ratio);
         m[3] = (uint8_t)(rand.w > ratio);
 
-        int i = j * 4;
+        int i = j * unroll_factor;
 
         mask[i] = (uint8_t)m[0];
         mask[i + 1] = (uint8_t)m[1];
@@ -35,6 +37,18 @@ __global__ void dropout_kernel(const int N,
         out[i + 2] = Xdata[i + 2] * scale * m[2];
         out[i + 3] = Xdata[i + 3] * scale * m[3];
     }
+    int high_index =
+        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
+    if (N > high_index) {
+        float4 rand = curand_uniform4(&state);
+        float* rand_data = &(rand.x);
+        int k = 0;
+        for (int i = high_index; i < N; i++) {
+            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
+            out[i] = Xdata[i] * scale * m;
+            mask[i] = m;
+        }
+    }
 }
 
 __global__ void dropout_kernel(const int N,
@@ -66,7 +80,7 @@ __global__ void dropout_kernel(const int N,
     __half2 mask_h[2];
     float2 mask_f[2];
 
-    CUDA_1D_KERNEL_LOOP(j, N / 4)
+    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
     {
         float2 x_f = x_cast[j];
         __half2* x_h = reinterpret_cast<__half2*>(&x_f);
@@ -80,7 +94,7 @@ __global__ void dropout_kernel(const int N,
 
         float* mask_f_data = &mask_f[0].x;
 #pragma unroll
-        for (int i = 0; i < 4; i++) mask_f_data[i] = (float)(m[i]);
+        for (int i = 0; i < unroll_factor; i++) mask_f_data[i] = (float)(m[i]);
 
         mask_h[0] = __float22half2_rn(mask_f[0]);
         mask_h[1] = __float22half2_rn(mask_f[1]);
@@ -95,16 +109,16 @@ __global__ void dropout_kernel(const int N,
 
 #else
 
-    CUDA_1D_KERNEL_LOOP(j, N / 4)
+    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
     {
-        int i = j * 4;
+        int i = j * unroll_factor;
 
         const __half2* vals_half = reinterpret_cast<const __half2*>(Xdata + i);
         float2 vals_half_f[2];
         vals_half_f[0] = __half22float2(vals_half[0]);
         vals_half_f[1] = __half22float2(vals_half[1]);
 
-        uint8_t m[4];
+        uint8_t m[unroll_factor];
         float4 rand = curand_uniform4(&state);
         m[0] = (uint8_t)(rand.x > ratio);
         m[1] = (uint8_t)(rand.y > ratio);
@@ -123,6 +137,18 @@ __global__ void dropout_kernel(const int N,
     }
 
 #endif
+    int high_index =
+        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
+    if (N > high_index) {
+        float4 rand = curand_uniform4(&state);
+        float* rand_data = &(rand.x);
+        int k = 0;
+        for (int i = high_index; i < N; i++) {
+            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
+            out[i] = __float2half((float)Xdata[i] * scale * m);
+            mask[i] = m;
+        }
+    }
 }
 
 __global__ void dropout_kernel_bwd(const int N,
@@ -133,15 +159,20 @@ __global__ void dropout_kernel_bwd(const int N,
                                    std::pair<uint64_t, uint64_t> seed)
 {
     const float scale = 1. / (1. - ratio);
-    CUDA_1D_KERNEL_LOOP(j, N / 4)
+    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
     {
-        int i = j * 4;
+        int i = j * unroll_factor;
 
         out[i] = mask[i] ? Xdata[i] * scale : 0.0;
         out[i + 1] = mask[i + 1] ? Xdata[i + 1] * scale : 0.0;
         out[i + 2] = mask[i + 2] ? Xdata[i + 2] * scale : 0.0;
         out[i + 3] = mask[i + 3] ? Xdata[i + 3] * scale : 0.0;
     }
+    int high_index =
+        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
+    if (N > high_index) {
+        for (int i = high_index; i < N; i++) { out[i] = mask[i] ? Xdata[i] * scale : 0.0; }
+    }
 }
 
 __global__ void dropout_kernel_bwd(const int N,
@@ -161,18 +192,20 @@ __global__ void dropout_kernel_bwd(const int N,
     float2* out_cast = reinterpret_cast<float2*>(out);
     uint32_t* mask_cast = reinterpret_cast<uint32_t*>(mask);
 
-    CUDA_1D_KERNEL_LOOP(j, N / 4)
+    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
     {
         float2 x_f = x_cast[j];
         __half2* x_h = reinterpret_cast<__half2*>(&x_f);
 
-        uint8_t* m = reinterpret_cast<uint8_t*>(mask_cast + j);
+        uint32_t m_32 = mask_cast[j];
+        uint8_t* m = (uint8_t*)&m_32;
+
         __half2 mask_h[2];
         float2 mask_f[2];
 
         float* mask_f_data = &mask_f[0].x;
 #pragma unroll
-        for (int i = 0; i < 4; i++) mask_f_data[i] = (float)(m[i]);
+        for (int i = 0; i < unroll_factor; i++) mask_f_data[i] = (float)(m[i]);
 
 #pragma unroll
         for (int i = 0; i < 2; i++) mask_h[i] = __float22half2_rn(mask_f[i]);
@@ -191,9 +224,9 @@ __global__ void dropout_kernel_bwd(const int N,
     const __half h_scale = __float2half(scale);
     const __half h_zero = __float2half(0.0);
 
-    CUDA_1D_KERNEL_LOOP(j, N / 4)
+    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
     {
-        int i = j * 4;
+        int i = j * unroll_factor;
 
         const __half2* vals_half = reinterpret_cast<const __half2*>(Xdata + i);
 
@@ -211,6 +244,13 @@ __global__ void dropout_kernel_bwd(const int N,
     }
 
 #endif
+    int high_index =
+        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
+    if (N > high_index) {
+        for (int i = high_index; i < N; i++) {
+            out[i] = __float2half((float)Xdata[i] * scale * mask[i]);
+        }
+    }
 }
 
 template <typename T>
@@ -223,7 +263,9 @@ void launch_dropout(T* out,
                     cudaStream_t stream,
                     bool bwd)
 {
-    dim3 grid_dim = DS_GET_BLOCKS(total_count / 4);
+    assert(unroll_factor == 4);
+
+    dim3 grid_dim = DS_GET_BLOCKS(total_count / unroll_factor);
     dim3 block_dim = DS_CUDA_NUM_THREADS;
 
     if (dim > 512) {
@@ -264,55 +306,70 @@ __global__ void dropout_grad_kernel(const int N, const float scale, float* Xdata
 
 __global__ void dropout_grad_kernel(const int N, const float scale, __half* Xdata, uint8_t* mask)
 {
-#ifdef __STOCHASTIC_MODE__
-
     const __half2 h_scale = __float2half2_rn(scale);
     float2* x_cast = reinterpret_cast<float2*>(Xdata);
     uint32_t* mask_cast = reinterpret_cast<uint32_t*>(mask);
 
-    CUDA_1D_KERNEL_LOOP(j, N / 4)
+    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
     {
-        uint8_t* m = reinterpret_cast<uint8_t*>(mask_cast + j);
+        float2 x_data = x_cast[j];
+        uint32_t m_32 = mask_cast[j];
+        uint8_t* m = (uint8_t*)&m_32;
+
+        float2 result_f;
+        __half2* result_h = reinterpret_cast<__half2*>(&result_f);
+
+#ifdef __STOCHASTIC_MODE__
+
+        __half2* x_data_h = reinterpret_cast<__half2*>(&x_data);
         __half2 mask_h[2];
         float2 mask_f[2];
 
         float* mask_f_data = &mask_f[0].x;
 #pragma unroll
-        for (int i = 0; i < 4; i++) *(mask_f_data++) = (float)(m[i]);
+        for (int i = 0; i < unroll_factor; i++) *(mask_f_data++) = (float)(m[i]);
 
         mask_h[0] = __float22half2_rn(mask_f[0]);
         mask_h[1] = __float22half2_rn(mask_f[1]);
 
-        float2 x_data = x_cast[j];
-        __half2* x_data_h = reinterpret_cast<__half2*>(&x_data);
-
-        float2 result_f;
-        __half2* result_h = reinterpret_cast<__half2*>(&result_f);
-
         result_h[0] = x_data_h[0] * h_scale * mask_h[0];
         result_h[1] = x_data_h[1] * h_scale * mask_h[1];
 
-        x_cast[j] = result_f;
-    }
-
 #else
 
-    CUDA_1D_KERNEL_LOOP(j, N / 2)
-    {
-        int i = j * 2;
-        Xdata[i] = (__half)((float)Xdata[i] * scale * mask[i]);
-        Xdata[i + 1] = (__half)((float)Xdata[i + 1] * scale * mask[i + 1]);
-    }
+        __half* x_data_h = reinterpret_cast<__half*>(&x_data);
+        float2 result[2];
+
+        result[0].x = (float)x_data_h[0] * scale * m[0];
+        result[0].y = (float)x_data_h[1] * scale * m[1];
+        result[1].x = (float)x_data_h[2] * scale * m[2];
+        result[1].y = (float)x_data_h[3] * scale * m[3];
+
+        result_h[0] = __float22half2_rn(result[0]);
+        result_h[1] = __float22half2_rn(result[1]);
 
 #endif
+        x_cast[j] = result_f;
+    }
+    int high_index =
+        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
+    if (N > high_index) {
+        for (int i = high_index; i < N; i++) {
+            Xdata[i] = __float2half((float)Xdata[i] * scale * mask[i]);
+        }
+    }
 }
 
 template <typename T>
 void launch_dropout_grad(T* vals, uint8_t* mask, int total_count, float ratio, cudaStream_t stream)
 {
+    assert(unroll_factor == 4);
+
     const float scale = 1. / (1. - ratio);
-    dropout_grad_kernel<<<DS_GET_BLOCKS(total_count / 2), DS_CUDA_NUM_THREADS, 0, stream>>>(
-        total_count, scale, vals, mask);
+    dropout_grad_kernel<<<DS_GET_BLOCKS(total_count / unroll_factor),
+                          DS_CUDA_NUM_THREADS,
+                          0,
+                          stream>>>(total_count, scale, vals, mask);
 }
 
 template void launch_dropout_grad(float* vals,
@@ -341,11 +398,38 @@ __global__ void dropout_grad_kernel(const int N,
                                     __half* out,
                                     uint8_t* mask)
 {
-    CUDA_1D_KERNEL_LOOP(j, N / 2)
+    const float2* x_cast = reinterpret_cast<const float2*>(Xdata);
+    float2* out_cast = reinterpret_cast<float2*>(out);
+    const uint32_t* mask_cast = reinterpret_cast<const uint32_t*>(mask);
+
+    float2 result_f;
+    __half2* result_h = reinterpret_cast<__half2*>(&result_f);
+
+    CUDA_1D_KERNEL_LOOP(j, N / unroll_factor)
     {
-        int i = j * 2;
-        out[i] = (__half)((float)Xdata[i] * scale * mask[i]);
-        out[i + 1] = (__half)((float)Xdata[i + 1] * scale * mask[i + 1]);
+        float2 x_data = x_cast[j];
+        uint32_t m_32 = mask_cast[j];
+        uint8_t* m = (uint8_t*)&m_32;
+
+        __half* x_data_h = reinterpret_cast<__half*>(&x_data);
+        float2 result[2];
+
+        result[0].x = (float)x_data_h[0] * scale * m[0];
+        result[0].y = (float)x_data_h[1] * scale * m[1];
+        result[1].x = (float)x_data_h[2] * scale * m[2];
+        result[1].y = (float)x_data_h[3] * scale * m[3];
+
+        result_h[0] = __float22half2_rn(result[0]);
+        result_h[1] = __float22half2_rn(result[1]);
+
+        out_cast[j] = result_f;
+    }
+    int high_index =
+        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
+    if (N > high_index) {
+        for (int i = high_index; i < N; i++) {
+            out[i] = __float2half((float)Xdata[i] * scale * mask[i]);
+        }
     }
 }
 
@@ -357,9 +441,13 @@ void launch_dropout_grad(T* vals_out,
                          float ratio,
                          cudaStream_t stream)
 {
+    assert(unroll_factor == 4);
+
     const float scale = 1. / (1. - ratio);
-    dropout_grad_kernel<<<DS_GET_BLOCKS(total_count / 2), DS_CUDA_NUM_THREADS, 0, stream>>>(
-        total_count, scale, vals, vals_out, mask);
+    dropout_grad_kernel<<<DS_GET_BLOCKS(total_count / unroll_factor),
+                          DS_CUDA_NUM_THREADS,
+                          0,
+                          stream>>>(total_count, scale, vals, vals_out, mask);
 }
 template void launch_dropout_grad(float*,
                                   const float* vals,
@@ -374,7 +462,8 @@ template void launch_dropout_grad(__half*,
                                   float ratio,
                                   cudaStream_t stream);
 
-__global__ void dropout_kernel(const int dim,
+__global__ void dropout_kernel(const int N,
+                               const int dim,
                                const float ratio,
                                const float* bias,
                                float* Xdata,
@@ -383,26 +472,27 @@ __global__ void dropout_kernel(const int dim,
 {
     const float scale = 1. / (1. - ratio);
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x;
+    int tid = threadIdx.x % (dim / unroll_factor);
 
     curandStatePhilox4_32_10_t state;
     curand_init(seed.first, idx, seed.second, &state);
 
     float4* Xdata_cast = reinterpret_cast<float4*>(Xdata);
+    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
     const float4* bias_cast = reinterpret_cast<const float4*>(bias);
 
+    CUDA_1D_KERNEL_LOOP(j, N)
     {
         float4 rand = curand_uniform4(&state);
-        uint8_t m[4];
+        uint32_t m_32;
+        uint8_t* m = (uint8_t*)&m_32;
 
         m[0] = (uint8_t)(rand.x > ratio);
         m[1] = (uint8_t)(rand.y > ratio);
         m[2] = (uint8_t)(rand.z > ratio);
         m[3] = (uint8_t)(rand.w > ratio);
 
-        int i = blockIdx.x * dim + tid * 4;
-
-        float4 x_data = Xdata_cast[idx];
+        float4 x_data = Xdata_cast[j];
         float4 b_data = bias_cast[tid];
 
         x_data.x += b_data.x;
@@ -415,16 +505,26 @@ __global__ void dropout_kernel(const int dim,
         x_data.z = x_data.z * scale * m[2];
         x_data.w = x_data.w * scale * m[3];
 
-        mask[i] = (uint8_t)m[0];
-        mask[i + 1] = (uint8_t)m[1];
-        mask[i + 2] = (uint8_t)m[2];
-        mask[i + 3] = (uint8_t)m[3];
-
-        Xdata_cast[idx] = x_data;
+        mask_32[j] = m_32;
+        Xdata_cast[j] = x_data;
+    }
+    int high_index =
+        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
+    if (N > high_index) {
+        float4 rand = curand_uniform4(&state);
+        float* rand_data = &(rand.x);
+        int k = 0;
+        for (int i = high_index; i < N; i++) {
+            float x_data = Xdata[i] + bias[threadIdx.x % dim];
+            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
+            Xdata[i] = x_data * scale * m;
+            mask[i] = m;
+        }
     }
 }
 
-__global__ void dropout_kernel(const int dim,
+__global__ void dropout_kernel(const int N,
+                               const int dim,
                                const float ratio,
                                const __half* bias,
                                __half* Xdata,
@@ -433,17 +533,17 @@ __global__ void dropout_kernel(const int dim,
 {
     const float scale = 1. / (1. - ratio);
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x;
+    int tid = threadIdx.x % (dim / unroll_factor);
 
     curandStatePhilox4_32_10_t state;
     curand_init(seed.first, idx, seed.second, &state);
 
     float2* Xdata_cast = reinterpret_cast<float2*>(Xdata);
+    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
     const float2* bias_cast = reinterpret_cast<const float2*>(bias);
 
+    CUDA_1D_KERNEL_LOOP(j, N)
     {
-        int i = blockIdx.x * dim + tid * 4;
-
         float4 rand = curand_uniform4(&state);
 
         float2 data_f;
@@ -452,7 +552,7 @@ __global__ void dropout_kernel(const int dim,
         float2 bias_f;
         __half2* bias_h = reinterpret_cast<__half2*>(&bias_f);
 
-        data_f = Xdata_cast[idx];
+        data_f = Xdata_cast[j];
         bias_f = bias_cast[tid];
 
         float2 data_h_0 = __half22float2(data_h[0]);
@@ -466,7 +566,8 @@ __global__ void dropout_kernel(const int dim,
         data_h_1.x += bias_h_1.x;
         data_h_1.y += bias_h_1.y;
 
-        uint8_t m[4];  // = mask + i;
+        uint32_t m_32;
+        uint8_t* m = (uint8_t*)&m_32;
 
         m[0] = (uint8_t)(rand.x > ratio);
         m[1] = (uint8_t)(rand.y > ratio);
@@ -484,12 +585,21 @@ __global__ void dropout_kernel(const int dim,
         result_h[0] = __float22half2_rn(data_h_0);
         result_h[1] = __float22half2_rn(data_h_1);
 
-        Xdata_cast[idx] = result_f;
-
-        mask[i] = m[0];
-        mask[i + 1] = m[1];
-        mask[i + 2] = m[2];
-        mask[i + 3] = m[3];
+        Xdata_cast[j] = result_f;
+        mask_32[j] = m_32;
+    }
+    int high_index =
+        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
+    if (N > high_index) {
+        float4 rand = curand_uniform4(&state);
+        float* rand_data = &(rand.x);
+        int k = 0;
+        for (int i = high_index; i < N; i++) {
+            float x_data = (float)Xdata[i] + (float)bias[threadIdx.x % dim];
+            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
+            Xdata[i] = __float2half(x_data * scale * m);
+            mask[i] = m;
+        }
     }
 }
 
@@ -502,13 +612,18 @@ void launch_dropout(T* out,
                     float ratio,
                     cudaStream_t stream)
 {
-    dim3 grid_dim(batch);     // DS_GET_BLOCKS(total_count/4);
-    dim3 block_dim(dim / 4);  // DS_CUDA_NUM_THREADS;
+    assert(unroll_factor == 4);
+
+    int total_count = batch * dim / unroll_factor;
+
+    dim3 grid_dim = DS_GET_BLOCKS(total_count);
+    dim3 block_dim = DS_CUDA_NUM_THREADS;
 
     uint64_t inc = (batch * dim) / grid_dim.x / block_dim.x;
     std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
 
-    dropout_kernel<<<grid_dim, block_dim, 0, stream>>>(dim, ratio, bias, out, mask, seed);
+    dropout_kernel<<<grid_dim, block_dim, 0, stream>>>(
+        total_count, dim, ratio, bias, out, mask, seed);
 }
 
 template void launch_dropout(float*,
@@ -526,7 +641,8 @@ template void launch_dropout(__half*,
                              float ratio,
                              cudaStream_t stream);
 
-__global__ void dropout_kernel(const int dim,
+__global__ void dropout_kernel(const int N,
+                               const int dim,
                                const float ratio,
                                const float* input,
                                const float* residual,
@@ -537,31 +653,34 @@ __global__ void dropout_kernel(const int dim,
 {
     const float scale = 1. / (1. - ratio);
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x;
+    int tid = threadIdx.x % (dim / unroll_factor);
 
     curandStatePhilox4_32_10_t state;
     curand_init(seed.first, idx, seed.second, &state);
 
     float4* out_cast = reinterpret_cast<float4*>(out);
+    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
+
     const float4* bias_cast = reinterpret_cast<const float4*>(bias);
     const float4* residual_cast = reinterpret_cast<const float4*>(residual);
     const float4* input_cast = reinterpret_cast<const float4*>(input);
 
+    CUDA_1D_KERNEL_LOOP(j, N)
     {
         float4 rand = curand_uniform4(&state);
-        uint8_t m[4];
+
+        uint32_t m_32;
+        uint8_t* m = (uint8_t*)&m_32;
+
         m[0] = (uint8_t)(rand.x > ratio);
         m[1] = (uint8_t)(rand.y > ratio);
         m[2] = (uint8_t)(rand.z > ratio);
         m[3] = (uint8_t)(rand.w > ratio);
 
-        // int bid = k * blockDim.x + tid;
-        int i = blockIdx.x * dim + tid * 4;
-
-        float4 out_data = out_cast[idx];
+        float4 out_data;
         float4 b_data = bias_cast[tid];
-        float4 res_data = residual_cast[idx];
-        float4 inp_data = input_cast[idx];
+        float4 res_data = residual_cast[j];
+        float4 inp_data = input_cast[j];
 
         out_data.x = (b_data.x + inp_data.x);
         out_data.y = (b_data.y + inp_data.y);
@@ -578,16 +697,29 @@ __global__ void dropout_kernel(const int dim,
         out_data.z += res_data.z;
         out_data.w += res_data.w;
 
-        mask[i] = m[0];
-        mask[i + 1] = m[1];
-        mask[i + 2] = m[2];
-        mask[i + 3] = m[3];
-
-        out_cast[idx] = out_data;
+        mask_32[j] = m_32;
+        out_cast[j] = out_data;
+    }
+    int high_index =
+        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
+    if (N > high_index) {
+        float4 rand = curand_uniform4(&state);
+        float* rand_data = &(rand.x);
+        int k = 0;
+        for (int i = high_index; i < N; i++) {
+            float x_data = input[i] + bias[threadIdx.x % dim];
+            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
+            x_data = x_data * scale * m;
+            x_data += residual[i];
+
+            out[i] = x_data;
+            mask[i] = m;
+        }
     }
 }
 
-__global__ void dropout_kernel(const int dim,
+__global__ void dropout_kernel(const int N,
+                               const int dim,
                                const float ratio,
                                const __half* input,
                                const __half* residual,
@@ -598,19 +730,20 @@ __global__ void dropout_kernel(const int dim,
 {
     const float scale = 1. / (1. - ratio);
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int tid = threadIdx.x;
+    int tid = threadIdx.x % (dim / unroll_factor);
 
     curandStatePhilox4_32_10_t state;
     curand_init(seed.first, idx, seed.second, &state);
 
     float2* out_cast = reinterpret_cast<float2*>(out);
+    uint32_t* mask_32 = reinterpret_cast<uint32_t*>(mask);
+
     const float2* bias_cast = reinterpret_cast<const float2*>(bias);
     const float2* residual_cast = reinterpret_cast<const float2*>(residual);
     const float2* input_cast = reinterpret_cast<const float2*>(input);
 
+    CUDA_1D_KERNEL_LOOP(j, N)
     {
-        int i = blockIdx.x * dim + tid * 4;
-
         float4 rand = curand_uniform4(&state);
 
         float2 data_f;
@@ -625,10 +758,9 @@ __global__ void dropout_kernel(const int dim,
         float2 input_f;
         __half2* input_h = reinterpret_cast<__half2*>(&input_f);
 
-        data_f = out_cast[idx];
         bias_f = bias_cast[tid];
-        residual_f = residual_cast[idx];
-        input_f = input_cast[idx];
+        residual_f = residual_cast[j];
+        input_f = input_cast[j];
 
         float2 data_h_0 = __half22float2(data_h[0]);
         float2 data_h_1 = __half22float2(data_h[1]);
@@ -647,7 +779,8 @@ __global__ void dropout_kernel(const int dim,
         data_h_1.x = (bias_h_1.x + input_h_1.x);
         data_h_1.y = (bias_h_1.y + input_h_1.y);
 
-        uint8_t m[4];  // = mask + i;
+        uint32_t m_32;
+        uint8_t* m = (uint8_t*)&m_32;
 
         m[0] = (uint8_t)(rand.x > ratio);
         m[1] = (uint8_t)(rand.y > ratio);
@@ -670,12 +803,24 @@ __global__ void dropout_kernel(const int dim,
         result_h[0] = __float22half2_rn(data_h_0);
         result_h[1] = __float22half2_rn(data_h_1);
 
-        out_cast[idx] = result_f;
-
-        mask[i] = m[0];
-        mask[i + 1] = m[1];
-        mask[i + 2] = m[2];
-        mask[i + 3] = m[3];
+        out_cast[j] = result_f;
+        mask_32[j] = m_32;
+    }
+    int high_index =
+        ((((N / unroll_factor) - 1) / blockDim.x + 1) * (unroll_factor * blockDim.x)) + threadIdx.x;
+    if (N > high_index) {
+        float4 rand = curand_uniform4(&state);
+        float* rand_data = &(rand.x);
+        int k = 0;
+        for (int i = high_index; i < N; i++) {
+            float x_data = (float)input[i] + (float)bias[threadIdx.x % dim];
+            uint8_t m = (uint8_t)(rand_data[k++] > ratio);
+            x_data = x_data * scale * m;
+            x_data += (float)residual[i];
+
+            out[i] = __float2half(x_data);
+            mask[i] = m;
+        }
     }
 }
 
@@ -690,14 +835,17 @@ void launch_dropout(T* out,
                     float ratio,
                     cudaStream_t stream)
 {
-    dim3 grid_dim(batch);     // DS_GET_BLOCKS(total_count/4);
-    dim3 block_dim(dim / 4);  // DS_CUDA_NUM_THREADS;
+    assert(unroll_factor == 4);
+
+    int total_count = batch * dim / unroll_factor;
+    dim3 grid_dim = DS_GET_BLOCKS(total_count);
+    dim3 block_dim = DS_CUDA_NUM_THREADS;
 
     uint64_t inc = (batch * dim) / grid_dim.x / block_dim.x;
     std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
 
     dropout_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        dim, ratio, input, residual, bias, out, mask, seed);
+        total_count, dim, ratio, input, residual, bias, out, mask, seed);
 }
 
 template void launch_dropout(float*,
diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp
index 269468bdfdb4..f22b8a0743f1 100755
--- a/csrc/transformer/ds_transformer_cuda.cpp
+++ b/csrc/transformer/ds_transformer_cuda.cpp
@@ -14,23 +14,26 @@
 
 static std::unordered_map<int, std::shared_ptr<void>> s_transformer_layers;
 
+const int init_seq_length = 128;
+
 // C++ interface
 
 template <typename T>
 size_t get_workspace_size(int maxBatchSize,
                           int seq_len,
                           int hidden_size,
+                          int intermediate_size,
                           int heads,
                           bool training,
                           bool gelu_checkpoint)
 {
     size_t workSpacesize = 4 * (size_t(maxBatchSize) * seq_len * hidden_size);
     if (training) {
-        workSpacesize += (std::max((4 * size_t(maxBatchSize) * seq_len * hidden_size),
-                                   2 * (size_t(maxBatchSize) * heads * seq_len * seq_len)));
+        workSpacesize += ((std::max)((size_t(maxBatchSize) * seq_len * intermediate_size),
+                                     2 * (size_t(maxBatchSize) * heads * seq_len * seq_len)));
         if (gelu_checkpoint) workSpacesize += 2 * (size_t(maxBatchSize) * seq_len * hidden_size);
     }
-    return workSpacesize * sizeof(T);
+    return workSpacesize;  // * sizeof(T);
 }
 
 // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
@@ -77,39 +80,29 @@ BertTransformerLayer<T>::BertTransformerLayer(int layer_id,
                                                        hidden_size,
                                                        hidden_size,
                                                        gemm_algos[0])),
-      _norm_layer2(typename Normalize_Layer<T>::Config(batch_size,
-                                                       seq_length,
-                                                       hidden_size,
-                                                       true,
-                                                       false,
-                                                       false,
-                                                       !normalize_invertible)),
-      _norm_layer3(typename Normalize_Layer<T>::Config(batch_size,
-                                                       seq_length,
-                                                       hidden_size,
-                                                       true,
-                                                       false,
-                                                       false,
-                                                       !normalize_invertible)),
+      _attn_layer_norm(typename Normalize_Layer<T>::Config(batch_size,
+                                                           seq_length,
+                                                           hidden_size,
+                                                           true,
+                                                           !normalize_invertible)),
+      _layer_norm(typename Normalize_Layer<T>::Config(batch_size,
+                                                      seq_length,
+                                                      hidden_size,
+                                                      true,
+                                                      !normalize_invertible)),
       _ff1(typename FeedForward<T>::Config(batch_size * seq_length,
-                                           4 * hidden_size,
+                                           _intermediate_size,
                                            hidden_size,
                                            gemm_algos[1])),
       _ff2(typename FeedForward<T>::Config(batch_size * seq_length,
                                            hidden_size,
-                                           4 * hidden_size,
+                                           _intermediate_size,
                                            gemm_algos[2])),
       _softmax(typename Softmax<T>::Config(batch_size, num_heads, seq_length)),
-      _gelu(typename Gelu<T>::Config(_batch_size, _seq_length, _intermediate_size)),
-      _attn_prob_dropout(typename Dropout<T>::Config(attn_prob_dropout_ratio,
-                                                     _batch_size * _heads * _seq_length,
-                                                     _seq_length)),
-      _attn_output_dropout(typename Dropout<T>::Config(hidden_output_dropout_ratio,
-                                                       _batch_size * _seq_length,
-                                                       _hidden_size)),
-      _layer_output_dropout(typename Dropout<T>::Config(hidden_output_dropout_ratio,
-                                                        _batch_size * _seq_length,
-                                                        _hidden_size)),
+      _gelu(typename Gelu<T>::Config(_intermediate_size)),
+      _attn_prob_dropout(typename Dropout<T>::Config(attn_prob_dropout_ratio, _seq_length)),
+      _attn_output_dropout(typename Dropout<T>::Config(hidden_output_dropout_ratio, _hidden_size)),
+      _layer_output_dropout(typename Dropout<T>::Config(hidden_output_dropout_ratio, _hidden_size)),
       _attn_scores(typename StridedBatchGemm<T>::Config(_batch_size * _heads,
                                                         _seq_length,
                                                         _seq_length,
@@ -130,7 +123,6 @@ BertTransformerLayer<T>::BertTransformerLayer(int layer_id,
                                                          gemm_algos[4]))
 {
     assert(_hidden_size % _heads == 0);
-    assert(_seq_length <= 1024);
 
     Initialize();
 }
@@ -143,9 +135,6 @@ BertTransformerLayer<T>::~BertTransformerLayer()
 template <typename T>
 void BertTransformerLayer<T>::Initialize()
 {
-    Context::Instance().GenWorkSpace(get_workspace_size<T>(
-        _batch_size, _seq_length, _hidden_size, _heads, _training, _gelu_checkpoint));
-
     if (std::is_same<T, __half>::value) cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
 }
 
@@ -190,18 +179,18 @@ void BertTransformerLayer<T>::Forward(int bsz,
     if (_normalize_invertible) add_res_ptr = buf_1 + 3 * small_buf_size;
     if (_attn_dropout_checkpoint) ctx_bufB_ptr = buf_1 + 4 * small_buf_size;
 
+    int bsz_seq = bsz * _seq_length;
+
     if (_pre_or_postLayerNorm) {
-        if (_norm_layer3.UseMean())
-            _norm_layer3.ForwardCheckpoint(
-                bsz, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
+        if (_layer_norm.UseMean())
+            _layer_norm.ForwardCheckpoint(
+                bsz_seq, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
 
         else
-            _norm_layer3.Forward(
-                bsz, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
+            _layer_norm.Forward(
+                bsz_seq, inp_norm_ptr, input_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
     }
 
-    int bsz_seq = bsz * _seq_length;
-
     if (_pre_or_postLayerNorm)
         _qkv_linear.Forward(bsz_seq, inp_norm_ptr, attn_qkvw_ptr, buf_0, _cublasHandle);
     else
@@ -241,19 +230,19 @@ void BertTransformerLayer<T>::Forward(int bsz,
             bsz_seq, add_res_ptr, ff1_inp_ptr, input_ptr, attn_ob_ptr, _stream);
 
     if (_pre_or_postLayerNorm) {
-        if (_norm_layer2.UseMean())
-            _norm_layer2.ForwardCheckpoint(
-                bsz, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
+        if (_attn_layer_norm.UseMean())
+            _attn_layer_norm.ForwardCheckpoint(
+                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
         else
-            _norm_layer2.Forward(
-                bsz, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
+            _attn_layer_norm.Forward(
+                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
     } else {
-        if (_norm_layer2.UseMean())
-            _norm_layer2.ForwardCheckpoint(
-                bsz, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
+        if (_attn_layer_norm.UseMean())
+            _attn_layer_norm.ForwardCheckpoint(
+                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
         else
-            _norm_layer2.Forward(
-                bsz, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
+            _attn_layer_norm.Forward(
+                bsz_seq, ff1_inp_ptr, add_res_ptr, attn_nw_ptr, attn_nb_ptr, _stream, true);
     }
 
     _ff1.Forward(bsz_seq,
@@ -262,7 +251,7 @@ void BertTransformerLayer<T>::Forward(int bsz,
                  (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr),
                  _cublasHandle);
 
-    _gelu.ForwardWithBiasAdd(bsz,
+    _gelu.ForwardWithBiasAdd(bsz_seq,
                              (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr),
                              inter_b_ptr,
                              (_gelu_checkpoint ? ctx_bufB_ptr : ff2_inp_ptr),
@@ -283,11 +272,12 @@ void BertTransformerLayer<T>::Forward(int bsz,
             bsz_seq, inp_norm_ptr, out_ptr, ff1_inp_ptr, output_b_ptr, _stream);
 
     if (!_pre_or_postLayerNorm) {
-        if (_norm_layer3.UseMean())
-            _norm_layer3.ForwardCheckpoint(
-                bsz, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
+        if (_layer_norm.UseMean())
+            _layer_norm.ForwardCheckpoint(
+                bsz_seq, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
         else
-            _norm_layer3.Forward(bsz, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
+            _layer_norm.Forward(
+                bsz_seq, out_ptr, inp_norm_ptr, norm_w_ptr, norm_b_ptr, _stream, true);
     }
 }
 
@@ -343,7 +333,8 @@ void BertTransformerLayer<T>::Backward(int bsz,
     T* buf_2 = buf_1 + small_buf_size;
     T* buf_3 = buf_2 + small_buf_size;
 
-    T* ff2_buf = buf_3 + (_gelu_checkpoint ? 3 : 1) * small_buf_size;
+    T* ff2_buf = (_gelu_checkpoint ? buf_2 + (bsz * _seq_length * _intermediate_size)
+                                   : buf_3 + small_buf_size);
     T* ctx_bufB_ptr_recomp = ff2_buf + (_seq_length * _seq_length * bsz * _heads);
 
     cudaStream_t streams[2] = {_stream, _stream};
@@ -352,26 +343,26 @@ void BertTransformerLayer<T>::Backward(int bsz,
     int bsz_heads = bsz * _heads;
 
     if (!_pre_or_postLayerNorm) {
-        if (_norm_layer3.UseMean())
-            _norm_layer3.Backward(bsz,
-                                  grad_output_ptr,
-                                  norm_w_ptr,
-                                  grad_norm_w_ptr,
-                                  grad_norm_b_ptr,
-                                  streams,
-                                  buf_1,
-                                  inp_norm_ptr);
+        if (_layer_norm.UseMean())
+            _layer_norm.Backward(bsz_seq,
+                                 grad_output_ptr,
+                                 norm_w_ptr,
+                                 grad_norm_w_ptr,
+                                 grad_norm_b_ptr,
+                                 streams,
+                                 buf_1,
+                                 inp_norm_ptr);
 
         else
-            _norm_layer3.Backward(bsz,
-                                  grad_output_ptr,
-                                  norm_w_ptr,
-                                  norm_b_ptr,
-                                  grad_norm_w_ptr,
-                                  grad_norm_b_ptr,
-                                  streams,
-                                  buf_1,
-                                  output_ptr);
+            _layer_norm.Backward(bsz_seq,
+                                 grad_output_ptr,
+                                 norm_w_ptr,
+                                 norm_b_ptr,
+                                 grad_norm_w_ptr,
+                                 grad_norm_b_ptr,
+                                 streams,
+                                 buf_1,
+                                 output_ptr);
     }
 
     if (_pre_or_postLayerNorm)
@@ -383,7 +374,8 @@ void BertTransformerLayer<T>::Backward(int bsz,
                                      ? buf_0
                                      : (_pre_or_postLayerNorm ? grad_output_ptr : buf_1);
 
-    if (_gelu_checkpoint) _gelu.ForwardWithBiasAdd(bsz, ff2_inp_ptr, inter_b_ptr, buf_2, _stream);
+    if (_gelu_checkpoint)
+        _gelu.ForwardWithBiasAdd(bsz_seq, ff2_inp_ptr, inter_b_ptr, buf_2, _stream);
     _ff2.Backward(bsz_seq,
                   layer_dropout_buf,
                   (_gelu_checkpoint ? buf_2 : ff2_inp_ptr),
@@ -395,7 +387,7 @@ void BertTransformerLayer<T>::Backward(int bsz,
                   ff2_buf);
 
     _gelu.Backward(
-        bsz, ff2_buf, (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr), inter_b_ptr, _stream);
+        bsz_seq, ff2_buf, (_gelu_checkpoint ? ff2_inp_ptr : gelu_inp_ptr), inter_b_ptr, _stream);
 
     _ff1.Backward(bsz_seq,
                   ff2_buf,
@@ -411,49 +403,49 @@ void BertTransformerLayer<T>::Backward(int bsz,
         launch_fused_add2<T>(buf_2, buf_3, buf_1, bsz, _seq_length, _hidden_size, _stream);
 
     if (_pre_or_postLayerNorm) {
-        if (_norm_layer2.UseMean())
-            _norm_layer2.BackwardFusedAdd(bsz,
-                                          buf_3,
-                                          grad_output_ptr,
-                                          attn_nw_ptr,
-                                          grad_attn_nw_ptr,
-                                          grad_attn_nb_ptr,
-                                          streams,
-                                          buf_0,
-                                          add_res_ptr);
+        if (_attn_layer_norm.UseMean())
+            _attn_layer_norm.BackwardFusedAdd(bsz_seq,
+                                              buf_3,
+                                              grad_output_ptr,
+                                              attn_nw_ptr,
+                                              grad_attn_nw_ptr,
+                                              grad_attn_nb_ptr,
+                                              streams,
+                                              buf_0,
+                                              add_res_ptr);
 
         else
-            _norm_layer2.BackwardFusedAdd(bsz,
-                                          buf_3,
-                                          grad_output_ptr,
-                                          attn_nw_ptr,
-                                          attn_nb_ptr,
-                                          grad_attn_nw_ptr,
-                                          grad_attn_nb_ptr,
-                                          streams,
-                                          buf_0,
-                                          ff1_inp_ptr);
+            _attn_layer_norm.BackwardFusedAdd(bsz_seq,
+                                              buf_3,
+                                              grad_output_ptr,
+                                              attn_nw_ptr,
+                                              attn_nb_ptr,
+                                              grad_attn_nw_ptr,
+                                              grad_attn_nb_ptr,
+                                              streams,
+                                              buf_0,
+                                              ff1_inp_ptr);
     } else {
-        if (_norm_layer2.UseMean())
-            _norm_layer2.Backward(bsz,
-                                  buf_2,
-                                  attn_nw_ptr,
-                                  grad_attn_nw_ptr,
-                                  grad_attn_nb_ptr,
-                                  streams,
-                                  buf_0,
-                                  add_res_ptr);
+        if (_attn_layer_norm.UseMean())
+            _attn_layer_norm.Backward(bsz_seq,
+                                      buf_2,
+                                      attn_nw_ptr,
+                                      grad_attn_nw_ptr,
+                                      grad_attn_nb_ptr,
+                                      streams,
+                                      buf_0,
+                                      add_res_ptr);
 
         else
-            _norm_layer2.Backward(bsz,
-                                  buf_2,
-                                  attn_nw_ptr,
-                                  attn_nb_ptr,
-                                  grad_attn_nw_ptr,
-                                  grad_attn_nb_ptr,
-                                  streams,
-                                  buf_0,
-                                  ff1_inp_ptr);
+            _attn_layer_norm.Backward(bsz_seq,
+                                      buf_2,
+                                      attn_nw_ptr,
+                                      attn_nb_ptr,
+                                      grad_attn_nw_ptr,
+                                      grad_attn_nb_ptr,
+                                      streams,
+                                      buf_0,
+                                      ff1_inp_ptr);
     }
 
     _attn_output_dropout.Backward(bsz_seq, buf_2, buf_0, _stream);
@@ -518,28 +510,28 @@ void BertTransformerLayer<T>::Backward(int bsz,
                              buf_2);
 
     if (_pre_or_postLayerNorm) {
-        if (_norm_layer3.UseMean())
-            _norm_layer3.BackwardFusedAdd(bsz,
-                                          buf_2,
-                                          buf_0,
-                                          norm_w_ptr,
-                                          grad_norm_w_ptr,
-                                          grad_norm_b_ptr,
-                                          streams,
-                                          grad_input_ptr,
-                                          input_ptr);
+        if (_layer_norm.UseMean())
+            _layer_norm.BackwardFusedAdd(bsz_seq,
+                                         buf_2,
+                                         buf_0,
+                                         norm_w_ptr,
+                                         grad_norm_w_ptr,
+                                         grad_norm_b_ptr,
+                                         streams,
+                                         grad_input_ptr,
+                                         input_ptr);
 
         else
-            _norm_layer3.BackwardFusedAdd(bsz,
-                                          buf_2,
-                                          buf_0,
-                                          norm_w_ptr,
-                                          norm_b_ptr,
-                                          grad_norm_w_ptr,
-                                          grad_norm_b_ptr,
-                                          streams,
-                                          grad_input_ptr,
-                                          inp_norm_ptr);
+            _layer_norm.BackwardFusedAdd(bsz_seq,
+                                         buf_2,
+                                         buf_0,
+                                         norm_w_ptr,
+                                         norm_b_ptr,
+                                         grad_norm_w_ptr,
+                                         grad_norm_b_ptr,
+                                         streams,
+                                         grad_input_ptr,
+                                         inp_norm_ptr);
     } else
         launch_fused_add2<T>(grad_input_ptr, buf_2, buf_0, bsz, _seq_length, _hidden_size, _stream);
 }
@@ -556,11 +548,31 @@ void BertTransformerLayer<T>::SetTrainingMode(bool training)
 template <typename T>
 void BertTransformerLayer<T>::SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr,
                                                      uint8_t* attn_output_dropout_mask_ptr,
-                                                     uint8_t* layer_output_dropout_mask_ptr)
+                                                     uint8_t* layer_output_dropout_mask_ptr,
+                                                     T* attn_layer_norm_var,
+                                                     T* attn_layer_norm_mean,
+                                                     T* layer_norm_var,
+                                                     T* layer_norm_mean)
 {
     _attn_prob_dropout.SetMask(attn_prob_dropout_mask_ptr);
     _attn_output_dropout.SetMask(attn_output_dropout_mask_ptr);
     _layer_output_dropout.SetMask(layer_output_dropout_mask_ptr);
+
+    _attn_layer_norm.SetVar(attn_layer_norm_var);
+    _attn_layer_norm.SetMean(attn_layer_norm_mean);
+    _layer_norm.SetVar(layer_norm_var);
+    _layer_norm.SetMean(layer_norm_mean);
+}
+
+template <typename T>
+void BertTransformerLayer<T>::SetSeqLength(int seq_len)
+{
+    _seq_length = seq_len;
+
+    _softmax.SetSeqLength(_seq_length);
+    _attn_prob_dropout.SetDimension(_seq_length);
+    _attn_scores.SetConfig(_seq_length, _seq_length, _hidden_size / _heads);
+    _attn_context.SetConfig(_hidden_size / _heads, _seq_length, _seq_length);
 }
 
 template <typename T>
@@ -569,7 +581,6 @@ int create_transformer_layer(int layer_id,
                              int hidden_dim,
                              int num_heads,
                              int intermediate_size,
-                             int seq_length,
                              float attn_dropout_ratio,
                              float hidden_dropout_ratio,
                              int seed,
@@ -582,14 +593,14 @@ int create_transformer_layer(int layer_id,
 {
     Context::Instance().SetSeed(seed);
     Context::Instance().TestGemmFP16(
-        test_gemm, batch_size, seq_length, num_heads, hidden_dim / num_heads);
+        test_gemm, batch_size, init_seq_length, num_heads, hidden_dim / num_heads);
 
     auto layer = std::make_shared<BertTransformerLayer<T>>(layer_id,
                                                            batch_size,
                                                            hidden_dim,
                                                            num_heads,
                                                            intermediate_size,
-                                                           seq_length,
+                                                           init_seq_length,
                                                            attn_dropout_ratio,
                                                            hidden_dropout_ratio,
                                                            pre_or_postLayerNorm,
@@ -681,54 +692,71 @@ std::vector<torch::Tensor> ds_transformer_forward(int layer_id,
     std::shared_ptr<BertTransformerLayer<T>> layer =
         std::static_pointer_cast<BertTransformerLayer<T>>(s_transformer_layers[layer_id]);
 
+    int seq_len = layer->GetSeqLength();
+    if (input.size(1) != seq_len) {
+        seq_len = input.size(1);
+        layer->SetSeqLength(seq_len);
+    }
+
+    auto workspace = torch::empty({get_workspace_size<T>(bsz,
+                                                         seq_len,
+                                                         layer->GetHiddenSize(),
+                                                         layer->GetIntermediateSize(),
+                                                         layer->GetNumHeads(),
+                                                         layer->IsTrainingMode(),
+                                                         layer->GeluCheckpoint())},
+                                  options);
+    Context::Instance().SetWorkSpace((T*)workspace.data_ptr());
+
     auto inp_norm = ((prelayernorm || !normalize_invertible) ? torch::empty_like(input) : output);
     auto add_res = (normalize_invertible ? inp_norm : torch::empty_like(input));
     auto attn_o_inp = torch::empty_like(input);
-    auto qkv_tf = torch::empty({(bsz * layer->GetSeqLength()), output_w.size(0) * 3}, options);
+    auto qkv_tf = torch::empty({(bsz * seq_len), output_w.size(0) * 3}, options);
 
     auto attn_prob_dropout_mask =
-        torch::empty({(bsz * layer->GetNumHeads() * layer->GetSeqLength()), layer->GetSeqLength()},
-                     uint8_options);
+        torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, uint8_options);
     auto attn_output_dropout_mask =
-        torch::empty({(bsz * layer->GetSeqLength()), layer->GetHiddenSize()}, uint8_options);
+        torch::empty({(bsz * seq_len), layer->GetHiddenSize()}, uint8_options);
     auto layer_output_dropout_mask =
-        torch::empty({(bsz * layer->GetSeqLength()), layer->GetHiddenSize()}, uint8_options);
+        torch::empty({(bsz * seq_len), layer->GetHiddenSize()}, uint8_options);
+
+    auto attn_layer_norm_var = torch::empty({(bsz * seq_len)}, options);
+    auto attn_layer_norm_mean = torch::empty({(bsz * seq_len)}, options);
+    auto layer_norm_var = torch::empty({(bsz * seq_len)}, options);
+    auto layer_norm_mean = torch::empty({(bsz * seq_len)}, options);
 
     T* inp_norm_ptr = (T*)inp_norm.data_ptr();
     T* add_res_ptr = (T*)add_res.data_ptr();
     T* q_tf_ptr = (T*)qkv_tf.data_ptr();
-    T* k_tf_ptr =
-        q_tf_ptr + (bsz * layer->GetSeqLength() * output_w.size(0));  //(T*)k_tf.data_ptr();
-    T* v_tf_ptr =
-        k_tf_ptr + (bsz * layer->GetSeqLength() * output_w.size(0));  //(T*)v_tf.data_ptr();
+    T* k_tf_ptr = q_tf_ptr + (bsz * seq_len * output_w.size(0));  //(T*)k_tf.data_ptr();
+    T* v_tf_ptr = k_tf_ptr + (bsz * seq_len * output_w.size(0));  //(T*)v_tf.data_ptr();
     T* attn_o_inp_ptr = (T*)attn_o_inp.data_ptr();
 
-    torch::Tensor ff2_inp =
-        torch::empty({(bsz * layer->GetSeqLength()), output_w.size(1)}, options);
+    torch::Tensor ff2_inp = torch::empty({(bsz * seq_len), output_w.size(1)}, options);
     torch::Tensor gelu_inp =
-        (gelu_checkpoint
-             ? ff2_inp
-             : torch::empty({(bsz * layer->GetSeqLength()), output_w.size(1)}, options));
+        (gelu_checkpoint ? ff2_inp : torch::empty({(bsz * seq_len), output_w.size(1)}, options));
     auto ff1_inp = torch::empty_like(input);
     T* ff2_inp_ptr = (T*)ff2_inp.data_ptr();
     T* gelu_inp_ptr = (T*)gelu_inp.data_ptr();
     T* ff1_inp_ptr = (T*)ff1_inp.data_ptr();
 
-    torch::Tensor soft_out = torch::empty(
-        {(bsz * layer->GetNumHeads() * layer->GetSeqLength()), layer->GetSeqLength()}, options);
+    torch::Tensor soft_out =
+        torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, options);
     torch::Tensor ctx_bufB =
         (attn_dropout_checkpoint
              ? soft_out
-             : torch::empty(
-                   {(bsz * layer->GetNumHeads() * layer->GetSeqLength()), layer->GetSeqLength()},
-                   options));
+             : torch::empty({(bsz * layer->GetNumHeads() * seq_len), seq_len}, options));
     T* soft_out_ptr = (T*)soft_out.data_ptr();
     T* ctx_bufB_ptr = (T*)ctx_bufB.data_ptr();
 
     layer->SetTrainingMode(training_mode);
     layer->SetIntermediateBuffers((uint8_t*)attn_prob_dropout_mask.data_ptr(),
                                   (uint8_t*)attn_output_dropout_mask.data_ptr(),
-                                  (uint8_t*)layer_output_dropout_mask.data_ptr());
+                                  (uint8_t*)layer_output_dropout_mask.data_ptr(),
+                                  (T*)attn_layer_norm_var.data_ptr(),
+                                  (T*)attn_layer_norm_mean.data_ptr(),
+                                  (T*)layer_norm_var.data_ptr(),
+                                  (T*)layer_norm_mean.data_ptr());
 
     layer->Forward(bsz,
                    input_ptr,
@@ -770,7 +798,11 @@ std::vector<torch::Tensor> ds_transformer_forward(int layer_id,
             ff2_inp,
             attn_prob_dropout_mask,
             attn_output_dropout_mask,
-            layer_output_dropout_mask};
+            layer_output_dropout_mask,
+            attn_layer_norm_var,
+            attn_layer_norm_mean,
+            layer_norm_var,
+            layer_norm_mean};
 }
 
 template <typename T>
@@ -789,6 +821,10 @@ std::vector<torch::Tensor> ds_transformer_backward(int layer_id,
                                                    const torch::Tensor& attn_prob_dropout_mask,
                                                    const torch::Tensor& attn_output_dropout_mask,
                                                    const torch::Tensor& layer_output_dropout_mask,
+                                                   const torch::Tensor& attn_layer_norm_var,
+                                                   const torch::Tensor& attn_layer_norm_mean,
+                                                   const torch::Tensor& layer_norm_var,
+                                                   const torch::Tensor& layer_norm_mean,
                                                    const torch::Tensor& input,
                                                    const torch::Tensor& input_mask,
                                                    const torch::Tensor& attn_qkvw,
@@ -832,9 +868,26 @@ std::vector<torch::Tensor> ds_transformer_backward(int layer_id,
     CHECK_INPUT(norm_b);
 
     int bsz = g_output.size(0);
+
     std::shared_ptr<BertTransformerLayer<T>> layer =
         std::static_pointer_cast<BertTransformerLayer<T>>(s_transformer_layers[layer_id]);
 
+    int seq_len = layer->GetSeqLength();
+    if (g_output.size(1) != seq_len) {
+        seq_len = g_output.size(1);
+        layer->SetSeqLength(seq_len);
+    }
+
+    auto workspace = torch::empty({get_workspace_size<T>(bsz,
+                                                         seq_len,
+                                                         layer->GetHiddenSize(),
+                                                         layer->GetIntermediateSize(),
+                                                         layer->GetNumHeads(),
+                                                         layer->IsTrainingMode(),
+                                                         layer->GeluCheckpoint())},
+                                  grad_output.options());
+    Context::Instance().SetWorkSpace((T*)workspace.data_ptr());
+
     auto grad_input = torch::empty_like(input);
     auto grad_attn_qkvw = torch::empty_like(attn_qkvw);
     auto grad_attn_qkvb = torch::empty_like(attn_qkvb);
@@ -894,7 +947,11 @@ std::vector<torch::Tensor> ds_transformer_backward(int layer_id,
 
     layer->SetIntermediateBuffers((uint8_t*)attn_prob_dropout_mask.data_ptr(),
                                   (uint8_t*)attn_output_dropout_mask.data_ptr(),
-                                  (uint8_t*)layer_output_dropout_mask.data_ptr());
+                                  (uint8_t*)layer_output_dropout_mask.data_ptr(),
+                                  (T*)attn_layer_norm_var.data_ptr(),
+                                  (T*)attn_layer_norm_mean.data_ptr(),
+                                  (T*)layer_norm_var.data_ptr(),
+                                  (T*)layer_norm_mean.data_ptr());
 
     layer->Backward(bsz,
                     grad_output_ptr,
diff --git a/csrc/transformer/gelu_kernels.cu b/csrc/transformer/gelu_kernels.cu
old mode 100755
new mode 100644
index f0e65e3829b5..209b64a90902
--- a/csrc/transformer/gelu_kernels.cu
+++ b/csrc/transformer/gelu_kernels.cu
@@ -279,13 +279,12 @@ void launch_bias_gelu(const T* input,
                       T* output,
                       int intermediate_size,
                       int batch_size,
-                      int sequence_length,
                       cudaStream_t stream)
 {
     int iterations = (intermediate_size + 1023) / 1024;
     int threads = intermediate_size / iterations / 4;
     dim3 block_dims(threads);
-    dim3 grid_dims(sequence_length * batch_size);
+    dim3 grid_dims(batch_size);
 
     fused_bias_gelu<<<grid_dims, block_dims, 0, stream>>>(input, bias, output, intermediate_size);
 }
@@ -295,24 +294,26 @@ void launch_gelu(const T* input,
                  T* output,
                  int intermediate_size,
                  int batch_size,
-                 int sequence_length,
                  cudaStream_t stream)
 {
     int iterations = (intermediate_size + 1023) / 1024;
     int threads = intermediate_size / iterations / 4;
     dim3 block_dims(threads);
-    dim3 grid_dims(sequence_length * batch_size);
+    dim3 grid_dims(batch_size);
 
     gelu_kernel<<<grid_dims, block_dims, 0, stream>>>(input, output, intermediate_size);
 }
 
-template void
-launch_bias_gelu<float>(const float*, const float*, float*, int, int, int, cudaStream_t);
-template void
-launch_bias_gelu<__half>(const __half*, const __half*, __half*, int, int, int, cudaStream_t);
+template void launch_bias_gelu<float>(const float*, const float*, float*, int, int, cudaStream_t);
+template void launch_bias_gelu<__half>(const __half*,
+                                       const __half*,
+                                       __half*,
+                                       int,
+                                       int,
+                                       cudaStream_t);
 
-template void launch_gelu<float>(const float*, float*, int, int, int, cudaStream_t);
-template void launch_gelu<__half>(const __half*, __half*, int, int, int, cudaStream_t);
+template void launch_gelu<float>(const float*, float*, int, int, cudaStream_t);
+template void launch_gelu<__half>(const __half*, __half*, int, int, cudaStream_t);
 
 template <typename T>
 void launch_d_gelu(T* d_output,
@@ -320,17 +321,15 @@ void launch_d_gelu(T* d_output,
                    const T* bias,
                    int intermediate_size,
                    int batch_size,
-                   int sequence_length,
                    cudaStream_t stream)
 {
     int iterations = (intermediate_size + 1023) / 1024;
     int threads = intermediate_size / iterations / 4;
     dim3 block_dims(threads);
-    dim3 grid_dims(sequence_length * batch_size);
+    dim3 grid_dims(batch_size);
 
     d_gelu_func<<<grid_dims, block_dims, 0, stream>>>(d_output, input, bias, intermediate_size);
 }
 
-template void launch_d_gelu<float>(float*, const float*, const float*, int, int, int, cudaStream_t);
-template void
-launch_d_gelu<__half>(__half*, const __half*, const __half*, int, int, int, cudaStream_t);
+template void launch_d_gelu<float>(float*, const float*, const float*, int, int, cudaStream_t);
+template void launch_d_gelu<__half>(__half*, const __half*, const __half*, int, int, cudaStream_t);
diff --git a/csrc/transformer/general_kernels.cu b/csrc/transformer/general_kernels.cu
index 0ce280a702ab..fbe4d0536789 100644
--- a/csrc/transformer/general_kernels.cu
+++ b/csrc/transformer/general_kernels.cu
@@ -14,15 +14,18 @@ __global__ void column_sum_reduce(const T* __restrict__ inp,
     cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int offset = threadIdx.y * width + idx;
+
     int y_stride = width * TILE_DIM;
 
     float localSum = 0;
 
     // Loop across matrix height
-    for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
-        localSum += (float)inp[offset];
-        offset += y_stride;
+    if (idx < width) {
+        int offset = threadIdx.y * width + idx;
+        for (int r = threadIdx.y; r < rows; r += TILE_DIM) {
+            localSum += (float)inp[offset];
+            offset += y_stride;
+        }
     }
 
     tile[threadIdx.x][threadIdx.y] = localSum;
@@ -40,7 +43,7 @@ __global__ void column_sum_reduce(const T* __restrict__ inp,
 
     if (threadIdx.x == 0) {
         int pos = blockIdx.x * TILE_DIM + threadIdx.y;
-        out[pos] = sum;
+        if (pos < (rows * width)) out[pos] = sum;
     }
 }
 
@@ -58,10 +61,10 @@ void launch_fuse_transpose_bias_kernel<float>(const float* inp,
                                               int cols,
                                               cudaStream_t stream)
 {
-    assert(rows % TILE_DIM == 0);
-    assert(cols % TILE_DIM == 0);
+    // assert(rows % TILE_DIM == 0);
+    // assert(cols % TILE_DIM == 0);
 
-    dim3 grid_dim(cols / TILE_DIM);
+    dim3 grid_dim((cols - 1) / TILE_DIM + 1);
     dim3 block_dim(TILE_DIM, TILE_DIM);
 
     column_sum_reduce<float><<<grid_dim, block_dim, 0, stream>>>(inp, out, rows, cols);
@@ -74,49 +77,38 @@ void launch_fuse_transpose_bias_kernel<__half>(const __half* inp,
                                                int cols,
                                                cudaStream_t stream)
 {
-    assert(rows % TILE_DIM == 0);
-    assert(cols % TILE_DIM == 0);
+    // assert(rows % TILE_DIM == 0);
+    // assert(cols % TILE_DIM == 0);
 
-    dim3 grid_dim(cols / TILE_DIM);
+    dim3 grid_dim((cols - 1) / TILE_DIM + 1);
     dim3 block_dim(TILE_DIM, TILE_DIM);
 
     column_sum_reduce<__half><<<grid_dim, block_dim, 0, stream>>>(inp, out, rows, cols);
 }
 
-__global__ void fused_add2_kernel(float* out,
-                                  const float* inp1,
-                                  const float* inp2,
-                                  int size,
-                                  int row_stride)
+__global__ void fused_add2_kernel(const int N, float* out, const float* inp1, const float* inp2)
 {
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-
     const float4* inp1_4 = reinterpret_cast<const float4*>(inp1);
     const float4* inp2_4 = reinterpret_cast<const float4*>(inp2);
     float4* out_4 = reinterpret_cast<float4*>(out);
 
-    float4 val;
-    float4 inp1_reg = inp1_4[row * row_stride + id];
-    float4 inp2_reg = inp2_4[row * row_stride + id];
+    CUDA_1D_KERNEL_LOOP(j, N)
+    {
+        float4 val;
+        float4 inp1_reg = inp1_4[j];
+        float4 inp2_reg = inp2_4[j];
 
-    val.x = inp1_reg.x + inp2_reg.x;
-    val.y = inp1_reg.y + inp2_reg.y;
-    val.z = inp1_reg.z + inp2_reg.z;
-    val.w = inp1_reg.w + inp2_reg.w;
+        val.x = inp1_reg.x + inp2_reg.x;
+        val.y = inp1_reg.y + inp2_reg.y;
+        val.z = inp1_reg.z + inp2_reg.z;
+        val.w = inp1_reg.w + inp2_reg.w;
 
-    out_4[row * row_stride + id] = val;
+        out_4[j] = val;
+    }
 }
 
-__global__ void fused_add2_kernel(__half* out,
-                                  const __half* inp1,
-                                  const __half* inp2,
-                                  int size,
-                                  int row_stride)
+__global__ void fused_add2_kernel(const int N, __half* out, const __half* inp1, const __half* inp2)
 {
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-
     float2 inp1_4;
     float2 inp2_4;
 
@@ -126,28 +118,31 @@ __global__ void fused_add2_kernel(__half* out,
     const float2* inp1_arr = reinterpret_cast<const float2*>(inp1);
     const float2* inp2_arr = reinterpret_cast<const float2*>(inp2);
 
-    inp1_4 = inp1_arr[row * row_stride + id];
-    inp2_4 = inp2_arr[row * row_stride + id];
+    CUDA_1D_KERNEL_LOOP(j, N)
+    {
+        inp1_4 = inp1_arr[j];
+        inp2_4 = inp2_arr[j];
 
-    float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
-    float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
+        float2 inp1_h_f_0 = __half22float2(inp1_h[0]);
+        float2 inp1_h_f_1 = __half22float2(inp1_h[1]);
 
-    float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
-    float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
+        float2 inp2_h_f_0 = __half22float2(inp2_h[0]);
+        float2 inp2_h_f_1 = __half22float2(inp2_h[1]);
 
-    inp1_h_f_0.x += inp2_h_f_0.x;
-    inp1_h_f_0.y += inp2_h_f_0.y;
-    inp1_h_f_1.x += inp2_h_f_1.x;
-    inp1_h_f_1.y += inp2_h_f_1.y;
+        inp1_h_f_0.x += inp2_h_f_0.x;
+        inp1_h_f_0.y += inp2_h_f_0.y;
+        inp1_h_f_1.x += inp2_h_f_1.x;
+        inp1_h_f_1.y += inp2_h_f_1.y;
 
-    float2 val_f;
-    __half2* val_h = reinterpret_cast<__half2*>(&val_f);
+        float2 val_f;
+        __half2* val_h = reinterpret_cast<__half2*>(&val_f);
 
-    val_h[0] = __float22half2_rn(inp1_h_f_0);
-    val_h[1] = __float22half2_rn(inp1_h_f_1);
+        val_h[0] = __float22half2_rn(inp1_h_f_0);
+        val_h[1] = __float22half2_rn(inp1_h_f_1);
 
-    float2* out_4 = reinterpret_cast<float2*>(out);
-    out_4[row * row_stride + id] = val_f;
+        float2* out_4 = reinterpret_cast<float2*>(out);
+        out_4[j] = val_f;
+    }
 }
 
 template <>
@@ -159,12 +154,12 @@ void launch_fused_add2<float>(float* out,
                               int hidden_dim,
                               cudaStream_t& stream)
 {
-    dim3 grid_dim(batch_size * seq_length);
+    int total_count = batch_size * seq_length * hidden_dim / 4;
+    dim3 grid_dim = DS_GET_BLOCKS(total_count);  //(batch_size * seq_length);
 
-    dim3 block_dim(hidden_dim / 4);
+    dim3 block_dim = DS_CUDA_NUM_THREADS;  //(hidden_dim / 4);
 
-    fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        out, inp1, inp2, (batch_size * seq_length * hidden_dim), hidden_dim / 4);
+    fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(total_count, out, inp1, inp2);
 }
 
 template <>
@@ -176,12 +171,12 @@ void launch_fused_add2<__half>(__half* out,
                                int hidden_dim,
                                cudaStream_t& stream)
 {
-    dim3 grid_dim(batch_size * seq_length);
+    int total_count = batch_size * seq_length * hidden_dim / 4;
+    dim3 grid_dim = DS_GET_BLOCKS(total_count);  //(batch_size * seq_length);
 
-    dim3 block_dim(hidden_dim / 4);
+    dim3 block_dim = DS_CUDA_NUM_THREADS;  //(hidden_dim / 4);
 
-    fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        out, inp1, inp2, (batch_size * seq_length * hidden_dim), hidden_dim / 4);
+    fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(total_count, out, inp1, inp2);
 }
 
 __global__ void fused_add3_kernel(float* out,
diff --git a/csrc/transformer/normalize_kernels.cu b/csrc/transformer/normalize_kernels.cu
old mode 100755
new mode 100644
index 7345175694bf..a0e512c73d44
--- a/csrc/transformer/normalize_kernels.cu
+++ b/csrc/transformer/normalize_kernels.cu
@@ -5,51 +5,53 @@ namespace cg = cooperative_groups;
 /*
 Fused bias add, residual (elementwise) add, and normalization layer.
 
-Unlike the GELU, which doesn't require template parameters, this layer does since it
-does rely fairly heavily on unrolling loops. Currently, I exclude bounds checks and
-assume that the number of elements is a multiple of a power of 2. Default behavior
-for our purposes uses 256 threads for floats, and 128 threads for __half. This restriction
-is a result of using the shift parameter to perform the minimum number of register file
-shuffles necessary, which requires the number of threads in the secondary reduction to
-be 1, 2, 4, 8, 16, or 32. The number of threads here corresponds to the number of complete
-warps in the threadblock.
-
 For FP16, this kernel does not promote to FP32 in order to utilize the 2x throughput for
 __half2 instructions, and avoid the conversion overhead (1/8 of __hal2 arithmetic).
 
 For specific launch constraints, see the launch functions.
 */
 
-template <int row_stride, int iterations>
+#define NORM_REG (MAX_REGISTERS / 4)
+
 __global__ void fused_bias_residual_layer_norm(float* vals,
                                                const float* residual,
                                                const float* gamma,
                                                const float* beta,
                                                float epsilon,
                                                bool preLayerNorm,
-                                               bool training = false,
-                                               float* vars = nullptr,
-                                               float* means = nullptr,
-                                               float* vals_hat = nullptr)
+                                               bool training,
+                                               float* vars,
+                                               float* means,
+                                               int row_stride)
 {
-    constexpr int iteration_stride = row_stride / iterations;
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
-    int gid = id / 32;
+    int gid = id / WARP_SIZE;
+
+    float vals_arr[NORM_REG];
+    __shared__ float shr[MAX_WARP_NUM];
 
-    float vals_arr[iterations];
-    __shared__ float shr[iteration_stride >> 5];
+    residual += (row * row_stride);
+    vals += (row * row_stride);
 
     float sum = 0.f;
+    int high_index = iterations * iteration_stride + id;
 #pragma unroll
     for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = residual[row * row_stride + i * iteration_stride + id];
+        vals_arr[i] = residual[i * iteration_stride + id];
         sum += vals_arr[i];
     }
+    if (high_index < row_stride) {
+        vals_arr[iterations] = residual[high_index];
+        sum += vals_arr[iterations];
+        iterations++;
+    }
 
     for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
 
@@ -71,7 +73,8 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
         if (g.thread_rank() == 0) means[row] = mean;
     float variance = 0.f;
     for (int i = 0; i < iterations; i++) {
-        variance += (vals_arr[i] - mean) * (vals_arr[i] - mean);
+        vals_arr[i] -= mean;
+        variance += vals_arr[i] * vals_arr[i];
     }
 
     for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
@@ -93,28 +96,34 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
     if (training)
         if (g.thread_rank() == 0) vars[row] = variance;
 
+    iterations = row_stride / iteration_stride;
     for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = (vals_arr[i] - mean) * rsqrtf(variance);
+        vals_arr[i] = vals_arr[i] * rsqrtf(variance);
         vals_arr[i] =
             vals_arr[i] * gamma[i * iteration_stride + id] + beta[i * iteration_stride + id];
-        vals[row * row_stride + i * iteration_stride + id] = vals_arr[i];
+        vals[i * iteration_stride + id] = vals_arr[i];
+    }
+    if ((high_index) < row_stride) {
+        vals_arr[iterations] = vals_arr[iterations] * rsqrtf(variance);
+        vals_arr[iterations] = vals_arr[iterations] * gamma[high_index] + beta[high_index];
+        vals[high_index] = vals_arr[iterations];
     }
 }
 
-template <int row_stride, int iterations>
 __global__ void fused_bias_residual_layer_norm(__half* vals,
                                                const __half* residual,
                                                const __half* gamma,
                                                const __half* beta,
                                                float epsilon,
                                                bool preLayerNorm,
-                                               bool training = false,
-                                               __half* vars = nullptr,
-                                               __half* means = nullptr,
-                                               __half* vals_hat = nullptr)
+                                               bool training,
+                                               __half* vars,
+                                               __half* means,
+                                               int row_stride)
 {
 #if __CUDA_ARCH__ >= 700
-    constexpr int iteration_stride = row_stride / iterations;
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -123,20 +132,29 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
     int id = threadIdx.x;
     int gid = id >> 5;
 
-    __half2 vals_arr[iterations];
-    float2 vals_f[iterations];
-    __shared__ float shr[iteration_stride >> 5];
+    float2 vals_f[NORM_REG];
+    __shared__ float shr[MAX_WARP_NUM];
 
     __half2* vals_cast = reinterpret_cast<__half2*>(vals);
     const __half2* residual_cast = reinterpret_cast<const __half2*>(residual);
 
+    residual_cast += (row * row_stride);
+    vals_cast += (row * row_stride);
+
     float sum = 0.f;
+    int high_index = iterations * iteration_stride + id;
 #pragma unroll
     for (int i = 0; i < iterations; i++) {
-        vals_f[i] = __half22float2(residual_cast[row * row_stride + i * iteration_stride + id]);
+        vals_f[i] = __half22float2(residual_cast[i * iteration_stride + id]);
         sum += vals_f[i].x;
         sum += vals_f[i].y;
     }
+    if ((high_index) < row_stride) {
+        vals_f[iterations] = __half22float2(residual_cast[high_index]);
+        sum += vals_f[iterations].x;
+        sum += vals_f[iterations].y;
+        iterations++;
+    }
 
     for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
 
@@ -156,8 +174,10 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
 
     float variance = 0.f;
     for (int i = 0; i < iterations; i++) {
-        variance += (vals_f[i].x - mean) * (vals_f[i].x - mean);
-        variance += (vals_f[i].y - mean) * (vals_f[i].y - mean);
+        vals_f[i].x -= mean;
+        vals_f[i].y -= mean;
+        variance += vals_f[i].x * vals_f[i].x;
+        variance += vals_f[i].y * vals_f[i].y;
     }
 
     for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
@@ -177,7 +197,6 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
     variance /= (row_stride * 2);
     variance += epsilon;
 
-    __half2 mean_h = __float2half2_rn(mean);
     __half2 variance_h = __float2half2_rn(variance);
     const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
     const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
@@ -186,13 +205,19 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
         vars[row] = __float2half(variance);
         means[row] = __float2half(mean);
     }
-
+    iterations = row_stride / iteration_stride;
     for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = __float22half2_rn(vals_f[i]);
-        vals_arr[i] = (vals_arr[i] - mean_h) * h2rsqrt(variance_h);
-        vals_arr[i] = vals_arr[i] * gamma_cast[i * iteration_stride + id] +
-                      beta_cast[i * iteration_stride + id];
-        vals_cast[row * row_stride + i * iteration_stride + id] = vals_arr[i];
+        __half2 vals_arr = __float22half2_rn(vals_f[i]);
+        vals_arr = vals_arr * h2rsqrt(variance_h);
+        vals_arr =
+            vals_arr * gamma_cast[i * iteration_stride + id] + beta_cast[i * iteration_stride + id];
+        vals_cast[i * iteration_stride + id] = vals_arr;
+    }
+    if ((high_index) < row_stride) {
+        __half2 vals_arr = __float22half2_rn(vals_f[iterations]);
+        vals_arr = vals_arr * h2rsqrt(variance_h);
+        vals_arr = vals_arr * gamma_cast[high_index] + beta_cast[high_index];
+        vals_cast[high_index] = vals_arr;
     }
 #endif
 }
@@ -204,14 +229,12 @@ void launch_bias_residual_layer_norm(T* vals,
                                      const T* beta,
                                      float epsilon,
                                      int batch_size,
-                                     int sequence_length,
                                      int hidden_dim,
                                      cudaStream_t stream,
                                      bool preLayerNorm,
                                      bool training,
                                      T* vars,
-                                     T* means,
-                                     T* vals_hat);
+                                     T* means);
 
 template <>
 void launch_bias_residual_layer_norm<float>(float* vals,
@@ -220,42 +243,28 @@ void launch_bias_residual_layer_norm<float>(float* vals,
                                             const float* beta,
                                             float epsilon,
                                             int batch_size,
-                                            int sequence_length,
                                             int hidden_dim,
                                             cudaStream_t stream,
                                             bool preLayerNorm,
                                             bool training,
                                             float* vars,
-                                            float* means,
-                                            float* vals_hat)
+                                            float* means)
 {
-    constexpr int threads = THREADS;
+    int threads = THREADS;
 
-    dim3 grid_dim(batch_size * sequence_length);
+    dim3 grid_dim(batch_size);
+
+    if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 1;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 2;
+    else if (hidden_dim > 65536)
+        throw std::runtime_error("Unsupport hidden_dim.");
 
     dim3 block_dim(threads);
 
-    // There are some limitations to call below functions, now just enumerate the situations.
-    if (hidden_dim == 768)
-        fused_bias_residual_layer_norm<768, 3><<<grid_dim, block_dim, 0, stream>>>(
-            vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat);
-    else if (hidden_dim == 512)
-        fused_bias_residual_layer_norm<512, 2><<<grid_dim, block_dim, 0, stream>>>(
-            vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat);
-    else if (hidden_dim == 1024)
-        fused_bias_residual_layer_norm<1024, 4><<<grid_dim, block_dim, 0, stream>>>(
-            vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat);
-    else if (hidden_dim == 1536)
-        fused_bias_residual_layer_norm<1536, 6><<<grid_dim, block_dim, 0, stream>>>(
-            vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat);
-    else if (hidden_dim == 2048)
-        fused_bias_residual_layer_norm<2048, 8><<<grid_dim, block_dim, 0, stream>>>(
-            vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat);
-    else if (hidden_dim == 2560)
-        fused_bias_residual_layer_norm<2560, 10><<<grid_dim, block_dim, 0, stream>>>(
-            vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat);
-    else
-        throw std::runtime_error("Unsupport hidden_dim.");
+    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
+        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, hidden_dim);
 }
 
 template <>
@@ -265,56 +274,44 @@ void launch_bias_residual_layer_norm<__half>(__half* vals,
                                              const __half* beta,
                                              float epsilon,
                                              int batch_size,
-                                             int sequence_length,
                                              int hidden_dim,
                                              cudaStream_t stream,
                                              bool preLayerNorm,
                                              bool training,
                                              __half* vars,
-                                             __half* means,
-                                             __half* vals_hat)
+                                             __half* means)
 {
-    constexpr int threads = 128;
+    int threads = 128;
 
-    dim3 grid_dim(batch_size * sequence_length);
-    dim3 block_dim(threads);
+    dim3 grid_dim(batch_size);
 
-    // There are some limitations to call below functions, now just enumerate the situations.
-    if (hidden_dim == 768)
-        fused_bias_residual_layer_norm<384, 3><<<grid_dim, block_dim, 0, stream>>>(
-            vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat);
-    else if (hidden_dim == 512)
-        fused_bias_residual_layer_norm<256, 2><<<grid_dim, block_dim, 0, stream>>>(
-            vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat);
-    else if (hidden_dim == 1024)
-        fused_bias_residual_layer_norm<512, 4><<<grid_dim, block_dim, 0, stream>>>(
-            vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat);
-    else if (hidden_dim == 1536)
-        fused_bias_residual_layer_norm<768, 6><<<grid_dim, block_dim, 0, stream>>>(
-            vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat);
-    else if (hidden_dim == 2048)
-        fused_bias_residual_layer_norm<1024, 8><<<grid_dim, block_dim, 0, stream>>>(
-            vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat);
-    else if (hidden_dim == 2560)
-        fused_bias_residual_layer_norm<1280, 10><<<grid_dim, block_dim, 0, stream>>>(
-            vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, vals_hat);
-    else
+    if (hidden_dim > 8192 && hidden_dim <= 16384)
+        threads <<= 1;
+    else if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 2;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 3;
+    else if (hidden_dim > 65536)
         throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim(threads);
+
+    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
+        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, means, hidden_dim / 2);
 }
 
-template <int row_stride, int iterations>
 __global__ void fused_bias_residual_layer_norm(float* vals,
                                                const float* residual,
                                                const float* gamma,
                                                const float* beta,
                                                float epsilon,
                                                bool preLayerNorm,
-                                               bool training = false,
-                                               float* vars = nullptr,
-                                               float* vals_hat = nullptr,
-                                               bool save_vals = false)
+                                               bool training,
+                                               float* vars,
+                                               int row_stride)
 {
-    constexpr int iteration_stride = row_stride / iterations;
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -323,15 +320,24 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
     int id = threadIdx.x;
     int gid = id / 32;
 
-    float vals_arr[iterations];
-    __shared__ float shr[iteration_stride >> 5];
+    float vals_arr[NORM_REG];
+    __shared__ float shr[MAX_WARP_NUM];
+
+    residual += (row * row_stride);
+    vals += (row * row_stride);
 
     float sum = 0.f;
+    int high_index = iterations * iteration_stride + id;
 #pragma unroll
     for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = residual[row * row_stride + i * iteration_stride + id];
+        vals_arr[i] = residual[i * iteration_stride + id];
         sum += vals_arr[i];
     }
+    if ((high_index) < row_stride) {
+        vals_arr[iterations] = residual[high_index];
+        sum += vals_arr[iterations];
+        iterations++;
+    }
 
     for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
 
@@ -351,7 +357,8 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
     float mean = sum / row_stride;
     float variance = 0.f;
     for (int i = 0; i < iterations; i++) {
-        variance += (vals_arr[i] - mean) * (vals_arr[i] - mean);
+        vals_arr[i] -= mean;
+        variance += vals_arr[i] * vals_arr[i];
     }
 
     for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
@@ -373,28 +380,34 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
     if (training)
         if (g.thread_rank() == 0) vars[row] = variance;
 
+    iterations = row_stride / iteration_stride;
     for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = (vals_arr[i] - mean) * rsqrtf(variance);
+        vals_arr[i] = vals_arr[i] * rsqrtf(variance);
         vals_arr[i] =
             vals_arr[i] * gamma[i * iteration_stride + id] + beta[i * iteration_stride + id];
-        vals[row * row_stride + i * iteration_stride + id] = vals_arr[i];
+        vals[i * iteration_stride + id] = vals_arr[i];
+    }
+    if ((high_index) < row_stride) {
+        vals_arr[iterations] = vals_arr[iterations] * rsqrtf(variance);
+        vals_arr[iterations] = vals_arr[iterations] * gamma[high_index] + beta[high_index];
+        vals[high_index] = vals_arr[iterations];
     }
 }
 
-template <int row_stride, int iterations>
 __global__ void fused_bias_residual_layer_norm(__half* vals,
                                                const __half* residual,
                                                const __half* gamma,
                                                const __half* beta,
                                                float epsilon,
                                                bool preLayerNorm,
-                                               bool training = false,
-                                               __half* vars = nullptr,
-                                               __half* vals_hat = nullptr,
-                                               bool save_vals = false)
+                                               bool training,
+                                               __half* vars,
+                                               int row_stride)
 {
 #if __CUDA_ARCH__ >= 700
-    constexpr int iteration_stride = row_stride / iterations;
+
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -403,20 +416,29 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
     int id = threadIdx.x;
     int gid = id >> 5;
 
-    __half2 vals_arr[iterations];
-    float2 vals_f[iterations];
-    __shared__ float shr[iteration_stride >> 5];
+    float2 vals_f[NORM_REG];
+    __shared__ float shr[MAX_WARP_NUM];
 
     __half2* vals_cast = reinterpret_cast<__half2*>(vals);
     const __half2* residual_cast = reinterpret_cast<const __half2*>(residual);
 
+    residual_cast += (row * row_stride);
+    vals_cast += (row * row_stride);
+
     float sum = 0.f;
+    int high_index = iterations * iteration_stride + id;
 #pragma unroll
     for (int i = 0; i < iterations; i++) {
-        vals_f[i] = __half22float2(residual_cast[row * row_stride + i * iteration_stride + id]);
+        vals_f[i] = __half22float2(residual_cast[i * iteration_stride + id]);
         sum += vals_f[i].x;
         sum += vals_f[i].y;
     }
+    if ((high_index) < row_stride) {
+        vals_f[iterations] = __half22float2(residual_cast[high_index]);
+        sum += vals_f[iterations].x;
+        sum += vals_f[iterations].y;
+        iterations++;
+    }
 
     for (int i = 1; i < 32; i *= 2) { sum += g.shfl_down(sum, i); }
 
@@ -436,8 +458,10 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
 
     float variance = 0.f;
     for (int i = 0; i < iterations; i++) {
-        variance += (vals_f[i].x - mean) * (vals_f[i].x - mean);
-        variance += (vals_f[i].y - mean) * (vals_f[i].y - mean);
+        vals_f[i].x -= mean;
+        vals_f[i].y -= mean;
+        variance += vals_f[i].x * vals_f[i].x;
+        variance += vals_f[i].y * vals_f[i].y;
     }
 
     for (int i = 1; i < 32; i *= 2) { variance += g.shfl_down(variance, i); }
@@ -457,19 +481,25 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
     variance /= (row_stride * 2);
     variance += epsilon;
 
-    __half2 mean_h = __float2half2_rn(mean);
     __half2 variance_h = __float2half2_rn(variance);
     const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
     const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
 
     if (training && g.thread_rank() == 0) vars[row] = __float2half(variance);
 
+    iterations = row_stride / iteration_stride;
     for (int i = 0; i < iterations; i++) {
-        vals_arr[i] = __float22half2_rn(vals_f[i]);
-        vals_arr[i] = (vals_arr[i] - mean_h) * h2rsqrt(variance_h);
-        vals_arr[i] = vals_arr[i] * gamma_cast[i * iteration_stride + id] +
-                      beta_cast[i * iteration_stride + id];
-        vals_cast[row * row_stride + i * iteration_stride + id] = vals_arr[i];
+        __half2 vals_arr = __float22half2_rn(vals_f[i]);
+        vals_arr = vals_arr * h2rsqrt(variance_h);
+        vals_arr =
+            vals_arr * gamma_cast[i * iteration_stride + id] + beta_cast[i * iteration_stride + id];
+        vals_cast[i * iteration_stride + id] = vals_arr;
+    }
+    if ((high_index) < row_stride) {
+        __half2 vals_arr = __float22half2_rn(vals_f[iterations]);
+        vals_arr = vals_arr * h2rsqrt(variance_h);
+        vals_arr = vals_arr * gamma_cast[high_index] + beta_cast[high_index];
+        vals_cast[high_index] = vals_arr;
     }
 #endif
 }
@@ -481,14 +511,11 @@ void launch_bias_residual_layer_norm(T* vals,
                                      const T* beta,
                                      float epsilon,
                                      int batch_size,
-                                     int sequence_length,
                                      int hidden_dim,
                                      cudaStream_t stream,
                                      bool preLayerNorm,
                                      bool training,
-                                     T* vars,
-                                     T* vals_hat,
-                                     bool save_vals);
+                                     T* vars);
 
 /*
 To tune this launch the following restrictions must be met:
@@ -512,90 +539,29 @@ void launch_bias_residual_layer_norm<float>(float* vals,
                                             const float* beta,
                                             float epsilon,
                                             int batch_size,
-                                            int sequence_length,
                                             int hidden_dim,
                                             cudaStream_t stream,
                                             bool preLayerNorm,
                                             bool training,
-                                            float* vars,
-                                            float* vals_hat,
-                                            bool save_vals)
+                                            float* vars)
 {
-    constexpr int threads = THREADS;
-
-    dim3 grid_dim(batch_size * sequence_length);
+    int threads = THREADS;
 
-    dim3 block_dim(threads);
+    dim3 grid_dim(batch_size);
 
     // There are some limitations to call below functions, now just enumerate the situations.
-    if (hidden_dim == 768)
-        fused_bias_residual_layer_norm<768, 3><<<grid_dim, block_dim, 0, stream>>>(vals,
-                                                                                   residual,
-                                                                                   gamma,
-                                                                                   beta,
-                                                                                   epsilon,
-                                                                                   preLayerNorm,
-                                                                                   training,
-                                                                                   vars,
-                                                                                   vals_hat,
-                                                                                   save_vals);
-    else if (hidden_dim == 512)
-        fused_bias_residual_layer_norm<512, 2><<<grid_dim, block_dim, 0, stream>>>(vals,
-                                                                                   residual,
-                                                                                   gamma,
-                                                                                   beta,
-                                                                                   epsilon,
-                                                                                   preLayerNorm,
-                                                                                   training,
-                                                                                   vars,
-                                                                                   vals_hat,
-                                                                                   save_vals);
-    else if (hidden_dim == 1024)
-        fused_bias_residual_layer_norm<1024, 4><<<grid_dim, block_dim, 0, stream>>>(vals,
-                                                                                    residual,
-                                                                                    gamma,
-                                                                                    beta,
-                                                                                    epsilon,
-                                                                                    preLayerNorm,
-                                                                                    training,
-                                                                                    vars,
-                                                                                    vals_hat,
-                                                                                    save_vals);
-    else if (hidden_dim == 1536)
-        fused_bias_residual_layer_norm<1536, 6><<<grid_dim, block_dim, 0, stream>>>(vals,
-                                                                                    residual,
-                                                                                    gamma,
-                                                                                    beta,
-                                                                                    epsilon,
-                                                                                    preLayerNorm,
-                                                                                    training,
-                                                                                    vars,
-                                                                                    vals_hat,
-                                                                                    save_vals);
-    else if (hidden_dim == 2048)
-        fused_bias_residual_layer_norm<2048, 8><<<grid_dim, block_dim, 0, stream>>>(vals,
-                                                                                    residual,
-                                                                                    gamma,
-                                                                                    beta,
-                                                                                    epsilon,
-                                                                                    preLayerNorm,
-                                                                                    training,
-                                                                                    vars,
-                                                                                    vals_hat,
-                                                                                    save_vals);
-    else if (hidden_dim == 2560)
-        fused_bias_residual_layer_norm<2560, 10><<<grid_dim, block_dim, 0, stream>>>(vals,
-                                                                                     residual,
-                                                                                     gamma,
-                                                                                     beta,
-                                                                                     epsilon,
-                                                                                     preLayerNorm,
-                                                                                     training,
-                                                                                     vars,
-                                                                                     vals_hat,
-                                                                                     save_vals);
-    else
+
+    if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 1;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 2;
+    else if (hidden_dim > 65536)
         throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim(threads);
+
+    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
+        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, hidden_dim);
 }
 
 template <>
@@ -605,89 +571,30 @@ void launch_bias_residual_layer_norm<__half>(__half* vals,
                                              const __half* beta,
                                              float epsilon,
                                              int batch_size,
-                                             int sequence_length,
                                              int hidden_dim,
                                              cudaStream_t stream,
                                              bool preLayerNorm,
                                              bool training,
-                                             __half* vars,
-                                             __half* vals_hat,
-                                             bool save_vals)
+                                             __half* vars)
 {
-    constexpr int threads = 128;
+    int threads = 128;
 
-    dim3 grid_dim(batch_size * sequence_length);
-    dim3 block_dim(threads);
+    dim3 grid_dim(batch_size);
 
     // There are some limitations to call below functions, now just enumerate the situations.
-    if (hidden_dim == 768)
-        fused_bias_residual_layer_norm<384, 3><<<grid_dim, block_dim, 0, stream>>>(vals,
-                                                                                   residual,
-                                                                                   gamma,
-                                                                                   beta,
-                                                                                   epsilon,
-                                                                                   preLayerNorm,
-                                                                                   training,
-                                                                                   vars,
-                                                                                   vals_hat,
-                                                                                   save_vals);
-    else if (hidden_dim == 512)
-        fused_bias_residual_layer_norm<256, 2><<<grid_dim, block_dim, 0, stream>>>(vals,
-                                                                                   residual,
-                                                                                   gamma,
-                                                                                   beta,
-                                                                                   epsilon,
-                                                                                   preLayerNorm,
-                                                                                   training,
-                                                                                   vars,
-                                                                                   vals_hat,
-                                                                                   save_vals);
-    else if (hidden_dim == 1024)
-        fused_bias_residual_layer_norm<512, 4><<<grid_dim, block_dim, 0, stream>>>(vals,
-                                                                                   residual,
-                                                                                   gamma,
-                                                                                   beta,
-                                                                                   epsilon,
-                                                                                   preLayerNorm,
-                                                                                   training,
-                                                                                   vars,
-                                                                                   vals_hat,
-                                                                                   save_vals);
-    else if (hidden_dim == 1536)
-        fused_bias_residual_layer_norm<768, 6><<<grid_dim, block_dim, 0, stream>>>(vals,
-                                                                                   residual,
-                                                                                   gamma,
-                                                                                   beta,
-                                                                                   epsilon,
-                                                                                   preLayerNorm,
-                                                                                   training,
-                                                                                   vars,
-                                                                                   vals_hat,
-                                                                                   save_vals);
-    else if (hidden_dim == 2048)
-        fused_bias_residual_layer_norm<1024, 8><<<grid_dim, block_dim, 0, stream>>>(vals,
-                                                                                    residual,
-                                                                                    gamma,
-                                                                                    beta,
-                                                                                    epsilon,
-                                                                                    preLayerNorm,
-                                                                                    training,
-                                                                                    vars,
-                                                                                    vals_hat,
-                                                                                    save_vals);
-    else if (hidden_dim == 2560)
-        fused_bias_residual_layer_norm<1280, 10><<<grid_dim, block_dim, 0, stream>>>(vals,
-                                                                                     residual,
-                                                                                     gamma,
-                                                                                     beta,
-                                                                                     epsilon,
-                                                                                     preLayerNorm,
-                                                                                     training,
-                                                                                     vars,
-                                                                                     vals_hat,
-                                                                                     save_vals);
-    else
+
+    if (hidden_dim > 8192 && hidden_dim <= 16384)
+        threads <<= 1;
+    else if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 2;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 3;
+    else if (hidden_dim > 65536)
         throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim(threads);
+    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
+        vals, residual, gamma, beta, epsilon, preLayerNorm, training, vars, hidden_dim / 2);
 }
 
 /* Normalize Gamma & Betta gradients
@@ -830,17 +737,17 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
  * We do the backward using the X_hat (X - u) / sqrt(variance) or the output of Normalization.
  */
 
-template <int row_stride>  // Hidden_Dim
 __global__ void LayerNormBackward2(const float* out_grad,
                                    const float* vals_hat,
                                    const float* gamma,
                                    const float* betta,
                                    const float* vars,
                                    float* inp_grad,
-                                   bool invertible)
+                                   bool invertible,
+                                   int row_stride)
 {
-    constexpr int iterations = row_stride / THREADS;
-    constexpr int iteration_stride = THREADS;  // row_stride / iterations;
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
@@ -848,21 +755,34 @@ __global__ void LayerNormBackward2(const float* out_grad,
     int row = blockIdx.x;
     int id = threadIdx.x;
     int wid = id / WARP_SIZE;
-    constexpr int warp_num = (THREADS < row_stride ? THREADS : row_stride) / WARP_SIZE;
-    __shared__ float partialSum[warp_num];
+    int warp_num = (THREADS < row_stride ? THREADS : row_stride) / WARP_SIZE;
+    __shared__ float partialSum[MAX_WARP_NUM];
 
-    float vals_arr[iterations];
-    float vals_hat_arr[iterations];
+    out_grad += (row * row_stride);
+    vals_hat += (row * row_stride);
+    inp_grad += (row * row_stride);
 
+    float vals_arr[NORM_REG];
+    float vals_hat_arr[NORM_REG];
+    int high_index = iterations * iteration_stride + id;
 #pragma unroll
     for (int i = 0; i < iterations; i++) {
         float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad[row * row_stride + i * iteration_stride + id];
+        vals_arr[i] = out_grad[i * iteration_stride + id];
         vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] = (invertible ? (vals_hat[row * row_stride + i * iteration_stride + id] -
-                                         betta[i * iteration_stride + id]) /
-                                            gamma_reg
-                                      : vals_hat[row * row_stride + i * iteration_stride + id]);
+        vals_hat_arr[i] =
+            (invertible ? (vals_hat[i * iteration_stride + id] - betta[i * iteration_stride + id]) /
+                              gamma_reg
+                        : vals_hat[i * iteration_stride + id]);
+    }
+    if ((high_index) < row_stride) {
+        float gamma_reg = gamma[high_index];
+        vals_arr[iterations] = out_grad[high_index];
+        vals_arr[iterations] *= gamma_reg;
+        vals_hat_arr[iterations] =
+            (invertible ? (vals_hat[high_index] - betta[high_index]) / gamma_reg
+                        : vals_hat[high_index]);
+        iterations++;
     }
 
     float var_reg = vars[row];
@@ -912,21 +832,22 @@ __global__ void LayerNormBackward2(const float* out_grad,
     sum = g.shfl(sum, 0);
     sum /= row_stride;
 
-    for (int i = 0; i < iterations; i++)
-        inp_grad[row * row_stride + i * iteration_stride + id] = (vals_arr[i] - sum);
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) inp_grad[i * iteration_stride + id] = (vals_arr[i] - sum);
+    if ((high_index) < row_stride) inp_grad[high_index] = (vals_arr[iterations] - sum);
 }
 
-template <int row_stride>  // Hidden_Dim
 __global__ void LayerNormBackward2(const __half* out_grad,
                                    const __half* vals_hat,
                                    const __half* gamma,
                                    const __half* betta,
                                    const __half* vars,
                                    __half* inp_grad,
-                                   bool invertible)
+                                   bool invertible,
+                                   int row_stride)
 {
-    constexpr int iteration_stride = THREADS / 2;  // row_stride / iterations;
-    constexpr int iterations = row_stride / iteration_stride;
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
@@ -934,30 +855,43 @@ __global__ void LayerNormBackward2(const __half* out_grad,
     int row = blockIdx.x;
     int id = threadIdx.x;
     int wid = id / WARP_SIZE;
-    constexpr int warp_num =
-        (iteration_stride < row_stride ? iteration_stride : row_stride) / WARP_SIZE;
-    __shared__ float partialSum[warp_num];
+    int warp_num = (iteration_stride < row_stride ? iteration_stride : row_stride) / WARP_SIZE;
+    __shared__ float partialSum[MAX_WARP_NUM];
 
-    __half2 vals_arr[iterations];
-    float2 vals_arr_f[iterations];
-    __half2 vals_hat_arr[iterations];
+    __half2 vals_arr[NORM_REG];
+    float2 vals_arr_f[NORM_REG];
+    __half2 vals_hat_arr[NORM_REG];
 
     __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
     const __half2* out_grad_h = reinterpret_cast<const __half2*>(out_grad);
     const __half2* vals_hat_h = reinterpret_cast<const __half2*>(vals_hat);
 
+    inp_grad_h += (row * row_stride);
+    out_grad_h += (row * row_stride);
+    vals_hat_h += (row * row_stride);
+
     const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
     const __half2* betta_h = (invertible ? reinterpret_cast<const __half2*>(betta) : nullptr);
-
+    int high_index = iterations * iteration_stride + id;
 #pragma unroll
     for (int i = 0; i < iterations; i++) {
         __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h[row * row_stride + i * iteration_stride + id];
+        vals_arr[i] = out_grad_h[i * iteration_stride + id];
         vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] = (invertible ? (vals_hat_h[row * row_stride + i * iteration_stride + id] -
-                                         betta_h[i * iteration_stride + id]) /
-                                            gamma_reg
-                                      : vals_hat_h[row * row_stride + i * iteration_stride + id]);
+        vals_hat_arr[i] =
+            (invertible
+                 ? (vals_hat_h[i * iteration_stride + id] - betta_h[i * iteration_stride + id]) /
+                       gamma_reg
+                 : vals_hat_h[i * iteration_stride + id]);
+    }
+    if ((high_index) < row_stride) {
+        __half2 gamma_reg = gamma_h[high_index];
+        vals_arr[iterations] = out_grad_h[high_index];
+        vals_arr[iterations] *= gamma_reg;
+        vals_hat_arr[iterations] =
+            (invertible ? (vals_hat_h[high_index] - betta_h[high_index]) / gamma_reg
+                        : vals_hat_h[high_index]);
+        iterations++;
     }
     __half var_h = vars[row];
     __half2 var_reg = __halves2half2(var_h, var_h);
@@ -1020,12 +954,20 @@ __global__ void LayerNormBackward2(const __half* out_grad,
     sum = g.shfl(sum, 0);
     sum /= (2 * row_stride);
 
+    iterations = row_stride / iteration_stride;
     for (int i = 0; i < iterations; i++) {
         vals_arr_f[i].x -= sum;
         vals_arr_f[i].y -= sum;
         __half2 temp = __float22half2_rn(vals_arr_f[i]);
 
-        inp_grad_h[row * row_stride + i * iteration_stride + id] = temp;
+        inp_grad_h[i * iteration_stride + id] = temp;
+    }
+    if ((high_index) < row_stride) {
+        vals_arr_f[iterations].x -= sum;
+        vals_arr_f[iterations].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
+
+        inp_grad_h[high_index] = temp;
     }
 }
 
@@ -1037,15 +979,13 @@ void launch_layerNorm_backward<float>(const float* out_grad,
                                       float* gamma_grad,
                                       float* betta_grad,
                                       float* inp_grad,
-                                      int batch_size,
-                                      int sequence_length,
+                                      int batch,
                                       int hidden_dim,
                                       cudaStream_t stream[2],
                                       bool invertible,
                                       const float* betta)
 {
-    constexpr int threads = THREADS;
-    int batch = batch_size * sequence_length;
+    int threads = THREADS;
 
     dim3 grid_dim(hidden_dim / TILE_DIM);
     dim3 block_dim(TILE_DIM, TILE_DIM);
@@ -1054,28 +994,18 @@ void launch_layerNorm_backward<float>(const float* out_grad,
         out_grad, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
 
     dim3 grid_dim2(batch);
-    dim3 block_dim2(threads);
 
-    if (hidden_dim == 768)
-        LayerNormBackward2<768><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 512)
-        LayerNormBackward2<512><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 1024)
-        LayerNormBackward2<1024><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 1536)
-        LayerNormBackward2<1536><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 2048)
-        LayerNormBackward2<2048><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 2560)
-        LayerNormBackward2<2560><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else
+    if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 1;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 2;
+    else if (hidden_dim > 65536)
         throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads);
+
+    LayerNormBackward2<<<grid_dim2, block_dim2, 0, stream[1]>>>(
+        out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim);
 }
 
 template <>
@@ -1086,15 +1016,13 @@ void launch_layerNorm_backward<__half>(const __half* out_grad,
                                        __half* gamma_grad,
                                        __half* betta_grad,
                                        __half* inp_grad,
-                                       int batch_size,
-                                       int sequence_length,
+                                       int batch,
                                        int hidden_dim,
                                        cudaStream_t stream[2],
                                        bool invertible,
                                        const __half* betta)
 {
-    constexpr int threads = THREADS;
-    int batch = batch_size * sequence_length;
+    int threads = THREADS;
 
     dim3 grid_dim(hidden_dim / TILE_DIM);
     dim3 block_dim(TILE_DIM, TILE_DIM);
@@ -1103,28 +1031,20 @@ void launch_layerNorm_backward<__half>(const __half* out_grad,
         out_grad, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
 
     dim3 grid_dim2(batch);
-    dim3 block_dim2(threads / 2);
 
-    if (hidden_dim == 768)
-        LayerNormBackward2<384><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 512)
-        LayerNormBackward2<256><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 1024)
-        LayerNormBackward2<512><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 1536)
-        LayerNormBackward2<768><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 2048)
-        LayerNormBackward2<1024><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 2560)
-        LayerNormBackward2<1280><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else
+    if (hidden_dim > 8192 && hidden_dim <= 16384)
+        threads <<= 1;
+    else if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 2;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 3;
+    else if (hidden_dim > 65536)
         throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads / 2);
+
+    LayerNormBackward2<<<grid_dim2, block_dim2, 0, stream[1]>>>(
+        out_grad, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim / 2);
 }
 
 /* Backward Normalize (Input-Gradient)
@@ -1133,16 +1053,16 @@ void launch_layerNorm_backward<__half>(const __half* out_grad,
  * We do the backward using the input (X)
  */
 
-template <int row_stride>  // Hidden_Dim
 __global__ void LayerNormBackward2(const float* out_grad,
                                    const float* X_vals,
                                    const float* gamma,
                                    const float* vars,
                                    const float* means,
-                                   float* inp_grad)
+                                   float* inp_grad,
+                                   int row_stride)
 {
-    constexpr int iterations = row_stride / THREADS;
-    constexpr int iteration_stride = THREADS;  // row_stride / iterations;
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
@@ -1150,25 +1070,35 @@ __global__ void LayerNormBackward2(const float* out_grad,
     int row = blockIdx.x;
     int id = threadIdx.x;
     int wid = id / WARP_SIZE;
-    constexpr int warp_num = (THREADS < row_stride ? THREADS : row_stride) / WARP_SIZE;
-    __shared__ float partialSum[warp_num];
+    int warp_num = (THREADS < row_stride ? THREADS : row_stride) / WARP_SIZE;
+    __shared__ float partialSum[MAX_WARP_NUM];
 
-    float vals_arr[iterations];
+    out_grad += (row * row_stride);
+    X_vals += (row * row_stride);
+    inp_grad += (row * row_stride);
 
+    float vals_arr[NORM_REG];
+    int high_index = iterations * iteration_stride + id;
 #pragma unroll
     for (int i = 0; i < iterations; i++) {
         float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad[row * row_stride + i * iteration_stride + id];
+        vals_arr[i] = out_grad[i * iteration_stride + id];
         vals_arr[i] *= gamma_reg;
     }
+    if ((high_index) < row_stride) {
+        float gamma_reg = gamma[high_index];
+        vals_arr[iterations] = out_grad[high_index];
+        vals_arr[iterations] *= gamma_reg;
+        iterations++;
+    }
 
     float var_reg = vars[row];
     float mean_reg = means[row];
 
     float sum = 0;
-    float xu[iterations];
+    float xu[NORM_REG];
     for (int i = 0; i < iterations; i++) {
-        xu[i] = (X_vals[row * row_stride + i * iteration_stride + id] - mean_reg);
+        xu[i] = (X_vals[i * iteration_stride + id] - mean_reg);
         sum += vals_arr[i] * xu[i];
         vals_arr[i] *= rsqrtf(var_reg);
     }
@@ -1213,20 +1143,21 @@ __global__ void LayerNormBackward2(const float* out_grad,
     sum = g.shfl(sum, 0);
     sum /= row_stride;
 
-    for (int i = 0; i < iterations; i++)
-        inp_grad[row * row_stride + i * iteration_stride + id] = (vals_arr[i] - sum);
+    iterations = row_stride / iteration_stride;
+    for (int i = 0; i < iterations; i++) inp_grad[i * iteration_stride + id] = (vals_arr[i] - sum);
+    if ((high_index) < row_stride) inp_grad[high_index] = (vals_arr[iterations] - sum);
 }
 
-template <int row_stride>  // Hidden_Dim
 __global__ void LayerNormBackward2(const __half* out_grad,
                                    const __half* X_vals,
                                    const __half* gamma,
                                    const __half* vars,
                                    const __half* means,
-                                   __half* inp_grad)
+                                   __half* inp_grad,
+                                   int row_stride)
 {
-    constexpr int iteration_stride = THREADS / 2;  // row_stride / iterations;
-    constexpr int iterations = row_stride / iteration_stride;
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
@@ -1234,35 +1165,44 @@ __global__ void LayerNormBackward2(const __half* out_grad,
     int row = blockIdx.x;
     int id = threadIdx.x;
     int wid = id / WARP_SIZE;
-    constexpr int warp_num =
-        (iteration_stride < row_stride ? iteration_stride : row_stride) / WARP_SIZE;
+    int warp_num = (iteration_stride < row_stride ? iteration_stride : row_stride) / WARP_SIZE;
 
-    __shared__ float partialSum[warp_num];
+    __shared__ float partialSum[MAX_WARP_NUM];
 
-    __half2 vals_arr[iterations];
-    float2 vals_arr_f[iterations];
+    __half2 vals_arr[NORM_REG];
+    float2 vals_arr_f[NORM_REG];
 
     __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
     const __half2* out_grad_h = reinterpret_cast<const __half2*>(out_grad);
     const __half2* vals_hat_h = reinterpret_cast<const __half2*>(X_vals);
 
-    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
+    inp_grad_h += (row * row_stride);
+    out_grad_h += (row * row_stride);
+    vals_hat_h += (row * row_stride);
 
+    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
+    int high_index = iterations * iteration_stride + id;
 #pragma unroll
     for (int i = 0; i < iterations; i++) {
         __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h[row * row_stride + i * iteration_stride + id];
+        vals_arr[i] = out_grad_h[i * iteration_stride + id];
         vals_arr[i] *= gamma_reg;  // out_grad * gamma
     }
+    if ((high_index) < row_stride) {
+        __half2 gamma_reg = gamma_h[high_index];
+        vals_arr[iterations] = out_grad_h[high_index];
+        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
+        iterations++;
+    }
     __half mean_h = means[row];
     __half var_h = vars[row];
     __half2 var_reg = __halves2half2(var_h, var_h);
     __half2 mean_reg = __halves2half2(mean_h, mean_h);
-    __half2 xu[iterations];
+    __half2 xu[NORM_REG];
 
     float sum = 0.f;
     for (int i = 0; i < iterations; i++) {
-        xu[i] = (vals_hat_h[row * row_stride + i * iteration_stride + id] - mean_reg);
+        xu[i] = (vals_hat_h[i * iteration_stride + id] - mean_reg);
         __half2 result_h = (xu[i] * vals_arr[i]);
         float2 result_f = __half22float2(result_h);
         sum += result_f.x;
@@ -1319,11 +1259,18 @@ __global__ void LayerNormBackward2(const __half* out_grad,
     sum = g.shfl(sum, 0);
     sum /= (2 * row_stride);
 
+    iterations = row_stride / iteration_stride;
     for (int i = 0; i < iterations; i++) {
         vals_arr_f[i].x -= sum;
         vals_arr_f[i].y -= sum;
         __half2 temp = __float22half2_rn(vals_arr_f[i]);
-        inp_grad_h[row * row_stride + i * iteration_stride + id] = temp;
+        inp_grad_h[i * iteration_stride + id] = temp;
+    }
+    if ((high_index) < row_stride) {
+        vals_arr_f[iterations].x -= sum;
+        vals_arr_f[iterations].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
+        inp_grad_h[high_index] = temp;
     }
 }
 
@@ -1336,13 +1283,11 @@ void launch_layerNorm_backward<float>(const float* out_grad,
                                       float* gamma_grad,
                                       float* betta_grad,
                                       float* inp_grad,
-                                      int batch_size,
-                                      int sequence_length,
+                                      int batch,
                                       int hidden_dim,
                                       cudaStream_t stream[2])
 {
-    constexpr int threads = THREADS;
-    int batch = batch_size * sequence_length;
+    int threads = THREADS;
 
     dim3 grid_dim(hidden_dim / TILE_DIM);
     dim3 block_dim(TILE_DIM, TILE_DIM);
@@ -1351,28 +1296,17 @@ void launch_layerNorm_backward<float>(const float* out_grad,
         out_grad, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
 
     dim3 grid_dim2(batch);
-    dim3 block_dim2(threads);
 
-    if (hidden_dim == 768)
-        LayerNormBackward2<768><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 512)
-        LayerNormBackward2<512><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 1024)
-        LayerNormBackward2<1024><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 1536)
-        LayerNormBackward2<1536><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 2048)
-        LayerNormBackward2<2048><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 2560)
-        LayerNormBackward2<2560><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, X_data, gamma, vars, means, inp_grad);
-    else
+    if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 1;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 2;
+    else if (hidden_dim > 65536)
         throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads);
+    LayerNormBackward2<<<grid_dim2, block_dim2, 0, stream[1]>>>(
+        out_grad, X_data, gamma, vars, means, inp_grad, hidden_dim);
 }
 
 template <>
@@ -1384,13 +1318,11 @@ void launch_layerNorm_backward<__half>(const __half* out_grad,
                                        __half* gamma_grad,
                                        __half* betta_grad,
                                        __half* inp_grad,
-                                       int batch_size,
-                                       int sequence_length,
+                                       int batch,
                                        int hidden_dim,
                                        cudaStream_t stream[2])
 {
-    constexpr int threads = THREADS;
-    int batch = batch_size * sequence_length;
+    int threads = THREADS;
 
     dim3 grid_dim(hidden_dim / TILE_DIM);
     dim3 block_dim(TILE_DIM, TILE_DIM);
@@ -1399,28 +1331,19 @@ void launch_layerNorm_backward<__half>(const __half* out_grad,
         out_grad, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
 
     dim3 grid_dim2(batch);
-    dim3 block_dim2(threads / 2);
 
-    if (hidden_dim == 768)
-        LayerNormBackward2<384><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 512)
-        LayerNormBackward2<256><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 1024)
-        LayerNormBackward2<512><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 1536)
-        LayerNormBackward2<768><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 2048)
-        LayerNormBackward2<1024><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 2560)
-        LayerNormBackward2<1280><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad, X_data, gamma, vars, means, inp_grad);
-    else
+    if (hidden_dim > 8192 && hidden_dim <= 16384)
+        threads <<= 1;
+    else if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 2;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 3;
+    else if (hidden_dim > 65536)
         throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads / 2);
+    LayerNormBackward2<<<grid_dim2, block_dim2, 0, stream[1]>>>(
+        out_grad, X_data, gamma, vars, means, inp_grad, hidden_dim / 2);
 }
 
 template <typename T>
@@ -1546,7 +1469,6 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
     }
 }
 
-template <int row_stride>  // Hidden_Dim
 __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
                                              const float* out_grad2,
                                              const float* vals_hat,
@@ -1554,10 +1476,11 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
                                              const float* betta,
                                              const float* vars,
                                              float* inp_grad,
-                                             bool invertible)
+                                             bool invertible,
+                                             int row_stride)
 {
-    constexpr int iterations = row_stride / THREADS;
-    constexpr int iteration_stride = THREADS;  // row_stride / iterations;
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
@@ -1565,21 +1488,35 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
     int row = blockIdx.x;
     int id = threadIdx.x;
     int wid = id / WARP_SIZE;
-    constexpr int warp_num = (THREADS < row_stride ? THREADS : row_stride) / WARP_SIZE;
-    __shared__ float partialSum[warp_num];
+    int warp_num = (THREADS < row_stride ? THREADS : row_stride) / WARP_SIZE;
+    __shared__ float partialSum[MAX_WARP_NUM];
 
-    float vals_arr[iterations];
-    float vals_hat_arr[iterations];
+    out_grad1 += (row * row_stride);
+    out_grad2 += (row * row_stride);
+    vals_hat += (row * row_stride);
+    inp_grad += (row * row_stride);
 
+    float vals_arr[NORM_REG];
+    float vals_hat_arr[NORM_REG];
+    int high_index = iterations * iteration_stride + id;
 #pragma unroll
     for (int i = 0; i < iterations; i++) {
         float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad1[row * row_stride + i * iteration_stride + id];
+        vals_arr[i] = out_grad1[i * iteration_stride + id];
         vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] = (invertible ? (vals_hat[row * row_stride + i * iteration_stride + id] -
-                                         betta[i * iteration_stride + id]) /
-                                            gamma_reg
-                                      : vals_hat[row * row_stride + i * iteration_stride + id]);
+        vals_hat_arr[i] =
+            (invertible ? (vals_hat[i * iteration_stride + id] - betta[i * iteration_stride + id]) /
+                              gamma_reg
+                        : vals_hat[i * iteration_stride + id]);
+    }
+    if ((high_index) < row_stride) {
+        float gamma_reg = gamma[high_index];
+        vals_arr[iterations] = out_grad1[high_index];
+        vals_arr[iterations] *= gamma_reg;
+        vals_hat_arr[iterations] =
+            (invertible ? (vals_hat[high_index] - betta[high_index]) / gamma_reg
+                        : vals_hat[high_index]);
+        iterations++;
     }
 
     float var_reg = vars[row];
@@ -1628,12 +1565,14 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
     sum = g.shfl(sum, 0);
     sum /= row_stride;
 
+    iterations = row_stride / iteration_stride;
     for (int i = 0; i < iterations; i++)
-        inp_grad[row * row_stride + i * iteration_stride + id] =
-            (vals_arr[i] - sum) + out_grad2[row * row_stride + i * iteration_stride + id];
+        inp_grad[i * iteration_stride + id] =
+            (vals_arr[i] - sum) + out_grad2[i * iteration_stride + id];
+    if ((high_index) < row_stride)
+        inp_grad[high_index] = (vals_arr[iterations] - sum) + out_grad2[high_index];
 }
 
-template <int row_stride>  // Hidden_Dim
 __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
                                              const __half* out_grad2,
                                              const __half* vals_hat,
@@ -1641,10 +1580,11 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
                                              const __half* betta,
                                              const __half* vars,
                                              __half* inp_grad,
-                                             bool invertible)
+                                             bool invertible,
+                                             int row_stride)
 {
-    constexpr int iteration_stride = THREADS / 2;  // row_stride / iterations;
-    constexpr int iterations = row_stride / iteration_stride;
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
@@ -1652,13 +1592,12 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
     int row = blockIdx.x;
     int id = threadIdx.x;
     int wid = id / WARP_SIZE;
-    constexpr int warp_num =
-        (iteration_stride < row_stride ? iteration_stride : row_stride) / WARP_SIZE;
-    __shared__ float partialSum[warp_num];
+    int warp_num = (iteration_stride < row_stride ? iteration_stride : row_stride) / WARP_SIZE;
+    __shared__ float partialSum[MAX_WARP_NUM];
 
-    __half2 vals_arr[iterations];
-    float2 vals_arr_f[iterations];
-    __half2 vals_hat_arr[iterations];
+    __half2 vals_arr[NORM_REG];
+    float2 vals_arr_f[NORM_REG];
+    __half2 vals_hat_arr[NORM_REG];
 
     // float2 result[iterations];
 
@@ -1667,18 +1606,33 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
     const __half2* out_grad_h2 = reinterpret_cast<const __half2*>(out_grad2);
     const __half2* vals_hat_h = reinterpret_cast<const __half2*>(vals_hat);
 
+    inp_grad_h += (row * row_stride);
+    out_grad_h1 += (row * row_stride);
+    out_grad_h2 += (row * row_stride);
+    vals_hat_h += (row * row_stride);
+
     const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
     const __half2* betta_h = (invertible ? reinterpret_cast<const __half2*>(betta) : nullptr);
-
+    int high_index = iterations * iteration_stride + id;
 #pragma unroll
     for (int i = 0; i < iterations; i++) {
         __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h1[row * row_stride + i * iteration_stride + id];
+        vals_arr[i] = out_grad_h1[i * iteration_stride + id];
         vals_arr[i] *= gamma_reg;  // out_grad * gamma
-        vals_hat_arr[i] = (invertible ? (vals_hat_h[row * row_stride + i * iteration_stride + id] -
-                                         betta_h[i * iteration_stride + id]) /
-                                            gamma_reg
-                                      : vals_hat_h[row * row_stride + i * iteration_stride + id]);
+        vals_hat_arr[i] =
+            (invertible
+                 ? (vals_hat_h[i * iteration_stride + id] - betta_h[i * iteration_stride + id]) /
+                       gamma_reg
+                 : vals_hat_h[i * iteration_stride + id]);
+    }
+    if ((high_index) < row_stride) {
+        __half2 gamma_reg = gamma_h[high_index];
+        vals_arr[iterations] = out_grad_h1[high_index];
+        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
+        vals_hat_arr[iterations] =
+            (invertible ? (vals_hat_h[high_index] - betta_h[high_index]) / gamma_reg
+                        : vals_hat_h[high_index]);
+        iterations++;
     }
     __half var_h = vars[row];
     __half2 var_reg = __halves2half2(var_h, var_h);
@@ -1740,13 +1694,20 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
     sum = g.shfl(sum, 0);
     sum /= (2 * row_stride);
 
+    iterations = row_stride / iteration_stride;
     for (int i = 0; i < iterations; i++) {
         vals_arr_f[i].x -= sum;
         vals_arr_f[i].y -= sum;
         __half2 temp = __float22half2_rn(vals_arr_f[i]);
 
-        inp_grad_h[row * row_stride + i * iteration_stride + id] =
-            temp + out_grad_h2[row * row_stride + i * iteration_stride + id];
+        inp_grad_h[i * iteration_stride + id] = temp + out_grad_h2[i * iteration_stride + id];
+    }
+    if ((high_index) < row_stride) {
+        vals_arr_f[iterations].x -= sum;
+        vals_arr_f[iterations].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
+
+        inp_grad_h[high_index] = temp + out_grad_h2[high_index];
     }
 }
 
@@ -1759,15 +1720,13 @@ void launch_layerNorm_backward_fused_add<float>(const float* out_grad1,
                                                 float* gamma_grad,
                                                 float* betta_grad,
                                                 float* inp_grad,
-                                                int batch_size,
-                                                int sequence_length,
+                                                int batch,
                                                 int hidden_dim,
                                                 cudaStream_t stream[2],
                                                 bool invertible,
                                                 const float* betta)
 {
-    constexpr int threads = THREADS;
-    int batch = batch_size * sequence_length;
+    int threads = THREADS;
 
     dim3 grid_dim(hidden_dim / TILE_DIM);
     dim3 block_dim(TILE_DIM, TILE_DIM);
@@ -1775,28 +1734,17 @@ void launch_layerNorm_backward_fused_add<float>(const float* out_grad1,
         out_grad1, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
 
     dim3 grid_dim2(batch);
-    dim3 block_dim2(threads);
 
-    if (hidden_dim == 768)
-        LayerNormBackward2_fused_add<768><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 512)
-        LayerNormBackward2_fused_add<512><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 1024)
-        LayerNormBackward2_fused_add<1024><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 1536)
-        LayerNormBackward2_fused_add<1536><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 2048)
-        LayerNormBackward2_fused_add<2048><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 2560)
-        LayerNormBackward2_fused_add<2560><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else
+    if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 1;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 2;
+    else if (hidden_dim > 65536)
         throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads);
+    LayerNormBackward2_fused_add<<<grid_dim2, block_dim2, 0, stream[1]>>>(
+        out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim);
 }
 
 template <>
@@ -1808,15 +1756,13 @@ void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1,
                                                  __half* gamma_grad,
                                                  __half* betta_grad,
                                                  __half* inp_grad,
-                                                 int batch_size,
-                                                 int sequence_length,
+                                                 int batch,
                                                  int hidden_dim,
                                                  cudaStream_t stream[2],
                                                  bool invertible,
                                                  const __half* betta)
 {
-    constexpr int threads = THREADS;
-    int batch = batch_size * sequence_length;
+    int threads = THREADS;
 
     dim3 grid_dim(hidden_dim / TILE_DIM);
     dim3 block_dim(TILE_DIM, TILE_DIM);
@@ -1825,28 +1771,19 @@ void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1,
         out_grad1, vals_hat, gamma, betta, gamma_grad, betta_grad, batch, hidden_dim, invertible);
 
     dim3 grid_dim2(batch);
-    dim3 block_dim2(threads / 2);
 
-    if (hidden_dim == 768)
-        LayerNormBackward2_fused_add<384><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 512)
-        LayerNormBackward2_fused_add<256><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 1024)
-        LayerNormBackward2_fused_add<512><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 1536)
-        LayerNormBackward2_fused_add<768><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 2048)
-        LayerNormBackward2_fused_add<1024><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else if (hidden_dim == 2560)
-        LayerNormBackward2_fused_add<1280><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible);
-    else
+    if (hidden_dim > 8192 && hidden_dim <= 16384)
+        threads <<= 1;
+    else if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 2;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 3;
+    else if (hidden_dim > 65536)
         throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads / 2);
+    LayerNormBackward2_fused_add<<<grid_dim2, block_dim2, 0, stream[1]>>>(
+        out_grad1, out_grad2, vals_hat, gamma, betta, vars, inp_grad, invertible, hidden_dim / 2);
 }
 
 /* Backward Normalize (Input-Gradient)
@@ -1855,17 +1792,17 @@ void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1,
  * We do the backward using the input (X)
  */
 
-template <int row_stride>  // Hidden_Dim
 __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
                                              const float* out_grad2,
                                              const float* X_vals,
                                              const float* gamma,
                                              const float* vars,
                                              const float* means,
-                                             float* inp_grad)
+                                             float* inp_grad,
+                                             int row_stride)
 {
-    constexpr int iterations = row_stride / THREADS;
-    constexpr int iteration_stride = THREADS;  // row_stride / iterations;
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
@@ -1873,25 +1810,37 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
     int row = blockIdx.x;
     int id = threadIdx.x;
     int wid = id / WARP_SIZE;
-    constexpr int warp_num = (THREADS < row_stride ? THREADS : row_stride) / WARP_SIZE;
-    __shared__ float partialSum[warp_num];
+    int warp_num = (THREADS < row_stride ? THREADS : row_stride) / WARP_SIZE;
+    __shared__ float partialSum[MAX_WARP_NUM];
 
-    float vals_arr[iterations];
-    float vals_hat_arr[iterations];
+    float vals_arr[NORM_REG];
+    float vals_hat_arr[NORM_REG];
 
+    out_grad1 += (row * row_stride);
+    out_grad2 += (row * row_stride);
+    X_vals += (row * row_stride);
+    inp_grad += (row * row_stride);
+    int high_index = iterations * iteration_stride + id;
 #pragma unroll
     for (int i = 0; i < iterations; i++) {
         float gamma_reg = gamma[i * iteration_stride + id];
-        vals_arr[i] = out_grad1[row * row_stride + i * iteration_stride + id];
+        vals_arr[i] = out_grad1[i * iteration_stride + id];
         vals_arr[i] *= gamma_reg;
-        vals_hat_arr[i] = X_vals[row * row_stride + i * iteration_stride + id];
+        vals_hat_arr[i] = X_vals[i * iteration_stride + id];
+    }
+    if ((high_index) < row_stride) {
+        float gamma_reg = gamma[high_index];
+        vals_arr[iterations] = out_grad1[high_index];
+        vals_arr[iterations] *= gamma_reg;
+        vals_hat_arr[iterations] = X_vals[high_index];
+        iterations++;
     }
 
     float var_reg = vars[row];
     float mean_reg = means[row];
 
     float sum = 0;
-    float xu[iterations];
+    float xu[NORM_REG];
     for (int i = 0; i < iterations; i++) {
         xu[i] = (vals_hat_arr[i] - mean_reg);
         sum += vals_arr[i] * xu[i];
@@ -1938,23 +1887,25 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
     sum = g.shfl(sum, 0);
     sum /= row_stride;
 
+    iterations = row_stride / iteration_stride;
     for (int i = 0; i < iterations; i++)
-        inp_grad[row * row_stride + i * iteration_stride + id] =
-            (vals_arr[i] - sum) + out_grad2[row * row_stride + i * iteration_stride + id];
-    ;
+        inp_grad[i * iteration_stride + id] =
+            (vals_arr[i] - sum) + out_grad2[i * iteration_stride + id];
+    if ((high_index) < row_stride)
+        inp_grad[high_index] = (vals_arr[iterations] - sum) + out_grad2[high_index];
 }
 
-template <int row_stride>  // Hidden_Dim
 __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
                                              const __half* out_grad2,
                                              const __half* X_vals,
                                              const __half* gamma,
                                              const __half* vars,
                                              const __half* means,
-                                             __half* inp_grad)
+                                             __half* inp_grad,
+                                             int row_stride)
 {
-    constexpr int iteration_stride = THREADS / 2;  // row_stride / iterations;
-    constexpr int iterations = row_stride / iteration_stride;
+    int iteration_stride = blockDim.x;
+    int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
@@ -1962,35 +1913,46 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
     int row = blockIdx.x;
     int id = threadIdx.x;
     int wid = id / WARP_SIZE;
-    constexpr int warp_num =
-        (iteration_stride < row_stride ? iteration_stride : row_stride) / WARP_SIZE;
+    int warp_num = (iteration_stride < row_stride ? iteration_stride : row_stride) / WARP_SIZE;
 
-    __shared__ float partialSum[warp_num];
+    __shared__ float partialSum[MAX_WARP_NUM];
 
-    __half2 vals_arr[iterations];
-    float2 vals_arr_f[iterations];
-    __half2 vals_hat_arr[iterations];
+    __half2 vals_arr[NORM_REG];
+    float2 vals_arr_f[NORM_REG];
+    __half2 vals_hat_arr[NORM_REG];
 
     __half2* inp_grad_h = reinterpret_cast<__half2*>(inp_grad);
     const __half2* out_grad_h1 = reinterpret_cast<const __half2*>(out_grad1);
     const __half2* out_grad_h2 = reinterpret_cast<const __half2*>(out_grad2);
     const __half2* vals_hat_h = reinterpret_cast<const __half2*>(X_vals);
 
-    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
+    out_grad_h1 += (row * row_stride);
+    out_grad_h2 += (row * row_stride);
+    inp_grad_h += (row * row_stride);
+    vals_hat_h += (row * row_stride);
 
+    const __half2* gamma_h = reinterpret_cast<const __half2*>(gamma);
+    int high_index = iterations * iteration_stride + id;
 #pragma unroll
     for (int i = 0; i < iterations; i++) {
         __half2 gamma_reg = gamma_h[i * iteration_stride + id];
-        vals_arr[i] = out_grad_h1[row * row_stride + i * iteration_stride + id];
+        vals_arr[i] = out_grad_h1[i * iteration_stride + id];
         vals_arr[i] *= gamma_reg;  // out_grad * gamma
-        vals_hat_arr[i] = vals_hat_h[row * row_stride + i * iteration_stride + id];
+        vals_hat_arr[i] = vals_hat_h[i * iteration_stride + id];
+    }
+    if ((high_index) < row_stride) {
+        __half2 gamma_reg = gamma_h[high_index];
+        vals_arr[iterations] = out_grad_h1[high_index];
+        vals_arr[iterations] *= gamma_reg;  // out_grad * gamma
+        vals_hat_arr[iterations] = vals_hat_h[high_index];
+        iterations++;
     }
 
     __half mean_h = means[row];
     __half var_h = vars[row];
     __half2 var_reg = __halves2half2(var_h, var_h);
     __half2 mean_reg = __halves2half2(mean_h, mean_h);
-    __half2 xu[iterations];
+    __half2 xu[NORM_REG];
 
     float sum = 0.f;
     for (int i = 0; i < iterations; i++) {
@@ -2051,12 +2013,18 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
     sum = g.shfl(sum, 0);
     sum /= (2 * row_stride);
 
+    iterations = row_stride / iteration_stride;
     for (int i = 0; i < iterations; i++) {
         vals_arr_f[i].x -= sum;
         vals_arr_f[i].y -= sum;
         __half2 temp = __float22half2_rn(vals_arr_f[i]);
-        inp_grad_h[row * row_stride + i * iteration_stride + id] =
-            temp + out_grad_h2[row * row_stride + i * iteration_stride + id];
+        inp_grad_h[i * iteration_stride + id] = temp + out_grad_h2[i * iteration_stride + id];
+    }
+    if ((high_index) < row_stride) {
+        vals_arr_f[iterations].x -= sum;
+        vals_arr_f[iterations].y -= sum;
+        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
+        inp_grad_h[high_index] = temp + out_grad_h2[high_index];
     }
 }
 
@@ -2070,13 +2038,11 @@ void launch_layerNorm_backward_fused_add<float>(const float* out_grad1,
                                                 float* gamma_grad,
                                                 float* betta_grad,
                                                 float* inp_grad,
-                                                int batch_size,
-                                                int sequence_length,
+                                                int batch,
                                                 int hidden_dim,
                                                 cudaStream_t stream[2])
 {
-    constexpr int threads = THREADS;
-    int batch = batch_size * sequence_length;
+    int threads = THREADS;
 
     dim3 grid_dim(hidden_dim / TILE_DIM);
     dim3 block_dim(TILE_DIM, TILE_DIM);
@@ -2085,28 +2051,17 @@ void launch_layerNorm_backward_fused_add<float>(const float* out_grad1,
         out_grad1, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
 
     dim3 grid_dim2(batch);
-    dim3 block_dim2(threads);
 
-    if (hidden_dim == 768)
-        LayerNormBackward2_fused_add<768><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 512)
-        LayerNormBackward2_fused_add<512><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 1024)
-        LayerNormBackward2_fused_add<1024><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 1536)
-        LayerNormBackward2_fused_add<1536><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 2048)
-        LayerNormBackward2_fused_add<2048><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 2560)
-        LayerNormBackward2_fused_add<2560><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad);
-    else
+    if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 1;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 2;
+    else if (hidden_dim > 65536)
         throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads);
+    LayerNormBackward2_fused_add<<<grid_dim2, block_dim2, 0, stream[1]>>>(
+        out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad, hidden_dim);
 }
 
 template <>
@@ -2119,13 +2074,11 @@ void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1,
                                                  __half* gamma_grad,
                                                  __half* betta_grad,
                                                  __half* inp_grad,
-                                                 int batch_size,
-                                                 int sequence_length,
+                                                 int batch,
                                                  int hidden_dim,
                                                  cudaStream_t stream[2])
 {
-    constexpr int threads = THREADS;
-    int batch = batch_size * sequence_length;
+    int threads = THREADS;
 
     dim3 grid_dim(hidden_dim / TILE_DIM);
     dim3 block_dim(TILE_DIM, TILE_DIM);
@@ -2134,26 +2087,17 @@ void launch_layerNorm_backward_fused_add<__half>(const __half* out_grad1,
         out_grad1, X_data, vars, means, gamma_grad, betta_grad, batch, hidden_dim);
 
     dim3 grid_dim2(batch);
-    dim3 block_dim2(threads / 2);
 
-    if (hidden_dim == 768)
-        LayerNormBackward2_fused_add<384><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 512)
-        LayerNormBackward2_fused_add<256><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 1024)
-        LayerNormBackward2_fused_add<512><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 1536)
-        LayerNormBackward2_fused_add<768><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 2048)
-        LayerNormBackward2_fused_add<1024><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad);
-    else if (hidden_dim == 2560)
-        LayerNormBackward2_fused_add<1280><<<grid_dim2, block_dim2, 0, stream[1]>>>(
-            out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad);
-    else
+    if (hidden_dim > 8192 && hidden_dim <= 16384)
+        threads <<= 1;
+    else if (hidden_dim > 16384 && hidden_dim <= 32768)
+        threads <<= 2;
+    else if (hidden_dim > 32768 && hidden_dim <= 65536)
+        threads <<= 3;
+    else if (hidden_dim > 65536)
         throw std::runtime_error("Unsupport hidden_dim.");
+
+    dim3 block_dim2(threads / 2);
+    LayerNormBackward2_fused_add<<<grid_dim2, block_dim2, 0, stream[1]>>>(
+        out_grad1, out_grad2, X_data, gamma, vars, means, inp_grad, hidden_dim / 2);
 }
diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu
index 8e2b86901609..be776b0c074d 100644
--- a/csrc/transformer/softmax_kernels.cu
+++ b/csrc/transformer/softmax_kernels.cu
@@ -1,3 +1,4 @@
+#include <math.h>
 #include "custom_cuda_layers.h"
 #include "general_kernels.h"
 
@@ -79,7 +80,8 @@ __global__ void attn_softmax(float* vals,
 #endif
 
         int iters = warp_num;
-        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
 
         for (int i = 1; i < iters; i *= 2) {
             auto temp = g.shfl_xor(max_val, i);
@@ -112,7 +114,8 @@ __global__ void attn_softmax(float* vals,
 #endif
 
         int iters = warp_num;
-        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
 
         for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
 
@@ -215,7 +218,8 @@ __global__ void attn_softmax(__half* vals,
 #endif
 
         int iters = warp_num;
-        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
 
         for (int i = 1; i < iters; i *= 2) {
             auto temp = g.shfl_xor(max_val, i);
@@ -251,7 +255,8 @@ __global__ void attn_softmax(__half* vals,
 #endif
 
         int iters = warp_num;
-        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
 
         for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
 
@@ -282,7 +287,7 @@ __global__ void attn_softmax(__half* vals,
 }
 
 template <typename T>
-void launch_attn_softmax(T*, const T*, int, int, int, cudaStream_t, bool);
+void launch_attn_softmax(T*, const T*, int, int, int, cudaStream_t);
 
 template <>
 void launch_attn_softmax<float>(float* vals,
@@ -294,11 +299,10 @@ void launch_attn_softmax<float>(float* vals,
 {
     const int threads = 128;
     int seq_length4 = sequence_length / 4;
-    int seq2 = sequence_length * seq_length4;
 
     int block_compute_size =
-        (seq_length4 < threads ? ((threads / seq_length4) * seq_length4) : seq_length4);
-    dim3 grid_dim(batch_size, heads * seq2 / block_compute_size);
+        (seq_length4 < threads ? (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) : 1);
+    dim3 grid_dim(batch_size, heads * sequence_length / block_compute_size);
 
     int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
 
@@ -330,15 +334,18 @@ void launch_attn_softmax<float>(float* vals,
     else {
         const int threads = 256;
         block_compute_size =
-            (seq_length4 < threads ? ((threads / seq_length4) * seq_length4) : seq_length4);
-        dim3 grid_dim(batch_size, heads * seq2 / block_compute_size);
+            (seq_length4 < threads ? (int)pow(2.0, floor(log2((float)(threads / seq_length4))))
+                                   : 1);
+        dim3 grid_dim(batch_size, heads * sequence_length / block_compute_size);
 
         int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
 
         dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
                                                 subblock_max_workload * threads)
                                              : threads);
-
+        iterations =
+            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
+                                                     : MAX_THREAD_ITERATIONS);
         if (sequence_length <= 512)
             attn_softmax<32, (threads / 128), 128><<<grid_dim, block_dim, 0, stream>>>(
                 vals, attn_mask, heads, seq_length4, iterations);
@@ -362,11 +369,10 @@ void launch_attn_softmax<__half>(__half* vals,
 {
     const int threads = 128;
     int seq_length4 = sequence_length / 4;
-    int seq2 = sequence_length * seq_length4;
 
     int block_compute_size =
-        (seq_length4 < threads ? ((threads / seq_length4) * seq_length4) : seq_length4);
-    dim3 grid_dim(batch_size, heads * seq2 / block_compute_size);
+        (seq_length4 < threads ? (int)pow(2.0, floor(log2((float)(threads / seq_length4)))) : 1);
+    dim3 grid_dim(batch_size, heads * sequence_length / block_compute_size);
 
     int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
 
@@ -399,15 +405,18 @@ void launch_attn_softmax<__half>(__half* vals,
     else {
         const int threads = 256;
         block_compute_size =
-            (seq_length4 < threads ? ((threads / seq_length4) * seq_length4) : seq_length4);
-        dim3 grid_dim(batch_size, heads * seq2 / block_compute_size);
+            (seq_length4 < threads ? (int)pow(2.0, floor(log2((float)(threads / seq_length4))))
+                                   : 1);
+        dim3 grid_dim(batch_size, heads * sequence_length / block_compute_size);
 
         int subblock_max_workload = MAX_THREAD_ITERATIONS * 4 * threads;
 
         dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
                                                 subblock_max_workload * threads)
                                              : threads);
-
+        iterations =
+            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
+                                                     : MAX_THREAD_ITERATIONS);
         if (sequence_length <= 512)
             attn_softmax<32, (threads / 128), 128><<<grid_dim, block_dim, 0, stream>>>(
                 vals, attn_mask, heads, seq_length4, iterations);
@@ -531,55 +540,41 @@ void launch_attn_softmax_backward_v2(T* out_grad,
                                      int seq_length,
                                      cudaStream_t stream)
 {
-    if ((seq_length % WARP_SIZE) != 0 || seq_length > 2048)
-        throw std::runtime_error("Invalid sequence length found in softmax backward.");
-
     const int warps_per_block = 4;
     dim3 grid_dim(batch_size * heads * seq_length / warps_per_block);
     dim3 block_dim(WARP_SIZE, warps_per_block);
 
-    switch (seq_length) {
-        case 32:
-            softmax_backward_kernel_v2<T, 1>
-                <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-            break;
-        case 64:
-            softmax_backward_kernel_v2<T, 2>
-                <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-            break;
-        case 128:
-            softmax_backward_kernel_v2<T, 4>
-                <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-            break;
-        case 256:
-            softmax_backward_kernel_v2<T, 8>
-                <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-            break;
-        case 384:
-            softmax_backward_kernel_v2<T, 12>
-                <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-            break;
-        case 512:
-            softmax_backward_kernel_v2<T, 16>
-                <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-            break;
-        case 768:
-            softmax_backward_kernel_v2<T, 24>
-                <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-            break;
-        case 1024:
-            softmax_backward_kernel_v2<T, 32>
-                <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-            break;
-        case 2048:
-            softmax_backward_kernel_v2<T, 64>
-                <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
-            break;
-        default:
-            throw std::runtime_error(
-                std::string("Special sequence length found in softmax backward, seq_length: ") +
-                std::to_string(seq_length));
-    }
+    if (seq_length <= 32)
+        softmax_backward_kernel_v2<T, 1>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 64)
+        softmax_backward_kernel_v2<T, 2>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 128)
+        softmax_backward_kernel_v2<T, 4>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 256)
+        softmax_backward_kernel_v2<T, 8>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 384)
+        softmax_backward_kernel_v2<T, 12>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 512)
+        softmax_backward_kernel_v2<T, 16>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 768)
+        softmax_backward_kernel_v2<T, 24>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 1024)
+        softmax_backward_kernel_v2<T, 32>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 2048)
+        softmax_backward_kernel_v2<T, 64>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else
+        throw std::runtime_error(
+            std::string("Special sequence length found in softmax backward, seq_length: ") +
+            std::to_string(seq_length));
 }
 
 template void launch_attn_softmax_backward_v2<__half>(__half* out_grad,
diff --git a/csrc/transformer/transform_kernels.cu b/csrc/transformer/transform_kernels.cu
index 33422601cbbd..691b877771b9 100644
--- a/csrc/transformer/transform_kernels.cu
+++ b/csrc/transformer/transform_kernels.cu
@@ -53,27 +53,33 @@ void Transpose<float>(const float* inp_mat, float* out_mat, int rows, int cols,
 }
 
 template <typename T>
-__global__ void transform_0213(T* output, const T* vals, int hidden_dim, int seq_length, int heads);
+__global__ void transform_0213(T* output,
+                               const T* vals,
+                               int hidden_dim,
+                               int seq_length,
+                               int heads,
+                               int head_ext);
 
 template <>
 __global__ void transform_0213<float>(float* output,
                                       const float* vals,
                                       int hidden_dim,
                                       int seq_length,
-                                      int heads)
+                                      int heads,
+                                      int head_ext)
 {
-    int d0_stride = hidden_dim * seq_length / 4;
-    int d1_stride = hidden_dim / 4;
-    int d2_stride = hidden_dim / heads / 4;
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
 
     int d0_out_stride = d0_stride;
     int d1_out_stride = d2_stride;
     int d2_out_stride = d2_stride * seq_length;
 
-    int d0 = blockIdx.x;   // Batch
-    int d1 = blockIdx.y;   // Sequence ID (0-127)
-    int d2 = threadIdx.y;  // Head (0-11)
-    int d3 = threadIdx.x;  // Values (groups of 4)
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = blockIdx.y / head_ext;                                       // Sequence ID (0-127)
+    int d2 = threadIdx.y + (blockIdx.y % head_ext) * (heads / head_ext);  // Head (0-11)
+    int d3 = threadIdx.x;                                                 // Values (groups of 4)
 
     const float4* vals_vec = reinterpret_cast<const float4*>(vals);
     float4* output_vec = reinterpret_cast<float4*>(output);
@@ -87,22 +93,23 @@ __global__ void transform_0213<__half>(__half* output,
                                        const __half* vals,
                                        int hidden_dim,
                                        int seq_length,
-                                       int heads)
+                                       int heads,
+                                       int head_ext)
 {
 #if __CUDA_ARCH__ >= 700
 
-    int d0_stride = hidden_dim * seq_length / 8;
-    int d1_stride = hidden_dim / 8;
-    int d2_stride = hidden_dim / heads / 8;
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
 
     int d0_out_stride = d0_stride;
     int d1_out_stride = d2_stride;
     int d2_out_stride = d2_stride * seq_length;
 
-    int d0 = blockIdx.x;   // Batch
-    int d1 = blockIdx.y;   // Sequence ID (0-127)
-    int d2 = threadIdx.y;  // Head (0-11)
-    int d3 = threadIdx.x;  // Values (groups of 4)
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = blockIdx.y / head_ext;                                       // Sequence ID (0-127)
+    int d2 = threadIdx.y + (blockIdx.y % head_ext) * (heads / head_ext);  // Head (0-11)
+    int d3 = threadIdx.x;                                                 // Values (groups of 4)
 
     float4 vals_arr[1];
 
@@ -123,10 +130,13 @@ void launch_transform_0213<float>(float* output,
                                   int heads,
                                   cudaStream_t stream)
 {
-    dim3 block_dim(hidden_dim / heads / 4, heads);
-    dim3 grid_dim(batch_size, seq_length);
+    hidden_dim >>= 2;
+    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
+    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
+    dim3 grid_dim(batch_size, (seq_length * head_ext));
+
     transform_0213<float>
-        <<<grid_dim, block_dim, 0, stream>>>(output, vals, hidden_dim, seq_length, heads);
+        <<<grid_dim, block_dim, 0, stream>>>(output, vals, hidden_dim, seq_length, heads, head_ext);
 }
 
 template <>
@@ -138,10 +148,12 @@ void launch_transform_0213<__half>(__half* output,
                                    int heads,
                                    cudaStream_t stream)
 {
-    dim3 block_dim(hidden_dim / heads / 8, heads);
-    dim3 grid_dim(batch_size, seq_length);
+    hidden_dim >>= 3;
+    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
+    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
+    dim3 grid_dim(batch_size, (seq_length * head_ext));
     transform_0213<__half>
-        <<<grid_dim, block_dim, 0, stream>>>(output, vals, hidden_dim, seq_length, heads);
+        <<<grid_dim, block_dim, 0, stream>>>(output, vals, hidden_dim, seq_length, heads, head_ext);
 }
 
 // Bias add
@@ -151,7 +163,8 @@ __global__ void bias_add_transform_0213(T* output,
                                         const T* bias,
                                         int hidden_dim,
                                         int seq_length,
-                                        int heads);
+                                        int heads,
+                                        int head_ext);
 
 template <>
 __global__ void bias_add_transform_0213<float>(float* output,
@@ -159,28 +172,29 @@ __global__ void bias_add_transform_0213<float>(float* output,
                                                const float* bias,
                                                int hidden_dim,
                                                int seq_length,
-                                               int heads)
+                                               int heads,
+                                               int head_ext)
 {
-    int d0_stride = hidden_dim * seq_length / 4;
-    int d1_stride = hidden_dim / 4;
-    int d2_stride = hidden_dim / heads / 4;
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
 
     int d0_out_stride = d0_stride;
     int d1_out_stride = d2_stride;
     int d2_out_stride = d2_stride * seq_length;
 
-    int d0 = blockIdx.x;   // Batch
-    int d1 = blockIdx.y;   // Sequence ID (0-127)
-    int cnt = blockIdx.z;  // Hidden count
-    int d2 = threadIdx.y;  // Head (0-11)
-    int d3 = threadIdx.x;  // Values (groups of 4)
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = blockIdx.y;                                                  // Sequence ID (0-127)
+    int cnt = blockIdx.z / head_ext;                                      // Hidden count
+    int d2 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head (0-11)
+    int d3 = threadIdx.x;                                                 // Values (groups of 4)
 
     const float4* vals_vec = reinterpret_cast<const float4*>(vals);
     const float4* bias_vec = reinterpret_cast<const float4*>(bias);
     float4* output_vec = reinterpret_cast<float4*>(output);
 
-    float4 inputs = vals_vec[d0 * d0_stride * gridDim.z + cnt * d1_stride +
-                             d1 * d1_stride * gridDim.z + d2 * d2_stride + d3];
+    float4 inputs = vals_vec[d0 * d0_stride * (gridDim.z / head_ext) + cnt * d1_stride +
+                             d1 * d1_stride * (gridDim.z / head_ext) + d2 * d2_stride + d3];
     float4 biases = bias_vec[cnt * d1_stride + d2 * d2_stride + d3];
 
     float4 outputs;
@@ -202,14 +216,73 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
                                                 const __half* bias,
                                                 int hidden_dim,
                                                 int seq_length,
-                                                int heads)
+                                                int heads,
+                                                int head_ext)
+{
+#if __CUDA_ARCH__ >= 700
+
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
+
+    int d2_out_stride = d2_stride * seq_length;
+
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = blockIdx.y;                                                  // Sequence ID (0-127)
+    int cnt = blockIdx.z / head_ext;                                      // Hidden count
+    int d2 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head (0-11)
+    int d3 = threadIdx.x;                                                 // Values (groups of 4)
+
+    float4 vals_arr;
+    float4 bias_arr;
+    float4 output_arr;
+    __half2* vals_half = reinterpret_cast<__half2*>(&vals_arr);
+    __half2* bias_half = reinterpret_cast<__half2*>(&bias_arr);
+    __half2* output_half = reinterpret_cast<__half2*>(&output_arr);
+
+    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
+    const float4* bias_vec = reinterpret_cast<const float4*>(bias);
+    float4* output_vec = reinterpret_cast<float4*>(output);
+
+    vals_vec += (d0 * d0_stride * (gridDim.z / head_ext));
+    vals_vec += (d1 * d1_stride * (gridDim.z / head_ext));
+    vals_vec += (cnt * d1_stride);
+    vals_vec += (d2 * d2_stride);
+
+    bias_vec += (cnt * d1_stride);
+    bias_vec += (d2 * d2_stride);
+
+    output_vec += (cnt * d0_stride * gridDim.x);
+    output_vec += (d1 * d2_stride);
+    output_vec += (d0 * d0_stride);
+    output_vec += (d2 * d2_out_stride);
+
+    bias_arr = bias_vec[d3];
+    vals_arr = vals_vec[d3];
+
+    output_half[0] = vals_half[0] + bias_half[0];
+    output_half[1] = vals_half[1] + bias_half[1];
+    output_half[2] = vals_half[2] + bias_half[2];
+    output_half[3] = vals_half[3] + bias_half[3];
+
+    output_vec[d3] = output_arr;
+
+#endif
+}
+
+__global__ void bias_add_transform_0213_v2(__half* output,
+                                           const __half* vals,
+                                           const __half* bias,
+                                           int hidden_dim,
+                                           int seq_length,
+                                           int heads)
 {
 #if __CUDA_ARCH__ >= 700
     __shared__ float4 in_data[3072];
 
-    int d0_stride = hidden_dim * seq_length / 8;
-    int d1_stride = hidden_dim / 8;
-    int d2_stride = hidden_dim / heads / 8;
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
     int iteration_stride = d1_stride * blockDim.z;  // Hidden * 3 / 8
     int batch_stride = d0_stride * blockDim.z;      // Hidden * S * 3 / 8
 
@@ -237,6 +310,8 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
     int iter_index = cnt * d1_stride + d2 * d2_stride + d3;
     int input_offset = d0 * batch_stride + d1 * (iteration_stride << 1);
     bias_arr[0] = bias_vec[iter_index];
+
+#pragma unroll
     for (int iter = 0; iter < 2; iter++) {
         int iter_id = iter * iteration_stride + iter_index;
         vals_arr[0] = vals_vec[input_offset + iter_id];
@@ -255,6 +330,8 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
     int head_count = (d2 >> 1) + cnt * (blockDim.y >> 1);
 
     int out_index = d0 * d0_out_stride + d1 * (d1_out_stride << 1) + d3 + (d2 % 2) * d2_stride;
+
+#pragma unroll
     for (int iter = 0; iter < 2; iter++) {
         int iter_row = (iter * iteration_stride) + head_count;
         int iter_offset =
@@ -277,10 +354,14 @@ void launch_bias_add_transform_0213<float>(float* output,
                                            cudaStream_t stream,
                                            int trans_count)
 {
-    dim3 block_dim(hidden_dim / heads / 4, heads);
-    dim3 grid_dim(batch_size, seq_length, trans_count);
-    bias_add_transform_0213<float>
-        <<<grid_dim, block_dim, 0, stream>>>(output, vals, bias, hidden_dim, seq_length, heads);
+    hidden_dim >>= 2;
+    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
+
+    dim3 block_dim(hidden_dim / heads, (heads / head_ext));
+    dim3 grid_dim(batch_size, seq_length, (trans_count * head_ext));
+
+    bias_add_transform_0213<float><<<grid_dim, block_dim, 0, stream>>>(
+        output, vals, bias, hidden_dim, seq_length, heads, head_ext);
 }
 
 template <>
@@ -294,32 +375,47 @@ void launch_bias_add_transform_0213<__half>(__half* output,
                                             cudaStream_t stream,
                                             int trans_count)
 {
-    dim3 block_dim(hidden_dim / heads / 8, heads, trans_count);
-    dim3 grid_dim(batch_size, seq_length / 2);
-    bias_add_transform_0213<__half>
-        <<<grid_dim, block_dim, 0, stream>>>(output, vals, bias, hidden_dim, seq_length, heads);
+    hidden_dim >>= 3;
+    if (hidden_dim > 128 || hidden_dim < 16) {
+        int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
+        dim3 block_dim(hidden_dim / heads, (heads / head_ext));
+        dim3 grid_dim(batch_size, seq_length, (trans_count * head_ext));
+        bias_add_transform_0213<__half><<<grid_dim, block_dim, 0, stream>>>(
+            output, vals, bias, hidden_dim, seq_length, heads, head_ext);
+    } else {
+        dim3 block_dim(hidden_dim / heads, heads, trans_count);
+        dim3 grid_dim(batch_size, seq_length / 2);
+        bias_add_transform_0213_v2<<<grid_dim, block_dim, 0, stream>>>(
+            output, vals, bias, hidden_dim, seq_length, heads);
+    }
 }
 
 template <typename T>
-__global__ void transform4d_0213(T* out, const T* in, int heads, int seq_length, int hidden_dim);
+__global__ void transform4d_0213(T* out,
+                                 const T* in,
+                                 int heads,
+                                 int seq_length,
+                                 int hidden_dim,
+                                 int head_ext);
 
 template <>
 __global__ void transform4d_0213<float>(float* out,
                                         const float* in,
                                         int heads,
                                         int seq_length,
-                                        int hidden_dim)
+                                        int hidden_dim,
+                                        int head_ext)
 {
-    int d0_stride = hidden_dim * seq_length / 4;
+    int d0_stride = hidden_dim * seq_length;
     int d1_stride = d0_stride / heads;
-    int d2_stride = hidden_dim / heads / 4;
+    int d2_stride = hidden_dim / heads;
 
     int d0_out_stride = d0_stride;
     int d1_out_stride = d2_stride;
-    int d2_out_stride = hidden_dim / 4;
+    int d2_out_stride = hidden_dim;
 
-    int d0 = blockIdx.x;                                                 // Batch
-    int d1 = blockIdx.y / ((seq_length + blockDim.y - 1) / blockDim.y);  // Head
+    int d0 = blockIdx.x;                                        // Batch
+    int d1 = blockIdx.y / ((seq_length - 1) / blockDim.y + 1);  // Head
     int d2 = (threadIdx.y + blockDim.y * blockIdx.y) % seq_length;
     int cnt = blockIdx.z;
     int d3 = threadIdx.x;  // Values (groups of 8)
@@ -340,14 +436,51 @@ __global__ void transform4d_0213<__half>(__half* out,
                                          const __half* in,
                                          int heads,
                                          int seq_length,
-                                         int hidden_dim)
+                                         int hidden_dim,
+                                         int head_ext)
+{
+#if __CUDA_ARCH__ >= 700
+
+    int d0_stride = hidden_dim * (seq_length / head_ext);
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
+
+    int d0 = blockIdx.x;                                                  // Batch
+    int d1 = threadIdx.y + (blockIdx.z % head_ext) * (heads / head_ext);  // Head
+    int d2 = blockIdx.z / head_ext;                                       // Sequence
+    int cnt = blockIdx.y;                                                 // Hidden count
+    int d3 = threadIdx.x;                                                 // Values (groups of 8)
+
+    const float4* in_vec = reinterpret_cast<const float4*>(in);
+    float4* out_vec = reinterpret_cast<float4*>(out);
+
+    in_vec += (cnt * d0_stride * gridDim.x);
+    in_vec += (d0 * d0_stride);
+    in_vec += (d2 * d2_stride);
+    in_vec += (d1 * d2_stride * seq_length);
+
+    out_vec += (cnt * d1_stride);
+    out_vec += (d1 * d2_stride);
+    out_vec += (d0 * d0_stride * gridDim.y);
+    out_vec += (d2 * d1_stride * gridDim.y);
+
+    out_vec[d3] = in_vec[d3];
+
+#endif
+}
+
+__global__ void transform4d_0213_v2(__half* out,
+                                    const __half* in,
+                                    int heads,
+                                    int seq_length,
+                                    int hidden_dim)
 {
 #if __CUDA_ARCH__ >= 700
     __shared__ float4 in_data[3072];
 
-    int d0_stride = hidden_dim * seq_length / 8;
-    int d1_stride = hidden_dim / 8;
-    int d2_stride = hidden_dim / heads / 8;
+    int d0_stride = hidden_dim * seq_length;
+    int d1_stride = hidden_dim;
+    int d2_stride = hidden_dim / heads;
 
     int d0 = blockIdx.x;    // Batch
     int d1 = threadIdx.y;   // Head
@@ -358,11 +491,12 @@ __global__ void transform4d_0213<__half>(__half* out,
     const float4* in_vec = reinterpret_cast<const float4*>(in);
     float4* out_vec = reinterpret_cast<float4*>(out);
 
-    int input_offset = d0 * d0_stride + d2 * (d2_stride << 1) + d3 + d1 % 2 * d2_stride;
+    int input_offset = d0 * d0_stride + d2 * (d2_stride << 1) + d3 + (d1 % 2) * d2_stride;
     int head_count = (d1 >> 1) + cnt * (blockDim.y >> 1);
     int iteration_stride = blockDim.z * (blockDim.y >> 1);
     int matrix_stride = (d0_stride * gridDim.x);
 
+#pragma unroll
     for (int iter = 0; iter < 2; iter++) {
         int iter_row = iter * iteration_stride + head_count;
         int iter_offset = (iter_row % blockDim.y) * d2_stride;
@@ -377,6 +511,7 @@ __global__ void transform4d_0213<__half>(__half* out,
     int iter_index = cnt * d1_stride + d1 * d2_stride + d3;
     int output_offset = d0 * d0_stride * blockDim.z + d2 * (iteration_stride << 1);
 
+#pragma unroll
     for (int iter = 0; iter < 2; iter++) {
         int iter_id = iter * iteration_stride + iter_index;
         out_vec[output_offset + iter_id] = in_data[iter_id];
@@ -395,10 +530,11 @@ void launch_transform4d_0213<float>(float* out,
                                     cudaStream_t stream,
                                     int trans_count)
 {
-    dim3 grid_dims(batch_size, heads * ((seq_length + 7) / 8), trans_count);
-    dim3 block_dims(hidden_dim / heads / 4, 8);
+    hidden_dim >>= 2;
+    dim3 grid_dims(batch_size, heads * ((seq_length - 1) / 8 + 1), trans_count);
+    dim3 block_dims(hidden_dim / heads, 8);
     transform4d_0213<float>
-        <<<grid_dims, block_dims, 0, stream>>>(out, in, heads, seq_length, hidden_dim);
+        <<<grid_dims, block_dims, 0, stream>>>(out, in, heads, seq_length, hidden_dim, 1);
 }
 
 template <>
@@ -411,8 +547,17 @@ void launch_transform4d_0213<__half>(__half* out,
                                      cudaStream_t stream,
                                      int trans_count)
 {
-    dim3 grid_dims(batch_size, seq_length / 2);
-    dim3 block_dims(hidden_dim / heads / 8, heads, trans_count);
-    transform4d_0213<__half>
-        <<<grid_dims, block_dims, 0, stream>>>(out, in, heads, seq_length, hidden_dim);
+    hidden_dim >>= 3;
+    if (hidden_dim > 128 || hidden_dim < 16) {
+        int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
+        dim3 grid_dims(batch_size, trans_count, (seq_length * head_ext));
+        dim3 block_dims(hidden_dim / heads, (heads / head_ext));
+        transform4d_0213<__half><<<grid_dims, block_dims, 0, stream>>>(
+            out, in, heads, seq_length, hidden_dim, head_ext);
+    } else {
+        dim3 grid_dims(batch_size, seq_length / 2);
+        dim3 block_dims(hidden_dim / heads, heads, trans_count);
+        transform4d_0213_v2<<<grid_dims, block_dims, 0, stream>>>(
+            out, in, heads, seq_length, hidden_dim);
+    }
 }
diff --git a/csrc/utils/flatten_unflatten.cpp b/csrc/utils/flatten_unflatten.cpp
new file mode 100644
index 000000000000..7d16c5c00a5f
--- /dev/null
+++ b/csrc/utils/flatten_unflatten.cpp
@@ -0,0 +1,25 @@
+/*
+   Copyright 2020 The Microsoft DeepSpeed Team
+   Copyright NVIDIA/apex
+   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+*/
+
+#include <torch/csrc/utils/tensor_flatten.h>
+#include <torch/extension.h>
+// https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_flatten.h
+
+at::Tensor flatten(std::vector<at::Tensor> tensors)
+{
+    return torch::utils::flatten_dense_tensors(tensors);
+}
+
+std::vector<at::Tensor> unflatten(at::Tensor flat, std::vector<at::Tensor> tensors)
+{
+    return torch::utils::unflatten_dense_tensors(flat, tensors);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("flatten", &flatten, "Flatten dense tensors");
+    m.def("unflatten", &unflatten, "Unflatten dense tensors");
+}
diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py
index b1970ac4ebbe..31e901d8ec59 100755
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
@@ -1,34 +1,51 @@
 '''
 Copyright 2020 The Microsoft DeepSpeed Team
 '''
+import sys
+import types
 
-from deepspeed.pt.deepspeed_light import DeepSpeedLight
-from deepspeed.pt.deepspeed_light import ADAM_OPTIMIZER, LAMB_OPTIMIZER
-from deepspeed.pt.deepspeed_lr_schedules import add_tuning_arguments
-from deepspeed.pt.log_utils import logger
-from deepspeed.pt.deepspeed_cuda import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
-from deepspeed.pt.deepspeed_config import DeepSpeedConfig
+from . import ops
 
-import deepspeed.pt.deepspeed_checkpointing as checkpointing
+from .runtime.engine import DeepSpeedEngine
+from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER
+from .runtime.pipe.engine import PipelineEngine
+from .runtime.lr_schedules import add_tuning_arguments
+from .runtime.config import DeepSpeedConfig, DeepSpeedConfigError
+from .runtime.activation_checkpointing import checkpointing
+from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
+from .utils import log_dist
+from .utils.distributed import init_distributed
+
+from .pipe import PipelineModule
+
+from .git_version_info import version, git_hash, git_branch
+
+
+def _parse_version(version_str):
+    '''Parse a version string and extract the major, minor, and patch versions.'''
+    import re
+    matched = re.search('^(\d+)\.(\d+)\.(\d+)', version_str)
+    return int(matched.group(1)), int(matched.group(2)), int(matched.group(3))
 
-try:
-    from deepspeed.git_version_info import git_hash, git_branch
-except ImportError:
-    git_hash = None
-    git_branch = None
 
 # Export version information
-__version_major__ = 0
-__version_minor__ = 2
-__version_patch__ = 0
-__version__ = '.'.join(
-    map(str,
-        [__version_major__,
-         __version_minor__,
-         __version_patch__]))
+__version__ = version
+__version_major__, __version_minor__, __version_patch__ = _parse_version(__version__)
 __git_hash__ = git_hash
 __git_branch__ = git_branch
 
+# Provide backwards compatability with old deepspeed.pt module structure, should hopefully not be used
+pt = types.ModuleType('pt', 'dummy pt module for backwards compatability')
+deepspeed = sys.modules[__name__]
+setattr(deepspeed, 'pt', pt)
+setattr(deepspeed.pt, 'deepspeed_utils', deepspeed.runtime.utils)
+sys.modules['deepspeed.pt'] = deepspeed.pt
+sys.modules['deepspeed.pt.deepspeed_utils'] = deepspeed.runtime.utils
+setattr(deepspeed.pt, 'deepspeed_config', deepspeed.runtime.config)
+sys.modules['deepspeed.pt.deepspeed_config'] = deepspeed.runtime.config
+setattr(deepspeed.pt, 'loss_scaler', deepspeed.runtime.fp16.loss_scaler)
+sys.modules['deepspeed.pt.loss_scaler'] = deepspeed.runtime.fp16.loss_scaler
+
 
 def initialize(args,
                model,
@@ -83,23 +100,35 @@ def initialize(args,
         * ``lr_scheduler``: Wrapped lr scheduler if user ``lr_scheduler`` is passed, or
           if ``lr_scheduler`` specified in JSON configuration. Otherwise ``None``.
     """
-    logger.info(
-        "DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
-            __version__,
-            __git_hash__,
-            __git_branch__),
-    )
-
-    engine = DeepSpeedLight(args=args,
-                            model=model,
-                            optimizer=optimizer,
-                            model_parameters=model_parameters,
-                            training_data=training_data,
-                            lr_scheduler=lr_scheduler,
-                            mpu=mpu,
-                            dist_init_required=dist_init_required,
-                            collate_fn=collate_fn,
-                            config_params=config_params)
+    log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
+        __version__,
+        __git_hash__,
+        __git_branch__),
+             ranks=[0])
+
+    if not isinstance(model, PipelineModule):
+        engine = DeepSpeedEngine(args=args,
+                                 model=model,
+                                 optimizer=optimizer,
+                                 model_parameters=model_parameters,
+                                 training_data=training_data,
+                                 lr_scheduler=lr_scheduler,
+                                 mpu=mpu,
+                                 dist_init_required=dist_init_required,
+                                 collate_fn=collate_fn,
+                                 config_params=config_params)
+    else:
+        assert mpu is None, "mpu must be None with pipeline parallelism"
+        engine = PipelineEngine(args=args,
+                                model=model,
+                                optimizer=optimizer,
+                                model_parameters=model_parameters,
+                                training_data=training_data,
+                                lr_scheduler=lr_scheduler,
+                                mpu=model.mpu(),
+                                dist_init_required=dist_init_required,
+                                collate_fn=collate_fn,
+                                config_params=config_params)
 
     return_items = [
         engine,
diff --git a/deepspeed/constants.py b/deepspeed/constants.py
new file mode 100644
index 000000000000..467e85aefcb6
--- /dev/null
+++ b/deepspeed/constants.py
@@ -0,0 +1,8 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+
+#############################################
+# Torch distributed constants
+#############################################
+TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
diff --git a/deepspeed/elasticity/__init__.py b/deepspeed/elasticity/__init__.py
new file mode 100644
index 000000000000..be517de7df93
--- /dev/null
+++ b/deepspeed/elasticity/__init__.py
@@ -0,0 +1 @@
+from .elasticity import compute_elastic_config, elasticity_enabled, ensure_immutable_elastic_config
diff --git a/deepspeed/elasticity/config.py b/deepspeed/elasticity/config.py
new file mode 100644
index 000000000000..47a428ead16a
--- /dev/null
+++ b/deepspeed/elasticity/config.py
@@ -0,0 +1,110 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+
+import json
+from .constants import *
+
+
+class ElasticityError(Exception):
+    """
+    Base exception for all elasticity related errors
+    """
+    pass
+
+
+class ElasticityConfigError(ElasticityError):
+    """
+    Elasticity configuration error
+    """
+    pass
+
+
+class ElasticityIncompatibleWorldSize(ElasticityError):
+    """
+    Attempting to run a world size that is incompatible with a given elastic config
+    """
+    pass
+
+
+class ElasticityConfig:
+    """
+    Elastic config object, constructed from a param dictionary that only contains elastic
+    config parameters, example below:
+
+    If elasticity is enabled, user must specify (at least) max_train_batch_size
+    and micro_batch_sizes.
+
+    {
+        "enabled": true,
+        "max_train_batch_size": 2000,
+        "micro_batch_sizes": [2,4,6],
+        "min_gpus": 1,
+        "max_gpus" : 10000
+        "min_time": 20
+        "ignore_non_elastic_batch_info": false
+        "version": 0.1
+    }
+    """
+    def __init__(self, param_dict):
+        self.enabled = param_dict.get(ENABLED, ENABLED_DEFAULT)
+        if self.enabled:
+            if MAX_ACCEPTABLE_BATCH_SIZE in param_dict:
+                self.max_acceptable_batch_size = param_dict[MAX_ACCEPTABLE_BATCH_SIZE]
+            else:
+                raise ElasticityConfigError(
+                    f"Elasticity config missing {MAX_ACCEPTABLE_BATCH_SIZE}")
+            if MICRO_BATCHES in param_dict:
+                self.micro_batches = param_dict[MICRO_BATCHES]
+            else:
+                raise ElasticityConfigError(f"Elasticity config missing {MICRO_BATCHES}")
+        else:
+            self.max_acceptable_batch_size = param_dict.get(
+                MAX_ACCEPTABLE_BATCH_SIZE,
+                MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT)
+            self.micro_batches = param_dict.get(MICRO_BATCHES, MICRO_BATCHES_DEFAULT)
+
+        if not isinstance(self.micro_batches, list):
+            raise ElasticityConfigError(
+                f"Elasticity expected value of {MICRO_BATCHES} to be a "
+                f"list of micro batches, instead is: {type(self.micro_batches)}, containing: {self.micro_batches}"
+            )
+
+        if not all(map(lambda m: isinstance(m, int), self.micro_batches)):
+            raise ElasticityConfigError(
+                f"Elasticity expected {MICRO_BATCHES} to only contain a list of integers, "
+                f"instead contains: f{self.micro_batches}")
+
+        if not all(map(lambda m: m > 0, self.micro_batches)):
+            raise ElasticityConfigError(
+                f"Elasticity expected {MICRO_BATCHES} to only contain positive integers, "
+                f"instead contains: f{self.micro_batches}")
+
+        self.min_gpus = param_dict.get(MIN_GPUS, MIN_GPUS_DEFAULT)
+        self.max_gpus = param_dict.get(MAX_GPUS, MAX_GPUS_DEFAULT)
+        if self.min_gpus < 1 or self.max_gpus < 1:
+            raise ElasticityConfigError(
+                "Elasticity min/max gpus must be > 0, "
+                f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}")
+        if self.max_gpus < self.min_gpus:
+            raise ElasticityConfigError(
+                "Elasticity min_gpus cannot be greater than max_gpus, "
+                f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}")
+
+        self.min_time = param_dict.get(MIN_TIME, MIN_TIME_DEFAULT)
+        if self.min_time < 0:
+            raise ElasticityConfigError(
+                f"Elasticity min time needs to be >= 0: given {self.min_time}")
+
+        self.version = param_dict.get(VERSION, VERSION_DEFAULT)
+        self.prefer_larger_batch_size = param_dict.get(PREFER_LARGER_BATCH,
+                                                       PREFER_LARGER_BATCH_DEFAULT)
+        self.ignore_non_elastic_batch_info = param_dict.get(
+            IGNORE_NON_ELASTIC_BATCH_INFO,
+            IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT)
+
+    def repr(self):
+        return self.__dict__
+
+    def __repr__(self):
+        return json.dumps(self.__dict__, sort_keys=True, indent=4)
diff --git a/deepspeed/elasticity/constants.py b/deepspeed/elasticity/constants.py
new file mode 100644
index 000000000000..03cba725fa87
--- /dev/null
+++ b/deepspeed/elasticity/constants.py
@@ -0,0 +1,74 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+
+#########################################
+# Elasticity
+#########################################
+''' Elasticity Utility in DeepSpeed can be used to create highly elastic jobs compatible
+with a large number of GPUs. For elastic jobs, DeepSpeed will provide a batch size that
+can support a large number of GPUs based on the user specified parameters
+'''
+FORMAT = '''
+Elasticity should be enabled as:
+"elasticity": {
+  "enabled": true,
+  "max_train_batch_size": 2000,
+  "micro_batch_sizes": [2,4,6],
+  "min_gpus": 1,
+  "max_gpus" : 10000
+  "min_time": 20,
+  "prefer_larger_batch": true,
+  "ignore_non_elastic_batch_info": false,
+  "version": 0.1
+}
+'''
+
+ELASTICITY = 'elasticity'
+
+# Current elasticity version
+LATEST_ELASTICITY_VERSION = 0.1
+
+ENABLED = 'enabled'
+ENABLED_DEFAULT = False
+
+# Max acceptable train_batch_size
+MAX_ACCEPTABLE_BATCH_SIZE = 'max_train_batch_size'
+MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT = 2000
+
+# Acceptable micro batch sizes, same as train_micro_batch_size_per_gpu
+MICRO_BATCHES = 'micro_batch_sizes'
+MICRO_BATCHES_DEFAULT = [2, 4, 6]
+
+# Min/max of GPUs to search over
+MIN_GPUS = 'min_gpus'
+MIN_GPUS_DEFAULT = 1
+MAX_GPUS = 'max_gpus'
+MAX_GPUS_DEFAULT = 10000
+
+# Minimum running time (minutes) before the scheduler will scale us, 0 implies it's unknown
+MIN_TIME = "min_time"
+MIN_TIME_DEFAULT = 0
+
+# When finding a suitable batch size, attempt to find one that is closest
+# to the max train batch size given.
+PREFER_LARGER_BATCH = 'prefer_larger_batch'
+PREFER_LARGER_BATCH_DEFAULT = True
+
+# In order to reduce confusion, if elastic mode is enabled we
+# require (via assert) that no batch info is set outside of the
+# elastic config. You can turn off this assert via this config
+# but keep in mind that all batch info defined outside the
+# elastic mode *will be ignored*.
+IGNORE_NON_ELASTIC_BATCH_INFO = 'ignore_non_elastic_batch_info'
+IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT = False
+
+# Version of elastic logic to use
+VERSION = "version"
+VERSION_DEFAULT = LATEST_ELASTICITY_VERSION
+
+# Minimum deepspeed version to use elasticity
+MINIMUM_DEEPSPEED_VERSION = "0.3.8"
+
+# Environment variable storing elastic config from resource scheduler
+DEEPSPEED_ELASTICITY_CONFIG = "DEEPSPEED_ELASTICITY_CONFIG"
diff --git a/deepspeed/elasticity/elasticity.py b/deepspeed/elasticity/elasticity.py
new file mode 100644
index 000000000000..ae91877f5f24
--- /dev/null
+++ b/deepspeed/elasticity/elasticity.py
@@ -0,0 +1,334 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+import os
+import re
+import json
+import numpy as np
+
+from .config import ElasticityConfig, ElasticityConfigError, ElasticityError, \
+    ElasticityIncompatibleWorldSize
+from .constants import ELASTICITY, ENABLED, ENABLED_DEFAULT, LATEST_ELASTICITY_VERSION, \
+    MINIMUM_DEEPSPEED_VERSION, IGNORE_NON_ELASTIC_BATCH_INFO, \
+    IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT, DEEPSPEED_ELASTICITY_CONFIG
+from ..git_version_info import version as __version__
+from ..utils import logger
+
+# Thirty eight smallest highly composite numbers. The list should
+# be enough to support up to 720K batch size.
+HCN_LIST = [
+    1,
+    2,
+    4,
+    6,
+    12,
+    24,
+    36,
+    48,
+    60,
+    120,
+    180,
+    240,
+    360,
+    720,
+    840,
+    1260,
+    1680,
+    2520,
+    5040,
+    7560,
+    10080,
+    15120,
+    20160,
+    25200,
+    27720,
+    45360,
+    50400,
+    55440,
+    83160,
+    110880,
+    166320,
+    221760,
+    277200,
+    332640,
+    498960,
+    554400,
+    665280,
+    720720
+]
+
+
+def get_candidate_batch_sizes(base_list, max_acceptable_batch_size):
+    candidate_batch_size = []
+
+    #brute force is fine here. We are working with very small lists
+    for base in base_list:
+        batch_size = base
+        for hcn in HCN_LIST:
+            new_batch_size = base * hcn
+            if new_batch_size > max_acceptable_batch_size:
+                break
+            batch_size = new_batch_size
+        candidate_batch_size.append(batch_size)
+    return list(set(candidate_batch_size))
+
+
+def get_valid_gpus(batch_size, micro_batches, min_valid_gpus, max_valid_gpus):
+    valid_gpus = []
+    for micro_batch in micro_batches:
+        if batch_size % micro_batch == 0:
+
+            max_gpus = batch_size // micro_batch
+            if max_gpus >= min_valid_gpus and max_gpus <= max_valid_gpus:
+                valid_gpus.append(max_gpus)
+
+            for i in range(1, max_gpus // 2 + 1):
+                if max_gpus % i == 0:
+                    if i >= min_valid_gpus and i <= max_valid_gpus:
+                        valid_gpus.append(i)
+    valid_gpus = set(valid_gpus)
+    valid_gpus = sorted(list(valid_gpus))
+    return valid_gpus
+
+
+def get_best_candidates(candidate_batch_sizes,
+                        micro_batches,
+                        min_gpus,
+                        max_gpus,
+                        prefer_larger):
+
+    max_valid_gpus = 0
+    valid_gpus = None
+    final_batch_size = int(min(micro_batches))
+
+    for batch_size in candidate_batch_sizes:
+
+        current_valid_gpus = get_valid_gpus(batch_size,
+                                            micro_batches,
+                                            min_gpus,
+                                            max_gpus)
+
+        if (len(current_valid_gpus) > max_valid_gpus
+                or (len(current_valid_gpus) == max_valid_gpus and
+                    ((prefer_larger and batch_size > final_batch_size) or
+                     (not prefer_larger and batch_size < final_batch_size)))):
+            max_valid_gpus = len(current_valid_gpus)
+            valid_gpus = current_valid_gpus
+            final_batch_size = batch_size
+
+    return final_batch_size, valid_gpus
+
+
+def _get_compatible_gpus_v01(micro_batches,
+                             max_acceptable_batch_size,
+                             min_gpus=None,
+                             max_gpus=None,
+                             prefer_larger=True):
+    '''We use two heuristics to compute the batch size
+        1. We use the Lowest Common Multiple of the micro-batches
+    as the base batch size and scale it by a HCN such that the result is
+    the largest batch size less than the max_acceptable batch size
+        2. We use each of the micro batches as a base and scale it
+    by a HCN such that the result is the largest batch size less than the
+    max_acceptable batch size.
+
+    We then use brute force to count the number of compatible GPU count for
+    each of the aforementioned cases, and return the batch size with the most number of
+    compatible GPU counts in the min-max GPU range if provided, other wise
+    we return the batch size with the most number of total compatible GPU counts.
+
+    Returns:
+        final_batch_size
+        valid_gpus
+    '''
+
+    if min_gpus is None:
+        min_gpus = int(1)
+
+    if max_gpus is None:
+        max_gpus = int(max_acceptable_batch_size / min(micro_batches))
+
+    assert all(mb <= max_acceptable_batch_size for mb in micro_batches ), \
+            f"All micro batches must be less than \
+            or equal to max_acceptable_batch_size: {max_acceptable_batch_size}"
+
+    lcm = np.lcm.reduce(micro_batches)
+
+    base_list = []
+    base_list.extend(micro_batches)
+    base_list.append(lcm)
+
+    candidate_batch_sizes = get_candidate_batch_sizes(base_list,
+                                                      max_acceptable_batch_size)
+
+    final_batch_size, valid_gpus = get_best_candidates(
+        candidate_batch_sizes,
+        micro_batches,
+        min_gpus,
+        max_gpus,
+        prefer_larger)
+
+    return final_batch_size, valid_gpus
+
+
+def _parse_version(version_str):
+    '''Parse a version string and extract the major and minor versions (and possibly patch version).'''
+    matched = re.search('^(\d+)\.(\d+)\.(\d+)', version_str)
+    if matched:
+        return int(matched.group(1)), int(matched.group(2)), int(matched.group(3))
+    else:
+        matched = re.search('^(\d+)\.(\d+)', version_str)
+        assert matched != None, "Unable to parse version number, expecting" \
+            f"major.minor[.patch] format but received {version_str}"
+        return int(matched.group(1)), int(matched.group(2)), 0
+
+
+def _compatible_ds_version_check(target_deepspeed_version: str):
+    min_major, min_minor, min_patch = _parse_version(MINIMUM_DEEPSPEED_VERSION)
+    trg_major, trg_minor, trg_patch = _parse_version(target_deepspeed_version)
+
+    err_str = f"Target deepspeed version of {target_deepspeed_version} is not compatible " \
+        f"with minimum version {MINIMUM_DEEPSPEED_VERSION} supporting elasticity."
+    if trg_major < min_major:
+        raise ElasticityError(err_str)
+    if trg_minor < min_minor:
+        raise ElasticityError(err_str)
+    if trg_patch < min_patch:
+        raise ElasticityError(err_str)
+    return True
+
+
+def elasticity_enabled(ds_config: dict):
+    if ELASTICITY not in ds_config:
+        return False
+    return ds_config[ELASTICITY].get(ENABLED, ENABLED_DEFAULT)
+
+
+def ensure_immutable_elastic_config(runtime_elastic_config_dict: dict):
+    """
+    Ensure the resource scheduler saw the same elastic config we are using at runtime
+    """
+    if DEEPSPEED_ELASTICITY_CONFIG in os.environ:
+        scheduler_elastic_config_dict = json.loads(
+            os.environ[DEEPSPEED_ELASTICITY_CONFIG])
+        scheduler_elastic_config = ElasticityConfig(scheduler_elastic_config_dict)
+        runtime_elastic_config = ElasticityConfig(runtime_elastic_config_dict)
+        err_str = "Elastic config '{}={}' seen by resource scheduler does not match config passed to runtime {}={}"
+        if runtime_elastic_config.max_acceptable_batch_size != scheduler_elastic_config.max_acceptable_batch_size:
+            raise ElasticityConfigError(
+                err_str.format('max_acceptable_batch_size',
+                               scheduler_elastic_config.max_acceptable_batch_size,
+                               'max_acceptable_batch_size',
+                               runtime_elastic_config.max_acceptable_batch_size))
+        if runtime_elastic_config.micro_batches != scheduler_elastic_config.micro_batches:
+            raise ElasticityConfigError(
+                err_str.format('micro_batches',
+                               scheduler_elastic_config.micro_batches,
+                               'micro_batches',
+                               runtime_elastic_config.micro_batches))
+        if runtime_elastic_config.version != scheduler_elastic_config.version:
+            raise ElasticityConfigError(
+                err_str.format('version',
+                               scheduler_elastic_config.version,
+                               'version',
+                               runtime_elastic_config.version))
+    else:
+        logger.warning("Unable to find DEEPSPEED_ELASTICITY_CONFIG environment variable, cannot " \
+            "guarantee resource scheduler will scale this job using compatible GPU counts.")
+
+
+def compute_elastic_config(ds_config: dict, target_deepspeed_version: str, world_size=0):
+    """Core deepspeed elasticity API. Given an elastic config (similar to the example below)
+    DeepSpeed will compute a total train batch size corresponding valid GPU count list that
+    provides a high level of elasticity. Elasticity in this case means we are safe to scale
+    the training job up/down across the GPU count list *without* any negative impacts on
+    training convergence. This is achievable primarily due to DeepSpeed's gradient accumulation
+    feature which allows us to decompose a global training batch size into:
+    micro-batch-size * gradient-accumulation-steps * world-size.
+
+    "elasticity": {
+        "enabled": true,
+        "max_train_batch_size": 2000,
+        "micro_batch_sizes": [2,4,6],
+        "min_gpus": 1,
+        "max_gpus" : 10000
+        "min_time": 20
+        "version": 0.1
+    }
+
+    Intended to be called both by scheduling infrastructure and deepspeed runtime.
+    For the same `ds_config` we should return deterministic results.
+
+    Args:
+        ds_config (dict): DeepSpeed config dictionary/json
+        target_deepspeed_version (str): When called from scheduling
+            infrastructure we want to ensure that the target deepspeed version is
+            compatible with the elasticity version used in the backend.
+        world_size (int, optional): Intended/current world size, will do some sanity
+            checks to ensure world size is actually valid with the config.
+
+    Raises:
+        ElasticityConfigError: Missing required elasticity config or elasticity disabled
+        ElasticityError: If target deepspeed version is not compatible with current version
+
+    Returns:
+        final_batch_size (int): total batch size used for training
+        valid_gpus (list(int)): list of valid GPU counts with this config
+        micro_batch_size (int, optional): if world_size is provided will return
+            specific micro batch size
+    """
+    if not isinstance(ds_config, dict):
+        raise ValueError("Expected ds_config to be a dictionary but received " \
+            f"a {type(ds_config)}, containing: {ds_config}")
+
+    if ELASTICITY not in ds_config:
+        raise ElasticityConfigError(f"'{ELASTICITY}' is missing from config json," \
+            " please add it if running an elastic training job.")
+
+    elastic_config_dict = ds_config[ELASTICITY]
+    if not elastic_config_dict.get(ENABLED, ENABLED_DEFAULT):
+        raise ElasticityConfigError("Elasticity is disabled, please enable it " \
+            "('enabled':true) if running an elastic training job.")
+
+    elastic_config = ElasticityConfig(elastic_config_dict)
+
+    if float(elastic_config.version) > LATEST_ELASTICITY_VERSION:
+        raise ElasticityConfigError("Attempting to run elasticity version " \
+            f"{elastic_config.version} but runtime only supports up " \
+            f"to {LATEST_ELASTICITY_VERSION}")
+
+    # Ensure target deepspeed version works with intended elasticity version
+    if not _compatible_ds_version_check(target_deepspeed_version):
+        raise ElasticityError("Unable to run elasticity on target deepspeed version of" \
+            f" {target_deepspeed_version}, currently {__version__}")
+
+    if float(elastic_config.version) == 0.1:
+        final_batch_size, valid_gpus = _get_compatible_gpus_v01(
+            micro_batches=elastic_config.micro_batches,
+            max_acceptable_batch_size=elastic_config.max_acceptable_batch_size,
+            min_gpus=elastic_config.min_gpus,
+            max_gpus=elastic_config.max_gpus,
+            prefer_larger=elastic_config.prefer_larger_batch_size)
+        # ensure batch size is int dtype
+        final_batch_size = int(final_batch_size)
+    else:
+        raise NotImplementedError(
+            f"Unable to find elastic logic for version: {elastic_config.version}")
+
+    if world_size > 0:
+        if world_size not in valid_gpus:
+            raise ElasticityIncompatibleWorldSize(f"World size ({world_size}) is not valid " \
+        f"with the current list of valid GPU counts: {valid_gpus}")
+
+        # Pick largest valid micro batch size
+        micro_batch_size = None
+        for mbsz in sorted(list(set(elastic_config.micro_batches)), reverse=True):
+            if final_batch_size // world_size % mbsz == 0:
+                micro_batch_size = mbsz
+                break
+        assert micro_batch_size is not None, "Unable to find divisible micro batch size" \
+            f" world_size={world_size}, final_batch_size={final_batch_size}, and " \
+            f" micro_batches={elastic_config.micro_batches}."
+        return final_batch_size, valid_gpus, micro_batch_size
+
+    return final_batch_size, valid_gpus
diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py
new file mode 100644
index 000000000000..b14ac4464835
--- /dev/null
+++ b/deepspeed/env_report.py
@@ -0,0 +1,109 @@
+import torch
+import deepspeed
+import subprocess
+from .ops.op_builder import ALL_OPS
+from .git_version_info import installed_ops, torch_info
+from .ops import __compatible_ops__ as compatible_ops
+
+GREEN = '\033[92m'
+RED = '\033[91m'
+YELLOW = '\033[93m'
+END = '\033[0m'
+SUCCESS = f"{GREEN} [SUCCESS] {END}"
+OKAY = f"{GREEN}[OKAY]{END}"
+WARNING = f"{YELLOW}[WARNING]{END}"
+FAIL = f'{RED}[FAIL]{END}'
+INFO = '[INFO]'
+
+color_len = len(GREEN) + len(END)
+okay = f"{GREEN}[OKAY]{END}"
+warning = f"{YELLOW}[WARNING]{END}"
+
+
+def op_report():
+    max_dots = 23
+    max_dots2 = 11
+    h = ["op name", "installed", "compatible"]
+    print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
+    print("DeepSpeed C++/CUDA extension op report")
+    print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
+
+    print("NOTE: Ops not installed will be just-in-time (JIT) compiled at\n"
+          "      runtime if needed. Op compatibility means that your system\n"
+          "      meet the required dependencies to JIT install the op.")
+
+    print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
+    print("JIT compiled ops requires ninja")
+    ninja_status = OKAY if ninja_installed() else FAIL
+    print('ninja', "." * (max_dots - 5), ninja_status)
+    print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
+    print(h[0], "." * (max_dots - len(h[0])), h[1], "." * (max_dots2 - len(h[1])), h[2])
+    print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
+    installed = f"{GREEN}[YES]{END}"
+    no = f"{YELLOW}[NO]{END}"
+    for op_name, builder in ALL_OPS.items():
+        dots = "." * (max_dots - len(op_name))
+        is_compatible = OKAY if builder.is_compatible() else no
+        is_installed = installed if installed_ops[op_name] else no
+        dots2 = '.' * ((len(h[1]) + (max_dots2 - len(h[1]))) -
+                       (len(is_installed) - color_len))
+        print(op_name, dots, is_installed, dots2, is_compatible)
+    print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
+
+
+def ninja_installed():
+    try:
+        import ninja
+    except ImportError:
+        return False
+    return True
+
+
+def nvcc_version():
+    import torch.utils.cpp_extension
+    cuda_home = torch.utils.cpp_extension.CUDA_HOME
+    if cuda_home is None:
+        return f"{RED} [FAIL] cannot find CUDA_HOME via torch.utils.cpp_extension.CUDA_HOME={torch.utils.cpp_extension.CUDA_HOME} {END}"
+    try:
+        output = subprocess.check_output([cuda_home + "/bin/nvcc",
+                                          "-V"],
+                                         universal_newlines=True)
+    except FileNotFoundError:
+        return f"{RED} [FAIL] nvcc missing {END}"
+    output_split = output.split()
+    release_idx = output_split.index("release")
+    release = output_split[release_idx + 1].replace(',', '').split(".")
+    return ".".join(release)
+
+
+def debug_report():
+    max_dots = 33
+    report = [
+        ("torch install path",
+         torch.__path__),
+        ("torch version",
+         torch.__version__),
+        ("torch cuda version",
+         torch.version.cuda),
+        ("nvcc version",
+         nvcc_version()),
+        ("deepspeed install path",
+         deepspeed.__path__),
+        ("deepspeed info",
+         f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}"
+         ),
+        ("deepspeed wheel compiled w.",
+         f"torch {torch_info['version']}, cuda {torch_info['cuda_version']}"),
+    ]
+    print("DeepSpeed general environment info:")
+    for name, value in report:
+        print(name, "." * (max_dots - len(name)), value)
+
+
+def main():
+    op_report()
+    debug_report()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py
new file mode 100644
index 000000000000..f04982c74f0d
--- /dev/null
+++ b/deepspeed/git_version_info.py
@@ -0,0 +1,17 @@
+try:
+    #  This is populated by setup.py
+    from .git_version_info_installed import *
+except ModuleNotFoundError:
+    import os
+    if os.path.isfile('version.txt'):
+        # Will be missing from checkouts that haven't been installed (e.g., readthedocs)
+        version = open('version.txt', 'r').read().strip()
+    else:
+        version = "0.0.0"
+    git_hash = '[none]'
+    git_branch = '[none]'
+
+    from .ops.op_builder import ALL_OPS
+    installed_ops = dict.fromkeys(ALL_OPS.keys(), False)
+    compatible_ops = dict.fromkeys(ALL_OPS.keys(), False)
+    torch_info = {'version': "0.0", "cuda_version": "0.0"}
diff --git a/deepspeed/pt/__init__.py b/deepspeed/launcher/__init__.py
similarity index 100%
rename from deepspeed/pt/__init__.py
rename to deepspeed/launcher/__init__.py
diff --git a/deepspeed/launcher/constants.py b/deepspeed/launcher/constants.py
new file mode 100644
index 000000000000..fd56facc4343
--- /dev/null
+++ b/deepspeed/launcher/constants.py
@@ -0,0 +1,9 @@
+# Copyright 2020 The Microsoft DeepSpeed Team
+
+PDSH_LAUNCHER = 'pdsh'
+PDSH_MAX_FAN_OUT = 1024
+
+OPENMPI_LAUNCHER = 'openmpi'
+
+MVAPICH_LAUNCHER = 'mvapich'
+MVAPICH_TMP_HOSTFILE = '/tmp/deepspeed_mvapich_hostfile'
diff --git a/deepspeed/pt/deepspeed_launch.py b/deepspeed/launcher/launch.py
similarity index 88%
rename from deepspeed/pt/deepspeed_launch.py
rename to deepspeed/launcher/launch.py
index 55399194d23d..0958295efe06 100755
--- a/deepspeed/pt/deepspeed_launch.py
+++ b/deepspeed/launcher/launch.py
@@ -1,5 +1,11 @@
+# Copyright 2020 The Microsoft DeepSpeed Team
 """
-Copyright 2020 The Microsoft DeepSpeed Team: deepspeed@microsoft.com
+DeepSpeed launcher, this is similar to torch.distributed.launch but supports
+additional features such as abitrary gpu exclusion.
+
+deepspeed.launcher.launch is intended to be run on a single worker node and
+will spawn several worker sub-processes depending on how many devices/ranks
+are on the worker.
 """
 
 import sys
@@ -10,7 +16,8 @@
 from collections import defaultdict
 from argparse import ArgumentParser, REMAINDER
 
-from deepspeed.pt.log_utils import logger
+from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
+from ..utils import logger
 
 
 def parse_args():
@@ -32,7 +39,7 @@ def parse_args():
                         " single node multi-proc training, the"
                         " --master_addr can simply be 127.0.0.1")
     parser.add_argument("--master_port",
-                        default=29500,
+                        default=TORCH_DISTRIBUTED_DEFAULT_PORT,
                         type=int,
                         help="Master node (rank 0)'s free port that needs to "
                         "be used for communication during distributed "
@@ -106,6 +113,7 @@ def main():
         # each process's rank
         dist_rank = global_rank_mapping[local_node][local_rank]
         current_env["RANK"] = str(dist_rank)
+        current_env["LOCAL_RANK"] = str(local_rank)
 
         # spawn the processes
         cmd = [
diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py
new file mode 100644
index 000000000000..a45cc6a56505
--- /dev/null
+++ b/deepspeed/launcher/multinode_runner.py
@@ -0,0 +1,189 @@
+import os
+import sys
+import shutil
+import subprocess
+import warnings
+from abc import ABC, abstractmethod
+
+from ..utils import logger
+from .constants import PDSH_MAX_FAN_OUT, MVAPICH_TMP_HOSTFILE
+
+
+class MultiNodeRunner(ABC):
+    def __init__(self, args, world_info_base64):
+        self.args = args
+        self.user_arguments = self.parse_user_args()
+        self.user_script = args.user_script
+        self.world_info_base64 = world_info_base64
+        self.exports = {}
+
+    @abstractmethod
+    def backend_exists(self):
+        pass
+
+    @abstractmethod
+    def get_cmd(self, environment, active_resources):
+        pass
+
+    def add_export(self, key, var):
+        self.exports[key.strip()] = var.strip()
+
+    def parse_user_args(self):
+        return self.args.user_args
+
+
+class PDSHRunner(MultiNodeRunner):
+    def __init__(self, args, world_info_base64):
+        super().__init__(args, world_info_base64)
+
+    def backend_exists(self):
+        return shutil.which('pdsh')
+
+    def parse_user_args(self):
+        return list(
+            map(lambda x: x if x.startswith("-") else "'{}'".format(x),
+                self.args.user_args))
+
+    def get_cmd(self, environment, active_resources):
+        environment['PDSH_RCMD_TYPE'] = 'ssh'
+
+        active_workers = ",".join(active_resources.keys())
+        logger.info("Running on the following workers: %s" % active_workers)
+
+        # PDSH flags for max node fan out and specific hosts to launch on
+        # See https://linux.die.net/man/1/pdsh for flag details
+        pdsh_cmd_args = ['pdsh', '-f', str(PDSH_MAX_FAN_OUT), '-w', active_workers]
+
+        exports = ""
+        for key, val in self.exports.items():
+            exports += "export {}={}; ".format(key, val)
+
+        deepspeed_launch = [
+            exports,
+            "cd {};".format(os.path.abspath('.')),
+            sys.executable,
+            "-u",
+            "-m",
+            "deepspeed.launcher.launch",
+            '--world_info={}'.format(self.world_info_base64),
+            "--node_rank=%n",
+            "--master_addr={}".format(self.args.master_addr),
+            "--master_port={}".format(self.args.master_port)
+        ]
+
+        return pdsh_cmd_args + deepspeed_launch + [self.user_script
+                                                   ] + self.user_arguments
+
+
+class OpenMPIRunner(MultiNodeRunner):
+    def __init__(self, args, world_info_base64, resource_pool):
+        super().__init__(args, world_info_base64)
+        self.resource_pool = resource_pool
+        self.add_export('UCX_TLS', 'tcp')
+
+    def backend_exists(self):
+        #TODO: if IB is available we should suggestion mvapich
+        return shutil.which('ompi_info')
+
+    def get_cmd(self, environment, active_resources):
+        #TODO: Allow for include/exclude at node-level but not gpu-level
+        assert self.args.include == "" and self.args.exclude == "", 'openmpi backend does not support worker include/exclusion'
+        assert self.args.num_nodes == -1 and self.args.num_gpus == -1, 'openmpi backend does not support limiting num nodes/gpus'
+        total_process_count = sum(self.resource_pool.values())
+
+        mpirun_cmd = [
+            'mpirun',
+            '-n',
+            f'{total_process_count}',
+            '-hostfile',
+            f'{self.args.hostfile}',
+            '--mca',
+            'btl',
+            '^openib',
+            '--mca',
+            'btl_tcp_if_include',
+            'eth0',
+        ]
+
+        export_cmd = []
+        for k, v in self.exports.items():
+            export_cmd += ['-x', f'{k}={v}']
+
+        python_exec = [sys.executable, "-u"]
+
+        return mpirun_cmd + export_cmd + python_exec + [self.user_script
+                                                        ] + self.user_arguments
+
+
+class MVAPICHRunner(MultiNodeRunner):
+    def __init__(self, args, world_info_base64, resource_pool):
+        super().__init__(args, world_info_base64)
+        self.resource_pool = resource_pool
+
+        # Disable the CMA kernel module, not available on Ubuntu systems
+        self.add_export('MV2_SMP_USE_CMA', '0')
+
+        # If we fail this will output more verbose logging
+        self.add_export('MV2_DEBUG_SHOW_BACKTRACE', '1')
+
+        # Enabled cuda-aware communication
+        self.add_export('MV2_USE_CUDA', '1')
+
+        # Support deep learning frameworks: http://hidl.cse.ohio-state.edu/userguide/horovod/
+        self.add_export('MV2_SUPPORT_DL', '1')
+
+        # Support MPI_THREAD_MULTIPLE
+        self.add_export('MV2_ENABLE_AFFINITY', '0')
+
+        # Performance tuning flags for allgather
+        self.add_export('MV2_INTER_ALLGATHER_TUNING', '5')
+        self.add_export('MV2_CUDA_USE_NAIVE', '0')
+
+    def backend_exists(self):
+        #TODO: if IB is available we should suggestion mvapich
+        mpiname_exists = shutil.which('mpiname')
+        exists = False
+        if not mpiname_exists:
+            warnings.warn("mpiname does not exist, mvapich is not installed properly")
+        else:
+            results = subprocess.check_output('mpiname', shell=True)
+            mpiname_results = results.decode('utf-8').strip()
+            if "MVAPICH2-GDR" in mpiname_results:
+                exists = True
+            else:
+                warnings.warn(
+                    f"Expected MVAPICH2-GDR as return for mpiname but received {mpiname_results}"
+                )
+        return exists
+
+    def get_cmd(self, environment, active_resources):
+        #TODO: Allow for include/exclude at node-level but not gpu-level
+        assert self.args.include == "" and self.args.exclude == "", 'mvapich backend does not support worker include/exclusion'
+        assert self.args.num_nodes == -1 and self.args.num_gpus == -1, 'mvapich backend does not support limiting num nodes/gpus'
+        devices_per_node = self.resource_pool.values()
+        total_process_count = sum(devices_per_node)
+        process_per_node = list(devices_per_node)[0]
+        assert all([n == process_per_node for n in devices_per_node]), "mvapich requires same number of devices per node"
+
+        with open(MVAPICH_TMP_HOSTFILE, 'w') as fd:
+            for host in self.resource_pool.keys():
+                fd.write(f'{host}\n')
+
+        mpirun_cmd = [
+            'mpirun',
+            '-np',
+            f'{total_process_count}',
+            '-ppn',
+            f'{process_per_node}',
+            '--hostfile',
+            f'{MVAPICH_TMP_HOSTFILE}',
+        ]
+
+        export_cmd = []
+        for k, v in self.exports.items():
+            export_cmd += ['-env', f'{k}={v}']
+
+        python_exec = [sys.executable, "-u"]
+
+        return mpirun_cmd + export_cmd + python_exec + [self.user_script
+                                                        ] + self.user_arguments
diff --git a/deepspeed/pt/deepspeed_run.py b/deepspeed/launcher/runner.py
similarity index 79%
rename from deepspeed/pt/deepspeed_run.py
rename to deepspeed/launcher/runner.py
index 02a5f351cd6d..6ce482060358 100755
--- a/deepspeed/pt/deepspeed_run.py
+++ b/deepspeed/launcher/runner.py
@@ -1,5 +1,9 @@
+# Copyright 2020 The Microsoft DeepSpeed Team
 """
-Copyright 2020 The Microsoft DeepSpeed Team
+DeepSpeed runner is the main front-end to launching multi-worker
+training jobs with DeepSpeed. By default this uses pdsh to parallel
+ssh into multiple worker nodes and launch all the neccisary processes
+per rank for training.
 """
 
 import os
@@ -14,13 +18,16 @@
 
 import torch.cuda
 
-from deepspeed.pt.deepspeed_constants import TORCH_DISTRIBUTED_DEFAULT_PORT
-from deepspeed.pt.log_utils import logger
+from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner
+from .constants import PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER
+from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
+from ..utils import logger
 
 DLTS_HOSTFILE = "/job/hostfile"
-EXPORT_ENVS = ["NCCL", "PYTHON"]
+EXPORT_ENVS = ["NCCL", "PYTHON", "MV2", 'UCX']
 DEEPSPEED_ENVIRONMENT_NAME = ".deepspeed_env"
 DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.']
+PDSH_MAX_FAN_OUT = 1024
 
 
 def parse_args(args=None):
@@ -61,12 +68,20 @@ def parse_args(args=None):
                         resources except slot 0 on worker-1.
                         ''')
 
-    parser.add_argument("--num_nodes", type=int, default=-1, help="")
+    parser.add_argument("--num_nodes",
+                        type=int,
+                        default=-1,
+                        help="Total number of worker nodes to run on, this will use "
+                        "the top N hosts from the given hostfile.")
 
-    parser.add_argument("--num_gpus", type=int, default=-1, help="")
+    parser.add_argument("--num_gpus",
+                        type=int,
+                        default=-1,
+                        help="Max number of GPUs to use on each node, will use "
+                        "[0:N) GPU ids on each node.")
 
     parser.add_argument("--master_port",
-                        default=int(TORCH_DISTRIBUTED_DEFAULT_PORT),
+                        default=TORCH_DISTRIBUTED_DEFAULT_PORT,
                         type=int,
                         help="(optional) Port used by PyTorch distributed for "
                         "communication during training.")
@@ -77,6 +92,18 @@ def parse_args(args=None):
                         help="(optional) IP address of node 0, will be "
                         "inferred via 'hostname -I' if not specified.")
 
+    parser.add_argument("--launcher",
+                        default=PDSH_LAUNCHER,
+                        type=str,
+                        help="(optional) choose launcher backend for multi-node"
+                        "training. Options currently include PDSH, OpenMPI, MVAPICH.")
+
+    parser.add_argument("--launcher_args",
+                        default="",
+                        type=str,
+                        help="(optional) pass launcher specific arguments as a "
+                        "single quoted argument.")
+
     parser.add_argument("user_script",
                         type=str,
                         help="User script to launch, followed by any required "
@@ -93,9 +120,12 @@ def fetch_hostfile(hostfile_path):
 
     # e.g., worker-0 slots=16
     with open(hostfile_path, 'r') as fd:
-
         resource_pool = collections.OrderedDict()
         for line in fd.readlines():
+            line = line.strip()
+            if line == '':
+                # skip empty lines
+                continue
             try:
                 hostname, slots = line.split()
                 _, slot_count = slots.split("=")
@@ -246,9 +276,6 @@ def main(args=None):
                                                  args.include,
                                                  args.exclude)
 
-    if multi_node_exec and not shutil.which('pdsh'):
-        raise RuntimeError("pdsh is not installed, unable to proceed")
-
     env = os.environ.copy()
 
     if not args.master_addr:
@@ -277,27 +304,35 @@ def main(args=None):
     # encode world info as base64 to make it easier to pass via command line
     world_info_base64 = encode_world_info(active_resources)
 
+    multi_node_exec = len(active_resources) > 1
+
+    if multi_node_exec and not shutil.which('pdsh'):
+        raise RuntimeError("pdsh is not installed, unable to proceed")
+
     if not multi_node_exec:
         deepspeed_launch = [
             sys.executable,
             "-u",
             "-m",
-            "deepspeed.pt.deepspeed_launch",
+            "deepspeed.launcher.launch",
             "--world_info={}".format(world_info_base64),
             "--master_addr={}".format(args.master_addr),
             "--master_port={}".format(args.master_port)
         ]
         cmd = deepspeed_launch + [args.user_script] + args.user_args
     else:
-        env['PDSH_RCMD_TYPE'] = 'ssh'
-
-        active_workers = ",".join(active_resources.keys())
-        logger.info("Running on the following workers: %s" % active_workers)
-
-        pdsh_cmd_args = ['pdsh', '-w', active_workers]
+        args.launcher = args.launcher.lower()
+        if args.launcher == PDSH_LAUNCHER:
+            runner = PDSHRunner(args, world_info_base64)
+        elif args.launcher == OPENMPI_LAUNCHER:
+            runner = OpenMPIRunner(args, world_info_base64, resource_pool)
+        elif args.launcher == MVAPICH_LAUNCHER:
+            runner = MVAPICHRunner(args, world_info_base64, resource_pool)
+        else:
+            raise NotImplementedError(f"Unknown launcher {args.launcher}")
 
-        num_nodes = len(active_resources.keys())
-        num_gpus_per_node = None
+        if not runner.backend_exists():
+            raise RuntimeError(f"launcher '{args.launcher}' not installed.")
 
         curr_path = os.path.abspath('.')
         if 'PYTHONPATH' in env:
@@ -307,33 +342,20 @@ def main(args=None):
 
         exports = ""
         for var in env.keys():
-            if any(map(lambda name: var.startswith(name), EXPORT_ENVS)):
-                exports += "export {}={}; ".format(var, env[var])
+            if any([var.startswith(name) for name in EXPORT_ENVS]):
+                runner.add_export(var, env[var])
 
         for environ_path in DEEPSPEED_ENVIRONMENT_PATHS:
             environ_file = os.path.join(environ_path, DEEPSPEED_ENVIRONMENT_NAME)
             if os.path.isfile(environ_file):
                 with open(environ_file, 'r') as fd:
                     for var in fd.readlines():
-                        exports += "export {}; ".format(var.strip())
+                        key, val = var.split('=')
+                        runner.add_export(key, val)
 
-        deepspeed_launch = [
-            exports,
-            "cd {};".format(curr_path),
-            sys.executable,
-            "-u",
-            "-m",
-            "deepspeed.pt.deepspeed_launch",
-            '--world_info={}'.format(world_info_base64),
-            "--node_rank=%n",
-            "--master_addr={}".format(args.master_addr),
-            "--master_port={}".format(args.master_port)
-        ]
-        user_args = list(
-            map(lambda x: x if x.startswith("-") else "'{}'".format(x),
-                args.user_args))
-        cmd = pdsh_cmd_args + deepspeed_launch + [args.user_script] + user_args
-    logger.info("cmd={}".format(cmd))
+        cmd = runner.get_cmd(env, active_resources)
+
+    logger.info("cmd = {}".format(' '.join(cmd)))
     result = subprocess.Popen(cmd, env=env)
     result.wait()
 
diff --git a/deepspeed/module_inject/__init__.py b/deepspeed/module_inject/__init__.py
new file mode 100755
index 000000000000..e69de29bb2d1
diff --git a/deepspeed/module_inject/inject.py b/deepspeed/module_inject/inject.py
new file mode 100755
index 000000000000..a601ef10e1d2
--- /dev/null
+++ b/deepspeed/module_inject/inject.py
@@ -0,0 +1,122 @@
+import copy
+import torch
+from deepspeed.ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
+
+
+def module_inject(layer_obj,
+                  model,
+                  config,
+                  micro_batch_size,
+                  max_seq_length,
+                  seed,
+                  preln,
+                  fp16=True):
+    for name, child in model.named_children():
+        if isinstance(child, layer_obj):
+            print('REPLACING BertLayer')
+
+            cuda_config = DeepSpeedTransformerConfig(
+                batch_size=micro_batch_size,
+                max_seq_length=max_seq_length,
+                hidden_size=config.hidden_size,
+                heads=config.num_attention_heads,
+                attn_dropout_ratio=config.attention_probs_dropout_prob,
+                hidden_dropout_ratio=config.hidden_dropout_prob,
+                num_hidden_layers=config.num_hidden_layers,
+                initializer_range=config.initializer_range,
+                seed=seed,
+                fp16=fp16,
+                pre_layer_norm=preln)
+
+            new_module = DeepSpeedTransformerLayer(cuda_config)
+
+            # copy relevant state from child -> new module
+            qw = child.attention.self.query.weight
+            qb = child.attention.self.query.bias
+            kw = child.attention.self.key.weight
+            kb = child.attention.self.key.bias
+            vw = child.attention.self.value.weight
+            vb = child.attention.self.value.bias
+
+            qkvw = torch.cat((qw, kw, vw), 0)
+            qkvb = torch.cat((qb, kb, vb), 0)
+
+            new_module.attn_qkvw.data = qkvw
+            new_module.attn_qkvb.data = qkvb
+            new_module.attn_ow.data = child.attention.output.dense.weight
+            new_module.attn_ob.data = child.attention.output.dense.bias
+            if preln:
+                attention_layerNorm = child.PostAttentionLayerNorm
+            else:
+                attention_layerNorm = child.attention.output.LayerNorm
+            new_module.attn_nw.data = attention_layerNorm.weight
+            new_module.attn_nb.data = attention_layerNorm.bias
+            if preln:
+                intermediate_FF = child.intermediate.dense_act
+            else:
+                intermediate_FF = child.intermediate.dense
+            new_module.inter_w.data = intermediate_FF.weight
+            new_module.inter_b.data = intermediate_FF.bias
+            new_module.output_w.data = child.output.dense.weight
+            new_module.output_b.data = child.output.dense.bias
+            if preln:
+                transformer_LayerNorm = child.PreAttentionLayerNorm
+            else:
+                transformer_LayerNorm = child.output.LayerNorm
+            new_module.norm_w.data = transformer_LayerNorm.weight
+            new_module.norm_b.data = transformer_LayerNorm.bias
+
+            setattr(model, name, copy.deepcopy(new_module))
+
+        else:
+            module_inject(layer_obj,
+                          child,
+                          config,
+                          micro_batch_size,
+                          max_seq_length,
+                          seed,
+                          preln,
+                          fp16)
+
+    return model
+
+
+def test_hi():
+    from turing.nvidia_modelingpreln import BertConfig as BertConfigPreLN
+    from turing.nvidia_modelingpreln import BertForQuestionAnswering as BertForQuestionAnsweringPreLN
+    from turing.nvidia_modelingpreln import BertLayer
+    bert_model_config = {
+        "vocab_size_or_config_json_file": 119547,
+        "hidden_size": 1024,
+        "num_hidden_layers": 1,
+        "num_attention_heads": 16,
+        "intermediate_size": 4096,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02
+    }
+    bert_config = BertConfigPreLN(**bert_model_config)
+    base_model = BertForQuestionAnsweringPreLN(bert_config, args=None)
+
+    #base_model = LinearStack()
+
+    test_model = copy.deepcopy(base_model)
+    test_model = module_inject(BertLayer, test_model, bert_config, 4, 384, 1234)
+
+    print('BASE', base_model)
+    print('TEST', test_model)
+
+    #base_model.eval()
+    #test_model.eval()
+
+    #test_input = torch.rand(1, base_model.input_dim)
+
+    #base_output = base_model(test_input)
+    #test_output = test_model(test_input)
+    #
+    #assert torch.allclose(base_output, test_output, atol=3e-8)
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
new file mode 100755
index 000000000000..5274d3c77f84
--- /dev/null
+++ b/deepspeed/module_inject/replace_module.py
@@ -0,0 +1,192 @@
+import copy
+import torch
+import deepspeed
+
+
+def replace_transformer_layer(orig_layer_impl,
+                              model,
+                              micro_batch_size,
+                              bert_config,
+                              seed,
+                              max_seq_length,
+                              preln=False,
+                              fp16=True,
+                              huggingface=False,
+                              local_rank=-1):
+    """ Replace bert-style transformer layers with DeepSpeed's transformer layer
+    Arguments:
+        orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for,
+            e.g., transformers.modeling_bert.BertLayer.
+        model (torch.nn.Module): user's nn.module representing their model
+        micro_batch_size (int): micro batch size per gpu used during training/eval
+        bert_config (dict): model config containing hidden size, attention heads, etc.
+        seed (int): random seed value
+        max_seq_length (int): max sequence length for training
+        preln (bool): does the original layer implementation do pre or post layer norm?
+        fp16 (bool): fp16 or fp32
+        huggingface (bool): huggingface implementation is unique (supports both encoder/decoder modes)
+
+    Returns:
+        Updated nn.module with replaced transformer layers
+    """
+    def replace_fn(child):
+        transformer_config = deepspeed.DeepSpeedTransformerConfig(
+            batch_size=micro_batch_size,
+            max_seq_length=max_seq_length,
+            hidden_size=bert_config.hidden_size,
+            heads=bert_config.num_attention_heads,
+            attn_dropout_ratio=bert_config.attention_probs_dropout_prob,
+            hidden_dropout_ratio=bert_config.hidden_dropout_prob,
+            num_hidden_layers=bert_config.num_hidden_layers,
+            initializer_range=bert_config.initializer_range,
+            seed=seed,
+            fp16=fp16,
+            pre_layer_norm=preln,
+            huggingface=huggingface,
+            local_rank=local_rank)
+        new_module = deepspeed.DeepSpeedTransformerLayer(transformer_config)
+
+        # copy relevant state from child -> new module
+        qw = child.attention.self.query.weight
+        qb = child.attention.self.query.bias
+        kw = child.attention.self.key.weight
+        kb = child.attention.self.key.bias
+        vw = child.attention.self.value.weight
+        vb = child.attention.self.value.bias
+
+        qkvw = torch.cat((qw, kw, vw), 0)
+        qkvb = torch.cat((qb, kb, vb), 0)
+
+        #qw.data,kw.data,vw.data = torch.chunk(qkvw, 3, axis=0)
+        #qb.data,kb.data,vb.data = torch.chunk(qkvb, 3, axis=0)
+
+        new_module.attn_qkvw.data = qkvw
+        new_module.attn_qkvb.data = qkvb
+        new_module.attn_ow.data = child.attention.output.dense.weight
+        new_module.attn_ob.data = child.attention.output.dense.bias
+        if preln:
+            attention_layernorm = child.PostAttentionLayerNorm
+        else:
+            attention_layernorm = child.attention.output.LayerNorm
+        new_module.attn_nw.data = attention_layernorm.weight
+        new_module.attn_nb.data = attention_layernorm.bias
+        if preln:
+            intermediate_ff = child.intermediate.dense_act
+        else:
+            intermediate_ff = child.intermediate.dense
+        new_module.inter_w.data = intermediate_ff.weight
+        new_module.inter_b.data = intermediate_ff.bias
+        new_module.output_w.data = child.output.dense.weight
+        new_module.output_b.data = child.output.dense.bias
+        if preln:
+            transformer_layernorm = child.PreAttentionLayerNorm
+        else:
+            transformer_layernorm = child.output.LayerNorm
+        new_module.norm_w.data = transformer_layernorm.weight
+        new_module.norm_b.data = transformer_layernorm.bias
+        return new_module
+
+    return replace_module(model=model, orig_class=orig_layer_impl, replace_fn=replace_fn)
+
+
+def revert_transformer_layer(orig_layer_impl, model, bert_config, preln=False):
+    """ Revert DeepSpeed's transformer layer back to original bert-style transformer layer
+    Arguments:
+        orig_layer_impl (torch.nn.Module): the original transformer layer implementation that was replaced,
+            e.g., transformers.modeling_bert.BertLayer.
+        model (torch.nn.Module): user's nn.module representing their model
+        bert_config (dict): model config containing hidden size, attention heads, etc.
+
+    Returns:
+        Updated nn.module with original bert-style transformer layers
+    """
+    def replace_fn(child):
+        #from turing.nvidia_modelingpreln import BertLayer
+        orig_module = orig_layer_impl(bert_config)
+
+        # copy relevant state from child -> original module
+        qkvw = child.attn_qkvw.data
+        qkvb = child.attn_qkvb.data
+
+        qw, kw, vw = torch.chunk(qkvw, 3, axis=0)
+        qb, kb, vb = torch.chunk(qkvb, 3, axis=0)
+
+        orig_module.attention.self.query.weight.data = qw
+        orig_module.attention.self.query.bias.data = qb
+        orig_module.attention.self.key.weight.data = kw
+        orig_module.attention.self.key.bias.data = kb
+        orig_module.attention.self.value.weight.data = vw
+        orig_module.attention.self.value.bias.data = vb
+
+        orig_module.attention.output.dense.weight.data = child.attn_ow.data
+        orig_module.attention.output.dense.bias.data = child.attn_ob.data
+
+        attn_ln_w = child.attn_nw.data
+        attn_ln_b = child.attn_nb.data
+        if preln:
+            orig_module.PostAttentionLayerNorm.weight.data = attn_ln_w
+            orig_module.PostAttentionLayerNorm.bias.data = attn_ln_b
+        else:
+            orig_module.attention.output.LayerNorm.weight.data = attn_ln_w
+            orig_module.attention.output.LayerNorm.bias.data = attn_ln_b
+
+        inter_ff_w = child.inter_w.data
+        inter_ff_b = child.inter_b.data
+        if preln:
+            orig_module.intermediate.dense_act.weight.data = inter_ff_w
+            orig_module.intermediate.dense_act.bias.data = inter_ff_b
+        else:
+            orig_module.intermediate.dense.weight.data = inter_ff_w
+            orig_module.intermediate.dense.bias.data = inter_ff_b
+
+        orig_module.output.dense.weight.data = child.output_w.data
+        orig_module.output.dense.bias.data = child.output_b.data
+
+        transformer_ln_w = child.norm_w.data
+        transformer_ln_b = child.norm_b.data
+        if preln:
+            orig_module.PreAttentionLayerNorm.weight.data = transformer_ln_w
+            orig_module.PreAttentionLayerNorm.bias.data = transformer_ln_b
+        else:
+            orig_module.output.LayerNorm.weight.data = transformer_ln_w
+            orig_module.output.LayerNorm.bias.data = transformer_ln_b
+        return orig_module
+
+    return replace_module(model=model,
+                          orig_class=deepspeed.DeepSpeedTransformerLayer,
+                          replace_fn=replace_fn)
+
+
+def replace_module(model, orig_class, replace_fn):
+    """ Scan the model for instances of ``orig_clas:`` to replace using ``replace_fn``.
+    Arguments:
+        model (torch.nn.Module): the model to augment
+        orig_class (torch.nn.Module): the module to search for
+        replace_fn (method): a method to convert instances of ``orig_class`` to the
+                             desired type and return a new instance.
+
+    Returns:
+        A modified ``model``.
+    """
+    policy = {orig_class: replace_fn}
+    return _replace_module(model, policy)
+
+
+def _replace_module(model, policies):
+    """ Traverse model's children recursively and apply any transformations in ``policies``.
+    Arguments:
+        model (torch.nn.Module): model to augment
+        policies (dict): Mapping of source class to replacement function.
+
+    Returns:
+        Modified ``model``.
+    """
+    for name, child in model.named_children():
+        if child.__class__ in policies:
+            orig = repr(child)
+            setattr(model, name, policies[child.__class__](child))
+            new = getattr(model, name)
+        else:
+            _replace_module(child, policies)
+
+    return model
diff --git a/deepspeed/ops/__init__.py b/deepspeed/ops/__init__.py
new file mode 100755
index 000000000000..e6fd81fb5a13
--- /dev/null
+++ b/deepspeed/ops/__init__.py
@@ -0,0 +1,9 @@
+from . import adam
+from . import lamb
+from . import sparse_attention
+from . import transformer
+
+from .transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
+from .module_inject import replace_module
+
+from ..git_version_info import compatible_ops as __compatible_ops__
diff --git a/deepspeed/ops/adam/__init__.py b/deepspeed/ops/adam/__init__.py
new file mode 100755
index 000000000000..6e620b36bd8e
--- /dev/null
+++ b/deepspeed/ops/adam/__init__.py
@@ -0,0 +1,2 @@
+from .cpu_adam import DeepSpeedCPUAdam
+from .fused_adam import FusedAdam
diff --git a/deepspeed/ops/adam/cpu_adam.py b/deepspeed/ops/adam/cpu_adam.py
new file mode 100755
index 000000000000..ebb4548afe6c
--- /dev/null
+++ b/deepspeed/ops/adam/cpu_adam.py
@@ -0,0 +1,147 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+
+import math
+import torch
+import time
+from pathlib import Path
+from ..op_builder import CPUAdamBuilder
+
+
+class DeepSpeedCPUAdam(torch.optim.Optimizer):
+    """Fast vectorized implementation of two variations of Adam optimizer on CPU:
+
+        - Adam: A Method for Stochastic Optimization: (https://arxiv.org/abs/1412.6980);
+        - AdamW: FIXING WEIGHT DECAY REGULARIZATION IN ADAM (https://arxiv.org/abs/1711.05101v1)
+
+       DeepSpeed CPU Adam(W) provides between 5x to 7x speedu over torch.optim.adam(W).
+       In order to apply this optimizer, the model requires to have its master parameter (in FP32)
+       reside on the CPU memory.
+
+       To train on a hetrogeneous system, such as coordinating CPU and GPU, DeepSpeed offers
+       the ZeRO-Offload technology which efficiently offloads the optimizer states into CPU memory,
+       with minimal impact on training througput. DeepSpeedCPUAdam plays an important role to minimize
+       the overhead of the optimizer's latency on CPU. Please refer to ZeRO-Offload tutorial
+       (https://www.deepspeed.ai/tutorials/zero-offload/) for more information on how to enable this technology.
+
+       For calling step function, there are two options available: (1) update optimizer's states and (2) update
+       optimizer's states and copy the parameters back to GPU at the same time. We have seen that the second
+       option can bring 30% higher throughput than the doing the copy separately using option one.
+
+
+    Arguments:
+        model_params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False) NOT SUPPORTED in DeepSpeed CPUAdam!
+        adamw_mode: select between Adam and AdamW implementations (default: AdamW)
+    """
+
+    optimizer_id = 0
+
+    def __init__(self,
+                 model_params,
+                 lr=1e-3,
+                 bias_correction=True,
+                 betas=(0.9,
+                        0.999),
+                 eps=1e-8,
+                 weight_decay=0,
+                 amsgrad=False,
+                 adamw_mode=True):
+
+        default_args = dict(lr=lr,
+                            betas=betas,
+                            eps=eps,
+                            weight_decay=weight_decay,
+                            bias_correction=bias_correction,
+                            amsgrad=amsgrad)
+        super(DeepSpeedCPUAdam, self).__init__(model_params, default_args)
+
+        self.opt_id = DeepSpeedCPUAdam.optimizer_id
+        DeepSpeedCPUAdam.optimizer_id = DeepSpeedCPUAdam.optimizer_id + 1
+
+        self.ds_opt_adam = CPUAdamBuilder().load()
+
+        self.ds_opt_adam.create_adam(self.opt_id,
+                                     lr,
+                                     betas[0],
+                                     betas[1],
+                                     eps,
+                                     weight_decay,
+                                     adamw_mode)
+
+    def __setstate__(self, state):
+        super(DeepSpeedCPUAdam, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    @torch.no_grad()
+    def step(self, closure=None, fp16_param_groups=None):
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group_id, group in enumerate(self.param_groups):
+            for param_id, p in enumerate(group['params']):
+
+                if p.grad is None:
+                    continue
+
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    print(f'group {group_id} param {param_id} = {p.numel()}')
+                    state['step'] = 0
+                    # gradient momentums
+                    state['exp_avg'] = torch.zeros_like(p.data,
+                                                        dtype=p.dtype,
+                                                        device='cpu')
+                    #memory_format=torch.preserve_format)
+                    # gradient variances
+                    state['exp_avg_sq'] = torch.zeros_like(p.data,
+                                                           dtype=p.dtype,
+                                                           device='cpu')
+                    #memory_format=torch.preserve_format)
+
+                state['step'] += 1
+                beta1, beta2 = group['betas']
+
+                if fp16_param_groups is not None:
+                    self.ds_opt_adam.adam_update_copy(
+                        self.opt_id,
+                        state['step'],
+                        group['lr'],
+                        beta1,
+                        beta2,
+                        group['eps'],
+                        group['weight_decay'],
+                        group['bias_correction'],
+                        p.data,
+                        p.grad.data,
+                        state['exp_avg'],
+                        state['exp_avg_sq'],
+                        fp16_param_groups[group_id][param_id].data)
+                else:
+                    self.ds_opt_adam.adam_update(self.opt_id,
+                                                 state['step'],
+                                                 group['lr'],
+                                                 beta1,
+                                                 beta2,
+                                                 group['eps'],
+                                                 group['weight_decay'],
+                                                 group['bias_correction'],
+                                                 p.data,
+                                                 p.grad.data,
+                                                 state['exp_avg'],
+                                                 state['exp_avg_sq'])
+        return loss
diff --git a/deepspeed/ops/adam/fused_adam.py b/deepspeed/ops/adam/fused_adam.py
new file mode 100644
index 000000000000..ae7c5fac88f0
--- /dev/null
+++ b/deepspeed/ops/adam/fused_adam.py
@@ -0,0 +1,182 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+'''
+
+import torch
+import importlib
+from .multi_tensor_apply import MultiTensorApply
+multi_tensor_applier = MultiTensorApply(2048 * 32)
+from ..op_builder import FusedAdamBuilder
+
+
+class FusedAdam(torch.optim.Optimizer):
+    """Implements Adam algorithm.
+
+    Currently GPU-only.
+
+    This version of fused Adam implements 2 fusions.
+
+      * Fusion of the Adam update's elementwise operations
+      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
+
+    Adam was been proposed in `Adam: A Method for Stochastic Optimization`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False) NOT SUPPORTED in FusedAdam!
+        adam_w_mode (boolean, optional): Apply L2 regularization or weight decay
+            True for decoupled weight decay(also known as AdamW) (default: True)
+        set_grad_none (bool, optional): whether set grad to None when zero_grad()
+            method is called. (default: True)
+
+    .. _Adam - A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 bias_correction=True,
+                 betas=(0.9,
+                        0.999),
+                 eps=1e-8,
+                 adam_w_mode=True,
+                 weight_decay=0.,
+                 amsgrad=False,
+                 set_grad_none=True):
+
+        if amsgrad:
+            raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
+        defaults = dict(lr=lr,
+                        bias_correction=bias_correction,
+                        betas=betas,
+                        eps=eps,
+                        weight_decay=weight_decay)
+        super(FusedAdam, self).__init__(params, defaults)
+        self.adam_w_mode = 1 if adam_w_mode else 0
+        self.set_grad_none = set_grad_none
+
+        fused_adam_cuda = FusedAdamBuilder().load()
+        # Skip buffer
+        self._dummy_overflow_buf = torch.cuda.IntTensor([0])
+        self.multi_tensor_adam = fused_adam_cuda.multi_tensor_adam
+
+    def zero_grad(self):
+        if self.set_grad_none:
+            for group in self.param_groups:
+                for p in group['params']:
+                    p.grad = None
+        else:
+            super(FusedAdam, self).zero_grad()
+
+    def step(self,
+             closure=None,
+             grads=None,
+             output_params=None,
+             scale=None,
+             grad_norms=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+
+        The remaining arguments are deprecated, and are only retained (for the moment) for error-checking purposes.
+        """
+        if any(p is not None for p in [grads, output_params, scale, grad_norms]):
+            raise RuntimeError(
+                'FusedAdam has been updated.  Simply initialize it identically to torch.optim.Adam, and call step() with no arguments.'
+            )
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            bias_correction = 1 if group['bias_correction'] else 0
+            beta1, beta2 = group['betas']
+
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support by making it tensor, or pass list into kernel
+            if 'step' in group:
+                group['step'] += 1
+            else:
+                group['step'] = 1
+
+            # create lists for multi-tensor apply
+            g_16, p_16, m_16, v_16 = [], [], [], []
+            g_32, p_32, m_32, v_32 = [], [], [], []
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                if p.grad.data.is_sparse:
+                    raise RuntimeError(
+                        'FusedAdam does not support sparse gradients, please consider SparseAdam instead'
+                    )
+
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                if p.dtype == torch.float16:
+                    g_16.append(p.grad.data)
+                    p_16.append(p.data)
+                    m_16.append(state['exp_avg'])
+                    v_16.append(state['exp_avg_sq'])
+                elif p.dtype == torch.float32:
+                    g_32.append(p.grad.data)
+                    p_32.append(p.data)
+                    m_32.append(state['exp_avg'])
+                    v_32.append(state['exp_avg_sq'])
+                else:
+                    raise RuntimeError('FusedAdam only support fp16 and fp32.')
+
+            if (len(g_16) > 0):
+                multi_tensor_applier(self.multi_tensor_adam,
+                                     self._dummy_overflow_buf,
+                                     [g_16,
+                                      p_16,
+                                      m_16,
+                                      v_16],
+                                     group['lr'],
+                                     beta1,
+                                     beta2,
+                                     group['eps'],
+                                     group['step'],
+                                     self.adam_w_mode,
+                                     bias_correction,
+                                     group['weight_decay'])
+            if (len(g_32) > 0):
+                multi_tensor_applier(self.multi_tensor_adam,
+                                     self._dummy_overflow_buf,
+                                     [g_32,
+                                      p_32,
+                                      m_32,
+                                      v_32],
+                                     group['lr'],
+                                     beta1,
+                                     beta2,
+                                     group['eps'],
+                                     group['step'],
+                                     self.adam_w_mode,
+                                     bias_correction,
+                                     group['weight_decay'])
+
+        return loss
diff --git a/deepspeed/ops/adam/multi_tensor_apply.py b/deepspeed/ops/adam/multi_tensor_apply.py
new file mode 100644
index 000000000000..173288ee76ab
--- /dev/null
+++ b/deepspeed/ops/adam/multi_tensor_apply.py
@@ -0,0 +1,15 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+
+Copyright NVIDIA/apex
+This file is adapted from NVIDIA/apex, commit a109f85
+'''
+import torch
+
+
+class MultiTensorApply(object):
+    def __init__(self, chunk_size):
+        self.chunk_size = chunk_size
+
+    def __call__(self, op, noop_flag_buffer, tensor_lists, *args):
+        return op(self.chunk_size, noop_flag_buffer, tensor_lists, *args)
diff --git a/deepspeed/ops/csrc b/deepspeed/ops/csrc
new file mode 120000
index 000000000000..338652302505
--- /dev/null
+++ b/deepspeed/ops/csrc
@@ -0,0 +1 @@
+../../csrc
\ No newline at end of file
diff --git a/deepspeed/ops/lamb/__init__.py b/deepspeed/ops/lamb/__init__.py
new file mode 100644
index 000000000000..9d3448dd68ea
--- /dev/null
+++ b/deepspeed/ops/lamb/__init__.py
@@ -0,0 +1 @@
+from .fused_lamb import FusedLamb
diff --git a/deepspeed/pt/deepspeed_fused_lamb.py b/deepspeed/ops/lamb/fused_lamb.py
similarity index 77%
rename from deepspeed/pt/deepspeed_fused_lamb.py
rename to deepspeed/ops/lamb/fused_lamb.py
index 387421ffe9b8..e9210cdda9bc 100644
--- a/deepspeed/pt/deepspeed_fused_lamb.py
+++ b/deepspeed/ops/lamb/fused_lamb.py
@@ -3,46 +3,37 @@
 
 Copyright NVIDIA/apex
 This file is adapted from NVIDIA/apex/optimizer/fused_adam and implements the LAMB optimizer
-
 '''
 import types
 import torch
-import importlib
+from ..op_builder import FusedLambBuilder
 
 
 class FusedLamb(torch.optim.Optimizer):
-    """Implements LAMB algorithm. Currently GPU-only.  Requires DeepSpeed adapted Apex to be installed via
-    ``python setup.py install --cuda_ext --cpp_ext``.
-
-    For usage example please see, TODO DeepSpeed Tutorial
+    """Implements the LAMB algorithm. Currently GPU-only.
 
-    It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes.
+    LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes.
     https://arxiv.org/abs/1904.00962
 
-
     Arguments:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups.
         lr (float, optional): learning rate. (default: 1e-3)
+        bias_correction (bool, optional): bias correction (default: True)
         betas (Tuple[float, float], optional): coefficients used for computing
             running averages of gradient and its square. (default: (0.9, 0.999))
         eps (float, optional): term added to the denominator to improve
             numerical stability. (default: 1e-8)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        max_coeff(float, optional): maximum value of the lamb coefficient (default: 10.0)
-        min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01)
-        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
-            algorithm from the paper `On the Convergence of Adam and Beyond`_
-            (default: False) NOT SUPPORTED in FusedAdam!
         eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
             adds eps to the bias-corrected second moment estimate before
             evaluating square root instead of adding it to the square root of
             second moment estimate as in the original paper. (default: False)
-
-    .. _Adam\: A Method for Stochastic Optimization:
-        https://arxiv.org/abs/1412.6980
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        max_grad_norm (float, optional): value used to clip global grad norm
+            (default: 0.0)
+        max_coeff(float, optional): maximum value of the lamb coefficient (default: 10.0)
+        min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01)
+        amsgrad (boolean, optional): NOT SUPPORTED in FusedLamb!
     """
     def __init__(self,
                  params,
@@ -57,8 +48,7 @@ def __init__(self,
                  max_coeff=10.0,
                  min_coeff=0.01,
                  amsgrad=False):
-        global fused_lamb_cuda
-        fused_lamb_cuda = importlib.import_module("deepspeed_lamb_cuda")
+        self.fused_lamb_cuda = FusedLambBuilder().load()
 
         if amsgrad:
             raise RuntimeError('FusedLamb does not support the AMSGrad variant.')
@@ -153,9 +143,7 @@ def step(self,
                 if grad is None:
                     grad = p.grad.data
                 if grad.is_sparse:
-                    raise RuntimeError(
-                        'FusedAdam does not support sparse gradients, please consider SparseAdam instead'
-                    )
+                    raise RuntimeError('FusedLamb does not support sparse gradients')
 
                 state = self.state[p]
 
@@ -177,22 +165,22 @@ def step(self,
                 out_p = torch.tensor(
                     [],
                     dtype=torch.float) if output_param is None else output_param
-                lamb_coeff = fused_lamb_cuda.lamb(p.data,
-                                                  out_p,
-                                                  exp_avg,
-                                                  exp_avg_sq,
-                                                  grad,
-                                                  group['lr'],
-                                                  beta1,
-                                                  beta2,
-                                                  max_coeff,
-                                                  min_coeff,
-                                                  group['eps'],
-                                                  combined_scale,
-                                                  state['step'],
-                                                  self.eps_mode,
-                                                  bias_correction,
-                                                  group['weight_decay'])
+                lamb_coeff = self.fused_lamb_cuda.lamb(p.data,
+                                                       out_p,
+                                                       exp_avg,
+                                                       exp_avg_sq,
+                                                       grad,
+                                                       group['lr'],
+                                                       beta1,
+                                                       beta2,
+                                                       max_coeff,
+                                                       min_coeff,
+                                                       group['eps'],
+                                                       combined_scale,
+                                                       state['step'],
+                                                       self.eps_mode,
+                                                       bias_correction,
+                                                       group['weight_decay'])
                 self.lamb_coeffs.append(lamb_coeff)
         return loss
 
diff --git a/deepspeed/ops/module_inject.py b/deepspeed/ops/module_inject.py
new file mode 100755
index 000000000000..6b0d47cb6733
--- /dev/null
+++ b/deepspeed/ops/module_inject.py
@@ -0,0 +1,216 @@
+import copy
+import torch
+import deepspeed
+
+from deepspeed.ops import DeepSpeedTransformerConfig
+
+
+def _copy_child_transformer_state(new_module, orig_child, pre_layer_norm):
+    # copy relevant state from original child -> new module
+    qw = orig_child.attention.self.query.weight
+    qb = orig_child.attention.self.query.bias
+    kw = orig_child.attention.self.key.weight
+    kb = orig_child.attention.self.key.bias
+    vw = orig_child.attention.self.value.weight
+    vb = orig_child.attention.self.value.bias
+
+    qkvw = torch.cat((qw, kw, vw), 0)
+    qkvb = torch.cat((qb, kb, vb), 0)
+
+    #qw.data,kw.data,vw.data = torch.chunk(qkvw, 3, axis=0)
+    #qb.data,kb.data,vb.data = torch.chunk(qkvb, 3, axis=0)
+
+    new_module.attn_qkvw.data = qkvw
+    new_module.attn_qkvb.data = qkvb
+    new_module.attn_ow.data = orig_child.attention.output.dense.weight
+    new_module.attn_ob.data = orig_child.attention.output.dense.bias
+    if pre_layer_norm:
+        attention_layernorm = orig_child.PostAttentionLayerNorm
+    else:
+        attention_layernorm = orig_child.attention.output.LayerNorm
+    new_module.attn_nw.data = attention_layernorm.weight
+    new_module.attn_nb.data = attention_layernorm.bias
+    if pre_layer_norm:
+        intermediate_ff = orig_child.intermediate.dense_act
+    else:
+        intermediate_ff = orig_child.intermediate.dense
+    new_module.inter_w.data = intermediate_ff.weight
+    new_module.inter_b.data = intermediate_ff.bias
+    new_module.output_w.data = orig_child.output.dense.weight
+    new_module.output_b.data = orig_child.output.dense.bias
+    if pre_layer_norm:
+        transformer_layernorm = orig_child.PreAttentionLayerNorm
+    else:
+        transformer_layernorm = orig_child.output.LayerNorm
+    new_module.norm_w.data = transformer_layernorm.weight
+    new_module.norm_b.data = transformer_layernorm.bias
+
+
+def _replace_transformer_layer(orig_layer_impl, model, transformer_config):
+    """ Replace bert-style transformer layers with DeepSpeed's transformer layer
+    Arguments:
+        orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for,
+            e.g., transformers.modeling_bert.BertLayer.
+        model (torch.nn.Module): user's nn.module representing their model
+        transformer_config (dict): deepspeed transformer layer config containing hidden size, attention heads, etc.
+    Returns:
+        Updated nn.module with replaced transformer layers
+    """
+    def replace_fn(child):
+        new_module = deepspeed.DeepSpeedTransformerLayer(transformer_config)
+        _copy_child_transformer_state(new_module,
+                                      child,
+                                      transformer_config.pre_layer_norm)
+
+        return new_module
+
+    return _replace_module(model=model,
+                           orig_class=orig_layer_impl,
+                           replace_fn=replace_fn)
+
+
+def replace_module(orig_module_impl, model, replacement_module_config):
+    """ Replace client module
+    Arguments:
+        orig_module_impl (torch.nn.Module): original module implementation to replace,
+            e.g., transformers.modeling_bert.BertLayer.
+        model (torch.nn.Module): user's nn.module representing their model
+        replacement_module_config (dict): deepspeed replacement module config (e.g., DeepSpeedTransformerConfig) .
+
+    Returns:
+        Updated nn.module with replaced modules
+    """
+    assert isinstance(replacement_module_config, DeepSpeedTransformerConfig), \
+        'Only DeepSpeedTransformerConfig is currently supported as replacement config'
+
+    return _replace_transformer_layer(orig_layer_impl=orig_module_impl,
+                                      model=model,
+                                      transformer_config=replacement_module_config)
+
+
+def _revert_transformer_layer(orig_layer_impl, model, bert_config, transformer_config):
+    """ Revert DeepSpeed's transformer layer back to original bert-style transformer layer
+    Arguments:
+        orig_layer_impl (torch.nn.Module): the original transformer layer implementation that was replaced,
+            e.g., transformers.modeling_bert.BertLayer.
+        model (torch.nn.Module): user's nn.module representing their model
+        bert_config (dict): model config containing hidden size, attention heads, etc.
+        transformer_config (dict): deepspeed tranformer config used for replacement
+
+    Returns:
+        Updated nn.module with original bert-style transformer layers
+    """
+    def replace_fn(child):
+        #from turing.nvidia_modelingpreln import BertLayer
+        orig_module = orig_layer_impl(bert_config)
+
+        # copy relevant state from child -> original module
+        qkvw = child.attn_qkvw.data
+        qkvb = child.attn_qkvb.data
+
+        qw, kw, vw = torch.chunk(qkvw, 3, axis=0)
+        qb, kb, vb = torch.chunk(qkvb, 3, axis=0)
+
+        orig_module.attention.self.query.weight.data = qw
+        orig_module.attention.self.query.bias.data = qb
+        orig_module.attention.self.key.weight.data = kw
+        orig_module.attention.self.key.bias.data = kb
+        orig_module.attention.self.value.weight.data = vw
+        orig_module.attention.self.value.bias.data = vb
+
+        orig_module.attention.output.dense.weight.data = child.attn_ow.data
+        orig_module.attention.output.dense.bias.data = child.attn_ob.data
+
+        attn_ln_w = child.attn_nw.data
+        attn_ln_b = child.attn_nb.data
+        if transformer_config.pre_layer_norm:
+            orig_module.PostAttentionLayerNorm.weight.data = attn_ln_w
+            orig_module.PostAttentionLayerNorm.bias.data = attn_ln_b
+        else:
+            orig_module.attention.output.LayerNorm.weight.data = attn_ln_w
+            orig_module.attention.output.LayerNorm.bias.data = attn_ln_b
+
+        inter_ff_w = child.inter_w.data
+        inter_ff_b = child.inter_b.data
+        if transformer_config.pre_layer_norm:
+            orig_module.intermediate.dense_act.weight.data = inter_ff_w
+            orig_module.intermediate.dense_act.bias.data = inter_ff_b
+        else:
+            orig_module.intermediate.dense.weight.data = inter_ff_w
+            orig_module.intermediate.dense.bias.data = inter_ff_b
+
+        orig_module.output.dense.weight.data = child.output_w.data
+        orig_module.output.dense.bias.data = child.output_b.data
+
+        transformer_ln_w = child.norm_w.data
+        transformer_ln_b = child.norm_b.data
+        if transformer_config.pre_layer_norm:
+            orig_module.PreAttentionLayerNorm.weight.data = transformer_ln_w
+            orig_module.PreAttentionLayerNorm.bias.data = transformer_ln_b
+        else:
+            orig_module.output.LayerNorm.weight.data = transformer_ln_w
+            orig_module.output.LayerNorm.bias.data = transformer_ln_b
+        return orig_module
+
+    return _replace_module(model=model,
+                           orig_class=deepspeed.DeepSpeedTransformerLayer,
+                           replace_fn=replace_fn)
+
+
+def revert_module(orig_module_impl,
+                  model,
+                  orig_module_config,
+                  replacement_module_config):
+    """ Revert DeepSpeed's module back to original client module
+    Arguments:
+        orig_module_impl (torch.nn.Module): the original module that was replaced,
+        e.g., transformers.modeling_bert.BertLayer.
+        model (torch.nn.Module): user's nn.module representing their model
+        orig_module_config (dict): original module configuration
+        replacement_module_config (dict): replacement deepspeed module configuration
+
+    Returns:
+        Updated nn.module with original bert-style transformer layers
+    """
+    assert isinstance(replacement_module_config, DeepSpeedTransformerConfig), \
+        'Only DeepSpeedTransformerConfig is currently supported as replacement config'
+
+    return _revert_transformer_layer(orig_layer_impl=orig_module_impl,
+                                     model=model,
+                                     bert_config=orig_module_config,
+                                     transformer_config=replacement_module_config)
+
+
+def _replace_module(model, orig_class, replace_fn):
+    """ Scan the model for instances of ``orig_clas:`` to replace using ``replace_fn``.
+    Arguments:
+        model (torch.nn.Module): the model to augment
+        orig_class (torch.nn.Module): the module to search for
+        replace_fn (method): a method to convert instances of ``orig_class`` to the
+                             desired type and return a new instance.
+
+    Returns:
+        A modified ``model``.
+    """
+    policy = {orig_class: replace_fn}
+    return _replace_module_using_policies(model, policy)
+
+
+def _replace_module_using_policies(model, policies):
+    """ Traverse model's children recursively and apply any transformations in ``policies``.
+    Arguments:
+        model (torch.nn.Module): model to augment
+        policies (dict): Mapping of source class to replacement function.
+
+    Returns:
+        Modified ``model``.
+    """
+    for name, child in model.named_children():
+        if child.__class__ in policies:
+            orig = repr(child)
+            setattr(model, name, policies[child.__class__](child))
+            new = getattr(model, name)
+        else:
+            _replace_module_using_policies(child, policies)
+
+    return model
diff --git a/deepspeed/ops/op_builder b/deepspeed/ops/op_builder
new file mode 120000
index 000000000000..db4f9c335065
--- /dev/null
+++ b/deepspeed/ops/op_builder
@@ -0,0 +1 @@
+../../op_builder
\ No newline at end of file
diff --git a/deepspeed/ops/sparse_attention/__init__.py b/deepspeed/ops/sparse_attention/__init__.py
new file mode 100644
index 000000000000..604ab29a8d6b
--- /dev/null
+++ b/deepspeed/ops/sparse_attention/__init__.py
@@ -0,0 +1,6 @@
+from .sparsity_config import SparsityConfig, DenseSparsityConfig, FixedSparsityConfig, VariableSparsityConfig, BigBirdSparsityConfig, BSLongformerSparsityConfig
+from .softmax import Softmax
+from .matmul import MatMul
+from .sparse_self_attention import SparseSelfAttention
+from .bert_sparse_self_attention import BertSparseSelfAttention
+from .sparse_attention_utils import SparseAttentionUtils
diff --git a/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py b/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py
new file mode 100755
index 000000000000..40dc697e11b1
--- /dev/null
+++ b/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py
@@ -0,0 +1,78 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+
+from torch import nn
+from deepspeed.ops.sparse_attention import SparseSelfAttention, FixedSparsityConfig
+
+
+class BertSparseSelfAttention(nn.Module):
+    """Implements Sparse Self Attention layer of Bert model based on https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py#L373
+
+    For more information please see, TODO DeepSpeed Sparse Transformer.
+
+    For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial.
+    """
+    def __init__(
+        self,
+        config,
+        # SparsityConfig parameters needs to be set accordingly
+        sparsity_config=FixedSparsityConfig(num_heads=4)):
+        """Initialize the bert sparse self attention layer.
+
+        Note) you can use any of the provided sparsity configs or simply add yours!
+
+        Arguments:
+            config: required: Bert model config
+            sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on FixedSparsityConfig class.
+        """
+
+        super(BertSparseSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size,
+                                config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.sparse_self_attention = SparseSelfAttention(sparsity_config)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask):
+        """Applies forward phase of bert sparse self attention
+
+        Arguments:
+            hidden_states: required: hidde_states tensor of the bert model
+            attn_mask: required: a mask tensor of size (SequenceLength X SequenceLength); currently only 2D is supported
+
+        Return:
+             context_layer: a dense tensor containing attnetion context
+        """
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        context_layer = self.sparse_self_attention(query_layer,
+                                                   key_layer,
+                                                   value_layer,
+                                                   key_padding_mask=attention_mask)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
diff --git a/deepspeed/ops/sparse_attention/matmul.py b/deepspeed/ops/sparse_attention/matmul.py
new file mode 100644
index 000000000000..db5b774c3243
--- /dev/null
+++ b/deepspeed/ops/sparse_attention/matmul.py
@@ -0,0 +1,750 @@
+# DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
+# https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py
+import importlib
+import warnings
+import torch
+import math
+from .trsrc import matmul
+from ..op_builder import SparseAttnBuilder
+
+triton = None
+
+
+##############
+#  MAIN API  #
+##############
+class _sparse_matmul(torch.autograd.Function):
+
+    sdd_cache = dict()
+    dsd_cache = dict()
+    dds_cache = dict()
+    locks = dict()
+
+    # Given an array sizes representing reduction size for each
+    # column of a block-mode matrix multiplication,
+    # performs load-balancing to achieve more smaller reductions
+    # between `seg_size` elements
+    @staticmethod
+    def load_balance(sizes, block):
+        global triton
+        if triton is None:
+            triton = importlib.import_module('triton')
+        # segment size
+        # heuristics taken from OpenAI blocksparse code
+        # https://github.com/openai/blocksparse/blob/master/blocksparse/matmul.py#L95
+        max_size = sizes.max()
+        min_size = sizes[sizes != 0].min()
+        #if max_size > min_size * 2.0:
+        #  seg_max = max(triton.cdiv(max_size, 4), min_size*2)
+        #else:
+        #  seg_max = max_size
+        seg_max = max_size
+        seg_min = max(triton.cdiv(seg_max, 4), 4)
+        # split reduction into segments
+        div = sizes // seg_max
+        rem = sizes % seg_max
+        packs = div + (sizes < seg_min).long() + (rem >= seg_min).long()
+        width = packs.sum()
+        segments = torch.empty(width, dtype=sizes.dtype)
+        column = torch.empty_like(segments)
+        lockid = torch.zeros_like(segments)
+        maxid = torch.zeros_like(segments)
+        nlocks = 0
+        current = 0
+        col_idx = 0
+        for i in range(len(sizes)):
+            d, r = div[i], rem[i]
+            isempty = sizes[i] < seg_min
+            last = current + d + (r >= seg_min) + isempty
+            # column id
+            column[current:last] = col_idx
+            # lock id
+            if d > 1 or (d == 1 and r >= seg_min):
+                nlocks += 1
+                lockid[current:last] = nlocks
+                maxid[current:last] = last - current
+            # segment size
+            segments[current:current + d] = seg_max
+            if r < seg_min and not isempty:
+                segments[current + d - 1] += r
+            if r >= seg_min or isempty:
+                segments[current + d] = r
+            current = last
+            col_idx += 1
+        offsets = torch.zeros_like(segments)
+        offsets[1:] = torch.cumsum(segments[:-1], dim=0)
+        return segments, column, lockid, maxid, offsets
+
+    @staticmethod
+    def get_locks(size, dev):
+        if dev not in _sparse_matmul.locks or \
+            size > _sparse_matmul.locks[dev].size(0):
+            _sparse_matmul.locks[dev] = torch.zeros(size, dtype=torch.int32, device=dev)
+        return _sparse_matmul.locks[dev]
+
+    ##########################
+    # SPARSE = DENSE x DENSE #
+    ##########################
+    cpp_utils = None
+    sdd_segment = None
+
+    @staticmethod
+    def _load_utils():
+        if _sparse_matmul.cpp_utils is None:
+            _sparse_matmul.cpp_utils = SparseAttnBuilder().load()
+            _sparse_matmul.sdd_segment = _sparse_matmul.cpp_utils.sdd_segment
+
+    @staticmethod
+    def make_sdd_lut(layout, block, dtype, device):
+        _sparse_matmul._load_utils()
+        start_width = 64 // block
+        segmented = _sparse_matmul.sdd_segment(layout.type(torch.int32), start_width)
+        luts, widths, packs = [], [], []
+        for size, nnz in segmented:
+            width = nnz.shape[0] // (size * size)
+            h = nnz[:, 0]
+            i = nnz[:, 1]
+            j = nnz[:, 2]
+            b = nnz[:, 3]
+            lut = torch.stack((h, i, j, b), dim=1).view(-1).contiguous()
+            luts.append(lut.type(torch.int32).to(device))
+            widths.append(width)
+            packs.append(size)
+        # create locks
+        return luts, None, widths, packs
+
+    @staticmethod
+    def _sdd_matmul(a,
+                    b,
+                    trans_a,
+                    trans_b,
+                    trans_c,
+                    spdims,
+                    block,
+                    luts,
+                    num_locks,
+                    widths,
+                    packs,
+                    bench,
+                    time):
+        global triton
+        if triton is None:
+            triton = importlib.import_module('triton')
+
+        if trans_c:
+            a, b = b, a
+            trans_a, trans_b = not trans_b, not trans_a
+        AS0 = a.size(0)
+        AS1 = a.size(1)
+        AS2 = a.size(3 if trans_a else 2)
+        AS3 = a.size(2 if trans_a else 3)
+        BS0 = b.size(0)
+        BS1 = b.size(1)
+        BS2 = b.size(3 if trans_b else 2)
+        BS3 = b.size(2 if trans_b else 3)
+        dtype = a.dtype
+        is_16_multiple = AS3 % 16 == 0
+        is_32_multiple = AS3 % 32 == 0
+        is_64_multiple = AS3 % 64 == 0
+        if not is_16_multiple:
+            raise ValueError('Reduction size for SDD must be a multiple of 16')
+        # create kernel
+        total_width = sum([width * pack * pack for width, pack in zip(widths, packs)])
+        c = torch.empty((AS0, total_width, block, block), dtype=dtype, device=a.device)
+        for lut, width, pack in zip(luts, widths, packs):
+            num_lock = 1
+            key = (block,
+                   a.dtype,
+                   b.dtype,
+                   trans_a,
+                   trans_b,
+                   trans_c,
+                   pack,
+                   is_32_multiple,
+                   is_64_multiple)
+            if key not in _sparse_matmul.sdd_cache:
+                F32TK = [8, 16]
+                F16TK = [16]
+                F16TK += [32] if is_32_multiple else []
+                F16TK += [64] if is_64_multiple else []
+                TK = {torch.float32: F32TK, torch.float16: F16TK}[dtype]
+                defines = {
+                    'TM': block * pack,
+                    'TN': block * pack,
+                    'TMN': block * block * pack * pack,
+                    'BLOCK': block,
+                    'TK': TK,
+                    'TYPE': dtype,
+                    'STRIDE_AM': '1' if trans_a else 'lda',
+                    'STRIDE_AK': 'lda' if trans_a else '1',
+                    'STRIDE_BN': 'ldb' if trans_b else '1',
+                    'STRIDE_BK': '1' if trans_b else 'ldb',
+                    'STRIDE_CM': 'ldc',
+                    'STRIDE_CN': '1',
+                    'SDD': True,
+                    'TZ': 1,
+                    'NAME': 'sdd_kernel'
+                }
+                _sparse_matmul.sdd_cache[key] = triton.kernel(matmul,
+                                                              defines=defines,
+                                                              num_warps=[1,
+                                                                         2,
+                                                                         4])
+                #_sparse_matmul.sdd_cache[key] = triton.kernel(src, defines=defines, num_warps=[1, 2, 4])
+
+            kernel = _sparse_matmul.sdd_cache[key]
+            # create output
+            locks = _sparse_matmul.get_locks(2 * width * AS0 * num_lock, a.device)
+            # maximum grid size is 65535
+            # so operation might be decomposed into multiple
+            # kernel calls
+            max_width = 49152
+            total = 0 if bench else None
+            for off_width in range(0, width, max_width):
+                current = kernel(a,
+                                 b,
+                                 c,
+                                 a.stride(2),
+                                 b.stride(2),
+                                 block,
+                                 a.stride(0),
+                                 b.stride(0),
+                                 c.stride(0),
+                                 a.stride(1),
+                                 b.stride(1),
+                                 c.stride(0),
+                                 AS2,
+                                 AS2,
+                                 AS3,
+                                 off_width,
+                                 lut,
+                                 locks,
+                                 num_lock,
+                                 grid=lambda opt:
+                                 [opt.d('TZ'),
+                                  min(max_width,
+                                      width - off_width),
+                                  AS0],
+                                 bench=bench)
+                total = total + current if bench else None
+            time[0] = total
+        # save for backward pass
+        return c
+
+    ##########################
+    # DENSE = DENSE x SPARSE #
+    ##########################
+
+    # Given a binary layout of 0s and 1s,
+    # Construct look-up table for efficient execution on GPUs
+    @staticmethod
+    def make_dxx_lut(layout, block, step, trans, device, transform=lambda idx: idx):
+        # load-balancing
+        _empty = torch.tensor([], dtype=torch.int64, device=layout.device)
+        segments = _empty.clone()
+        column = _empty.clone()
+        depth = _empty.clone()
+        lockid = _empty.clone()
+        maxid = _empty.clone()
+        offsets = _empty.clone()
+        current_offset = 0
+        current_maxid = 0
+        for z in range(layout.size(0)):
+            if trans:
+                sizes = torch.sum(layout[z, :, :], 1)
+            else:
+                sizes = torch.sum(layout[z, :, :], 0)
+            z_segments, z_column, z_lockid, z_maxid, z_offsets = _sparse_matmul.load_balance(sizes, block)
+            z_depth = z * torch.ones_like(z_segments)
+            z_lockid[z_lockid > 0] += current_maxid
+            current_maxid = z_lockid.max()
+            # concatenate depth
+            segments = torch.cat((segments, z_segments))
+            column = torch.cat((column, z_column))
+            depth = torch.cat((depth, z_depth))
+            maxid = torch.cat((maxid, z_maxid))
+            offsets = torch.cat((offsets, current_offset + z_offsets))
+            lockid = torch.cat((lockid, z_lockid))
+            current_offset += layout[z, :, :].sum()
+        segments *= step
+        # pointer increments
+        if trans:
+            nnz = layout.nonzero()
+        else:
+            nnz = layout.transpose(1, 2).nonzero()
+        num_blocks = nnz.size(0)
+        offsets = torch.min(offsets, (num_blocks - 1) * torch.ones_like(offsets))
+        idx = transform(nnz[:, 2] * block)
+        xincs = idx.clone()
+        xincs[1:] -= idx[:-1]
+        # divide block into multiple steps
+        div = block // step
+        xincs = xincs.view(-1, 1).repeat(1, div)
+        xincs[:, 1:] = step
+        xincs[:, 0] -= (div - 1) * step
+        # first increment for each reduction is actually the offset
+        xincs[offsets[segments > 0], 0] = idx[offsets[segments > 0]]
+        xincs = xincs.view(-1)
+        # block-mode input increments
+        if trans:
+            widx = torch.arange(num_blocks)
+        else:
+            widx = _empty.clone()
+            current_offset = 0
+            for z in range(layout.size(0)):
+                layoutw = layout[z, :, :].clone()
+                msum = layoutw.sum()
+                layoutw[layoutw > 0] = 1 + torch.arange(msum)
+                widx = torch.cat((widx, current_offset + layoutw.T[layoutw.T > 0] - 1))
+                current_offset += msum
+        widx = widx
+        wincs = widx * block * block
+        wincs[1:] -= widx[:-1] * block * block
+        wincs = wincs.view(-1, 1).repeat(1, div)
+        if trans:
+            wincs[:, 1:] = step
+            wincs[:, 0] -= (div - 1) * step
+        else:
+            wincs[:, 1:] = step * block
+            wincs[:, 0] -= (div - 1) * step * block
+        wincs[offsets[segments > 0], 0] = widx[offsets[segments > 0]]
+        wincs = wincs.view(-1)
+        # adjust offset and segment size
+        offsets *= 2 * div
+        segments *= div
+        # create header
+        width = column.size(0)
+        offsets += 6 * width
+        header = torch.stack((offsets,
+                              segments,
+                              column,
+                              depth,
+                              lockid,
+                              maxid),
+                             dim=1).view(-1).contiguous()
+        incs = torch.stack((xincs, wincs), dim=1).view(-1).contiguous()
+        incs = torch.cat((incs, torch.zeros(2, device=incs.device, dtype=incs.dtype)))
+        # create lut
+        lut = torch.cat((header, incs))
+        lut = lut.type(torch.int32).to(device)
+        # create locks
+        num_locks = max(1, lockid.max())
+        return lut, num_locks, width, None
+
+    @staticmethod
+    def _dds_matmul(a,
+                    b,
+                    trans_a,
+                    trans_b,
+                    trans_c,
+                    spdims,
+                    block,
+                    lut,
+                    num_locks,
+                    width,
+                    packs,
+                    bench,
+                    time):
+        global triton
+        if triton is None:
+            triton = importlib.import_module('triton')
+
+        # shapes / dtypes
+        AS0 = a.size(0)
+        AS1 = a.size(1)
+        AS2 = a.size(3 if trans_a else 2)
+        AS3 = a.size(2 if trans_a else 3)
+        BS0 = spdims[0]
+        BS1 = block * spdims[2 if trans_b else 1]
+        BS2 = block * spdims[1 if trans_b else 2]
+        dtype = a.dtype
+        # kernel
+        key = (block, a.dtype, b.dtype, trans_a, trans_b, trans_c)
+        if key not in _sparse_matmul.dds_cache:
+            TM = [64, 128] if dtype == torch.float32 else [64, 128, 256]
+            TK = [8] if dtype == torch.float32 else [16]
+            defines = {
+                'TM': TM,
+                'TN': block,
+                'TK': TK,
+                'BLOCK': block,
+                'TYPE': dtype,
+                'STRIDE_AM': 1 if trans_a else 'lda',
+                'STRIDE_AK': 'lda' if trans_a else 1,
+                'STRIDE_BN': block if trans_b else 1,
+                'STRIDE_BK': 1 if trans_b else block,
+                'STRIDE_CM': '1' if trans_c else 'ldc',
+                'STRIDE_CN': 'ldc' if trans_c else '1',
+                'NAME': 'dds_kernel',
+                'DDS': True
+            }
+            _sparse_matmul.dds_cache[key] = triton.kernel(matmul,
+                                                          defines=defines,
+                                                          num_warps=[4])
+            #_sparse_matmul.dds_cache[key] = triton.kernel(src, defines=defines, num_warps=[4])
+        kernel = _sparse_matmul.dds_cache[key]
+        # output
+        CS0 = AS0
+        CS1 = AS1
+        CS2 = BS2 if trans_c else AS2
+        CS3 = AS2 if trans_c else BS2
+        locks = _sparse_matmul.get_locks(2 * AS0 * AS2 // 32 * num_locks, a.device)
+        c = torch.empty((CS0, CS1, CS2, CS3), dtype=dtype, device=a.device)
+        time[0] = kernel(a,
+                         b,
+                         c,
+                         a.stride(2),
+                         block,
+                         c.stride(2),
+                         a.stride(0),
+                         b.stride(0),
+                         c.stride(0),
+                         a.stride(1),
+                         b.stride(1),
+                         c.stride(1),
+                         AS2,
+                         BS2,
+                         0,
+                         0,
+                         lut,
+                         locks,
+                         num_locks,
+                         grid=lambda opt: [width,
+                                           triton.cdiv(AS2,
+                                                       opt.d('TM')),
+                                           AS0],
+                         bench=bench)
+        return c
+
+    @staticmethod
+    def _dsd_matmul(a,
+                    b,
+                    trans_a,
+                    trans_b,
+                    trans_c,
+                    spdims,
+                    block,
+                    lut,
+                    num_locks,
+                    width,
+                    packs,
+                    bench,
+                    time):
+        global triton
+        if triton is None:
+            triton = importlib.import_module('triton')
+
+        # shapes / dtypes
+        AS0 = spdims[0]
+        AS1 = block * spdims[2 if trans_a else 1]
+        AS2 = block * spdims[1 if trans_a else 2]
+        BS0 = b.size(0)
+        BS1 = b.size(1)
+        BS2 = b.size(3 if trans_b else 2)
+        BS3 = b.size(2 if trans_b else 3)
+        dtype = a.dtype
+        # kernel
+        key = (block, a.dtype, b.dtype, trans_a, trans_b, trans_c)
+        if key not in _sparse_matmul.dsd_cache:
+            TN = [64, 128] if dtype == torch.float32 else [64, 128, 256]
+            TK = [8] if dtype == torch.float32 else [16]
+            defines = {
+                'TM': block,
+                'TN': TN,
+                'TK': TK,
+                'BLOCK': block,
+                'TYPE': dtype,
+                'STRIDE_AM': 1 if trans_a else block,
+                'STRIDE_AK': block if trans_a else 1,
+                'STRIDE_BN': 'ldb' if trans_b else '1',
+                'STRIDE_BK': '1' if trans_b else 'ldb',
+                'STRIDE_CM': '1' if trans_c else 'ldc',
+                'STRIDE_CN': 'ldc' if trans_c else '1',
+                'NAME': 'dsd_kernel',
+                'DSD': True
+            }
+            _sparse_matmul.dsd_cache[key] = triton.kernel(matmul,
+                                                          defines=defines,
+                                                          num_warps=[4])
+            #_sparse_matmul.dsd_cache[key] = triton.kernel(src, defines=defines, num_warps=[4])
+        kernel = _sparse_matmul.dsd_cache[key]
+        # output
+        CS0 = BS0
+        CS1 = BS1
+        CS2 = BS3 if trans_c else AS1
+        CS3 = AS1 if trans_c else BS3
+        locks = _sparse_matmul.get_locks(2 * BS0 * BS3 // 32 * num_locks, a.device)
+        c = torch.empty((CS0, CS1, CS2, CS3), dtype=dtype, device=a.device)
+        time[0] = kernel(a,
+                         b,
+                         c,
+                         block,
+                         b.stride(2),
+                         c.stride(2),
+                         a.stride(0),
+                         b.stride(0),
+                         c.stride(0),
+                         a.stride(1),
+                         b.stride(1),
+                         c.stride(1),
+                         BS3,
+                         AS1,
+                         0,
+                         0,
+                         lut,
+                         locks,
+                         num_locks,
+                         grid=lambda opt: [width,
+                                           triton.cdiv(BS3,
+                                                       opt.d('TN')),
+                                           BS0],
+                         bench=bench)
+        return c
+
+    fn = {
+        'sdd': _sdd_matmul.__get__(object),
+        'dsd': _dsd_matmul.__get__(object),
+        'dds': _dds_matmul.__get__(object)
+    }
+
+    @staticmethod
+    def forward(ctx,
+                a,
+                b,
+                trans_a,
+                trans_b,
+                trans_c,
+                mode,
+                spdims,
+                block,
+                c_lut,
+                c_num_locks,
+                c_width,
+                c_packs,
+                c_bench,
+                c_time,
+                da_lut,
+                da_num_locks,
+                da_width,
+                da_packs,
+                da_bench,
+                da_time,
+                db_lut,
+                db_num_locks,
+                db_width,
+                db_packs,
+                db_bench,
+                db_time):
+        c = _sparse_matmul.fn[mode](a,
+                                    b,
+                                    trans_a,
+                                    trans_b,
+                                    trans_c,
+                                    spdims,
+                                    block,
+                                    c_lut,
+                                    c_num_locks,
+                                    c_width,
+                                    c_packs,
+                                    c_bench,
+                                    c_time)
+        # save for backward
+        ctx.save_for_backward(a, b)
+        ctx.da_num_locks = da_num_locks
+        ctx.da_lut = da_lut
+        ctx.da_width = da_width
+        ctx.da_packs = da_packs
+        ctx.da_bench = da_bench
+        ctx.da_time = da_time
+        ctx.db_lut = db_lut
+        ctx.db_num_locks = db_num_locks
+        ctx.db_width = db_width
+        ctx.db_bench = db_bench
+        ctx.db_packs = db_packs
+        ctx.db_time = db_time
+        ctx.mode = mode
+        ctx.spdims = spdims
+        ctx.block = block
+        ctx.trans_a = trans_a
+        ctx.trans_b = trans_b
+        return c
+
+    @staticmethod
+    def backward(ctx, dc):
+        # saved for backward
+        a, b = ctx.saved_tensors
+        mode = ctx.mode
+        # gradients w.r.t. a
+        if ctx.needs_input_grad[0]:
+            mode_da = mode[1] + mode[0] + mode[2]
+            da = _sparse_matmul.fn[mode_da](dc,
+                                            b,
+                                            False,
+                                            not ctx.trans_b,
+                                            ctx.trans_a,
+                                            ctx.spdims,
+                                            ctx.block,
+                                            ctx.da_lut,
+                                            ctx.da_num_locks,
+                                            ctx.da_width,
+                                            ctx.da_packs,
+                                            ctx.da_bench,
+                                            ctx.da_time)
+        # gradients w.r.t. b
+        if ctx.needs_input_grad[1]:
+            mode_db = mode[2] + mode[1] + mode[0]
+            db = _sparse_matmul.fn[mode_db](a,
+                                            dc,
+                                            not ctx.trans_a,
+                                            False,
+                                            ctx.trans_b,
+                                            ctx.spdims,
+                                            ctx.block,
+                                            ctx.db_lut,
+                                            ctx.db_num_locks,
+                                            ctx.db_width,
+                                            ctx.db_packs,
+                                            ctx.db_bench,
+                                            ctx.db_time)
+        return da, db, None, None, None,\
+               None, None, None, None,\
+               None, None, None, None, None, None,\
+               None, None, None, None, None, None,\
+               None, None, None, None, None, None
+
+
+class MatMul:
+    """Block-Sparse MatMul class; this class handles three types of matrix-multiplication:
+       - sparse = dense X dense
+       - dense = sparse X dense
+       - dense = dense X sparse
+
+    For more details about sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509
+    """
+    def make_lut(self, dtype, device):
+        """Generates the sparsity layout/s used in block-sparse matmul
+        """
+        key = (dtype, device)
+        if key in self.lut_cache:
+            return self.lut_cache[key]
+        # C look-up table
+        layout, block = self.layout, self.block
+        step = 8 if dtype == torch.float32 else 16
+        if self.mode == 'sdd':
+            c_lut, c_num_locks, c_width, c_packs = _sparse_matmul.make_sdd_lut(layout, block, dtype, device)
+        elif self.mode == 'dsd':
+            c_lut, c_num_locks, c_width, c_packs = _sparse_matmul.make_dxx_lut(layout, block, step, not self.trans_a, device)
+        elif self.mode == 'dds':
+            c_lut, c_num_locks, c_width, c_packs = _sparse_matmul.make_dxx_lut(layout, block, step, self.trans_b, device)
+        # DA look-up table
+        if self.mode == 'sdd':
+            da_lut, da_num_locks, da_width, da_packs = _sparse_matmul.make_dxx_lut(layout, block, step, True, device)
+        elif self.mode == 'dsd':
+            da_lut, da_num_locks, da_width, da_packs = _sparse_matmul.make_sdd_lut(layout, block, dtype, device)
+        elif self.mode == 'dds':
+            da_lut, da_num_locks, da_width, da_packs = _sparse_matmul.make_dxx_lut(layout, block, step, not self.trans_b, device)
+        # DB look-up table
+        if self.mode == 'sdd':
+            db_lut, db_num_locks, db_width, db_packs = _sparse_matmul.make_dxx_lut(layout, block, step, False, device)
+        elif self.mode == 'dsd':
+            db_lut, db_num_locks, db_width, db_packs = _sparse_matmul.make_dxx_lut(layout, block, step, self.trans_a, device)
+        elif self.mode == 'dds':
+            db_lut, db_num_locks, db_width, db_packs = _sparse_matmul.make_sdd_lut(layout, block, dtype, device)
+        self.lut_cache[key] = (c_lut, c_num_locks, c_width, c_packs,\
+                               da_lut, da_num_locks, da_width, da_packs,\
+                               db_lut, db_num_locks, db_width, db_packs)
+        return self.lut_cache[key]
+
+    def __init__(self, layout, block, mode, trans_a=False, trans_b=False, bench=False):
+        """Initialize the Block-Sparse MatMul class.
+
+        Arguments:
+             layout: required: sparsity layout tensor
+             block: required: an integer determining the block size.
+             mode: required: a string determining type of matmul; ('sdd') sparse = dense X dense, ('dsd') dense = sparse X dense, ('dds') dense = dense X sparse
+             trans_a: optional: a boolean determining if multiplication needs to be applied on transpose of input a; default is false
+             trans_b: optional: a boolean determining if multiplication needs to be applied on transpose of input b; default is false
+             bench: optional: set if you want to do benchmarking
+        """
+
+        if mode not in ['sdd', 'dsd', 'dds']:
+            raise NotImplementedError('Supported modes are: sdd, dsd, dds')
+        # look-up table cache
+        self.lut_cache = dict()
+        # attributes
+        self.trans_a = trans_a
+        self.trans_b = trans_b
+        self.mode = mode
+        self.spdims = layout.shape
+        self.block = block
+        self.layout = layout
+        # timings
+        self.bench = bench
+        self.time_c = None
+        self.time_da = None
+        self.time_db = None
+
+    # pad shapes of a tensor to make it
+    # compatible with kernel calls
+    @staticmethod
+    def _pad_shape(x, is_sparse):
+        max_dim = 3 if is_sparse else 4
+        for i in range(max_dim - x.dim()):
+            x = x.unsqueeze(0)
+        return x
+
+    def __call__(self, a, b):
+        """Applies Block-Sparse MatMul.
+
+        For more details about sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509
+
+        Arguments:
+             a: required: a dense/block-sparse tensor; first input of mat-mul
+             b: required: a dense/block-sparse tensor; second input of mat-mul
+
+        Return:
+             c: a dense/block-sparse tensor result of a X b
+        """
+
+
+        c_lut, c_num_locks, c_width, c_packs,\
+        da_lut, da_num_locks, da_width, da_packs,\
+        db_lut, db_num_locks, db_width, db_packs = self.make_lut(a.dtype, a.device)
+        # timings
+        time_c = [None]
+        time_da = [None]
+        time_db = [None]
+        # pad shapes with ones
+        a = MatMul._pad_shape(a, self.mode == 'dsd')
+        b = MatMul._pad_shape(b, self.mode == 'dds')
+        # execute
+        c = _sparse_matmul.apply(a,
+                                 b,
+                                 self.trans_a,
+                                 self.trans_b,
+                                 False,
+                                 self.mode,
+                                 self.spdims,
+                                 self.block,
+                                 c_lut,
+                                 c_num_locks,
+                                 c_width,
+                                 c_packs,
+                                 self.bench,
+                                 time_c,
+                                 da_lut,
+                                 da_num_locks,
+                                 da_width,
+                                 da_packs,
+                                 self.bench,
+                                 time_da,
+                                 db_lut,
+                                 db_num_locks,
+                                 db_width,
+                                 db_packs,
+                                 self.bench,
+                                 time_db)
+        self.time_c = time_c[0]
+        self.time_da = time_da[0]
+        self.time_db = time_db[0]
+        return c
diff --git a/deepspeed/ops/sparse_attention/softmax.py b/deepspeed/ops/sparse_attention/softmax.py
new file mode 100644
index 000000000000..a0805ada4bc0
--- /dev/null
+++ b/deepspeed/ops/sparse_attention/softmax.py
@@ -0,0 +1,304 @@
+# DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
+# https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py
+
+import warnings
+import importlib
+import torch
+import math
+from .trsrc import softmax_fwd, softmax_bwd
+
+fwd_kernels = dict()
+bwd_kernels = dict()
+
+# Delay importing triton unless we need it
+triton = None
+
+
+class _sparse_softmax(torch.autograd.Function):
+
+    bwd_kernels = dict()
+
+    @staticmethod
+    def make_lut(layout, block, device):
+        _empty = torch.tensor([], dtype=torch.int64, device=layout.device)
+        sizes = _empty.clone()
+        # sizes along rows
+        for h in range(layout.shape[0]):
+            sizes = torch.cat((sizes, layout[h, :, :].sum(-1)))
+        # offsets in block format
+        offsets = torch.zeros_like(sizes)
+        offsets[1:] = torch.cumsum(sizes[:-1], dim=0)
+        # block indices
+        idx = torch.arange(layout.sum())
+        head = layout.nonzero()[:, 0]
+        rows = layout.nonzero()[:, 1]
+        columns = layout.nonzero()[:, 2]
+        core = torch.stack((idx, columns, rows, head), dim=1).view(-1)
+        # construct look-up table
+        offsets = offsets * 4 + 2 * sizes.numel()
+        header = torch.stack((sizes, offsets), dim=1).view(-1)
+        lut = torch.cat((header, core)).type(torch.int32).to(device)
+        return lut, int(sizes.max())
+
+    @staticmethod
+    def make_kernel(cache,
+                    src,
+                    max_k,
+                    dtype,
+                    block,
+                    apply_scale,
+                    apply_rpe,
+                    apply_kp_mask,
+                    apply_attn_mask,
+                    kp_mask_mode,
+                    attn_mask_mode):
+        global triton
+        if triton is None:
+            triton = importlib.import_module('triton')
+
+        if max_k >= 32768:
+            raise NotImplementedError('Reductions larger than 32768 elements '\
+                                      'are not yet implemented')
+        num_warps = 4 if max_k < 512 else (8 if max_k < 2048 else 16)
+        pad = num_warps * 32 * 2
+        TN = (int(max_k) + pad - 1) // pad * pad
+        # just-in-time compile kernel
+        key = (block,
+               dtype,
+               num_warps,
+               TN,
+               apply_scale,
+               apply_rpe,
+               apply_kp_mask,
+               apply_attn_mask,
+               kp_mask_mode,
+               attn_mask_mode)
+        if key not in cache:
+            defines = {
+                'TM': [1],
+                'TN': [TN],
+                'TYPE': dtype,
+                'BLOCK': block,
+                'INFINITY': {
+                    torch.float32: 'F32_INFINITY',
+                    torch.float16: 'F16_INFINITY'
+                }[dtype]
+            }
+            if apply_scale:
+                defines['APPLY_SCALE'] = True
+            if apply_rpe:
+                defines['APPLY_RPE'] = True
+            if apply_kp_mask:
+                defines['APPLY_KP_MASK'] = True
+                if kp_mask_mode == 'mul':
+                    defines['KP_MASK_MUL'] = True
+            if apply_attn_mask:
+                defines['APPLY_ATTN_MASK'] = True
+                if attn_mask_mode == 'mul':
+                    defines['ATTN_MASK_MUL'] = True
+            kernel = triton.kernel(src, defines=defines, num_warps=[num_warps])
+            cache[key] = kernel
+        return cache[key]
+
+    @staticmethod
+    def forward(ctx,
+                x,
+                scale,
+                rpe,
+                key_padding_mask,
+                attn_mask,
+                kp_mask_mode,
+                attn_mask_mode,
+                spdims,
+                block,
+                lut,
+                num_blocks,
+                maxlut,
+                bench,
+                time):
+        global triton
+        if triton is None:
+            triton = importlib.import_module('triton')
+
+        apply_scale = False if scale == 1.0 else True
+
+        # handle None rpe
+        if rpe is None:
+            apply_rpe = False
+            stride_zrpe, stride_hrpe, stride_srpe = 0, 0, 0
+            rpe = torch.empty(0, dtype=x.dtype, device=x.device)
+        else:
+            apply_rpe = True
+            stride_zrpe, stride_hrpe, stride_srpe = rpe.stride(0), rpe.stride(1), rpe.stride(2)
+
+        # handle None key_padding_mask
+        if key_padding_mask is None:
+            apply_kp_mask = False
+            stride_zkpm = 0
+            key_padding_mask = torch.empty(0, dtype=x.dtype, device=x.device)
+        else:
+            apply_kp_mask = True
+            stride_zkpm = key_padding_mask.stride(0)
+
+        # handle None attention_mask
+        if attn_mask is None:
+            apply_attn_mask = False
+            stride_zattnm = 0
+            attn_mask = torch.empty(0, dtype=x.dtype, device=x.device)
+        else:
+            apply_attn_mask = True
+            stride_zattnm = attn_mask.stride(0)
+
+        # run kernel
+        kernel = _sparse_softmax.make_kernel(fwd_kernels,
+                                             softmax_fwd,
+                                             maxlut * block,
+                                             x.dtype,
+                                             block,
+                                             apply_scale,
+                                             apply_rpe,
+                                             apply_kp_mask,
+                                             apply_attn_mask,
+                                             kp_mask_mode,
+                                             attn_mask_mode)
+        M = x.shape[0]
+        grid = lambda opt: [triton.cdiv(spdims[0] * spdims[1] * block, opt.d('TM')), M]
+
+        # run kernel
+        time[0] = kernel(x, scale, lut, rpe, key_padding_mask, attn_mask,\
+                         num_blocks, maxlut,\
+                         x.stride(0),\
+                         stride_zrpe, stride_hrpe, stride_srpe,\
+                         stride_zkpm, stride_zattnm,\
+                         grid=grid, bench=bench)
+        # save to context
+        ctx.mark_dirty(x)
+        ctx.save_for_backward(x, lut)
+        ctx.spdims = spdims
+        ctx.block = block
+        ctx.maxlut = maxlut
+        ctx.scale = scale
+        ctx.apply_scale = apply_scale
+        ctx.apply_rpe = apply_rpe
+        ctx.apply_kp_mask = apply_kp_mask
+        ctx.apply_attn_mask = apply_attn_mask
+        ctx.kp_mask_mode = kp_mask_mode
+        ctx.attn_mask_mode = attn_mask_mode
+        return x
+
+    @staticmethod
+    def backward(ctx, dx):
+        global triton
+        if triton is None:
+            triton = importlib.import_module('triton')
+
+        # retrieve from context
+        x, lut = ctx.saved_tensors
+        # run kernel
+        kernel = _sparse_softmax.make_kernel(bwd_kernels,
+                                             softmax_bwd,
+                                             ctx.maxlut * ctx.block,
+                                             x.dtype,
+                                             ctx.block,
+                                             ctx.apply_scale,
+                                             ctx.apply_rpe,
+                                             ctx.apply_kp_mask,
+                                             ctx.apply_attn_mask,
+                                             ctx.kp_mask_mode,
+                                             ctx.attn_mask_mode)
+        M = x.shape[0]
+        grid = lambda opt: [
+            triton.cdiv(ctx.spdims[0] * ctx.spdims[1] * ctx.block,
+                        opt.d('TM')),
+            M
+        ]
+        kernel(x, ctx.scale, dx, lut, ctx.maxlut, x.stride(0), dx.stride(0), grid=grid)
+        return dx, None, None, None, None, None, None, None, None, None, None, None, None, None, None
+
+
+class Softmax:
+    """Block-Sparse Softmax class; this class computes softmax on a block sparse matrix. It is also able to apply either/all of the following masks:
+       - relative position embedding
+       - key padding mask
+       - attention mask
+
+    For more details about sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509
+    """
+    def sparse_softmax(*args, **kwargs):
+        return _sparse_softmax.apply(*args, **kwargs)
+
+    def make_lut(self, device):
+        """Generates the sparsity layout used in block-sparse softmax
+        """
+        key = (device, )
+        if key not in self.lut_cache:
+            self.lut_cache[key] = _sparse_softmax.make_lut(self.layout,
+                                                           self.block,
+                                                           device)
+        return self.lut_cache[key]
+
+    def __init__(self, layout, block, bench=False):
+        """Initialize the Block-Sparse Softmax class.
+
+        Arguments:
+             layout: required: sparsity layout tensor
+             block: required: an integer determining the block size.
+             bench: optional: set if you want to do benchmarking
+        """
+
+        self.num_blocks = layout.sum().item()
+        self.spdims = layout.shape
+        self.layout = layout
+        self.block = block
+        self.bench = bench
+        self.lut_cache = dict()
+
+    def __call__(self,
+                 x,
+                 scale=1.,
+                 rpe=None,
+                 key_padding_mask=None,
+                 attn_mask=None,
+                 key_padding_mask_mode='add',
+                 attn_mask_mode='add'):
+        """Applies softmax on a Block-Sparse input tensor.
+
+        For more details about sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509
+
+        Arguments:
+             x: required: a block-sparse tensor that softmax is applied on it; computation will be in place and result will be returned in the same tensor
+             scale: optional: a float value; x values will be multiplied by this value before normalization. Default value is 1.0.
+             rpe: optional: a tensor same dimension as x that is used as relative position embedding
+             key_padding_mask: optional: a mask tensor of size (BatchSize X SequenceLength)
+             attn_mask: optional: a mask tensor of size (SequenceLength X SequenceLength); currently only 2D is supported
+             key_padding_mask_mode: optional: a boolean determining if key_padding_mask needs to be added or multiplied
+             attn_mask_mode: optional: a boolean determining if attn_mask needs to be added or multiplied
+
+        Return:
+             x: a block-sparse tensor contains normalized input x using softmax; and masks applied if given
+        """
+
+        time_y = [None]
+        if rpe is not None and rpe.dtype != x.dtype:
+            raise ValueError('relative position embedding must be %s' % x.dtype)
+        if attn_mask is not None and attn_mask.dtype != x.dtype:
+            raise ValueError('Attention mask must be %s' % x.dtype)
+        if key_padding_mask is not None and key_padding_mask.dtype != x.dtype:
+            raise ValueError('Key padding mask must be %s' % x.dtype)
+        lut, maxlut = self.make_lut(x.device)
+        x = Softmax.sparse_softmax(x,
+                                   scale,
+                                   rpe,
+                                   key_padding_mask,
+                                   attn_mask,
+                                   key_padding_mask_mode,
+                                   attn_mask_mode,
+                                   self.spdims,
+                                   self.block,
+                                   lut,
+                                   self.num_blocks,
+                                   maxlut,
+                                   self.bench,
+                                   time_y)
+        self.time_y = time_y[0]
+        return x
diff --git a/deepspeed/ops/sparse_attention/sparse_attention_utils.py b/deepspeed/ops/sparse_attention/sparse_attention_utils.py
new file mode 100644
index 000000000000..32999f62323f
--- /dev/null
+++ b/deepspeed/ops/sparse_attention/sparse_attention_utils.py
@@ -0,0 +1,225 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+
+from torch import nn
+from torch.nn import functional as F
+from deepspeed.ops.sparse_attention import BertSparseSelfAttention, SparsityConfig
+'''
+This file contains few utility functions to handle adapting pretrained model with sparse self-attention module.
+'''
+
+
+class SparseAttentionUtils:
+    """This class provides some utility functions that are use integrating sparse attention into transformer models.
+    Such utilities include extending position embeddings, replacing current self-attention layer with sparse attention, padding sequences to multiple of block size, etc.
+
+    """
+    @staticmethod
+    def extend_position_embedding(model, max_position):
+        """This function extends the position embedding weights of a model loaded from a checkpoint.
+        It assumes the new max position is bigger than the original max length.
+
+        Arguments:
+            model: required: a transformer model
+            max_position: required: an integer determining new position embedding size
+        Return:
+            model: updated model; in which position embedding weights have been extended based on new size
+        """
+
+        if hasattr(model, 'bert'):
+            original_max_position = model.bert.embeddings.position_embeddings.weight.size(
+                0)
+            assert max_position > original_max_position
+            extend_multiples = max(1, max_position // original_max_position)
+            model.bert.embeddings.position_embeddings.weight.data = model.bert.embeddings.position_embeddings.weight.repeat(
+                extend_multiples,
+                1)
+        elif hasattr(model, 'roberta'):
+            # RoBERTa has positions 0 & 1 reserved, so embedding size is max position + 2
+            original_max_position, embed_size = model.roberta.embeddings.position_embeddings.weight.shape
+            original_max_position -= 2
+            extend_multiples = max(1, max_position // original_max_position)
+            assert max_position > original_max_position
+            max_position += 2
+            extended_position_embedding = model.roberta.embeddings.position_embeddings.weight.new_empty(
+                max_position,
+                embed_size)
+            k = 2
+            for i in range(extend_multiples):
+                extended_position_embedding[k:(
+                    k + original_max_position
+                )] = model.roberta.embeddings.position_embeddings.weight[2:]
+                k += original_max_position
+            model.roberta.embeddings.position_embeddings.weight.data = extended_position_embedding
+        else:
+            raise ValueError(
+                'Please extend \"extend_position_embedding\" function to support your model type. It currently only supports \"bert\" & \"roberta\"!'
+            )
+
+        model.config.max_position_embeddings = max_position
+        print(
+            f'Extended position embeddings to {original_max_position * extend_multiples}'
+        )
+
+        return model
+
+    @staticmethod
+    def update_tokenizer_model_max_length(tokenizer, max_position):
+        """This function updates the position embedding length of a tokenizer to a new max position.
+
+        Arguments:
+            tokenizer: required: a transformer tokenizer
+            max_position: required: an integer determining new position embedding size
+        Return:
+            tokenizer: updated tokenizer; in which model maximum length has been extended based on new size
+        """
+
+        tokenizer.model_max_length = max_position
+        tokenizer.init_kwargs['model_max_length'] = max_position
+        print(f'updated tokenizer model max imum length to {max_position}')
+
+        return tokenizer
+
+    @staticmethod
+    def replace_model_self_attention_with_sparse_self_attention(
+        model,
+        max_position,
+        # SparsityConfig parameters needs to be set accordingly
+        sparsity_config=SparsityConfig(num_heads=4)):
+        """This function replaces the self attention layers in model encoder with sparse self attention.
+        It currently supports bert and roberta model and can be easily extended to any other models following similar steps here.
+        For sparsityConfig, refer to the config class.
+
+        Arguments:
+            model: required: a transformer model
+            max_position: required: an integer determining new position embedding size
+            sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class
+
+        Return:
+            model: updated model; in which self attention layer has been repleaced with DeepSpeed Sparse Self Attention layer.
+        """
+
+        if hasattr(model, 'bert'):
+            model.config.max_position_embeddings = max_position
+            replace_self_attention_layer_with_sparse_self_attention_layer(
+                model.config,
+                model.bert.encoder.layer,
+                sparsity_config)
+        elif hasattr(model, 'roberta'):
+            model.config.max_position_embeddings = max_position + 2
+            replace_self_attention_layer_with_sparse_self_attention_layer(
+                model.config,
+                model.roberta.encoder.layer,
+                sparsity_config)
+        else:
+            raise ValueError(
+                'Please extend \"update_model_self_attention_to_sparse_self_attention\" function to support \
+                                     your model type. It currently only supports \"bert\" & \"roberta\"!'
+            )
+        return model
+
+    @staticmethod
+    def replace_self_attention_layer_with_sparse_self_attention_layer(
+        config,
+        layers,
+        # SparsityConfig parameters needs to be set accordingly
+        sparsity_config=SparsityConfig(num_heads=4)):
+        """This function replaces the self attention layers in attention layer with sparse self attention.
+        For sparsityConfig, refer to the config class.
+
+        Arguments:
+            config: required: transformer model config
+            layers: required: transformer model attention layers
+            sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class
+
+        Return:
+            layers: updated attention layers; in which self attention layers have been repleaced with DeepSpeed Sparse Self Attention layer.
+        """
+
+        for layer in layers:
+            deepspeed_sparse_self_attn = BertSparseSelfAttention(config, sparsity_config)
+            deepspeed_sparse_self_attn.query = layer.attention.self.query
+            deepspeed_sparse_self_attn.key = layer.attention.self.key
+            deepspeed_sparse_self_attn.value = layer.attention.self.value
+
+            layer.attention.self = deepspeed_sparse_self_attn
+
+        return layers
+
+    @staticmethod
+    def pad_to_block_size(block_size,
+                          input_ids,
+                          attention_mask,
+                          token_type_ids,
+                          position_ids,
+                          inputs_embeds,
+                          pad_token_id,
+                          model_mbeddings):
+        """This function pads input tokens and attention mask on sequence length dimension to be multiple of block size.
+            This is a requirement for Sparse Transformer in which the self attention layer works on sequences of length multiple of block size.
+            It needs to be called in your model, such as BertModel, right before you calculate the embedding outputs.
+            Note)
+            1- instead of passing your embedding layer to this function, you can simply add this function to your model. It can be more simplified if given attention_mask and/or token_type_ids are none.
+            2- you need to call unpdad function before returning your model output to unpad the encoder sequence output.
+
+            Arguments:
+                block_size: required: an integer determining the block size of sparsity config.
+                pad_token_id: required: an integer determining the pad token from the model config; such as bert.config.pad_token_id.
+                input_ids: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the vocabulary
+                attention_mask: a torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max input sequence length in the current batch. It's the mask that we typically use for attention when a batch has varying length sentences.
+                token_type_ids: a torch.LongTensor of shape [batch_size, sequence_length] with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
+                position_ids:  a torch.LongTensor of shape [batch_size, sequence_length] with the indices of positions of each input sequence tokens in the position embeddings.
+                inputs_embeds: an optional torch.FloatTensor of shape [batch_size, sequence_length, hidden_size] that contains embedded representation and can be passed instead of input_ids directly.
+                model_embeddings: an optional object. If inputs_embeds are not none, this will be your model embeddings such as BertEmbeddings from your model such as BertModel. You can move this function inside your model and use self.embeddings instead of passing this parameter.
+
+            Return:
+                pad_len: an integer determining how much inputs have been padded to transfer sequence length dimension to multiple of block size.
+                input_ids: if input_ids are not none padded input_ids otherwise none.
+                attention_mask: if attention_mask is not none padded attention_mask otherwise none.
+                token_type_ids: if token_type_ids are not none padded token_type_ids otherwise none.
+                position_ids: if position_ids are not none padded position_ids otherwise none.
+                inputs_embeds: if inputs_embeds are not none padded inputs_embeds otherwise none.
+        """
+
+        batch_size, seq_len = input_ids.shape if input_ids is not None else inputs_embeds.shape[:-1]
+
+        pad_len = (block_size - seq_len % block_size) % block_size
+        if pad_len > 0:
+            if inputs_embeds is not None:
+                pad_input_ids = inputs_embeds.new_full((batch_size,
+                                                        pad_len),
+                                                       pad_token_id,
+                                                       dtype=torch.long)
+                pad_inputs_embeds = model_embeddings(pad_input_ids)
+                inputs_embeds = torch.cat([inputs_embeds, pad_inputs_embeds], dim=-2)
+            # may not be needed as input_ids are not used if inputs_embeds are given
+            if input_ids is not None:
+                input_ids = F.pad(input_ids, (0, pad_len), value=pad_token_id)
+            if position_ids is not None:
+                # pad position_id with pad_token_id
+                position_ids = F.pad(position_ids, (0, pad_len), value=pad_token_id)
+            # pad attention mask without attention on the padding tokens
+            attention_mask = F.pad(attention_mask, (0, pad_len), value=False)
+            # pad token_type_ids with token_type_id = 0
+            token_type_ids = F.pad(token_type_ids, (0, pad_len), value=0)
+
+        return pad_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds
+
+    @staticmethod
+    def unpad_sequence_output(pad_len, sequence_output):
+        """This function unpads sequence output if inputs of the model were padded.
+           This is a requirement for Sparse Transformer in which the self attention layer works on sequences of length multiple of block size.
+           It needs to be called in your model, such as BertModel, right before you return the model outputs.
+
+           Arguments:
+               pad_len: required: an integer determining how much model inputs have been padded to transfer sequence length dimension to multiple of block size.
+               sequence_output: required: sequence output of the encoder layer.
+
+           Return:
+               sequence_output: unpaded sequence output of the encoder layer.
+        """
+
+        if (pad_len > 0):
+            sequence_output = sequence_output[:, :-pad_len]
+        return sequence_output
diff --git a/deepspeed/ops/sparse_attention/sparse_self_attention.py b/deepspeed/ops/sparse_attention/sparse_self_attention.py
new file mode 100644
index 000000000000..6e7d8905e0a8
--- /dev/null
+++ b/deepspeed/ops/sparse_attention/sparse_self_attention.py
@@ -0,0 +1,164 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+
+import torch.nn as nn
+from torch.nn.functional import *
+import torch
+from torch import distributed as dist
+from collections import namedtuple
+from deepspeed.ops.sparse_attention import MatMul, Softmax, SparsityConfig
+import sys
+
+
+class SparseSelfAttention(nn.Module):
+    """Implements an efficient Sparse Self Attention of Transformer layer based on `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509
+
+    For more information please see, TODO DeepSpeed Sparse Transformer.
+
+    For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial.
+    """
+    def __init__(
+        self,
+        # SparsityConfig parameters needs to be set accordingly
+        sparsity_config=SparsityConfig(num_heads=4),
+        key_padding_mask_mode='add',
+        attn_mask_mode='mul',
+        max_seq_length=2048):
+        """Initialize the sparse self attention layer.
+        Arguments:
+            sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class.
+            key_padding_mask_mode: optional: a string determining if key padding mask needs to be added, `add`, or be multiplied, `mul`.
+            attn_mask_mode: optional: a string determining if attention mask needs to be added, `add`, or be multiplied, `mul`.
+            max_seq_length: optional: the maximum sequence length this sparse attention module will be applied to; it controls the size of the master_layout.
+        """
+        super().__init__()
+
+        # sparsity information
+        self.sparsity_config = sparsity_config
+
+        # initialize sparse layout and register as buffer
+        master_layout = self.sparsity_config.make_layout(max_seq_length)
+        self.register_buffer("master_layout", master_layout)
+        self._need_layout_synchronization = True
+
+        # mask modes
+        self.key_padding_mask_mode = key_padding_mask_mode
+        self.attn_mask_mode = attn_mask_mode
+
+    ops = dict()
+
+    def get_layout(self, L):
+        # if layout is never synchronized across GPUs, broadcast the layout from global rank 0
+        if self._need_layout_synchronization and dist.is_initialized():
+            dist.broadcast(self.master_layout, src=0)
+            self._need_layout_synchronization = False
+
+        if (L % self.sparsity_config.block != 0):
+            raise ValueError(
+                f'Sequence Length, {L}, needs to be dividable by Block size {self.sparsity_config.block}!'
+            )
+
+        num_blocks = L // self.sparsity_config.block
+        return self.master_layout[..., :num_blocks, :num_blocks].cpu()  # layout needs to be a CPU tensor
+
+    # add to cache
+    def get_ops(self, H, L):
+        import sys
+        if L not in SparseSelfAttention.ops:
+            sparsity_layout = self.get_layout(L)
+            sparse_dot_sdd_nt = MatMul(sparsity_layout,
+                                       self.sparsity_config.block,
+                                       'sdd',
+                                       trans_a=False,
+                                       trans_b=True)
+
+            sparse_dot_dsd_nn = MatMul(sparsity_layout,
+                                       self.sparsity_config.block,
+                                       'dsd',
+                                       trans_a=False,
+                                       trans_b=False)
+
+            sparse_softmax = Softmax(sparsity_layout, self.sparsity_config.block)
+
+            SparseSelfAttention.ops[L] = (sparse_dot_sdd_nt,
+                                          sparse_dot_dsd_nn,
+                                          sparse_softmax)
+        return SparseSelfAttention.ops[L]
+
+    def transpose_key_for_scores(self, x, L):
+        bsz, num_heads, seq_len, head_dim = x.size()
+        if seq_len != L:
+            return x.permute(0, 1, 3, 2)
+        return x
+
+    def transpose_mask_for_sparse(self, qtype, x, is_key_padding_mask=False):
+        x = x.type(qtype)
+        if is_key_padding_mask:
+            xdim = x.dim()
+            for d in range(xdim - 1, 0, -1):
+                x = x.squeeze(dim=d)
+            return x
+        return x.squeeze()
+
+    # forward pass
+    def forward(self,
+                query,
+                key,
+                value,
+                rpe=None,
+                key_padding_mask=None,
+                attn_mask=None):
+        """Applies forward phase of sparse self attention
+
+        Arguments:
+            query: required: query tensor
+            key: required: key tensor
+            value: required: value tensor
+            rpe: optional: a tensor same dimension as x that is used as relative position embedding
+            key_padding_mask: optional: a mask tensor of size (BatchSize X SequenceLength)
+            attn_mask: optional: a mask tensor of size (SequenceLength X SequenceLength); currently only 2D is supported
+            key_padding_mask_mode: optional: a boolean determining if key_padding_mask needs to be added or multiplied
+            attn_mask_mode: optional: a boolean determining if attn_mask needs to be added or multiplied
+
+        Return:
+             attn_output: a dense tensor containing attnetion context
+        """
+        bsz, num_heads, tgt_len, head_dim = query.size()
+
+        # transpose back key if it is already transposed
+        key = self.transpose_key_for_scores(key, tgt_len)
+
+        # check that operation is supported
+        if query.shape != key.shape or key.shape != value.shape:
+            raise NotImplementedError('only self-attention is supported for now')
+
+        # squeeze key_padding_mask if it is given
+        if key_padding_mask is not None:
+            key_padding_mask = self.transpose_mask_for_sparse(query.dtype,
+                                                              key_padding_mask,
+                                                              is_key_padding_mask=True)
+
+        # squeeze attn_mask if it is given
+        if attn_mask is not None:
+            attn_mask = self.transpose_mask_for_sparse(query.dtype, attn_mask)
+
+        # cache look-up table computations etc
+        sparse_dot_sdd_nt, sparse_dot_dsd_nn, sparse_softmax = self.get_ops(num_heads, tgt_len)
+
+        scaling = float(head_dim)**-0.5
+
+        # attention scores
+        attn_output_weights = sparse_dot_sdd_nt(query, key)
+        attn_output_weights = sparse_softmax(
+            attn_output_weights,
+            scale=scaling,
+            rpe=rpe,
+            key_padding_mask=key_padding_mask,
+            attn_mask=attn_mask,
+            key_padding_mask_mode=self.key_padding_mask_mode,
+            attn_mask_mode=self.attn_mask_mode)
+
+        # outputs
+        attn_output = sparse_dot_dsd_nn(attn_output_weights, value)
+        return attn_output
diff --git a/deepspeed/ops/sparse_attention/sparsity_config.py b/deepspeed/ops/sparse_attention/sparsity_config.py
new file mode 100644
index 000000000000..5d8e2fa5fa46
--- /dev/null
+++ b/deepspeed/ops/sparse_attention/sparsity_config.py
@@ -0,0 +1,663 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+
+import torch
+import random
+
+
+class SparsityConfig:
+    """Abstract Configuration class to store `sparsity configuration of a self attention layer`.
+    It contains shared property of different block-sparse sparsity patterns. However, each class needs to extend it based on required property and functionality.
+    """
+    def __init__(self, num_heads, block=16, different_layout_per_head=False):
+        """Initialize the Sparsity Pattern Config.
+
+        For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial
+
+        Arguments:
+             num_heads: required: an integer determining number of attention heads of the layer.
+             block: optional: an integer determining the block size. Current implementation of sparse self-attention is based on blocked sparse matrices. In which this parameter defines size of such blocks, `Block X Block`.
+             different_layout_per_head: optional: a boolean determining if each head should be assigned a different sparsity layout; default is false and this will be satisfied based on availability.
+        """
+
+        self.num_heads = num_heads
+        self.block = block
+        self.different_layout_per_head = different_layout_per_head
+        self.num_layout_heads = num_heads if different_layout_per_head else 1
+
+    def setup_layout(self, seq_len):
+        """Create layout tensor for the given sequence length
+
+        Arguments:
+             seq_len: required: an integer determining number of attention heads of the layer.
+
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) for sparsity layout of all head; initialized with zero
+        """
+
+        if (seq_len % self.block != 0):
+            raise ValueError(
+                f'Sequence Length, {seq_len}, needs to be dividable by Block size {self.block}!'
+            )
+        num_blocks = seq_len // self.block
+        # TODO Currently we allocate layout per head; needs to be updated if heads share a single layout.
+        layout = torch.zeros((self.num_heads, num_blocks, num_blocks), dtype=torch.int64)
+        return layout
+
+    def check_and_propagate_first_head_layout(self, layout):
+        """If all heads require same sparsity layout, it propagate first head layout to all heads
+
+        Arguments:
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head
+        """
+
+        if not self.different_layout_per_head:
+            layout[1:self.num_heads, :, :] = layout[0, :, :]
+        return layout
+
+
+class DenseSparsityConfig(SparsityConfig):
+    """Configuration class to store `Dense` configuration.
+    In reality, this is not sparse and all blocks are used. We keep it for the sake of comparison and comprehension.
+    """
+    def __init__(self, num_heads, block=16, different_layout_per_head=False):
+        """Initialize the Dense Sparsity Pattern Config.
+        In reality, this is not sparse and all blocks are used. We keep it for the sake of comparison and comprehension.
+
+        Arguments:
+             num_heads: required: an integer determining number of attention heads of the layer.
+             seq_len: required: an integer determining number of attention heads of the layer.
+             different_layout_per_head: optional: this is just for the sake of consistency with other sparsity formats; can ignore it for DenseSparsityConfig
+        """
+
+        super().__init__(num_heads, block, different_layout_per_head)
+
+    def make_layout(self, seq_len):
+        """Set 1 to all blocks of the layout meanins the pattern is dense; not sparse.
+
+        Arguments:
+             seq_len: required: an integer determining the underling sequence length; must be <= max sequence length
+
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; for dense everything is 1
+        """
+
+        layout = self.setup_layout(seq_len)
+        layout[:, :, :] = 1
+        return layout
+
+
+class FixedSparsityConfig(SparsityConfig):
+    """Configuration class to store `Fixed` sparsity configuration.
+    For more details about this sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509; this has been customized.
+    This class extends parent class of `SparsityConfig` and customizes it for `Fixed` sparsity.
+    """
+    def __init__(self,
+                 num_heads,
+                 block=16,
+                 different_layout_per_head=False,
+                 num_local_blocks=4,
+                 num_global_blocks=1,
+                 attention='bidirectional',
+                 horizontal_global_attention=False,
+                 num_different_global_patterns=1):
+        """Initialize `Fixed` Sparsity Pattern Config.
+
+        For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial
+
+        Arguments:
+             num_heads: required: an integer determining number of attention heads of the layer.
+             block: optional: an integer determining the block size. Current implementation of sparse self-attention is based on blocked sparse matrices. In which this parameter defines size of such blocks, `Block X Block`.
+             different_layout_per_head: optional: a boolean determining if each head should be assigned a different sparsity layout; default is false and this will be satisfied based on availability.
+             num_local_blocks: optional: an integer determining the number of blocks in local attention window.
+             num_global_blocks: optional: an integer determining how many consecutive blocks in a local window is used as the representative of the window for global attention.
+             attention: optional: a string determining attention type. Attention can be `unidirectional`, such as autoregressive models, in which tokens attend only to tokens appear before them in the context. Considering that, the upper triangular of attention matrix is empty as above figure. Or it can be `bidirectional`, such as BERT, in which tokens can attend to any other tokens before or after them. Then, the upper triangular part of the attention matrix is mirror of the lower triangular in the above figure.
+             horizontal_global_attention: optional: a boolean determining if blocks that are global representative of a local window, also attend to all other blocks. This is valid only if attention type is `bidirectional`. Looking at the attention matrix, that means global attention not only includes the vertical blocks, but also horizontal blocks.
+             num_different_global_patterns: optional: an integer determining number of different global attentions layouts. While global attention can be fixed by which block/s are representative of any local window, since there are multi-heads, each head can use a different global representative. For example, with 4 blocks local window and global attention size of 1 block, we can have 4 different versions in which the first, Second, third, or forth block of each local window can be global representative of that window. This parameter determines how many of such patterns we want. Of course, there is a limitation based on num_local_blocks and num_global_blocks.
+        """
+
+        super().__init__(num_heads, block, different_layout_per_head)
+
+        self.num_local_blocks = num_local_blocks
+
+        if (num_local_blocks % num_global_blocks != 0):
+            raise ValueError(
+                f'Number of blocks in a local window, {num_local_blocks}, must be dividable by number of global blocks, {num_global_blocks}!'
+            )
+        self.num_global_blocks = num_global_blocks
+
+        if (attention != 'unidirectional' and attention != 'bidirectional'):
+            raise NotImplementedError(
+                'only \"uni/bi-directional\" attentions are supported for now!')
+        self.attention = attention
+
+        if (attention != 'bidirectional' and horizontal_global_attention):
+            raise ValueError(
+                'only \"bi-directional\" attentions can support horizontal global attention!'
+            )
+        self.horizontal_global_attention = horizontal_global_attention
+
+        if (num_different_global_patterns > 1 and not different_layout_per_head):
+            raise ValueError(
+                f'Number of different layouts cannot be more than one when you have set a single layout for all heads! Set different_layout_per_head to True.'
+            )
+        if (num_different_global_patterns > (num_local_blocks // num_global_blocks)):
+            raise ValueError(
+                f'Number of layout versions (num_different_global_patterns), {num_different_global_patterns}, cannot be larger than number of local window blocks divided by number of global blocks, {num_local_blocks} / {num_global_blocks} = {num_local_blocks//num_global_blocks}!'
+            )
+        self.num_different_global_patterns = num_different_global_patterns
+
+    def set_local_layout(self, h, layout):
+        """Sets local attantion layout used by the given head in the sparse attention.
+
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which local layout is set
+        """
+
+        num_blocks = layout.shape[1]
+        for i in range(0, num_blocks, self.num_local_blocks):
+            end = min(i + self.num_local_blocks, num_blocks)
+            for row in range(i, end):
+                for col in range(
+                        i,
+                    (row + 1 if self.attention == 'unidirectional' else end)):
+                    layout[h, row, col] = 1
+        return layout
+
+    def set_global_layout(self, h, layout):
+        """Sets global attantion layout used by the given head in the sparse attention.
+
+        Currently we set global blocks starting from the last block of a local window to the first one. That means if a local window consists of 4 blocks and global attention size is one block, we use block #4 in each local window as global. If we have different layout per head, then other heads will get #3, #2, and #1. And if we have more heads (and different layout has set) than num of global attentions, multiple head may have same global attentions.
+        Note) if horizontal_global_attention is set, global blocks will be set both horizontally and vertically.
+
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which global layout is set
+        """
+
+        num_blocks = layout.shape[1]
+        first_global_block_idx = self.num_local_blocks - (
+            1 + h % self.num_different_global_patterns) * self.num_global_blocks
+
+        # set all global blocks except the last one if (in last local window)
+        end = num_blocks - (num_blocks % self.num_local_blocks)
+        for i in range(first_global_block_idx, end, self.num_local_blocks):
+
+            # vertical global attention
+            first_row = 0 if self.attention == 'bidirectional' else i
+            #(((i // self.num_local_blocks) + 1) * self.num_local_blocks)
+            #if (first_row < num_blocks):
+            layout[h, first_row:, i:i + self.num_global_blocks] = 1
+
+            # horizontal global attention; only in bidirectional attention
+            if (self.horizontal_global_attention):
+                layout[h, i:i + self.num_global_blocks, :] = 1
+
+        # set last global blocks; handle possible short last local window
+        if (end < num_blocks):
+            start = min(end + first_global_block_idx,
+                        num_blocks - self.num_global_blocks)
+            end = start + self.num_global_blocks
+
+            # vertical global attention
+            first_row = 0 if self.attention == 'bidirectional' else start
+            #(((start // self.num_local_blocks) + 1) * self.num_local_blocks)
+            #if (first_row < num_blocks):
+            layout[h, first_row:, start:end] = 1
+
+            # horizontal global attention
+            if (self.horizontal_global_attention):
+                layout[h, start:end, :] = 1
+        return layout
+
+    def make_layout(self, seq_len):
+        """Generates `Fixed` sparsity layout used by each head in the sparse attention.
+
+        Arguments:
+             seq_len: required: an integer determining number of attention heads of the layer.
+
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing `Fixed` sparsity layout of all head
+        """
+
+        layout = self.setup_layout(seq_len)
+        for h in range(0, self.num_layout_heads):
+            layout = self.set_local_layout(h, layout)
+            layout = self.set_global_layout(h, layout)
+
+        layout = self.check_and_propagate_first_head_layout(layout)
+        return layout
+
+
+class VariableSparsityConfig(SparsityConfig):
+    """Configuration class to store `Variable` sparsity configuration.
+    This layout is an extension of FixedSparsityConfig in which:
+      - user can set random layout; default value is zero means no random block
+      - user can provide a list of local block sizes
+      - user can provide a list of global block indices.
+
+    For more details about `Fixed` sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509; this has been customized.
+    This class extends parent class of `SparsityConfig` and customizes it for `Fixed` sparsity.
+    """
+    def __init__(self,
+                 num_heads,
+                 block=16,
+                 different_layout_per_head=False,
+                 num_random_blocks=0,
+                 local_window_blocks=[4],
+                 global_block_indices=[0],
+                 global_block_end_indices=None,
+                 attention='bidirectional',
+                 horizontal_global_attention=False):
+        """Initialize `Variable` Sparsity Pattern Config.
+
+        For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial
+
+        Arguments:
+             num_heads: required: an integer determining number of attention heads of the layer.
+             block: optional: an integer determining the block size. Current implementation of sparse self-attention is based on blocked sparse matrices. In which this parameter defines size of such blocks, `Block X Block`.
+             different_layout_per_head: optional: a boolean determining if each head should be assigned a different sparsity layout; default is false and this will be satisfied based on availability. Currently this sparsity config can only assign single layout to all heads; needs to be extended for different layout per head.
+             num_random_blocks: optional: an integer determining the number of random blocks in each block row.
+             local_window_blocks: optional: a list of integers determining the number of blocks in each local attention window. It assumes first number determines # of blocks in the first local window, second the second window, ..., and the last number determines the number of blocks in the remaining local windows.
+             global_block_indices: optional: a list of integers determining which blocks are considered as global attention. Given indices, determine the blocks that all other token blocks attend to and they attend to all other token blocks. Default value is only index 0. Notice that if global_block_end_indices parameter is set, this parameter is used as starting index of each global window.
+             global_block_end_indices: optional: a list of integers determining end indices of global window blocks. By default this is not used. But if it is set, it must have the same size of global_block_indices parameter, and combining this two parameters, for each index i, blocks from global_block_indices[i] to global_block_end_indices[i] (exclusive) are considered as global attention.
+             num_global_blocks: optional: an integer determining how many consecutive blocks in a local window is used as the representative of the window for global attention.
+             attention: optional: a string determining attention type. Attention can be `unidirectional`, such as autoregressive models, in which tokens attend only to tokens appear before them in the context. Considering that, the upper triangular of attention matrix is empty as above figure. Or it can be `bidirectional`, such as BERT, in which tokens can attend to any other tokens before or after them. Then, the upper triangular part of the attention matrix is mirror of the lower triangular in the above figure.
+             horizontal_global_attention: optional: a boolean determining if blocks that are global representative of a local window, also attend to all other blocks. This is valid only if attention type is `bidirectional`. Looking at the attention matrix, that means global attention not only includes the vertical blocks, but also horizontal blocks.
+        """
+
+        super().__init__(num_heads, block, different_layout_per_head)
+
+        self.num_random_blocks = num_random_blocks
+        self.local_window_blocks = local_window_blocks
+        self.global_block_indices = global_block_indices
+
+        if (global_block_end_indices is not None):
+            if (len(global_block_indices) != len(global_block_end_indices)):
+                raise ValueError(
+                    f'Global block start indices length, {len(global_block_indices)}, must be same as global block end indices length, {len(global_block_end_indices)}!'
+                )
+            for _, (start_idx, end_idx) in enumerate(zip(global_block_indices, global_block_end_indices)):
+                if start_idx >= end_idx:
+                    raise ValueError(
+                        f'Global block start index, {start_idx}, must be smaller than global block end index, {end_idx}!'
+                    )
+        self.global_block_end_indices = global_block_end_indices
+
+        if (attention != 'unidirectional' and attention != 'bidirectional'):
+            raise NotImplementedError(
+                'only \"uni/bi-directional\" attentions are supported for now!')
+        self.attention = attention
+
+        if (attention != 'bidirectional' and horizontal_global_attention):
+            raise ValueError(
+                'only \"bi-directional\" attentions can support horizontal global attention!'
+            )
+        self.horizontal_global_attention = horizontal_global_attention
+
+    def set_random_layout(self, h, layout):
+        """Sets random attantion layout used by the given head in the sparse attention.
+        Note) By default, it assumes there will be a unique random block layout for all heads; unless `different_layout_per_head` parameter is set in which each head can have a different random layout.
+
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which random layout is set
+        """
+
+        num_blocks = layout.shape[1]
+        if (num_blocks < self.num_random_blocks):
+            raise ValueError(
+                f'Number of random blocks, {self.num_random_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
+            )
+        for row in range(0, num_blocks):
+            rnd_cols = random.sample(range(0, num_blocks), self.num_random_blocks)
+            layout[h, row, rnd_cols] = 1
+        return layout
+
+    def set_local_layout(self, h, layout):
+        """Sets local attantion layout used by the given head in the sparse attention.
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which local layout is set
+        """
+
+        num_blocks = layout.shape[1]
+        start_block_idx = 0
+        end_block_idx = 0
+        for block_size in self.local_window_blocks:
+            end_block_idx += block_size
+            end_block_idx = min(end_block_idx, num_blocks)
+            for row in range(start_block_idx, end_block_idx):
+                for col in range(
+                        start_block_idx,
+                    (row + 1 if self.attention == 'unidirectional' else end_block_idx)):
+                    layout[h, row, col] = 1
+            start_block_idx += block_size
+
+        # if there is any remaining not attended part, use the lats local window block size as local window for the remaining applicable local windows
+        for i in range(start_block_idx, num_blocks, block_size):
+            end_block_idx = min(i + block_size, num_blocks)
+            for row in range(i, end_block_idx):
+                for col in range(
+                        i,
+                    (row + 1 if self.attention == 'unidirectional' else end_block_idx)):
+                    layout[h, row, col] = 1
+        return layout
+
+    def set_global_layout(self, h, layout):
+        """Sets global attantion layout used by the given head in the sparse attention.
+
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which global layout is set
+        """
+
+        num_blocks = layout.shape[1]
+        if (self.global_block_end_indices is None):
+            for idx in self.global_block_indices:
+                # if global block idx is in the range of the sequnce blocks
+                if (idx < num_blocks):
+                    #global rows
+                    if (self.horizontal_global_attention):
+                        layout[h, idx, :] = 1
+
+                    #global columns
+                    first_row = 0 if self.attention == 'bidirectional' else idx
+                    layout[h, first_row:, idx] = 1
+        else:
+            for _, (start_idx, end_idx) in enumerate(zip(self.global_block_indices, self.global_block_end_indices)):
+                # if global block idx is in the range of the sequnce blocks
+                if (start_idx < num_blocks):
+                    end_idx = min(end_idx, num_blocks)
+                    #global rows
+                    if (self.horizontal_global_attention):
+                        layout[h, start_idx:end_idx, :] = 1
+
+                    #global columns
+                    first_row = 0 if self.attention == 'bidirectional' else start_idx
+                    layout[h, first_row:, start_idx:end_idx] = 1
+        return layout
+
+    def make_layout(self, seq_len):
+        """Generates `Variable` sparsity layout used by each head in the sparse attention.
+
+        Arguments:
+             seq_len: required: an integer determining number of attention heads of the layer.
+
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing `Variable` sparsity layout of all head
+        """
+
+        layout = self.setup_layout(seq_len)
+        for h in range(0, self.num_layout_heads):
+            layout = self.set_random_layout(h, layout)
+            layout = self.set_local_layout(h, layout)
+            layout = self.set_global_layout(h, layout)
+
+        layout = self.check_and_propagate_first_head_layout(layout)
+        return layout
+
+
+class BigBirdSparsityConfig(SparsityConfig):
+    """Configuration class to store `BigBird` sparsity configuration.
+    For more details about this sparsity config, please see `Big Bird: Transformers for Longer Sequences`: https://arxiv.org/pdf/2007.14062.pdf
+    This class extends parent class of `SparsityConfig` and customizes it for `BigBird` sparsity.
+    """
+    def __init__(self,
+                 num_heads,
+                 block=16,
+                 different_layout_per_head=False,
+                 num_random_blocks=1,
+                 num_sliding_window_blocks=3,
+                 num_global_blocks=1):
+        """Initialize the BigBird Sparsity Pattern Config.
+
+        For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial
+
+        Arguments:
+             num_heads: required: an integer determining number of attention heads of the layer.
+             block: optional: an integer determining the block size. Current implementation of sparse self-attention is based on blocked sparse matrices. In which this parameter defines size of such blocks, `Block X Block`.
+             different_layout_per_head: optional: a boolean determining if each head should be assigned a different sparsity layout; default is false and this will be satisfied based on availability.
+             num_random_blocks: optional: an integer determining the number of random blocks in each block row.
+             num_sliding_window_blocks: optional: an integer determining the number of blocks in sliding local attention window.
+             num_global_blocks: optional: an integer determining how many consecutive blocks, starting from index 0, are considered as global attention. Global block tokens will be attended by all other block tokens and will attend to all other block tokens as well.
+        """
+
+        super().__init__(num_heads, block, different_layout_per_head)
+
+        self.num_random_blocks = num_random_blocks
+        self.num_sliding_window_blocks = num_sliding_window_blocks
+        self.num_global_blocks = num_global_blocks
+
+    def set_random_layout(self, h, layout):
+        """Sets random attantion layout used by the given head in the sparse attention.
+        Note) By default, it assumes there will be a unique random block layout for all heads; unless `different_layout_per_head` parameter is set in which each head can have a different random layout.
+
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which random layout is set
+        """
+
+        num_blocks = layout.shape[1]
+        if (num_blocks < self.num_random_blocks):
+            raise ValueError(
+                f'Number of random blocks, {self.num_random_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
+            )
+
+        for row in range(0, num_blocks):
+            rnd_cols = random.sample(range(0, num_blocks), self.num_random_blocks)
+            layout[h, row, rnd_cols] = 1
+        return layout
+
+    def set_sliding_window_layout(self, h, layout):
+        """Sets sliding local attantion layout used by the given head in the sparse attention.
+
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which local sliding window layout is set
+        """
+
+        num_blocks = layout.shape[1]
+        if (num_blocks < self.num_sliding_window_blocks):
+            raise ValueError(
+                f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
+            )
+
+        w = self.num_sliding_window_blocks // 2
+        for row in range(0, num_blocks):
+            start = max(0, row - w)
+            end = min(row + w + 1, num_blocks)
+            layout[h, row, start:end] = 1
+        return layout
+
+    def set_global_layout_itc(self, h, layout):
+        """Sets global attantion layout used by the given head in the sparse attention.
+
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which global layout is set
+        """
+
+        num_blocks = layout.shape[1]
+        if (num_blocks < self.num_global_blocks):
+            raise ValueError(
+                f'Number of global blocks, {self.num_global_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
+            )
+
+        #global rows
+        layout[h, 0:self.num_global_blocks, :] = 1
+
+        #global columns
+        layout[h, :, 0:self.num_global_blocks] = 1
+
+        return layout
+
+    def make_layout(self, seq_len):
+        """Generates `BigBird` sparsity layout used by each head in the sparse attention.
+
+        Arguments:
+             seq_len: required: an integer determining number of attention heads of the layer.
+
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing `BigBird` sparsity layout of all head
+        """
+
+        layout = self.setup_layout(seq_len)
+        for h in range(0, self.num_layout_heads):
+            layout = self.set_random_layout(h, layout)
+            layout = self.set_sliding_window_layout(h, layout)
+            layout = self.set_global_layout_itc(h, layout)
+
+        layout = self.check_and_propagate_first_head_layout(layout)
+        return layout
+
+
+class BSLongformerSparsityConfig(SparsityConfig):
+    """Configuration class to store edited `Longformer` sparsity configuration.
+
+    Note) this is a block-sparse version of the Longformer which is slightly different than original Longformer; which is element-wise sparsity.
+
+    For more details about this sparsity config, please see `Longformer: The Long-Document Transformer`: https://arxiv.org/pdf/2004.05150.pdf
+    This class extends parent class of `SparsityConfig` and customizes it for `Longformer` sparsity.
+    """
+    def __init__(self,
+                 num_heads,
+                 block=16,
+                 different_layout_per_head=False,
+                 num_sliding_window_blocks=3,
+                 global_block_indices=[0],
+                 global_block_end_indices=None):
+        """Initialize the edited `Longformer` Sparsity Pattern Config.
+
+        For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial
+
+        Arguments:
+             num_heads: required: an integer determining number of attention heads of the layer.
+             block: optional: an integer determining the block size. Current implementation of sparse self-attention is based on blocked sparse matrices. In which this parameter defines size of such blocks, `Block X Block`.
+             different_layout_per_head: optional: a boolean determining if each head should be assigned a different sparsity layout; default is false and this will be satisfied based on availability.
+
+             num_sliding_window_blocks: optional: an integer determining the number of blocks in sliding local attention window.
+             global_block_indices: optional: a list of integers determining which blocks are considered as global attention. Given indices, determine the blocks that all other token blocks attend to and they attend to all other token blocks. Default value is only index 0. Notice that if global_block_end_indices parameter is set, this parameter is used as starting index of each global window.
+             global_block_end_indices: optional: a list of integers determining end indices of global window blocks. By default this is not used. But if it is set, it must have the same size of global_block_indices parameter, and combining this two parameters, for each index i, blocks from global_block_indices[i] to global_block_end_indices[i] (exclusive) are considered as global attention.
+        """
+
+        super().__init__(num_heads, block, different_layout_per_head)
+
+        self.num_sliding_window_blocks = num_sliding_window_blocks
+        self.global_block_indices = global_block_indices
+
+        if (global_block_end_indices is not None):
+            if (len(global_block_indices) != len(global_block_end_indices)):
+                raise ValueError(
+                    f'Global block start indices length, {len(global_block_indices)}, must be same as global block end indices length, {len(global_block_end_indices)}!'
+                )
+            for _, (start_idx, end_idx) in enumerate(zip(global_block_indices, global_block_end_indices)):
+                if start_idx >= end_idx:
+                    raise ValueError(
+                        f'Global block start index, {start_idx}, must be smaller than global block end index, {end_idx}!'
+                    )
+        self.global_block_end_indices = global_block_end_indices
+
+    def set_sliding_window_layout(self, h, layout):
+        """Sets sliding local attantion layout used by the given head in the sparse attention.
+
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which local sliding window layout is set
+        """
+
+        num_blocks = layout.shape[1]
+        if (num_blocks < self.num_sliding_window_blocks):
+            raise ValueError(
+                f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
+            )
+
+        w = self.num_sliding_window_blocks // 2
+        for row in range(0, num_blocks):
+            start = max(0, row - w)
+            end = min(row + w + 1, num_blocks)
+            layout[h, row, start:end] = 1
+        return layout
+
+    def set_global_layout(self, h, layout):
+        """Sets global attantion layout used by the given head in the sparse attention.
+
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which global layout is set
+        """
+
+        num_blocks = layout.shape[1]
+        if (self.global_block_end_indices is None):
+            for idx in self.global_block_indices:
+                # if global block idx is in the range of the sequnce blocks
+                if (idx < num_blocks):
+                    #global rows
+                    layout[h, idx, :] = 1
+
+                    #global columns
+                    layout[h, :, idx] = 1
+        else:
+            for _, (start_idx, end_idx) in enumerate(zip(self.global_block_indices, self.global_block_end_indices)):
+                # if global block idx is in the range of the sequnce blocks
+                if (start_idx < num_blocks):
+                    end_idx = min(end_idx, num_blocks)
+                    #global rows
+                    layout[h, start_idx:end_idx, :] = 1
+
+                    #global columns
+                    layout[h, :, start_idx:end_idx] = 1
+        return layout
+
+    def make_layout(self, seq_len):
+        """Generates edited `Longformer` sparsity layout used by each head in the sparse attention.
+
+        Arguments:
+             seq_len: required: an integer determining number of attention heads of the layer.
+
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing `BSLongformer` sparsity layout of all head
+        """
+
+        layout = self.setup_layout(seq_len)
+        for h in range(0, self.num_layout_heads):
+            layout = self.set_sliding_window_layout(h, layout)
+            layout = self.set_global_layout(h, layout)
+
+        layout = self.check_and_propagate_first_head_layout(layout)
+        return layout
diff --git a/deepspeed/ops/sparse_attention/trsrc/__init__.py b/deepspeed/ops/sparse_attention/trsrc/__init__.py
new file mode 100644
index 000000000000..765d34f574a2
--- /dev/null
+++ b/deepspeed/ops/sparse_attention/trsrc/__init__.py
@@ -0,0 +1,32 @@
+import sys
+import os
+
+
+def _build_file_index(directory, suffix='.tr'):
+    """Build an index of source files and their basenames in a given directory.
+
+    Args:
+        directory (string): the directory to index
+        suffix (string): index files with this suffix
+
+    Returns:
+        list: A list of tuples of the form [(basename, absolute path), ...]
+    """
+
+    index = []
+
+    for fname in os.listdir(directory):
+        if fname.endswith(suffix):
+            basename = fname[:fname.rfind(suffix)]  # strip the suffix
+            path = os.path.join(directory, fname)
+            index.append((basename, path))
+
+    return index
+
+
+# Go over all local source files and parse them as strings
+_module = sys.modules[_build_file_index.__module__]
+_directory = os.path.dirname(os.path.realpath(__file__))
+for name, fname in _build_file_index(_directory):
+    with open(fname, 'r') as fin:
+        setattr(_module, name, fin.read())
diff --git a/deepspeed/ops/sparse_attention/trsrc/matmul.tr b/deepspeed/ops/sparse_attention/trsrc/matmul.tr
new file mode 100644
index 000000000000..bf87e993feda
--- /dev/null
+++ b/deepspeed/ops/sparse_attention/trsrc/matmul.tr
@@ -0,0 +1,201 @@
+// DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
+// https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py
+
+__global__ void NAME (TYPE* A __readonly  __noalias __aligned(16),
+                        TYPE* B __readonly  __noalias __aligned(16),
+                        TYPE* C __noalias __aligned(16),
+                        int lda __multipleof(8),
+                        int ldb __multipleof(8),
+                        int ldc __multipleof(8),
+                        long stride_za __multipleof(8),
+                        long stride_zb __multipleof(8),
+                        long stride_zc __multipleof(8),
+                        long stride_ha __multipleof(8),
+                        long stride_hb __multipleof(8),
+                        long stride_hc __multipleof(8),
+                        int DS0, int DS1,
+                        int SDD_K __multipleof(16),
+                        int SDD_off_width,
+                        int* lut, int* locks, int nlocks) {
+    /* ---------------- */
+    /*    Prologue      */
+    /* ---------------- */
+    // program ids
+    int pid0 = get_program_id(0);
+    int pid1 = get_program_id(1);
+    int pidz = get_program_id(2);
+#ifdef SDD
+    // load LUT header
+    pid1 = pid1 + SDD_off_width;
+    int blockidm[TM] = (0 ... TM) / BLOCK;
+    int blockidn[TN] = (0 ... TN) / BLOCK;
+    int offlutm[TM]  = blockidm*(TN/BLOCK)*4;
+    int offlutn[TN]  = blockidn*4;
+    int *header      = lut + pid1 * (TM/BLOCK) * (TN/BLOCK) * 4;
+    int z            = *(header + 0);
+    int i[TM]        = *(header + 1 + offlutm);
+    int j[TN]        = *(header + 2 + offlutn);
+    int AS1 = SDD_K / TZ;
+    int lockid = select(TZ > 1, 1, 0);
+    int offka  = pid0 * AS1;
+    int offkb  = pid0 * AS1;
+    int offmc  = 0;
+    int offnc  = 0;
+    int offpa  = 0;
+    int offpb  = 0;
+    int maxid = TZ;
+    int offhc = 0;
+    int offha = z;
+    int offhb = z;
+    int ram[TM] = i*BLOCK + ((0 ... TM) % BLOCK);
+    int rbn[TN] = j*BLOCK + ((0 ... TN) % BLOCK);
+#else
+    // load LUT header
+    int *header = lut + pid0 * 6;
+    int offset = *(header + 0);
+    int AS1    = *(header + 1);
+    int column = *(header + 2);
+    int depth  = *(header + 3);
+    int lockid = *(header + 4);
+    int maxid  = *(header + 5);
+    int *pinc  = lut + offset;
+    int offhc = depth;
+#ifdef DSD
+    // output offset
+    int offnc = pid1 * TN;
+    int offmc = column * TM;
+    int offpc = 0;
+    // dense input offset
+    int offnb = pid1 * TN;
+    int offkb __multipleof(8) = *pinc;
+    int offpb = 0;
+    // sparse input offset
+    int offma = 0;
+    int offka = 0;
+    long offpa __multipleof(8) = *(pinc + 1);
+    offpa = offpa * BLOCK * BLOCK;
+    int offha = 0;
+    int offhb = depth;
+#endif
+#ifdef DDS
+    // output offset
+    int offmc = pid1 * TM;
+    int offnc = column * TN;
+    int offpc = 0;
+    // dense input offset
+    int offma = pid1 * TM;
+    int offka __multipleof(8) = *pinc;
+    int offpa = 0;
+    // sparse input offset
+    int offnb = 0;
+    int offkb = 0;
+    long offpb __multipleof(8) = *(pinc + 1);
+    offpb = offpb * BLOCK * BLOCK;
+    int offha = depth;
+    int offhb = 0;
+#endif
+    int ram[TM] = offma + 0 ... TM;
+    int rbn[TN] = offnb + 0 ... TN;
+#endif
+    // initialize a, b pointers
+    int rka[TK] = offka + 0 ... TK;
+    int rkb[TK] = offkb + 0 ... TK;
+    TYPE* pa[TM, TK] = A + pidz * stride_za + offha * stride_ha + offpa + ram[:, newaxis] * STRIDE_AM + rka[newaxis, :] * STRIDE_AK;
+    TYPE* pb[TK, TN] = B + pidz * stride_zb + offhb * stride_hb + offpb + rbn[newaxis, :] * STRIDE_BN + rkb[:, newaxis] * STRIDE_BK;
+    // pre-fetch
+#ifdef DDS
+    bool checkam[TM, TK] = ram[:, newaxis] < DS0;
+#else
+    bool checkam[TM, TK] = AS1 > 0;
+#endif
+#ifdef DSD
+    bool checkbn[TK, TN] = rbn[newaxis, :] < DS0;
+#else
+    bool checkbn[TK, TN] = AS1 > 0;
+#endif
+    TYPE a[TM, TK] = checkam ? *pa : 0;
+    TYPE b[TK, TN] = checkbn ? *pb : 0;
+
+    /* ---------------- */
+    /*    Inner Loop    */
+    /* ---------------- */
+    // create result tile
+    float acc[TM, TN] = 0;
+    int step = TK;
+    for(int k = AS1; k > 0; k -= step) {
+      acc += a @ b;
+      // update pointers
+#ifdef SDD
+      int inc_a = TK * STRIDE_AK;
+      int inc_b = TK * STRIDE_BK;
+#else
+      pinc += 2;
+#ifdef DSD
+      int inc_b __multipleof(8) = *pinc;
+      int inc_a __multipleof(8) = *(pinc + 1);
+      inc_b = inc_b * STRIDE_BK;
+#endif
+#ifdef DDS
+      int inc_a __multipleof(8) = *pinc;
+      int inc_b __multipleof(8) = *(pinc + 1);
+      inc_a = inc_a * STRIDE_AK;
+#endif
+#endif
+      pa += inc_a;
+      pb += inc_b;
+      // pre-fetch
+      bool checkak[TM, TK] = k > TK;
+      bool checkbk[TK, TN] = k > TK;
+      bool checka[TM, TK] = checkam && checkak;
+      bool checkb[TK, TN] = checkbk && checkbn;
+      a = *?(checka)pa;
+      b = *?(checkb)pb;
+    }
+    TYPE c[TM, TN] = acc;
+
+    /* ---------------- */
+    /*    Epilogue      */
+    /* ---------------- */
+    // initialize c pointers
+#ifdef SDD
+    bool checkc[TM, TN] = 1;
+    // rematerialize
+    int rr_blockidm[TM]  = (0 ... TM) / BLOCK;
+    int rr_blockidn[TN]  = (0 ... TN) / BLOCK;
+    int rr_offlutm[TM]   = rr_blockidm*(TN/BLOCK)*4;
+    int rr_offlutn[TN]   = rr_blockidn*4;
+    int off_bkid[TM, TN] = 3 + rr_offlutm[:, newaxis] + rr_offlutn[newaxis, :];
+    int bkid[TM, TN]     = *(header + off_bkid);
+    long offpc[TM, TN]   = bkid * BLOCK * BLOCK;
+    // range within blocks
+    int   rcm[TM]    = (0 ... TM) % BLOCK;
+    int   rcn[TN]    = (0 ... TN) % BLOCK;
+#else
+    int   rcm[TM]    = offmc + 0 ... TM;
+    int   rcn[TN]    = offnc + 0 ... TN;
+#ifdef DSD
+    bool checkc[TM, TN] = rcn[newaxis, :] < DS0;
+#endif
+#ifdef DDS
+    bool checkc[TM, TN] = rcm[:, newaxis] < DS0;
+#endif
+#endif
+    TYPE* pc[TM, TN] = C + offpc + offhc*stride_hc + pidz*stride_zc + rcm[:, newaxis]*STRIDE_CM + rcn[newaxis, :]*STRIDE_CN;
+    // write-back directly
+    if(lockid == 0) {
+      *?(checkc) pc = c;
+    }
+    // accumulate partial result using spin-locks
+    else {
+      int *plock = locks + get_program_id(2)*nlocks*get_num_programs(1) + get_program_id(1)*nlocks + lockid - 1;
+      int *pcount = plock + get_num_programs(2)*get_num_programs(1)*nlocks;
+      for(int repeat = 1; repeat == 1; repeat = atomic_cas(plock, 0, 1));
+      int count = *pcount;
+      if(count == 0)
+        *?(checkc) pc = c;
+      else
+        *?(checkc) pc = c + *?(checkc)pc;
+      atomic_xchg(pcount, (count + 1) % maxid);
+      atomic_xchg(plock, 0);
+    }
+  }
diff --git a/deepspeed/ops/sparse_attention/trsrc/softmax_bwd.tr b/deepspeed/ops/sparse_attention/trsrc/softmax_bwd.tr
new file mode 100644
index 000000000000..25d15a99e468
--- /dev/null
+++ b/deepspeed/ops/sparse_attention/trsrc/softmax_bwd.tr
@@ -0,0 +1,54 @@
+// DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
+// https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/softmax.py
+
+__global__ void softmax_bwd(TYPE * X __readonly __noalias __aligned(16),
+                            float scale,
+                            TYPE* DX __readonly __noalias __aligned(16),
+                            int* LUT,
+                            int sizemax,
+                            long stride_zx __multipleof(BLOCK),
+                            long stride_zdx __multipleof(BLOCK)) {
+    int pidhm = get_program_id(0);
+    int pidz = get_program_id(1);
+
+    // create index ranges
+    int rxm = pidhm % BLOCK;
+    int rbm = pidhm / BLOCK;
+    int rxn[TN] = (0 ... TN) % BLOCK;
+    int rbn[TN] = (0 ... TN) / BLOCK;
+
+    // extract information from look-up table
+    int* header = LUT + rbm * 2;
+    int size    = *(header + 0);
+    int offset  = *(header + 1);
+
+    // bounds checking on lut
+    bool check[TN] = rbn < size;
+    int rbmn[TN] = check ? rbn : size - 1;
+
+    // initialize pointers to block-sparse input
+    long blockid[TN] = *(LUT + offset + rbmn*4);
+
+    TYPE* px[TN] = X + pidz * stride_zx
+                         + blockid * BLOCK * BLOCK
+                         + rxm * BLOCK
+                         + rxn;
+
+    TYPE* pdx[TN] = DX + pidz * stride_zdx
+                           + blockid * BLOCK * BLOCK
+                           + rxm * BLOCK
+                           + rxn;
+
+    // compute fused softmax backward
+    TYPE x[TN] = check ? *px : 0;
+    TYPE dx[TN] = check ? *pdx : 0;
+    float Fdx[TN] = dx;
+    float Fx[TN] = x;
+    float Fxdx[TN] = Fdx*Fx;
+    float Fxdxsum = Fxdx[+];
+    float Fy[TN] = Fx * (Fdx - Fxdxsum) * scale;
+    TYPE y[TN] = Fy;
+
+    // write-back
+    *? (check)pdx = y;
+}
diff --git a/deepspeed/ops/sparse_attention/trsrc/softmax_fwd.tr b/deepspeed/ops/sparse_attention/trsrc/softmax_fwd.tr
new file mode 100644
index 000000000000..7d5cc50b282d
--- /dev/null
+++ b/deepspeed/ops/sparse_attention/trsrc/softmax_fwd.tr
@@ -0,0 +1,136 @@
+// DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
+// https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/softmax.py
+
+__global__ void softmax_fwd(TYPE *X __readonly __noalias __aligned(16),
+                            float scale,
+                            int *LUT __readonly __noalias __aligned(16),
+                            TYPE *RPE __readonly __noalias __aligned(16),
+                            TYPE *KP_M __readonly __noalias __aligned(16),
+                            TYPE *ATTN_M __readonly __noalias __aligned(16),
+                            int num_blocks,
+                            int sizemax,
+                            long stride_zx __multipleof(BLOCK),
+                            long stride_zrpe __multipleof(BLOCK),
+                            int stride_hrpe __multipleof(BLOCK),
+                            int stride_srpe __multipleof(BLOCK),
+                            int stride_zkpm __multipleof(BLOCK),
+                            int stride_zattnm __multipleof(BLOCK)){
+  int pidhm = get_program_id(0);
+  int pidz = get_program_id(1);
+
+  // create index ranges
+  int rxm     = pidhm % BLOCK;
+  int rbm     = pidhm / BLOCK;
+  int rxn[TN] = (0 ... TN) % BLOCK;
+  int rbn[TN] = (0 ... TN) / BLOCK;
+
+  // extract information from look-up table
+  int* header = LUT + rbm * 2;
+  int size    = *(header + 0);
+  int offset  = *(header + 1);
+
+  bool check[TN] = rbn < size;
+  int   rbmn[TN] = check ? rbn : size - 1;
+
+  // block id and column id
+  long blockid [TN]  = *(LUT + offset + rbmn*4 + 0);
+  long columnid[TN]  = *(LUT + offset + rbmn*4 + 1);
+  long rowid   [TN]  = *(LUT + offset + rbmn*4 + 2);
+  long headid  [TN]  = *(LUT + offset + rbmn*4 + 3);
+
+  // pointers to X
+  TYPE* px[TN]  = X + pidz * stride_zx
+                    + blockid * BLOCK * BLOCK
+                    + rxm * BLOCK
+                    + rxn;
+#ifdef APPLY_RPE
+  // pointers to relative position embedding
+  TYPE* prpe[TN] = RPE + pidz * stride_zrpe
+                            + headid * stride_hrpe
+                            + columnid * BLOCK
+                            + rowid * BLOCK * stride_srpe
+                            + rxm * stride_srpe
+                            + rxn;
+#endif
+
+#ifdef APPLY_KP_MASK
+  // pointers to key padding mask
+  TYPE* pkp_m[TN]  = KP_M + pidz * stride_zkpm
+                          + columnid * BLOCK
+                          + rxn;
+#endif
+
+#ifdef APPLY_ATTN_MASK
+  // pointers to attention mask
+  TYPE* pattn_m[TN] = ATTN_M + columnid * BLOCK
+                             + rowid * BLOCK * stride_zattnm
+                             + rxm * stride_zattnm
+                             + rxn;
+#endif
+
+  // load  input
+  TYPE x[TN] =  check ? *px : -INFINITY;
+
+#ifdef APPLY_RPE
+  // load relative position embedding
+  TYPE rpe[TN] = check ? *prpe : 0;
+#endif
+
+#ifdef APPLY_KP_MASK
+  // load key-padding mask
+  TYPE kp_m[TN] = check ? *pkp_m : -INFINITY;
+#endif
+
+#ifdef APPLY_ATTN_MASK
+  // load attention mask
+  TYPE attn_m[TN] = check ? *pattn_m : -INFINITY;
+#endif
+
+  // compute softmax in float
+#ifdef APPLY_RPE
+  float Frpe[TN] = rpe;
+#endif
+
+#ifdef APPLY_KP_MASK
+  float Fkp_m[TN] = kp_m;
+#endif
+
+#ifdef APPLY_ATTN_MASK
+  float Fattn_m[TN] = attn_m;
+#endif
+
+#ifdef KP_MASK_MUL
+  Fkp_m = (Fkp_m == 0) ? (float[TN])-INFINITY : 0;
+#endif
+
+#ifdef ATTN_MASK_MUL
+  Fattn_m = (Fattn_m == 0) ? (float[TN])-INFINITY : 0;
+#endif
+
+  float Fx[TN] = x;
+
+#ifdef APPLY_SCALE
+  Fx = Fx * scale; // apply scale
+#endif
+
+#ifdef APPLY_RPE
+  Fx = Fx + Frpe; // apply relative position embedding
+#endif
+
+#ifdef APPLY_KP_MASK
+  Fx = Fx + Fkp_m; // apply key padding mask
+#endif
+
+#ifdef APPLY_ATTN_MASK
+  Fx = Fx + Fattn_m; // apply attention mask
+#endif
+
+  float Fxmax  = Fx[max];
+  float Fy[TN] = exp(Fx - Fxmax);
+  float Fysum = (check ? Fy : 0)[+];
+
+  // write-back in half/float
+  TYPE y[TN] = Fy;
+  TYPE ysum = Fysum;
+  *?(check)px = y / ysum;
+}
diff --git a/deepspeed/ops/transformer/__init__.py b/deepspeed/ops/transformer/__init__.py
new file mode 100644
index 000000000000..63c5938bb9e7
--- /dev/null
+++ b/deepspeed/ops/transformer/__init__.py
@@ -0,0 +1 @@
+from .transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
diff --git a/deepspeed/pt/deepspeed_cuda.py b/deepspeed/ops/transformer/transformer.py
similarity index 73%
rename from deepspeed/pt/deepspeed_cuda.py
rename to deepspeed/ops/transformer/transformer.py
index 3b86f06cc383..f0979f2e3f2a 100755
--- a/deepspeed/pt/deepspeed_cuda.py
+++ b/deepspeed/ops/transformer/transformer.py
@@ -1,17 +1,25 @@
-from torch import nn
-from torch.autograd import Function
-import torch
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
 import json
 import math
-import deepspeed_transformer_cuda as ds_transformer_cuda
-import deepspeed_stochastic_transformer_cuda as ds_stochastic_transformer_cuda
+import importlib
+import torch
+from torch import nn
+from torch.autograd import Function
+
+from ..op_builder import TransformerBuilder, StochasticTransformerBuilder
+
+# Cuda modules will be imported if needed
+transformer_cuda_module = None
+stochastic_transformer_cuda_module = None
 
 
 class TransformerConfig():
     def __init__(self,
                  batch_size,
-                 max_seq_length,
                  hidden_size,
+                 intermediate_size,
                  heads,
                  attn_dropout_ratio,
                  hidden_dropout_ratio,
@@ -20,7 +28,7 @@ def __init__(self,
         self.layer_id = -1
         self.batch_size = batch_size
         self.hidden_size = hidden_size
-        self.max_seq_length = max_seq_length
+        self.intermediate_size = intermediate_size
         self.heads = heads
         self.attn_dropout_ratio = attn_dropout_ratio
         self.hidden_dropout_ratio = hidden_dropout_ratio
@@ -38,6 +46,8 @@ class DeepSpeedTransformerConfig(TransformerConfig):
 
             hidden_size: The hidden size of the transformer layer
 
+            intermediate_size: The intermediate size of the feed-forward part of transformer layer
+
             heads: The number of heads in the self-attention of the transformer layer
 
             attn_dropout_ratio: The ratio of dropout for the attention's output
@@ -77,11 +87,15 @@ class DeepSpeedTransformerConfig(TransformerConfig):
                 that by enabling it, the pretraining tasks such as BERT are not affected and can obtain
                 a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend
                 to turn it off in order to be able to reproduce the same result through the regular kernel execution.
+
+            huggingface: Enbale if using the HuggingFace interface style for sending out the forward results.
+
+            training: Enable for training rather than inference.
     """
     def __init__(self,
                  batch_size=-1,
-                 max_seq_length=-1,
                  hidden_size=-1,
+                 intermediate_size=-1,
                  heads=-1,
                  attn_dropout_ratio=-1,
                  hidden_dropout_ratio=-1,
@@ -95,16 +109,19 @@ def __init__(self,
                  gelu_checkpoint=False,
                  adjust_init_range=True,
                  attn_dropout_checkpoint=False,
-                 stochastic_mode=False):
+                 stochastic_mode=False,
+                 huggingface=False,
+                 training=True):
         super(DeepSpeedTransformerConfig,
-              self).__init__(batch_size,
-                             max_seq_length,
-                             hidden_size,
-                             heads,
-                             attn_dropout_ratio,
-                             hidden_dropout_ratio,
-                             num_hidden_layers,
-                             initializer_range)
+              self).__init__(
+                  batch_size,
+                  hidden_size,
+                  (intermediate_size if intermediate_size > 0 else 4 * hidden_size),
+                  heads,
+                  attn_dropout_ratio,
+                  hidden_dropout_ratio,
+                  num_hidden_layers,
+                  initializer_range)
         self.fp16 = fp16
         self.pre_layer_norm = pre_layer_norm
         self.local_rank = local_rank
@@ -113,10 +130,11 @@ def __init__(self,
         self.gelu_checkpoint = gelu_checkpoint  # True: if higher batch size is required
         self.adjust_init_range = adjust_init_range
         self.test_gemm = False
-        self.training = True
+        self.training = training
         self.is_grad_enabled = True
         self.attn_dropout_checkpoint = attn_dropout_checkpoint
         self.stochastic_mode = stochastic_mode
+        self.huggingface = huggingface
 
     @classmethod
     def from_dict(cls, json_object):
@@ -127,7 +145,7 @@ def from_dict(cls, json_object):
 
     @classmethod
     def from_json_file(cls, json_file):
-        with open(json_file, "r", encoding='utf-8') as reader:
+        with open(json_file, "r", encoding='utf-16') as reader:
             text = reader.read()
         return cls.from_dict(json.loads(text))
 
@@ -159,9 +177,21 @@ def forward(ctx,
         if bsz > config.batch_size:
             raise ValueError('Input batch size exceeds the limit.')
 
-        cuda_module = ds_stochastic_transformer_cuda if config.stochastic_mode else ds_transformer_cuda
+        cuda_module = stochastic_transformer_cuda_module if config.stochastic_mode else transformer_cuda_module
         forward_func = cuda_module.forward_fp16 if config.fp16 else cuda_module.forward_fp32
 
+        inp_size = input.size()
+        if inp_size[1] % 16 != 0:
+            input = torch.cat((input,
+                               torch.randn((inp_size[0],
+                                            (16 - (inp_size[1] % 16)),
+                                            inp_size[2]),
+                                           device=input.device,
+                                           dtype=input.dtype)),
+                              1)
+            input_mask = torch.cat((input_mask, torch.ones((inp_size[0], input_mask.shape[1], input_mask.shape[2], \
+                                            (16 - (inp_size[1] % 16))), device=input_mask.device, dtype=input_mask.dtype) * -10000), 3)
+
         (output,
          inp_norm,
          qkv_tf,
@@ -174,26 +204,30 @@ def forward(ctx,
          ff2_inp,
          attn_prob_dropout_mask,
          attn_output_dropout_mask,
-         layer_output_dropout_mask) = forward_func(config.layer_id,
-                                                   input,
-                                                   input_mask,
-                                                   attn_qkvw,
-                                                   attn_qkvb,
-                                                   attn_ow,
-                                                   attn_ob,
-                                                   attn_nw,
-                                                   attn_nb,
-                                                   inter_w,
-                                                   inter_b,
-                                                   output_w,
-                                                   output_b,
-                                                   norm_w,
-                                                   norm_b,
-                                                   config.training,
-                                                   config.pre_layer_norm,
-                                                   config.attn_dropout_checkpoint,
-                                                   config.normalize_invertible,
-                                                   config.gelu_checkpoint)
+         layer_output_dropout_mask,
+         attn_layer_norm_var,
+         attn_layer_norm_mean,
+         layer_norm_var,
+         layer_norm_mean) = forward_func(config.layer_id,
+                                         input,
+                                         input_mask,
+                                         attn_qkvw,
+                                         attn_qkvb,
+                                         attn_ow,
+                                         attn_ob,
+                                         attn_nw,
+                                         attn_nb,
+                                         inter_w,
+                                         inter_b,
+                                         output_w,
+                                         output_b,
+                                         norm_w,
+                                         norm_b,
+                                         config.training,
+                                         config.pre_layer_norm,
+                                         config.attn_dropout_checkpoint,
+                                         config.normalize_invertible,
+                                         config.gelu_checkpoint)
 
         # For testing only.
         if grads is not None:
@@ -225,7 +259,7 @@ def forward(ctx,
             norm_w.register_hook(lambda x, self=self: grads.append([x, "norm_W"]))
             norm_b.register_hook(lambda x, self=self: grads.append([x, "norm_B"]))
 
-        if config.is_grad_enabled:
+        if config.is_grad_enabled and config.training:
             if (config.pre_layer_norm and config.normalize_invertible):
                 ctx.save_for_backward(input_mask,
                                       attn_qkvw,
@@ -270,6 +304,9 @@ def forward(ctx,
             if not config.normalize_invertible:
                 ctx.add_res = add_res
 
+            ctx.attn_layer_norm_mean = attn_layer_norm_mean
+            ctx.layer_norm_mean = layer_norm_mean
+
             ctx.ff1_inp = ff1_inp
             if not config.gelu_checkpoint:
                 ctx.gelu_inp = gelu_inp
@@ -278,12 +315,24 @@ def forward(ctx,
             ctx.attn_prob_dropout_mask = attn_prob_dropout_mask
             ctx.attn_output_dropout_mask = attn_output_dropout_mask
             ctx.layer_output_dropout_mask = layer_output_dropout_mask
+            ctx.attn_layer_norm_var = attn_layer_norm_var
+            ctx.layer_norm_var = layer_norm_var
 
-        return output
+        if inp_size[1] % 16 != 0:
+            output = torch.narrow(output, 1, 0, inp_size[1])
+
+        if config.huggingface:
+            return (output, )  # outputs -> (output) : outputs[0] = output
+        else:
+            return output
 
     @staticmethod
     def backward(ctx, grad_output):
         bsz = grad_output.shape[0]
+        grad_output_shape = grad_output.size()
+        if grad_output_shape[1] % 16 != 0:
+            grad_output = torch.cat((grad_output, torch.zeros((bsz, (16 - (grad_output_shape[1] % 16)), \
+                                        grad_output_shape[2]), device=grad_output.device, dtype=grad_output.dtype)), 1)
 
         if bsz > ctx.config.batch_size:
             raise ValueError('grad_output batch size exceeds the limit.')
@@ -321,7 +370,7 @@ def backward(ctx, grad_output):
              norm_w,
              norm_b) = ctx.saved_tensors
 
-        cuda_module = ds_stochastic_transformer_cuda if ctx.config.stochastic_mode else ds_transformer_cuda
+        cuda_module = stochastic_transformer_cuda_module if ctx.config.stochastic_mode else transformer_cuda_module
         backward_func = cuda_module.backward_fp16 if ctx.config.fp16 else cuda_module.backward_fp32
 
         (grad_input,
@@ -354,6 +403,10 @@ def backward(ctx, grad_output):
              ctx.attn_prob_dropout_mask,
              ctx.attn_output_dropout_mask,
              ctx.layer_output_dropout_mask,
+             ctx.attn_layer_norm_var,
+             ctx.attn_layer_norm_mean,
+             ctx.layer_norm_var,
+             ctx.layer_norm_mean,
              (ctx.inp_norm if (ctx.config.pre_layer_norm
                                and ctx.config.normalize_invertible) else input),
              input_mask,
@@ -370,6 +423,28 @@ def backward(ctx, grad_output):
              norm_w,
              norm_b)
 
+        # This appears to be an effective way to release context memory
+        ctx.qkv_tf = None
+        ctx.soft_inp = None
+        ctx.ctx_bufB = None
+        ctx.gelu_inp = None
+        ctx.ff2_inp = None
+        ctx.attn_o_inp = None
+        ctx.ff1_inp = None
+        ctx.add_res = None
+        ctx.inp_norm = None
+        ctx.config = None
+        ctx.attn_layer_norm_mean = None
+        ctx.layer_norm_mean = None
+        ctx.attn_prob_dropout_mask = None
+        ctx.attn_output_dropout_mask = None
+        ctx.layer_output_dropout_mask = None
+        ctx.attn_layer_norm_var = None
+        ctx.layer_norm_var = None
+
+        if grad_output_shape[1] % 16 != 0:
+            grad_input = torch.narrow(grad_input, 1, 0, grad_output_shape[1])
+
         return (grad_input,
                 None,
                 None,
@@ -393,21 +468,24 @@ def backward(ctx, grad_output):
 class DeepSpeedTransformerLayer(nn.Module):
     """Initialize the DeepSpeed Transformer Layer.
 
+        Static variable:
+            layer_id: The layer-index counter starting from 0 and incrementing by 1 every time a layer object is instantiated,
+            e.g. if a model has 24 transformer layers, layer_id goes from 0 to 23.
         Arguments:
-            layer_id: The layer index starting from 0, e.g. if model has 24 transformer layers,
-                layer_id will be 0,1,2...23 when each layer object is instantiated
-
             config: An object of DeepSpeedTransformerConfig
 
             initial_weights: Optional: Only used for unit test
 
             initial_biases: Optional: Only used for unit test
     """
-    def __init__(self, layer_id, config, initial_weights=None, initial_biases=None):
+    layer_id = 0
+
+    def __init__(self, config, initial_weights=None, initial_biases=None):
         super(DeepSpeedTransformerLayer, self).__init__()
 
         self.config = config
-        self.config.layer_id = layer_id
+        self.config.layer_id = DeepSpeedTransformerLayer.layer_id
+        DeepSpeedTransformerLayer.layer_id = DeepSpeedTransformerLayer.layer_id + 1
 
         print("DeepSpeed Transformer config is ", self.config.__dict__)
 
@@ -426,12 +504,12 @@ def __init__(self, layer_id, config, initial_weights=None, initial_biases=None):
             self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size))
             self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size))
             self.inter_w = nn.Parameter(
-                torch.Tensor(4 * self.config.hidden_size,
+                torch.Tensor(self.config.intermediate_size,
                              self.config.hidden_size))
-            self.inter_b = nn.Parameter(torch.Tensor(4 * self.config.hidden_size))
+            self.inter_b = nn.Parameter(torch.Tensor(self.config.intermediate_size))
             self.output_w = nn.Parameter(
                 torch.Tensor(self.config.hidden_size,
-                             4 * self.config.hidden_size))
+                             self.config.intermediate_size))
             self.output_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
             self.norm_w = nn.Parameter(torch.Tensor(self.config.hidden_size))
             self.norm_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
@@ -457,16 +535,22 @@ def __init__(self, layer_id, config, initial_weights=None, initial_biases=None):
             self.norm_w = initial_weights[7]
             self.norm_b = initial_biases[7]
 
+        # Load cuda modules if needed
+        global transformer_cuda_module, stochastic_transformer_cuda_module
+        if transformer_cuda_module is None and not self.config.stochastic_mode:
+            transformer_cuda_module = TransformerBuilder().load()
+        if stochastic_transformer_cuda_module is None and self.config.stochastic_mode:
+            stochastic_transformer_cuda_module = StochasticTransformerBuilder().load()
+
         # create the layer in cuda kernels.
-        cuda_module = ds_stochastic_transformer_cuda if self.config.stochastic_mode else ds_transformer_cuda
+        cuda_module = stochastic_transformer_cuda_module if self.config.stochastic_mode else transformer_cuda_module
         create_layer_func = cuda_module.create_transformer_layer_fp16 if self.config.fp16 else cuda_module.create_transformer_layer_fp32
 
         create_layer_func(self.config.layer_id,
                           self.config.batch_size,
                           self.config.hidden_size,
                           self.config.heads,
-                          4 * self.config.hidden_size,
-                          self.config.max_seq_length,
+                          self.config.intermediate_size,
                           self.config.attn_dropout_ratio,
                           self.config.hidden_dropout_ratio,
                           self.config.seed,
@@ -497,11 +581,18 @@ def init_transformer_weights(self, adjust_init_range=False):
         self.norm_w.data.fill_(1.0)
         self.norm_b.data.zero_()
 
-    def forward(self, input, input_mask, grads=None):
+    def forward(self,
+                hidden_states,
+                attention_mask=None,
+                head_mask=None,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                output_attentions=False,
+                grads=None):
         self.config.training = self.training
         self.config.is_grad_enabled = torch.is_grad_enabled()
-        return DeepSpeedTransformerFunction.apply(input,
-                                                  input_mask,
+        return DeepSpeedTransformerFunction.apply(hidden_states,
+                                                  attention_mask,
                                                   self,
                                                   grads,
                                                   self.config.layer_id,
diff --git a/deepspeed/pipe/__init__.py b/deepspeed/pipe/__init__.py
new file mode 100644
index 000000000000..db1308172f08
--- /dev/null
+++ b/deepspeed/pipe/__init__.py
@@ -0,0 +1 @@
+from ..runtime.pipe import PipelineModule, LayerSpec, TiedLayerSpec
diff --git a/deepspeed/profiling/__init__.py b/deepspeed/profiling/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/deepspeed/profiling/config.py b/deepspeed/profiling/config.py
new file mode 100644
index 000000000000..017f9ec9e32e
--- /dev/null
+++ b/deepspeed/profiling/config.py
@@ -0,0 +1,52 @@
+"""
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+"""
+
+from deepspeed.runtime.config_utils import get_scalar_param
+from deepspeed.profiling.constants import *
+
+
+class DeepSpeedFlopsProfilerConfig(object):
+    def __init__(self, param_dict):
+        """
+        docstring
+        """
+        super(DeepSpeedFlopsProfilerConfig, self).__init__()
+
+        self.enabled = None
+        self.start_step = None
+        self.end_step = None
+        self.module_depth = None
+        self.top_modules = None
+
+        if FLOPS_PROFILER in param_dict.keys():
+            flops_profiler_dict = param_dict[FLOPS_PROFILER]
+        else:
+            flops_profiler_dict = {}
+
+        self._initialize(flops_profiler_dict)
+
+    def _initialize(self, flops_profiler_dict):
+        """
+        docstring
+        """
+        self.enabled = get_scalar_param(flops_profiler_dict,
+                                        FLOPS_PROFILER_ENABLED,
+                                        FLOPS_PROFILER_ENABLED_DEFAULT)
+
+        self.start_step = get_scalar_param(flops_profiler_dict,
+                                           FLOPS_PROFILER_START_STEP,
+                                           FLOPS_PROFILER_START_STEP_DEFAULT)
+
+        self.end_step = get_scalar_param(flops_profiler_dict,
+                                         FLOPS_PROFILER_END_STEP,
+                                         FLOPS_PROFILER_END_STEP_DEFAULT)
+
+        self.module_depth = get_scalar_param(flops_profiler_dict,
+                                             FLOPS_PROFILER_MODULE_DEPTH,
+                                             FLOPS_PROFILER_MODULE_DEPTH_DEFAULT)
+
+        self.top_modules = get_scalar_param(flops_profiler_dict,
+                                            FLOPS_PROFILER_TOP_MODULES,
+                                            FLOPS_PROFILER_TOP_MODULES_DEFAULT)
diff --git a/deepspeed/profiling/constants.py b/deepspeed/profiling/constants.py
new file mode 100644
index 000000000000..f4812d32d866
--- /dev/null
+++ b/deepspeed/profiling/constants.py
@@ -0,0 +1,39 @@
+"""
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+"""
+
+#########################################
+# flops profiler
+#########################################
+# Flops profiler. By default, this feature is not enabled.
+# Users can configure in ds_config.json as below example:
+FLOPS_PROFILER_FORMAT = '''
+flops profiler should be enabled as:
+"session_params": {
+  "flops_profiler": {
+    "enalbe": [true|false],
+    "start_step": 5,
+    "end_step": 6,
+    "module_depth": -1,
+    "top_modules": 3,
+    }
+}
+'''
+
+FLOPS_PROFILER = "flops_profiler"
+
+FLOPS_PROFILER_ENABLED = "enabled"
+FLOPS_PROFILER_ENABLED_DEFAULT = False
+
+FLOPS_PROFILER_START_STEP = "start_step"
+FLOPS_PROFILER_START_STEP_DEFAULT = 5
+
+FLOPS_PROFILER_END_STEP = "end_step"
+FLOPS_PROFILER_END_STEP_DEFAULT = FLOPS_PROFILER_START_STEP_DEFAULT + 1
+
+FLOPS_PROFILER_MODULE_DEPTH = "module_depth"
+FLOPS_PROFILER_MODULE_DEPTH_DEFAULT = -1
+
+FLOPS_PROFILER_TOP_MODULES = "top_modules"
+FLOPS_PROFILER_TOP_MODULES_DEFAULT = 3
diff --git a/deepspeed/profiling/flops_profiler/README.md b/deepspeed/profiling/flops_profiler/README.md
new file mode 100644
index 000000000000..f4584d21d604
--- /dev/null
+++ b/deepspeed/profiling/flops_profiler/README.md
@@ -0,0 +1,250 @@
+# flops-profiler
+
+> Measures the time, number of estimated flops and parameters of each module in a PyTorch Model.
+
+The flops-profiler profiles the forward pass of a PyTorch model and prints the model graph with the measured profile attached to each module. It shows how time, flops and parameters are spent in the model and which modules or layers could be the bottleneck. It also outputs the names of the top k modules in terms of aggregated time, flops, and parameters at depth l with k and l specified by the user. The output profile is computed for each batch of input. If multiple forward passes are specified by the user to caputre (in the case where the model have different paths or for more accurate timing), the average profile of the multiple batches is taken.
+
+The flops estimation is partly inspired by [ptflops](https://github.com/sovrasov/flops-counter.pytorch) with the major difference being that flops-profiler captures `torch.nn.functional` invoked in a module to estimate the flops, thus allowing customized modules in the model (e.g. `ParallelTransformerLayerworks, ParallelSelfAttention, RowParallelLinear, etc.` in [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)). The flops-profiler also supports flops computation at module level (for RNNs).
+
+For models running on multi-node or multi-gpu, only the model parallelism affects the number of flops and parameters (e.g. `--model-parallel-size` in [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)), i.e., model_parallel_size _ flops = total_flops, model_parallel_size _ parameters = total_parameters. The number of gpus or nodes does not affect the output profile.
+
+Below is an example output for LeNet5 with batch size 1024 on a V100 GPU:
+
+```
+LeNet5(
+  61.71 k, 100.00% Params, 439.55 MMACs, 100.00% MACs, 25.62 ms, 100.00% time, 0.034 TFLOPS,
+  (feature_extractor): Sequential(
+    50.69 k, 82.15% Params, 428.37 MMACs, 97.46% MACs, 18.41 ms, 71.85% time, 0.047 TFLOPS,
+    (0): Conv2d(156, 0.25% Params, 125.24 MMACs, 28.49% MACs, 10.56 ms, 41.21% time, 0.024 TFLOPS, 1, 6, kernel_size=(5, 5), stride=(1, 1))
+    (1): Tanh(0, 0.00% Params, 0.0 MACs, 0.00% MACs, 2.25 ms, 8.79% time, 0.0 TFLOPS, )
+    (2): AvgPool2d(0, 0.00% Params, 4.82 MMACs, 1.10% MACs, 2.47 ms, 9.63% time, 0.0039 TFLOPS, kernel_size=2, stride=2, padding=0)
+    (3): Conv2d(2.42 k, 3.92% Params, 247.4 MMACs, 56.28% MACs, 1.08 ms, 4.23% time, 0.46 TFLOPS, 6, 16, kernel_size=(5, 5), stride=(1, 1))
+    (4): Tanh(0, 0.00% Params, 0.0 MACs, 0.00% MACs, 497.39 us, 1.94% time, 0.0 TFLOPS, )
+    (5): AvgPool2d(0, 0.00% Params, 1.64 MMACs, 0.37% MACs, 758.24 us, 2.96% time, 0.0043 TFLOPS, kernel_size=2, stride=2, padding=0)
+    (6): Conv2d(48.12 k, 77.98% Params, 49.27 MMACs, 11.21% MACs, 606.35 us, 2.37% time, 0.16 TFLOPS, 16, 120, kernel_size=(5, 5), stride=(1, 1))
+    (7): Tanh(0, 0.00% Params, 0.0 MACs, 0.00% MACs, 68.86 us, 0.27% time, 0.0 TFLOPS, )
+  )
+  (classifier): Sequential(
+    11.01 k, 17.85% Params, 11.18 MMACs, 2.54% MACs, 7.03 ms, 27.43% time, 0.0032 TFLOPS,
+    (0): Linear(10.16 k, 16.47% Params, 10.32 MMACs, 2.35% MACs, 2.71 ms, 10.57% time, 0.0076 TFLOPS, in_features=120, out_features=84, bias=True)
+    (1): Tanh(0, 0.00% Params, 0.0 MACs, 0.00% MACs, 78.77 us, 0.31% time, 0.0 TFLOPS, )
+    (2): Linear(850, 1.38% Params, 860.16 KMACs, 0.20% MACs, 4.17 ms, 16.27% time, 0.00041 TFLOPS, in_features=84, out_features=10, bias=True)
+  )
+)
+Top 3 modules in flops at depth 2 are {'Conv2d': '421.91 MMACs', 'Linear': '11.18 MMACs', 'AvgPool2d': '6.46 MMACs'}
+Top 3 modules in params at depth 2 are {'Conv2d': '50.69 k', 'Linear': '11.01 k', 'Tanh': '0'}
+Top 3 modules in time at depth 2 are {'Conv2d': '12.25 ms', 'Linear': '6.88 ms', 'AvgPool2d': '3.23 ms'}
+Batch size:                     1024
+Number of multiply-adds:        439.55 MMACs
+Number of parameters:           61.71 k
+Number of steps profiled:       10
+```
+
+## Installation
+
+The profiler is an integral part of DeepSpeed and can be installed by
+
+```
+pip install deepspeed
+```
+
+Refer to the [installaiton of DeepSpeed](https://www.deepspeed.ai/getting-started/#installation) for more information.
+
+## Usage
+
+### With the DeepSpeed runtime
+
+If using DeepSpeed for model training, no explict API calls are needed to use the flops-profiler.
+
+In DeepSpeed config file, specify:
+
+```python
+  ds_config = {
+    ...# other deepspeed configs
+    "flops_profiler": {
+        "enabled": True,
+        "start_step": 2,
+        "end_step": 3,
+        "module_depth": -1,
+        "top_modules": 3,
+    },
+  }
+```
+- `"enabled": true` to enable the flops-profiler.
+- `"start_step": 5` to start the profiler at step 5. Note that warm-up is necessary for getting accurate timing information.
+- `"end_step": 6` to end the profiler at step 6. Note that `end_step > start_step`.
+- `"module_depth": -1` to print aggregated module information at the maximum depth (innermost modules). Can be set to any positive number, caped by the maximum depth of the model.
+- `"top_modules": 3`to set the number of top modules to print aggregated profile
+
+An example is given in [test_flops_profiler](tests/unit/test_flops_profiler.py).
+### Without the DeepSpeed runtime
+
+The flops-profiler can be used as a standalone package outside of the deepspeed runtime.
+
+#### Use the low-level APIs to profile the forward pass in the existing model training workflow
+
+- `start_profile` - starts profiling
+- `get_total_flops` - returns the total number of flops
+- `get_total_params` - returns the total number of params
+- `get_total_duration` - returns the total duration of the model forward pass
+- `get_total_steps` - returns the total number of steps (or input batches) profiled.
+- `print_model_profile` - prints the profile annotated
+- `print_model_aggregated_profile` - prints the aggregated profile for the top modules
+- `end_profile` - ends profiling and cleans up, invoked at the end of the profiling and before any printing method.
+
+`flops_to_string`, `params_to_string`, `duration_to_string` are utility functions to convert the metric number to string.
+
+Below is an example of this usage in a typical training workflow.
+
+```python
+from deepspeed.profiling.flops_profiler.profiler import FlopsProfiler
+
+model = Model()
+profiler = FlopsProfiler(model)
+
+start_step = 5
+end_step = 10
+assert (end_step > start_step), "should end profiling after start profiling"
+print_profile = True
+pring_aggregated_profile = True
+
+for step, batch in enumerate(data_loader):
+  # start profiling at training step "profile_step"
+  if step == start_step:
+    profiler.start_profile()
+
+  # end profiling and print output at training step "profile_step"
+  if model == end_step: # if using multi nodes, check global_rank == 0 as well
+    flops = profiler.get_total_flops()
+    params = profiler.get_total_flops()
+    duration = profiler.get_total_duration()
+    steps = profiler.get_total_steps()
+    if print_profile:
+        profiler.print_model_profile()
+    if print_aggregated_profile:
+        profiler.print_model_aggregated_profile(module_depth=-1, top_modules=3)
+    profiler.end_profile()
+    print(flops, params, duration, step)
+
+  # forward() method
+  loss = model(batch)
+
+  # runs backpropagation
+  loss.backward()
+
+  # weight update
+  optimizer.step()
+```
+
+#### Use the high level-API and run the model inference for profiling purpose
+
+Examples of this usage are given below.
+
+##### Classification model example:
+
+```python
+import argparse
+import sys
+import torch
+import torchvision.models as models
+from deepspeed.profiling.flops_profiler import get_model_profile
+
+pt_models = {
+    'resnet18': models.resnet18,
+    'resnet50': models.resnet50,
+    'alexnet': models.alexnet,
+    'vgg16': models.vgg16,
+    'squeezenet': models.squeezenet1_0,
+    'densenet': models.densenet161,
+    'inception': models.inception_v3
+}
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='flops-profiler example script')
+    parser.add_argument('--device',
+                        type=int,
+                        default=0,
+                        help='Device to store the model.')
+    parser.add_argument('--model',
+                        choices=list(pt_models.keys()),
+                        type=str,
+                        default='resnet18')
+    args = parser.parse_args()
+
+    model = pt_models[args.model]()
+
+    if torch.cuda.is_available():
+        model.cuda(device=args.device)
+
+    batch_size = 256
+    macs, params, steps = get_model_profile(model, # the PyTorch model to be profiled
+                                     input_res=(batch_size, 3, 224, 224), # input shape or input to the input_constructor
+                                     input_constructor=None, # If specified, the constructor is applied to input_res and the constructor output is used as the input to the model
+                                     print_profile=True, # whether to print the model graph with the profile annotated. Defaults to True
+                                     print_aggregated_profile=True, # whether to print the aggregated profile for top modules. Defaults to True
+                                     module_depth=-1, # the depth into the nested modules. Defaults to -1 (the inner most modules)
+                                     top_modules=3, # the number of top modules to print aggregated profile
+                                     warm_up=10, # the number of warm-up steps before measuring the time of each module. Defaults to 5
+                                     num_steps=10, # the number of steps to profile. Defaults to 10
+                                     as_strings=True, # whether to print the output as strings (e.g. 1k). Defaults to True
+                                     ignore_modules=None) # the list of modules to ignore during profiling. Defaults to None
+
+    print("{:<30}  {:<8}".format("Batch size: ", batch_size))
+    print('{:<30}  {:<8}'.format('Number of MACs: ', macs))
+    print('{:<30}  {:<8}'.format('Number of parameters: ', params))
+    print('{:<30}  {:<8}'.format('Number of steps profiled: ', steps))
+
+# Output:
+# Number of MACs:                 466.48 GMACs
+# Number of parameters:           11.69 M
+
+```
+
+##### Bert model example:
+
+```python
+from functools import partial
+
+import torch
+from transformers import BertForSequenceClassification, BertTokenizer
+
+from deepspeed.profiling.flops_profiler import get_model_profile
+
+
+def bert_input_constructor(input_shape, tokenizer):
+    inp_seq = ""
+    for _ in range(input_shape[1] - 2):  # there are two special tokens [CLS] and [SEP]
+        inp_seq += tokenizer.pad_token  # let's use pad token to form a fake
+    # sequence for subsequent flops calculation
+
+    inputs = tokenizer([inp_seq] * input_shape[0],
+                       padding=True,
+                       truncation=True,
+                       return_tensors="pt")
+    labels = torch.tensor([1] * input_shape[0])
+    # Batch size input_shape[0], sequence length input_shape[128]
+    inputs = dict(inputs)
+    inputs.update({"labels": labels})
+    return inputs
+
+
+if __name__ == '__main__':
+    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+    macs, params, steps = get_model_profile(
+        model,
+        (2, 128),
+        input_constructor=partial(bert_input_constructor,
+                                  tokenizer=bert_tokenizer),
+        print_profile=True,
+        print_aggregated_profile=True,
+    )
+    print("{:<30}  {:<8}".format("Number of multiply-adds: ", macs))
+    print("{:<30}  {:<8}".format("Number of parameters: ", params))
+    print("{:<30}  {:<8}".format("Number of steps profiled: ", steps))
+
+# Output:
+# Number of multiply-adds:        21.74 GMACs
+# Number of parameters:           109.48 M
+
+```
diff --git a/deepspeed/profiling/flops_profiler/__init__.py b/deepspeed/profiling/flops_profiler/__init__.py
new file mode 100644
index 000000000000..2f033c862baa
--- /dev/null
+++ b/deepspeed/profiling/flops_profiler/__init__.py
@@ -0,0 +1 @@
+from .profiler import *
diff --git a/deepspeed/profiling/flops_profiler/profiler.py b/deepspeed/profiling/flops_profiler/profiler.py
new file mode 100644
index 000000000000..3e867fd9cede
--- /dev/null
+++ b/deepspeed/profiling/flops_profiler/profiler.py
@@ -0,0 +1,767 @@
+import time
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+
+module_flop_count = []
+old_functions = {}
+
+
+class FlopsProfiler(object):
+    """Measures the time, number of estimated flops and parameters of each module in a PyTorch model.
+
+    The flops-profiler profiles the forward pass of a PyTorch model and prints the model graph with the measured profile attached to each module. It shows how time, flops and parameters are spent in the model and which modules or layers could be the bottleneck. It also outputs the names of the top k modules in terms of aggregated time, flops, and parameters at depth l with k and l specified by the user. The output profile is computed for each batch of input. If multiple forward passes are specified by the user to caputre (in the case where the model have different paths or for more accurate timing), the average profile of the multiple batches is taken.
+
+    Args:
+        object (torch.nn.Module): The PyTorch model to profile.
+    """
+    def __init__(self, model):
+        self.model = model
+
+    def start_profile(self, ignore_list=None):
+        """Starts profiling.
+
+        Extra attributes are added recursively to all the modules and the profiled torch.nn.functionals are monkey patched.
+
+        Args:
+            ignore_list (list, optional): the list of modules to ignore while profiling. Defaults to None.
+        """
+        self.reset_profile()
+        _patch_functionals()
+
+        def register_module_hooks(module, ignore_list):
+            if ignore_list and type(module) in ignore_list:
+                return
+
+            # if computing the flops of a module directly
+            if type(module) in MODULE_HOOK_MAPPING:
+                module.__flops_handle__ = module.register_forward_hook(
+                    MODULE_HOOK_MAPPING[type(module)])
+                return
+
+            # if computing the flops of the functionals in a module
+            def pre_hook(module, input):
+                module_flop_count.clear()
+                if len(input) > 0:
+                    # Can have multiple inputs, getting the first one
+                    input = input[0]
+                module.__steps__ += 1
+
+            module.__pre_hook_handle__ = module.register_forward_pre_hook(pre_hook)
+
+            def post_hook(module, input, output):
+                module.__flops__ += sum([elem[1] for elem in module_flop_count])
+                module_flop_count.clear()
+
+            has_children = len(module._modules.items()) != 0
+            if not has_children:
+                module.__post_hook_handle__ = module.register_forward_hook(post_hook)
+
+            def start_time_hook(module, input):
+                module.__start_time__ = time.time()
+
+            module.__start_time_hook_handle__ = module.register_forward_pre_hook(
+                start_time_hook)
+
+            def end_time_hook(module, input, output):
+                module.__duration__ += time.time() - module.__start_time__
+
+            module.__end_time_hook_handle__ = module.register_forward_hook(end_time_hook)
+
+        self.model.apply(partial(register_module_hooks, ignore_list=ignore_list))
+
+    def end_profile(self):
+        """Ends profiling.
+
+        Added attributes and handles are removed recursively on all the modules and the torch.nn.functionals are restored.
+        """
+        def remove_profile_attrs(module):
+            if hasattr(module, "__steps__"):
+                del module.__steps__
+            if hasattr(module, "__flops__"):
+                del module.__flops__
+            if hasattr(module, "__params__"):
+                del module.__params__
+            if hasattr(module, "__start_time__"):
+                del module.__start_time__
+            if hasattr(module, "__duration__"):
+                del module.__duration__
+            if hasattr(module, "__pre_hook_handle__"):
+                module.__pre_hook_handle__.remove()
+                del module.__pre_hook_handle__
+            if hasattr(module, "__post_hook_handle__"):
+                module.__post_hook_handle__.remove()
+                del module.__post_hook_handle__
+            if hasattr(module, "__flops_handle__"):
+                module.__flops_handle__.remove()
+                del module.__flops_handle__
+            if hasattr(module, "__start_time_hook_handle__"):
+                module.__start_time_hook_handle__.remove()
+                del module.__start_time_hook_handle__
+            if hasattr(module, "__end_time_hook_handle__"):
+                module.__end_time_hook_handle__.remove()
+                del module.__end_time_hook_handle__
+
+        self.model.apply(remove_profile_attrs)
+        _reload_functionals()
+
+    def reset_profile(self):
+        """Resets the profiling.
+
+        Adds or resets the extra attributes.
+        """
+        def add_or_reset_attrs(module):
+            module.__flops__ = 0
+            module.__params__ = sum(p.numel() for p in module.parameters()
+                                    if p.requires_grad)
+            module.__start_time__ = 0
+            module.__duration__ = 0
+            module.__steps__ = 0
+
+        self.model.apply(add_or_reset_attrs)
+
+    def get_total_flops(self, in_str=False):
+        """Returns the total flops of the model.
+
+        Args:
+            in_str (bool, optional): whether to output the flops in string. Defaults to False.
+        """
+        if self.get_total_steps() == 0:
+            return 0
+        sum = 0
+        for module in self.model.modules():
+            sum += module.__flops__
+        total_flops = sum / self.get_total_steps()
+        return flops_to_string(total_flops) if in_str else total_flops
+
+    def get_total_duration(self, in_str=False):
+        """Returns the total duration of the model forward pass.
+
+        Args:
+            in_str (bool, optional): whether to output the duration in string. Defaults to False.
+        """
+        if self.get_total_steps() == 0:
+            return 0
+        total_duration = self.model.__duration__ / self.get_total_steps()
+        return duration_to_string(total_duration) if in_str else total_duration
+
+    def get_total_params(self, in_str=False):
+        """Returns the total parameters of the model.
+
+        Args:
+            in_str (bool, optional): whether to output the parameters in string. Defaults to False.
+        """
+        return params_to_string(
+            self.model.__params__) if in_str else self.model.__params__
+
+    def get_total_steps(self):
+        """Returns the total number of steps (or input batches) profiled.
+        """
+        def get_steps(module):
+            if module.__steps__ == 0:
+                sum = 0
+                for m in module.children():
+                    sum += get_steps(m)
+                module.__steps__ = sum
+            return module.__steps__
+
+        total_steps = get_steps(self.model)
+        if total_steps == 0:
+            print("no step is profiled")
+        return total_steps
+
+    def print_model_profile(self):
+        """Prints the model graph with the measured profile attached to each module.
+        """
+        total_flops = self.get_total_flops()
+        total_duration = self.get_total_duration()
+        total_params = self.get_total_params()
+        total_steps = self.get_total_steps()
+
+        def accumulate_flops(module):
+            has_children = len(module._modules.items()) != 0
+            if not has_children:
+                return module.__flops__
+            else:
+                sum = 0
+                for m in module.children():
+                    sum += m.accumulate_flops()
+            return sum
+
+        def flops_repr(module):
+            params = module.__params__
+            flops = 0 if total_steps == 0 else module.accumulate_flops() / total_steps
+            items = [
+                params_to_string(params),
+                "{:.2%} Params".format(params / total_params),
+                flops_to_string(flops),
+                "{:.2%} MACs".format(0.0 if total_flops == 0 else flops / total_flops),
+            ]
+            duration = 0 if total_steps == 0 else module.__duration__ / total_steps
+            items.append(duration_to_string(duration))
+            items.append("{:.2%} time".format(0.0 if total_duration == 0 else duration /
+                                              total_duration))
+            # flops = 2 * MACs
+            items.append(("{:.2} TFLOPS".format(0.0 if duration == 0 else 2 * flops /
+                                                duration / 10**12)))
+            items.append(str(module.__steps__))
+            items.append(module.original_extra_repr())
+            return ", ".join(items)
+
+        def add_extra_repr(module):
+            module.accumulate_flops = accumulate_flops.__get__(module)
+            flops_extra_repr = flops_repr.__get__(module)
+            if module.extra_repr != flops_extra_repr:
+                module.original_extra_repr = module.extra_repr
+                module.extra_repr = flops_extra_repr
+                assert module.extra_repr != module.original_extra_repr
+
+        def del_extra_repr(module):
+            if hasattr(module, "original_extra_repr"):
+                module.extra_repr = module.original_extra_repr
+                del module.original_extra_repr
+            if hasattr(module, "accumulate_flops"):
+                del module.accumulate_flops
+
+        self.model.apply(add_extra_repr)
+        print(self.model)
+        self.model.apply(del_extra_repr)
+
+    def print_model_aggregated_profile(self, module_depth=-1, top_modules=3):
+        """Prints the names of the top top_modules modules in terms of aggregated time, flops, and parameters at depth module_depth.
+
+        Args:
+            module_depth (int, optional): the depth of the modules to show. Defaults to -1 (the innermost modules).
+            top_modules (int, optional): the number of top modules to show. Defaults to 3.
+        """
+        info = {}
+        total_steps = self.get_total_steps()
+        if total_steps == 0:
+            return
+        if not hasattr(self.model, "__flops__"):
+            print(
+                "no __flops__ attribute in the model, call this function after start_profile and before end_profile"
+            )
+            return
+
+        def walk_module(module, curr_depth, info):
+            if curr_depth not in info:
+                info[curr_depth] = {}
+            if module.__class__.__name__ not in info[curr_depth]:
+                info[curr_depth][module.__class__.__name__] = [
+                    0,
+                    0,
+                    0,
+                ]  # flops, params, time
+            info[curr_depth][module.__class__.__name__][0] += module.__flops__
+            info[curr_depth][module.__class__.__name__][1] += module.__params__
+            info[curr_depth][module.__class__.__name__][2] += (module.__duration__)
+            has_children = len(module._modules.items()) != 0
+            if has_children:
+                for child in module.children():
+                    walk_module(child, curr_depth + 1, info)
+
+        walk_module(self.model, 0, info)
+
+        depth = module_depth
+        if module_depth == -1:
+            depth = len(info) - 1
+
+        num_items = min(top_modules, len(info[depth]))
+
+        sort_flops = {
+            k: flops_to_string(v[0] / total_steps)
+            for k,
+            v in sorted(info[depth].items(),
+                        key=lambda item: item[1][0],
+                        reverse=True)[:num_items]
+        }
+        sort_params = {
+            k: params_to_string(v[1])
+            for k,
+            v in sorted(info[depth].items(),
+                        key=lambda item: item[1][1],
+                        reverse=True)[:num_items]
+        }
+        sort_time = {
+            k: duration_to_string(v[2] / total_steps)
+            for k,
+            v in sorted(info[depth].items(),
+                        key=lambda item: item[1][2],
+                        reverse=True)[:num_items]
+        }
+        print(f"Top {num_items} modules in flops at depth {depth} are {sort_flops}")
+        print(f"Top {num_items} modules in params at depth {depth} are {sort_params}")
+        print(f"Top {num_items} modules in time at depth {depth} are {sort_time}")
+
+
+def _prod(dims):
+    p = 1
+    for v in dims:
+        p *= v
+    return p
+
+
+def _linear_flops_compute(input, weight, bias=None):
+    out_features = weight.shape[0]
+    return torch.numel(input) * out_features
+
+
+def _relu_flops_compute(input, inplace=False):
+    return torch.numel(input)
+
+
+def _pool_flops_compute(
+    input,
+    kernel_size,
+    stride=None,
+    padding=0,
+    ceil_mode=False,
+    count_include_pad=True,
+    divisor_override=None,
+):
+    return torch.numel(input)
+
+
+def _conv_flops_compute(input,
+                        weight,
+                        bias=None,
+                        stride=1,
+                        padding=0,
+                        dilation=1,
+                        groups=1):
+    assert weight.shape[1] * groups == input.shape[1]
+
+    batch_size = input.shape[0]
+    in_channels = input.shape[1]
+    out_channels = weight.shape[0]
+    kernel_dims = list(weight.shape[-2:])
+    input_dims = list(input.shape[2:])
+
+    paddings = padding if type(padding) is tuple else (padding, padding)
+    strides = stride if type(stride) is tuple else (stride, stride)
+    dilations = dilation if type(dilation) is tuple else (dilation, dilation)
+
+    output_dims = [0, 0]
+    output_dims[0] = (input_dims[0] + 2 * paddings[0] -
+                      (dilations[0] * (kernel_dims[0] - 1) + 1)) // strides[0] + 1
+    output_dims[1] = (input_dims[1] + 2 * paddings[1] -
+                      (dilations[1] * (kernel_dims[1] - 1) + 1)) // strides[1] + 1
+
+    filters_per_channel = out_channels // groups
+    conv_per_position_flops = int(_prod(kernel_dims)) * in_channels * filters_per_channel
+    active_elements_count = batch_size * int(_prod(output_dims))
+    overall_conv_flops = conv_per_position_flops * active_elements_count
+
+    bias_flops = 0
+    if bias is not None:
+        bias_flops = out_channels * active_elements_count
+
+    overall_flops = overall_conv_flops + bias_flops
+
+    return int(overall_flops)
+
+
+def _conv_trans_flops_compute(
+    input,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    output_padding=0,
+    groups=1,
+    dilation=1,
+):
+    batch_size = input.shape[0]
+    in_channels = input.shape[1]
+    out_channels = weight.shape[0]
+    kernel_dims = list(weight.shape[-2:])
+    input_dims = list(input.shape[2:])
+
+    paddings = padding if type(padding) is tuple else (padding, padding)
+    strides = stride if type(stride) is tuple else (stride, stride)
+    dilations = dilation if type(dilation) is tuple else (dilation, dilation)
+
+    output_dims = [0, 0]
+    output_dims[0] = (input_dims[0] + 2 * paddings[0] -
+                      (dilations[0] * (kernel_dims[0] - 1) + 1)) // strides[0] + 1
+    output_dims[1] = (input_dims[1] + 2 * paddings[1] -
+                      (dilations[1] * (kernel_dims[1] - 1) + 1)) // strides[1] + 1
+
+    filters_per_channel = out_channels // groups
+    conv_per_position_flops = int(_prod(kernel_dims)) * in_channels * filters_per_channel
+    active_elements_count = batch_size * int(_prod(input_dims))
+    overall_conv_flops = conv_per_position_flops * active_elements_count
+
+    bias_flops = 0
+    if bias is not None:
+        bias_flops = out_channels * batch_size * int(_prod(output_dims))
+
+    overall_flops = overall_conv_flops + bias_flops
+
+    return int(overall_flops)
+
+
+def _batch_norm_flops_compute(
+    input,
+    running_mean,
+    running_var,
+    weight=None,
+    bias=None,
+    training=False,
+    momentum=0.1,
+    eps=1e-05,
+):
+    # assume affine is true
+    flops = 2 * torch.numel(input)
+    return flops
+
+
+def _upsample_flops_compute(input,
+                            size=None,
+                            scale_factor=None,
+                            mode="nearest",
+                            align_corners=None):
+    if size is not None:
+        return int(_prod(size))
+    assert scale_factor is not None
+    flops = torch.numel(input)
+    if len(scale_factor) == len(input):
+        flops * int(_prod(scale_factor))
+    else:
+        flops * scale_factor**len(input)
+    return flops
+
+
+def _softmax_flops_compute(input, dim=None, _stacklevel=3, dtype=None):
+    return torch.numel(input)
+
+
+def _embedding_flops_compute(
+    input,
+    weight,
+    padding_idx=None,
+    max_norm=None,
+    norm_type=2.0,
+    scale_grad_by_freq=False,
+    sparse=False,
+):
+    return 0
+
+
+def _dropout_flops_compute(input, p=0.5, training=True, inplace=False):
+    return 0
+
+
+def wrapFunc(func, funcFlopCompute):
+    oldFunc = func
+    name = func.__name__
+    old_functions[func.__name__] = oldFunc
+
+    def newFunc(*args, **kwds):
+        flops = funcFlopCompute(*args, **kwds)
+        module_flop_count.append((name, flops))
+        return oldFunc(*args, **kwds)
+
+    return newFunc
+
+
+def _patch_functionals():
+    # FC
+    F.linear = wrapFunc(F.linear, _linear_flops_compute)
+
+    # convolutions
+    F.conv1d = wrapFunc(F.conv1d, _conv_flops_compute)
+    F.conv2d = wrapFunc(F.conv2d, _conv_flops_compute)
+    F.conv3d = wrapFunc(F.conv3d, _conv_flops_compute)
+
+    # conv transposed
+    F.conv_transpose1d = wrapFunc(F.conv_transpose1d, _conv_trans_flops_compute)
+    F.conv_transpose2d = wrapFunc(F.conv_transpose2d, _conv_trans_flops_compute)
+    F.conv_transpose3d = wrapFunc(F.conv_transpose3d, _conv_trans_flops_compute)
+
+    # activations
+    F.relu = wrapFunc(F.relu, _relu_flops_compute)
+    F.prelu = wrapFunc(F.prelu, _relu_flops_compute)
+    F.elu = wrapFunc(F.elu, _relu_flops_compute)
+    F.leaky_relu = wrapFunc(F.leaky_relu, _relu_flops_compute)
+    F.relu6 = wrapFunc(F.relu6, _relu_flops_compute)
+
+    # BatchNorms
+    F.batch_norm = wrapFunc(F.batch_norm, _batch_norm_flops_compute)
+
+    # poolings
+    F.avg_pool1d = wrapFunc(F.avg_pool1d, _pool_flops_compute)
+    F.avg_pool2d = wrapFunc(F.avg_pool2d, _pool_flops_compute)
+    F.avg_pool3d = wrapFunc(F.avg_pool3d, _pool_flops_compute)
+    F.max_pool1d = wrapFunc(F.max_pool1d, _pool_flops_compute)
+    F.max_pool2d = wrapFunc(F.max_pool2d, _pool_flops_compute)
+    F.max_pool3d = wrapFunc(F.max_pool3d, _pool_flops_compute)
+    F.adaptive_avg_pool1d = wrapFunc(F.adaptive_avg_pool1d, _pool_flops_compute)
+    F.adaptive_avg_pool2d = wrapFunc(F.adaptive_avg_pool2d, _pool_flops_compute)
+    F.adaptive_avg_pool3d = wrapFunc(F.adaptive_avg_pool3d, _pool_flops_compute)
+    F.adaptive_max_pool1d = wrapFunc(F.adaptive_max_pool1d, _pool_flops_compute)
+    F.adaptive_max_pool2d = wrapFunc(F.adaptive_max_pool2d, _pool_flops_compute)
+    F.adaptive_max_pool3d = wrapFunc(F.adaptive_max_pool3d, _pool_flops_compute)
+
+    # upsample
+    F.upsample = wrapFunc(F.upsample, _upsample_flops_compute)
+    F.interpolate = wrapFunc(F.interpolate, _upsample_flops_compute)
+
+    # softmax
+    F.softmax = wrapFunc(F.softmax, _softmax_flops_compute)
+
+    # embedding
+    F.embedding = wrapFunc(F.embedding, _embedding_flops_compute)
+
+
+def _reload_functionals():
+    # torch.nn.functional does not support importlib.reload()
+    F.linear = old_functions["linear"]
+    F.conv1d = old_functions["conv1d"]
+    F.conv2d = old_functions["conv2d"]
+    F.conv3d = old_functions["conv3d"]
+    F.conv_transpose1d = old_functions["conv_transpose1d"]
+    F.conv_transpose2d = old_functions["conv_transpose2d"]
+    F.conv_transpose3d = old_functions["conv_transpose3d"]
+    F.relu = old_functions["relu"]
+    F.prelu = old_functions["prelu"]
+    F.elu = old_functions["elu"]
+    F.leaky_relu = old_functions["leaky_relu"]
+    F.relu6 = old_functions["relu6"]
+    F.batch_norm = old_functions["batch_norm"]
+    F.avg_pool1d = old_functions["avg_pool1d"]
+    F.avg_pool2d = old_functions["avg_pool2d"]
+    F.avg_pool3d = old_functions["avg_pool3d"]
+    F.max_pool1d = old_functions["max_pool1d"]
+    F.max_pool2d = old_functions["max_pool2d"]
+    F.max_pool3d = old_functions["max_pool3d"]
+    F.adaptive_avg_pool1d = old_functions["adaptive_avg_pool1d"]
+    F.adaptive_avg_pool2d = old_functions["adaptive_avg_pool2d"]
+    F.adaptive_avg_pool3d = old_functions["adaptive_avg_pool3d"]
+    F.adaptive_max_pool1d = old_functions["adaptive_max_pool1d"]
+    F.adaptive_max_pool2d = old_functions["adaptive_max_pool2d"]
+    F.adaptive_max_pool3d = old_functions["adaptive_max_pool3d"]
+    F.upsample = old_functions["upsample"]
+    F.interpolate = old_functions["interpolate"]
+    F.softmax = old_functions["softmax"]
+    F.embedding = old_functions["embedding"]
+
+
+def _rnn_flops(flops, rnn_module, w_ih, w_hh, input_size):
+    # matrix matrix mult ih state and internal state
+    flops += w_ih.shape[0] * w_ih.shape[1]
+    # matrix matrix mult hh state and internal state
+    flops += w_hh.shape[0] * w_hh.shape[1]
+    if isinstance(rnn_module, (nn.RNN, nn.RNNCell)):
+        # add both operations
+        flops += rnn_module.hidden_size
+    elif isinstance(rnn_module, (nn.GRU, nn.GRUCell)):
+        # hadamard of r
+        flops += rnn_module.hidden_size
+        # adding operations from both states
+        flops += rnn_module.hidden_size * 3
+        # last two hadamard _product and add
+        flops += rnn_module.hidden_size * 3
+    elif isinstance(rnn_module, (nn.LSTM, nn.LSTMCell)):
+        # adding operations from both states
+        flops += rnn_module.hidden_size * 4
+        # two hadamard _product and add for C state
+        flops += rnn_module.hidden_size + rnn_module.hidden_size + rnn_module.hidden_size
+        # final hadamard
+        flops += rnn_module.hidden_size + rnn_module.hidden_size + rnn_module.hidden_size
+    return flops
+
+
+def _rnn_forward_hook(rnn_module, input, output):
+    flops = 0
+    # input is a tuple containing a sequence to process and (optionally) hidden state
+    inp = input[0]
+    batch_size = inp.shape[0]
+    seq_length = inp.shape[1]
+    num_layers = rnn_module.num_layers
+
+    for i in range(num_layers):
+        w_ih = rnn_module.__getattr__("weight_ih_l" + str(i))
+        w_hh = rnn_module.__getattr__("weight_hh_l" + str(i))
+        if i == 0:
+            input_size = rnn_module.input_size
+        else:
+            input_size = rnn_module.hidden_size
+        flops = _rnn_flops(flops, rnn_module, w_ih, w_hh, input_size)
+        if rnn_module.bias:
+            b_ih = rnn_module.__getattr__("bias_ih_l" + str(i))
+            b_hh = rnn_module.__getattr__("bias_hh_l" + str(i))
+            flops += b_ih.shape[0] + b_hh.shape[0]
+
+    flops *= batch_size
+    flops *= seq_length
+    if rnn_module.bidirectional:
+        flops *= 2
+    rnn_module.__flops__ += int(flops)
+
+
+def _rnn_cell_forward_hook(rnn_cell_module, input, output):
+    flops = 0
+    inp = input[0]
+    batch_size = inp.shape[0]
+    w_ih = rnn_cell_module.__getattr__("weight_ih")
+    w_hh = rnn_cell_module.__getattr__("weight_hh")
+    input_size = inp.shape[1]
+    flops = _rnn_flops(flops, rnn_cell_module, w_ih, w_hh, input_size)
+    if rnn_cell_module.bias:
+        b_ih = rnn_cell_module.__getattr__("bias_ih")
+        b_hh = rnn_cell_module.__getattr__("bias_hh")
+        flops += b_ih.shape[0] + b_hh.shape[0]
+
+    flops *= batch_size
+    rnn_cell_module.__flops__ += int(flops)
+
+
+MODULE_HOOK_MAPPING = {
+    # RNN
+    nn.RNN: _rnn_forward_hook,
+    nn.GRU: _rnn_forward_hook,
+    nn.LSTM: _rnn_forward_hook,
+    nn.RNNCell: _rnn_cell_forward_hook,
+    nn.LSTMCell: _rnn_cell_forward_hook,
+    nn.GRUCell: _rnn_cell_forward_hook,
+}
+
+
+def flops_to_string(flops, units=None, precision=2):
+    if units is None:
+        if flops // 10**9 > 0:
+            return str(round(flops / 10.0**9, precision)) + " GMACs"
+        elif flops // 10**6 > 0:
+            return str(round(flops / 10.0**6, precision)) + " MMACs"
+        elif flops // 10**3 > 0:
+            return str(round(flops / 10.0**3, precision)) + " KMACs"
+        else:
+            return str(flops) + " MACs"
+    else:
+        if units == "GMACs":
+            return str(round(flops / 10.0**9, precision)) + " " + units
+        elif units == "MMACs":
+            return str(round(flops / 10.0**6, precision)) + " " + units
+        elif units == "KMACs":
+            return str(round(flops / 10.0**3, precision)) + " " + units
+        else:
+            return str(flops) + " MACs"
+
+
+def params_to_string(params_num, units=None, precision=2):
+    if units is None:
+        if params_num // 10**6 > 0:
+            return str(round(params_num / 10**6, 2)) + " M"
+        elif params_num // 10**3:
+            return str(round(params_num / 10**3, 2)) + " k"
+        else:
+            return str(params_num)
+    else:
+        if units == "M":
+            return str(round(params_num / 10.0**6, precision)) + " " + units
+        elif units == "K":
+            return str(round(params_num / 10.0**3, precision)) + " " + units
+        else:
+            return str(params_num)
+
+
+def duration_to_string(duration, units=None, precision=2):
+    if units is None:
+        if duration > 1:
+            return str(round(duration, precision)) + " s"
+        elif duration * 10**3 > 1:
+            return str(round(duration * 10**3, precision)) + " ms"
+        elif duration * 10**6 > 1:
+            return str(round(duration * 10**6, precision)) + " us"
+        else:
+            return str(duration)
+    else:
+        if units == "us":
+            return str(round(duration * 10.0**6, precision)) + " " + units
+        elif units == "ms":
+            return str(round(duration * 10.0**3, precision)) + " " + units
+        else:
+            return str(round(duration, precision)) + " s"
+
+
+def get_model_profile(
+    model,
+    input_res,
+    input_constructor=None,
+    print_profile=True,
+    print_aggregated_profile=True,
+    module_depth=-1,
+    top_modules=3,
+    warm_up=5,
+    num_steps=10,
+    as_strings=True,
+    ignore_modules=None,
+):
+    """Returns the total flops, parameters, and profiled steps of a model.
+
+    Args:
+        model ([torch.nn.Module]): the PyTorch model to be profiled.
+        input_res (list): input shape or input to the input_constructor
+        input_constructor (func, optional): input constructor. If specified, the constructor is applied to input_res and the constructor output is used as the input to the model. Defaults to None.
+        print_profile (bool, optional): whether to print the model graph with the profile annotated. Defaults to True.
+        print_aggregated_profile (bool, optional): whether to print the aggregated profile for top modules. Defaults to True.
+        module_depth (int, optional): the depth into the nested modules. Defaults to -1 (the inner most modules).
+        top_modules (int, optional): the number of top modules to print in the aggregated profile. Defaults to 3.
+        warm_up (int, optional): the number of warm-up steps before measuring the time of each module. Defaults to 5.
+        num_steps (int, optional): the number of steps to profile. Defaults to 10.
+        as_strings (bool, optional): whether to print the output as strings. Defaults to True.
+        ignore_modules ([type], optional): the list of modules to ignore during profiling. Defaults to None.
+    """
+    assert type(input_res) is tuple
+    assert len(input_res) >= 1
+    assert isinstance(model, nn.Module)
+    prof = FlopsProfiler(model)
+    model.eval()
+    for _ in range(warm_up):
+        if input_constructor:
+            input = input_constructor(input_res)
+            _ = model(**input)
+        else:
+            try:
+                batch = torch.ones(()).new_empty(
+                    (*input_res),
+                    dtype=next(model.parameters()).dtype,
+                    device=next(model.parameters()).device,
+                )
+            except StopIteration:
+                batch = torch.ones(()).new_empty((*input_res))
+            _ = model(batch)
+
+    prof.start_profile(ignore_list=ignore_modules)
+
+    for _ in range(num_steps):
+        if input_constructor:
+            input = input_constructor(input_res)
+            _ = model(**input)
+        else:
+            try:
+                batch = torch.ones(()).new_empty(
+                    (*input_res),
+                    dtype=next(model.parameters()).dtype,
+                    device=next(model.parameters()).device,
+                )
+            except StopIteration:
+                batch = torch.ones(()).new_empty((*input_res))
+            _ = model(batch)
+
+    flops = prof.get_total_flops()
+    params = prof.get_total_params()
+    steps = prof.get_total_steps()
+    if print_profile:
+        prof.print_model_profile()
+    if print_aggregated_profile:
+        prof.print_model_aggregated_profile(module_depth=module_depth,
+                                            top_modules=top_modules)
+    prof.end_profile()
+    if as_strings:
+        return flops_to_string(flops), params_to_string(params), steps
+
+    return flops, params, steps
diff --git a/deepspeed/pt/deepspeed_utils.py b/deepspeed/pt/deepspeed_utils.py
deleted file mode 100755
index cb90a92c3c91..000000000000
--- a/deepspeed/pt/deepspeed_utils.py
+++ /dev/null
@@ -1,273 +0,0 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
-
-Copyright NVIDIA/Megatron
-
-Helper functions and classes from multiple sources.
-'''
-
-import torch
-from torch._six import inf
-
-from deepspeed.pt.log_utils import logger
-
-
-class CheckOverflow(object):
-    '''Checks for overflow in gradient across parallel process'''
-    def __init__(self, param_groups=None, mpu=None, zero_reduce_scatter=False):
-        self.mpu = mpu
-        self.params = [] if param_groups else None
-        self.zero_reduce_scatter = zero_reduce_scatter
-        if param_groups:
-            for group in param_groups:
-                for param in group:
-                    self.params.append(param)
-
-    def check_using_norm(self, norm_group):
-        overflow = -1 in norm_group
-
-        if self.mpu is not None:
-            overflow_gpu = torch.cuda.ByteTensor([overflow])
-            torch.distributed.all_reduce(overflow_gpu,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=self.mpu.get_model_parallel_group())
-            overflow = overflow_gpu[0].item()
-
-        return bool(overflow)
-
-    def check(self, param_groups=None):
-
-        #TODO: what's the equivalent here? do we need this?
-        # for group in self.fp32_from_fp32_groups:
-        #     for param in group:
-        #         params.append(param)
-
-        params = []
-        if param_groups is None:
-            params = self.params
-        else:
-            assert param_groups is not None, \
-                "self.params and param_groups both cannot be none"
-
-            for group in param_groups:
-                for param in group:
-                    params.append(param)
-
-        return self.has_overflow(params)
-
-    # `params` is a list / generator of torch.Variable
-    def has_overflow_serial(self, params):
-        for i, p in enumerate(params):
-            if p.grad is not None and self._has_inf_or_nan(p.grad.data, i):
-                return True
-        return False
-
-    def has_overflow(self, params):
-        overflow = self.has_overflow_serial(params)
-        # Since each model parallel GPU carries only part of the model,
-        # make sure overflow flag is synced across all the model parallel GPUs
-        overflow_gpu = torch.cuda.ByteTensor([overflow])
-        #torch.distributed.all_reduce(overflow_gpu,
-        #                             op=torch.distributed.ReduceOp.MAX,
-        #                             group=mpu.get_model_parallel_group())
-        if self.zero_reduce_scatter:
-            torch.distributed.all_reduce(overflow_gpu,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=torch.distributed.group.WORLD)
-        elif self.mpu is not None:
-            torch.distributed.all_reduce(overflow_gpu,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=self.mpu.get_model_parallel_group())
-        overflow = overflow_gpu[0].item()
-        return bool(overflow)
-
-    # `x` is a torch.Tensor
-    @staticmethod
-    def _has_inf_or_nan(x, i):
-        try:
-            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
-            # Pytorch's .sum() creates a one-element tensor of the same type as x
-            # (which is true for some recent version of pytorch).
-            cpu_sum = float(x.float().sum())
-            # More efficient version that can be used if .sum() returns a Python scalar
-            # cpu_sum = float(x.sum())
-        except RuntimeError as instance:
-            # We want to check if inst is actually an overflow exception.
-            # RuntimeError could come from a different error.
-            # If so, we still want the exception to propagate.
-            if "value cannot be converted" not in instance.args[0]:
-                raise
-            return True
-        else:
-            if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
-                return True
-            return False
-
-
-def _handle_overflow(cpu_sum, x, i):
-    import math
-    rank = torch.distributed.get_rank()
-    if rank == 0:
-        t_i = -1
-        for v_i, v in enumerate(x.data.contiguous().view(-1)):
-            if not math.isfinite(float(v)):
-                t_i = v_i
-                break
-        logger.info(
-            f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}"
-        )
-
-
-def get_grad_norm(parameters, norm_type=2, mpu=None):
-    """Clips gradient norm of an iterable of parameters.
-
-    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
-    added functionality to handle model parallel parameters. Note that
-    the gradients are modified in place. Taken from Nvidia Megatron.
-
-    Arguments:
-        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-            single Tensor that will have gradients normalized
-        max_norm (float or int): max norm of the gradients
-        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-            infinity norm.
-
-    Returns:
-        Total norm of the parameters (viewed as a single vector).
-    """
-    if isinstance(parameters, torch.Tensor):
-        parameters = [parameters]
-    parameters = list(filter(lambda p: p.grad is not None, parameters))
-
-    norm_type = float(norm_type)
-    if norm_type == inf:
-        total_norm = max(p.grad.data.abs().max() for p in parameters)
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-        # Take max across all GPUs.
-        if mpu is not None:
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=mpu.get_model_parallel_group())
-        total_norm = total_norm_cuda[0].item()
-    else:
-        total_norm = 0.
-        for p in parameters:
-            if mpu is not None:
-                if (mpu.get_model_parallel_rank() == 0) or (hasattr(p,
-                                                                    'model_parallel')
-                                                            and p.model_parallel):
-                    param_norm = p.grad.data.float().norm(norm_type)
-                    total_norm += param_norm.item()**norm_type
-            else:
-                param_norm = p.grad.data.float().norm(norm_type)
-                total_norm += param_norm.item()**norm_type
-
-        # Sum across all model parallel GPUs.
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-        if mpu is not None:
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.SUM,
-                                         group=mpu.get_model_parallel_group())
-        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
-
-    if total_norm == float(
-            'inf') or total_norm == -float('inf') or total_norm != total_norm:
-        total_norm = -1
-
-    return total_norm
-
-
-def get_weight_norm(parameters, norm_type=2, mpu=None):
-    """Clips gradient norm of an iterable of parameters.
-
-    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
-    added functionality to handle model parallel parameters. Note that
-    the gradients are modified in place. Taken from Nvidia Megatron.
-
-    Arguments:
-        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-            single Tensor that will have gradients normalized
-        max_norm (float or int): max norm of the gradients
-        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-            infinity norm.
-
-    Returns:
-        Total norm of the parameters (viewed as a single vector).
-    """
-    if isinstance(parameters, torch.Tensor):
-        parameters = [parameters]
-
-    norm_type = float(norm_type)
-    if norm_type == inf:
-        total_norm = max(p.data.abs().max() for p in parameters)
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-        # Take max across all GPUs.
-        if mpu is not None:
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=mpu.get_model_parallel_group())
-        total_norm = total_norm_cuda[0].item()
-    else:
-        total_norm = 0.
-        for p in parameters:
-            if mpu is not None:
-                if (mpu.get_model_parallel_rank() == 0) or (hasattr(p,
-                                                                    'model_parallel')
-                                                            and p.model_parallel):
-                    try:
-                        param_norm = float(torch.norm(p, norm_type, dtype=torch.float32))
-                    except TypeError as err:
-                        param_norm = float(torch.norm(p.float(), norm_type))
-
-                    #param_norm = p.data.float().norm(norm_type)
-                    total_norm += param_norm**norm_type
-            else:
-                try:
-                    param_norm = float(torch.norm(p, norm_type, dtype=torch.float32))
-                except TypeError as err:
-                    param_norm = float(torch.norm(p.float(), norm_type))
-                #param_norm = p.data.float().norm(norm_type)
-                total_norm += param_norm**norm_type
-
-        # Sum across all model parallel GPUs.
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-        if mpu is not None:
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.SUM,
-                                         group=mpu.get_model_parallel_group())
-        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
-
-    if total_norm == float(
-            'inf') or total_norm == -float('inf') or total_norm != total_norm:
-        total_norm = -1
-
-    return total_norm
-
-
-def is_model_parallel_parameter(p):
-    return hasattr(p, 'model_parallel') and p.model_parallel
-
-
-def see_memory_usage(message):
-    return
-    if torch.distributed.is_initialized() and not torch.distributed.get_rank() == 0:
-        return
-
-    # Print message except when distributed but not rank 0
-    logger.info(message)
-    logger.info(
-        "Memory Allocated %s GigaBytes ",
-        torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
-    )
-    logger.info(
-        "Max Memory Allocated %s GigaBytes",
-        torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
-    )
-    logger.info(
-        "Cache Allocated %s GigaBytes",
-        torch.cuda.memory_cached() / (1024 * 1024 * 1024),
-    )
-    logger.info(
-        "Max cache Allocated %s GigaBytes",
-        torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
-    )
diff --git a/deepspeed/runtime/__init__.py b/deepspeed/runtime/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/deepspeed/runtime/activation_checkpointing/__init__.py b/deepspeed/runtime/activation_checkpointing/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/deepspeed/pt/deepspeed_checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
old mode 100755
new mode 100644
similarity index 90%
rename from deepspeed/pt/deepspeed_checkpointing.py
rename to deepspeed/runtime/activation_checkpointing/checkpointing.py
index 2a5bb2ab688b..c7decc0ca0b7
--- a/deepspeed/pt/deepspeed_checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -13,16 +13,17 @@
 
 # Parts of the code here are adapted from PyTorch
 # repo: https://github.com/pytorch/pytorch
-import contextlib
 import copy
-import torch.distributed as dist
 import torch
+import contextlib
+import torch.distributed as dist
+
 from torch import _C
 from torch.cuda import _lazy_call, device as device_ctx_manager
-from deepspeed.pt.deepspeed_timer import SynchronizedWallClockTimer as Timers
-import torch.distributed as dist
-from deepspeed.pt.deepspeed_config import DeepSpeedConfig
-from deepspeed.pt.log_utils import logger
+
+from deepspeed.runtime.config import DeepSpeedConfig
+from deepspeed.utils import logger
+from deepspeed.utils.timer import SynchronizedWallClockTimer as Timers
 
 #DeepSpeed Checkpointing Enabled or Disabled
 deepspeed_checkpointing_enabled = False
@@ -372,6 +373,10 @@ def forward(ctx, run_function, *args):
 
             inputs = []
             for i, item in enumerate(args[:-1]):
+                if not torch.is_tensor(item):
+                    inputs.append(item)
+                    continue
+
                 partition_size = get_partition_size(item)
                 partition = item.detach().contiguous().view(-1).narrow(
                     0,
@@ -412,7 +417,12 @@ def forward(ctx, run_function, *args):
             inputs.append(args[-1])
 
         #just in case something funky is happening such as reuse of inputs
-        inputs_cuda = [item.to(cuda_device) for item in args]
+        inputs_cuda = []
+        for item in args:
+            if torch.is_tensor(item):
+                inputs_cuda.append(item.to(cuda_device))
+            else:
+                inputs_cuda.append(item)
 
         # Copy the rng states.
         ctx.fwd_cpu_rng_state = torch.get_rng_state()
@@ -438,6 +448,10 @@ def forward(ctx, run_function, *args):
         if PARTITION_ACTIVATIONS:
             new_args = []
             for i, (arg, inp) in enumerate(zip(args, inputs)):
+                if not torch.is_tensor(arg):
+                    new_args.append(arg)
+                    continue
+
                 size = torch.tensor(arg.size())
 
                 arg.data = inp.data
@@ -479,10 +493,17 @@ def forward(ctx, run_function, *args):
             timers.log(['forward'])
         if SYNCHRONIZE:
             torch.cuda.synchronize()
+
+        # Tensors returned from forward() may not be differentiable.
+        if torch.is_tensor(outputs):
+            non_grad_outputs = [outputs] if not outputs.is_floating_point() else []
+        else:
+            non_grad_outputs = [o for o in outputs if not o.is_floating_point()]
+        ctx.mark_non_differentiable(*non_grad_outputs)
         return outputs
 
     @staticmethod
-    def backward(ctx, *args):
+    def backward(ctx, *grads):
         global timers
         #see_memory_usage("In backward", force=True)
         #removing pointers to the contiguous buffer memory
@@ -547,14 +568,32 @@ def backward(ctx, *args):
 
         if isinstance(outputs, torch.Tensor):
             outputs = (outputs, )
-        torch.autograd.backward(outputs, args)
+
+        # Construct arguments to autograd.backward().
+        # This is usually just outputs and grads, but forward() can return tensors that
+        # are not differentiable.
+        output_tensors = []
+        grad_tensors = []
+        for out, grad in zip(outputs, grads):
+            if out.requires_grad:
+                output_tensors.append(out)
+                grad_tensors.append(grad)
+
+        torch.autograd.backward(output_tensors, grad_tensors)
 
         if PROFILE_TIME:
             timers('backward').stop()
             timers.log(['backward'])
         if SYNCHRONIZE:
             torch.cuda.synchronize()
-        return (None, ) + tuple(inp.grad for inp in detached_inputs)
+        ret_list = [None]  # first None for ctx
+        for inp in detached_inputs:
+            if torch.is_tensor(inp):
+                ret_list.append(inp.grad)
+            else:
+                ret_list.append(None)
+
+        return tuple(ret_list)
 
 
 def checkpoint(function, *args):
@@ -602,11 +641,11 @@ def reset():
         size_offsets = []
 
 
-def _configure_using_config_file(deepspeed_config):
+def _configure_using_config_file(deepspeed_config, mpu=None):
     global num_layers, PARTITION_ACTIVATIONS, CONTIGUOUS_CHECKPOINTING, \
             PA_TO_CPU, SYNCHRONIZE, PROFILE_TIME
 
-    config = DeepSpeedConfig(deepspeed_config).activation_checkpointing_config
+    config = DeepSpeedConfig(deepspeed_config, mpu=mpu).activation_checkpointing_config
     logger.info(config.repr())
     PARTITION_ACTIVATIONS = config.partition_activations
     CONTIGUOUS_CHECKPOINTING = config.contiguous_memory_optimization
@@ -684,12 +723,12 @@ def configure(
 
     _configure_defaults()
 
-    if deepspeed_config is not None:
-        _configure_using_config_file(deepspeed_config)
-
     if mpu_ is not None:
         mpu = mpu_
 
+    if deepspeed_config is not None:
+        _configure_using_config_file(deepspeed_config, mpu=mpu)
+
     if partition_activations is not None:
         PARTITION_ACTIVATIONS = partition_activations
 
diff --git a/deepspeed/pt/deepspeed_checkpointing_config.py b/deepspeed/runtime/activation_checkpointing/config.py
similarity index 95%
rename from deepspeed/pt/deepspeed_checkpointing_config.py
rename to deepspeed/runtime/activation_checkpointing/config.py
index 63e2b29f702a..30ac5157f843 100755
--- a/deepspeed/pt/deepspeed_checkpointing_config.py
+++ b/deepspeed/runtime/activation_checkpointing/config.py
@@ -3,7 +3,7 @@
 Licensed under the MIT license.
 """
 
-from deepspeed.pt.deepspeed_config_utils import get_scalar_param
+from deepspeed.runtime.config_utils import get_scalar_param
 
 #########################################
 #  DeepSpeed Activation Checkpointing
diff --git a/deepspeed/pt/deepspeed_config.py b/deepspeed/runtime/config.py
similarity index 51%
rename from deepspeed/pt/deepspeed_config.py
rename to deepspeed/runtime/config.py
index f618124d74c7..2aeb5135350f 100755
--- a/deepspeed/pt/deepspeed_config.py
+++ b/deepspeed/runtime/config.py
@@ -6,17 +6,61 @@
 import torch
 import json
 import copy
-from deepspeed.pt.deepspeed_constants import *
-from deepspeed.pt.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, DELAYED_SHIFT, MIN_LOSS_SCALE
-from deepspeed.pt.deepspeed_config_utils import get_scalar_param, dict_raise_error_on_duplicate_keys
-from deepspeed.pt.deepspeed_zero_config import DeepSpeedZeroConfig
-from deepspeed.pt.deepspeed_checkpointing_config import DeepSpeedActivationCheckpointingConfig
-from deepspeed.pt.log_utils import logger
+
+from .constants import *
+from .fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, DELAYED_SHIFT, MIN_LOSS_SCALE
+from .config_utils import get_scalar_param, dict_raise_error_on_duplicate_keys
+from .zero.config import DeepSpeedZeroConfig
+from .zero.constants import *
+from .activation_checkpointing.config import DeepSpeedActivationCheckpointingConfig
+
+from ..git_version_info import version as __version__
+from ..utils import logger
+
+from ..elasticity import elasticity_enabled, compute_elastic_config, ensure_immutable_elastic_config
+from ..elasticity.config import ElasticityConfigError
+from ..elasticity.constants import ELASTICITY, IGNORE_NON_ELASTIC_BATCH_INFO, \
+    IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT
+
+from ..profiling.config import DeepSpeedFlopsProfilerConfig
 
 TENSOR_CORE_ALIGN_SIZE = 8
+
 ADAM_OPTIMIZER = 'adam'
+ADAMW_OPTIMIZER = 'adamw'
 LAMB_OPTIMIZER = 'lamb'
-DEEPSPEED_OPTIMIZERS = [ADAM_OPTIMIZER, LAMB_OPTIMIZER]
+ONEBIT_ADAM_OPTIMIZER = 'onebitadam'
+DEEPSPEED_OPTIMIZERS = [
+    ADAM_OPTIMIZER,
+    ADAMW_OPTIMIZER,
+    LAMB_OPTIMIZER,
+    ONEBIT_ADAM_OPTIMIZER,
+]
+
+# extra optimizer parameters for adam/adamw
+TORCH_ADAM_PARAM = "torch_adam"
+
+
+class DeepSpeedConfigError(Exception):
+    pass
+
+
+def get_pld_enabled(param_dict):
+    if PROGRESSIVE_LAYER_DROP in param_dict.keys():
+        return get_scalar_param(param_dict[PROGRESSIVE_LAYER_DROP],
+                                PLD_ENABLED,
+                                PLD_ENABLED_DEFAULT)
+    else:
+        return False
+
+
+def get_pld_params(param_dict):
+    if PROGRESSIVE_LAYER_DROP in param_dict.keys():
+        pld_params = copy.copy(param_dict[PROGRESSIVE_LAYER_DROP])
+        pld_params.pop(PLD_ENABLED)
+        return pld_params
+    else:
+        return False
 
 
 def get_amp_enabled(param_dict):
@@ -110,22 +154,9 @@ def get_zero_optimization(param_dict):
 
 
 def get_zero_reduce_scatter(param_dict):
-    return get_scalar_param(param_dict, ZERO_REDUCE_SCATTER, ZERO_REDUCE_SCATTER_DEFAULT)
-
-
-def get_zero_max_elements_per_comm(param_dict):
-    return get_scalar_param(param_dict,
-                            ZERO_MAX_ELEMENTS_PER_COMM,
-                            ZERO_MAX_ELEMENTS_PER_COMM_DEFAULT)
-
-
-def get_allgather_size(param_dict):
     return get_scalar_param(param_dict,
-                            ALLGATHER_SIZE,
-                            ALLGATHER_SIZE_DEFAULT) if get_scalar_param(
-                                param_dict,
-                                ALLGATHER_SIZE,
-                                ALLGATHER_SIZE_DEFAULT) > 0 else ALLGATHER_SIZE_DEFAULT
+                            ZERO_OPTIMIZATION_REDUCE_SCATTER,
+                            ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT)
 
 
 def get_allreduce_always_fp32(param_dict):
@@ -158,6 +189,191 @@ def get_gradient_clipping(param_dict):
     return get_scalar_param(param_dict, GRADIENT_CLIPPING, GRADIENT_CLIPPING_DEFAULT)
 
 
+def get_sparse_attention(param_dict):
+    if SPARSE_ATTENTION in param_dict.keys():
+        sparsity = param_dict[SPARSE_ATTENTION]
+        mode = get_sparse_attention_mode(sparsity)
+
+        if (mode == SPARSE_DENSE_MODE):
+            return get_sparse_dense_config(sparsity)
+        elif (mode == SPARSE_FIXED_MODE):
+            return get_sparse_fixed_config(sparsity)
+        elif (mode == SPARSE_VARIABLE_MODE):
+            return get_sparse_variable_config(sparsity)
+        elif (mode == SPARSE_BIGBIRD_MODE):
+            return get_sparse_bigbird_config(sparsity)
+        elif (mode == SPARSE_BSLONGFORMER_MODE):
+            return get_sparse_bslongformer_config(sparsity)
+        else:
+            raise NotImplementedError(
+                f'Given sparsity mode, {mode}, has not been implemented yet!')
+
+    else:
+        return None
+
+
+def get_sparse_dense_config(sparsity):
+    block = get_scalar_param(sparsity, SPARSE_BLOCK, SPARSE_BLOCK_DEFAULT)
+    return {SPARSE_MODE: SPARSE_DENSE_MODE, SPARSE_BLOCK: block}
+
+
+def get_sparse_fixed_config(sparsity):
+    block = get_scalar_param(sparsity, SPARSE_BLOCK, SPARSE_BLOCK_DEFAULT)
+    different_layout_per_head = get_scalar_param(
+        sparsity,
+        SPARSE_DIFFERENT_LAYOUT_PER_HEAD,
+        SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULT)
+    num_local_blocks = get_scalar_param(sparsity,
+                                        SPARSE_NUM_LOCAL_BLOCKS,
+                                        SPARSE_NUM_LOCAL_BLOCKS_DEFAULT)
+    num_global_blocks = get_scalar_param(sparsity,
+                                         SPARSE_NUM_GLOBAL_BLOCKS,
+                                         SPARSE_NUM_GLOBAL_BLOCKS_DEFAULT)
+    attention = get_scalar_param(sparsity,
+                                 SPARSE_ATTENTION_TYPE,
+                                 SPARSE_ATTENTION_TYPE_DEFAULT)
+    horizontal_global_attention = get_scalar_param(
+        sparsity,
+        SPARSE_HORIZONTAL_GLOBAL_ATTENTION,
+        SPARSE_HORIZONTAL_GLOBAL_ATTENTION_DEFAULT)
+    num_different_global_patterns = get_scalar_param(
+        sparsity,
+        SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS,
+        SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS_DEFAULT)
+
+    return {
+        SPARSE_MODE: SPARSE_FIXED_MODE,
+        SPARSE_BLOCK: block,
+        SPARSE_DIFFERENT_LAYOUT_PER_HEAD: different_layout_per_head,
+        SPARSE_NUM_LOCAL_BLOCKS: num_local_blocks,
+        SPARSE_NUM_GLOBAL_BLOCKS: num_global_blocks,
+        SPARSE_ATTENTION_TYPE: attention,
+        SPARSE_HORIZONTAL_GLOBAL_ATTENTION: horizontal_global_attention,
+        SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS: num_different_global_patterns
+    }
+
+
+def get_sparse_variable_config(sparsity):
+    block = get_scalar_param(sparsity, SPARSE_BLOCK, SPARSE_BLOCK_DEFAULT)
+    different_layout_per_head = get_scalar_param(
+        sparsity,
+        SPARSE_DIFFERENT_LAYOUT_PER_HEAD,
+        SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULT)
+    num_random_blocks = get_scalar_param(sparsity,
+                                         SPARSE_NUM_RANDOM_BLOCKS,
+                                         SPARSE_NUM_RANDOM_BLOCKS_DEFAULT)
+    local_window_blocks = get_scalar_param(sparsity,
+                                           SPARSE_LOCAL_WINDOW_BLOCKS,
+                                           SPARSE_LOCAL_WINDOW_BLOCKS_DEFAULT)
+    global_block_indices = get_scalar_param(sparsity,
+                                            SPARSE_GLOBAL_BLOCK_INDICES,
+                                            SPARSE_GLOBAL_BLOCK_INDICES_DEFAULT)
+    global_block_end_indices = get_scalar_param(sparsity,
+                                                SPARSE_GLOBAL_BLOCK_END_INDICES,
+                                                SPARSE_GLOBAL_BLOCK_END_INDICES_DEFAULT)
+    attention = get_scalar_param(sparsity,
+                                 SPARSE_ATTENTION_TYPE,
+                                 SPARSE_ATTENTION_TYPE_DEFAULT)
+    horizontal_global_attention = get_scalar_param(
+        sparsity,
+        SPARSE_HORIZONTAL_GLOBAL_ATTENTION,
+        SPARSE_HORIZONTAL_GLOBAL_ATTENTION_DEFAULT)
+
+    return {
+        SPARSE_MODE: SPARSE_VARIABLE_MODE,
+        SPARSE_BLOCK: block,
+        SPARSE_DIFFERENT_LAYOUT_PER_HEAD: different_layout_per_head,
+        SPARSE_NUM_RANDOM_BLOCKS: num_random_blocks,
+        SPARSE_LOCAL_WINDOW_BLOCKS: local_window_blocks,
+        SPARSE_GLOBAL_BLOCK_INDICES: global_block_indices,
+        SPARSE_GLOBAL_BLOCK_END_INDICES: global_block_end_indices,
+        SPARSE_ATTENTION_TYPE: attention,
+        SPARSE_HORIZONTAL_GLOBAL_ATTENTION: horizontal_global_attention
+    }
+
+
+def get_sparse_bigbird_config(sparsity):
+    block = get_scalar_param(sparsity, SPARSE_BLOCK, SPARSE_BLOCK_DEFAULT)
+    different_layout_per_head = get_scalar_param(
+        sparsity,
+        SPARSE_DIFFERENT_LAYOUT_PER_HEAD,
+        SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULT)
+    num_random_blocks = get_scalar_param(sparsity,
+                                         SPARSE_NUM_RANDOM_BLOCKS,
+                                         SPARSE_NUM_RANDOM_BLOCKS_DEFAULT)
+    num_sliding_window_blocks = get_scalar_param(
+        sparsity,
+        SPARSE_NUM_SLIDING_WINDOW_BLOCKS,
+        SPARSE_NUM_SLIDING_WINDOW_BLOCKS_DEFAULT)
+    num_global_blocks = get_scalar_param(sparsity,
+                                         SPARSE_NUM_GLOBAL_BLOCKS,
+                                         SPARSE_NUM_GLOBAL_BLOCKS_DEFAULT)
+
+    return {
+        SPARSE_MODE: SPARSE_BIGBIRD_MODE,
+        SPARSE_BLOCK: block,
+        SPARSE_DIFFERENT_LAYOUT_PER_HEAD: different_layout_per_head,
+        SPARSE_NUM_RANDOM_BLOCKS: num_random_blocks,
+        SPARSE_NUM_SLIDING_WINDOW_BLOCKS: num_sliding_window_blocks,
+        SPARSE_NUM_GLOBAL_BLOCKS: num_global_blocks
+    }
+
+
+def get_sparse_bslongformer_config(sparsity):
+    block = get_scalar_param(sparsity, SPARSE_BLOCK, SPARSE_BLOCK_DEFAULT)
+    different_layout_per_head = get_scalar_param(
+        sparsity,
+        SPARSE_DIFFERENT_LAYOUT_PER_HEAD,
+        SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULT)
+    num_sliding_window_blocks = get_scalar_param(
+        sparsity,
+        SPARSE_NUM_SLIDING_WINDOW_BLOCKS,
+        SPARSE_NUM_SLIDING_WINDOW_BLOCKS_DEFAULT)
+    global_block_indices = get_scalar_param(sparsity,
+                                            SPARSE_GLOBAL_BLOCK_INDICES,
+                                            SPARSE_GLOBAL_BLOCK_INDICES_DEFAULT)
+    global_block_end_indices = get_scalar_param(sparsity,
+                                                SPARSE_GLOBAL_BLOCK_END_INDICES,
+                                                SPARSE_GLOBAL_BLOCK_END_INDICES_DEFAULT)
+
+    return {
+        SPARSE_MODE: SPARSE_BSLONGFORMER_MODE,
+        SPARSE_BLOCK: block,
+        SPARSE_DIFFERENT_LAYOUT_PER_HEAD: different_layout_per_head,
+        SPARSE_NUM_SLIDING_WINDOW_BLOCKS: num_sliding_window_blocks,
+        SPARSE_GLOBAL_BLOCK_INDICES: global_block_indices,
+        SPARSE_GLOBAL_BLOCK_END_INDICES: global_block_end_indices
+    }
+
+
+def get_sparse_attention_mode(param_dict):
+    if SPARSE_MODE in param_dict.keys():
+        return param_dict[SPARSE_MODE]
+    else:
+        return SPARSE_MODE_DEFAULT
+
+
+def get_sparse_attention_type(param_dict):
+    if SPARSE_ATTENTION_TYPE in param_dict.keys():
+        return param_dict[SPARSE_ATTENTION_TYPE]
+    else:
+        return SPARSE_ATTENTION_TYPE_DEFAULT
+
+
+def get_pipeline_config(param_dict):
+    '''Parses pipeline engine configuration. '''
+    default_pipeline = {
+        'stages': 'auto',
+        'partition': 'best',
+        'seed_layers': False,
+        'activation_checkpoint_interval': 0
+    }
+    config = default_pipeline
+    for key, val in param_dict.get('pipeline', {}).items():
+        config[key] = val
+    return config
+
+
 def get_optimizer_name(param_dict):
     if OPTIMIZER in param_dict.keys() and \
             TYPE in param_dict[OPTIMIZER].keys():
@@ -260,6 +476,21 @@ def get_tensorboard_job_name(param_dict):
         return TENSORBOARD_JOB_NAME_DEFAULT
 
 
+def get_checkpoint_params(param_dict):
+    return param_dict.get(CHECKPOINT, {})
+
+
+def get_checkpoint_tag_validation_mode(checkpoint_params):
+    tag_validation_mode = checkpoint_params.get(CHECKPOINT_TAG_VALIDATION,
+                                                CHECKPOINT_TAG_VALIDATION_DEFAULT)
+    tag_validation_mode = tag_validation_mode.upper()
+    if tag_validation_mode in CHECKPOINT_TAG_VALIDATION_MODES:
+        return tag_validation_mode
+    else:
+        raise DeepSpeedConfigError("Checkpoint config contains invalid tag_validation " \
+            f"value of {tag_validation_mode}, expecting one of {CHECKPOINT_TAG_VALIDATION_MODES}")
+
+
 '''Write deepspeed config files by modifying basic templates.
 Can be used for quicly changing parameters via command line parameters.'''
 
@@ -303,6 +534,59 @@ def __init__(self, json_file, mpu=None, param_dict=None):
             self.global_rank = 0
             self.world_size = 1
 
+        # If elastic-mode enabled, update compute + update _param_dict
+        self.elasticity_enabled = elasticity_enabled(self._param_dict)
+        if self.elasticity_enabled:
+            logger.info("DeepSpeed elasticity support enabled")
+            final_batch_size, valid_gpus, micro_batch_size = compute_elastic_config(
+                ds_config=self._param_dict,
+                target_deepspeed_version=__version__,
+                world_size=self.world_size)
+
+            elastic_dict = self._param_dict[ELASTICITY]
+
+            # Ensure the resource scheduler saw the same elastic config we are using at runtime
+            ensure_immutable_elastic_config(runtime_elastic_config_dict=elastic_dict)
+
+            ignore_non_elastic_batch_info = elastic_dict.get(
+                IGNORE_NON_ELASTIC_BATCH_INFO,
+                IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT)
+
+            if not ignore_non_elastic_batch_info:
+                batch_params = [
+                    TRAIN_BATCH_SIZE,
+                    TRAIN_MICRO_BATCH_SIZE_PER_GPU,
+                    GRADIENT_ACCUMULATION_STEPS
+                ]
+                if any(map(lambda t: t in self._param_dict, batch_params)):
+                    raise ElasticityConfigError("One or more batch related parameters were found in your " \
+                        f"ds_config ({TRAIN_BATCH_SIZE}, {TRAIN_MICRO_BATCH_SIZE_PER_GPU}, and/or " \
+                        f"{GRADIENT_ACCUMULATION_STEPS}). These parameters *will not be used* since " \
+                        "elastic training is enabled, which takes control of these parameters. " \
+                        "If you want to supress this error (the parameters will be silently ignored) " \
+                        f"please set {IGNORE_NON_ELASTIC_BATCH_INFO}':true in your elasticity config.")
+
+            # micro_bsz * world_size * gas = total_batch_size
+            # gas = total_batch_size // (micro_bsz * world_size)
+            gradient_accu_steps = final_batch_size // (micro_batch_size *
+                                                       self.world_size)
+
+            if TRAIN_BATCH_SIZE in self._param_dict:
+                logger.warning("[Elasticity] overriding training_batch_size: " \
+                    f"{self._param_dict[TRAIN_BATCH_SIZE]} -> {final_batch_size}")
+            if TRAIN_MICRO_BATCH_SIZE_PER_GPU in self._param_dict:
+                logger.warning("[Elasticity] overriding train_micro_batch_size_per_gpu: " \
+                    f"{self._param_dict[TRAIN_MICRO_BATCH_SIZE_PER_GPU]} -> {micro_batch_size}")
+            if GRADIENT_ACCUMULATION_STEPS in self._param_dict:
+                logger.warning("[Elasticity] overriding gradient_accumulation_steps: "\
+                    f"{self._param_dict[GRADIENT_ACCUMULATION_STEPS]} -> {gradient_accu_steps}")
+
+            logger.info(f"[Elasticity] valid GPU counts: {valid_gpus}")
+
+            self._param_dict[TRAIN_BATCH_SIZE] = final_batch_size
+            self._param_dict[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = micro_batch_size
+            self._param_dict[GRADIENT_ACCUMULATION_STEPS] = gradient_accu_steps
+
         self._initialize_params(self._param_dict)
         self._configure_train_batch_size()
         self._do_sanity_check()
@@ -321,8 +605,6 @@ def _initialize_params(self, param_dict):
         self.gradient_predivide_factor = get_gradient_predivide_factor(param_dict)
         self.sparse_gradients_enabled = get_sparse_gradients_enabled(param_dict)
 
-        self.allgather_size = get_allgather_size(param_dict)
-
         self.zero_config = DeepSpeedZeroConfig(param_dict)
         self.zero_optimization_stage = self.zero_config.stage
         self.zero_enabled = self.zero_optimization_stage > 0
@@ -353,11 +635,23 @@ def _initialize_params(self, param_dict):
         self.scheduler_params = get_scheduler_params(param_dict)
 
         self.wall_clock_breakdown = get_wall_clock_breakdown(param_dict)
+        self.flops_profiler_config = DeepSpeedFlopsProfilerConfig(param_dict)
         self.memory_breakdown = get_memory_breakdown(param_dict)
         self.tensorboard_enabled = get_tensorboard_enabled(param_dict)
         self.tensorboard_output_path = get_tensorboard_output_path(param_dict)
         self.tensorboard_job_name = get_tensorboard_job_name(param_dict)
 
+        self.sparse_attention = get_sparse_attention(param_dict)
+        self.pipeline = get_pipeline_config(param_dict)
+
+        self.pld_enabled = get_pld_enabled(param_dict)
+        self.pld_params = get_pld_params(param_dict)
+
+        checkpoint_params = get_checkpoint_params(param_dict)
+        validation_mode = get_checkpoint_tag_validation_mode(checkpoint_params)
+        self.checkpoint_tag_validation_enabled = validation_mode != ValidationMode.IGNORE
+        self.checkpoint_tag_validation_fail = validation_mode == ValidationMode.FAIL
+
     def _batch_assertion(self):
 
         train_batch = self.train_batch_size
@@ -426,10 +720,6 @@ def _set_batch_related_parameters(self):
             assert False, \
                 'Either train_batch_size or micro_batch_per_gpu needs to be provided'
 
-        logger.info(
-            f' After Train batch {self.train_batch_size} micro_batch {self.train_micro_batch_size_per_gpu} and grad_acc {self.gradient_accumulation_steps}'
-        )
-
     def _configure_train_batch_size(self):
         self._set_batch_related_parameters()
         self._batch_assertion()
@@ -454,15 +744,18 @@ def print(self, name):
                                    ':'))))
 
     def _do_error_check(self):
-        if self.zero_enabled:
-            assert self.fp16_enabled, "DeepSpeedConfig: ZeRO is only supported if fp16 is enabled"
-            assert self.zero_optimization_stage <= MAX_STAGE_ZERO_OPTIMIZATION, "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(MAX_STAGE_ZERO_OPTIMIZATION)
-
         assert self.train_micro_batch_size_per_gpu, "DeepSpeedConfig: {} is not defined".format(TRAIN_MICRO_BATCH_SIZE_PER_GPU)
 
-        assert self.gradient_accumulation_steps, 'DeepSpeedConfig: {} is not defined'.format(
+        assert self.gradient_accumulation_steps, "DeepSpeedConfig: {} is not defined".format(
             GRADIENT_ACCUMULATION_STEPS)
 
+        if self.zero_enabled:
+            assert self.fp16_enabled, "DeepSpeedConfig: ZeRO is only supported if fp16 is enabled"
+            assert self.zero_optimization_stage <= MAX_STAGE_ZERO_OPTIMIZATION, "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(MAX_STAGE_ZERO_OPTIMIZATION)
+            if self.zero_config.cpu_offload is True:
+                assert self.zero_optimization_stage == ZERO_OPTIMIZATION_GRADIENTS, "DeepSpeedConfig: cpu-offload supported ZeRO stage is {}".format(ZERO_OPTIMIZATION_GRADIENTS)
+                #assert self.gradient_accumulation_steps == 1, "DeepSpeedConfig: {}is not supported for {}".format(GRADIENT_ACCUMULATION_STEPS, ZERO_OPTIMIZATION_CPU_OFFLOAD)
+
     def _do_warning_check(self):
         fp16_enabled = self.fp16_enabled or self.zero_enabled
 
@@ -477,12 +770,14 @@ def _do_warning_check(self):
             MAX_GRAD_NORM in self.optimizer_params.keys() and \
                 self.optimizer_params[MAX_GRAD_NORM] > 0:
             if fp16_enabled:
-                logger.warning(
-                    'DeepSpeedConfig: In FP16 mode, DeepSpeed will pass {}:{} to FP16 wrapper'
-                    .format(MAX_GRAD_NORM,
-                            self.optimizer_params[MAX_GRAD_NORM]))
+                if self.global_rank == 0:
+                    logger.warning(
+                        'DeepSpeedConfig: In FP16 mode, DeepSpeed will pass {}:{} to FP16 wrapper'
+                        .format(MAX_GRAD_NORM,
+                                self.optimizer_params[MAX_GRAD_NORM]))
             else:
-                logger.warning(
-                    'DeepSpeedConfig: In FP32 mode, DeepSpeed does not permit MAX_GRAD_NORM ({}) > 0, setting to zero'
-                    .format(self.optimizer_params[MAX_GRAD_NORM]))
+                if self.global_rank == 0:
+                    logger.warning(
+                        'DeepSpeedConfig: In FP32 mode, DeepSpeed does not permit MAX_GRAD_NORM ({}) > 0, setting to zero'
+                        .format(self.optimizer_params[MAX_GRAD_NORM]))
                 self.optimizer_params[MAX_GRAD_NORM] = 0.0
diff --git a/deepspeed/pt/deepspeed_config_utils.py b/deepspeed/runtime/config_utils.py
similarity index 81%
rename from deepspeed/pt/deepspeed_config_utils.py
rename to deepspeed/runtime/config_utils.py
index 38fdb647f61d..37f35692369b 100755
--- a/deepspeed/pt/deepspeed_config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@@ -13,6 +13,10 @@ def get_scalar_param(param_dict, param_name, param_default_value):
     return param_dict.get(param_name, param_default_value)
 
 
+def get_list_param(param_dict, param_name, param_default_value):
+    return param_dict.get(param_name, param_default_value)
+
+
 def dict_raise_error_on_duplicate_keys(ordered_pairs):
     """Reject duplicate keys."""
     d = dict((k, v) for k, v in ordered_pairs)
diff --git a/deepspeed/pt/deepspeed_constants.py b/deepspeed/runtime/constants.py
similarity index 77%
rename from deepspeed/pt/deepspeed_constants.py
rename to deepspeed/runtime/constants.py
index 1ba3dd482e25..2f5916df753a 100755
--- a/deepspeed/pt/deepspeed_constants.py
+++ b/deepspeed/runtime/constants.py
@@ -17,6 +17,42 @@
 TRAIN_BATCH_SIZE = "train_batch_size"
 TRAIN_BATCH_SIZE_DEFAULT = None
 
+#############################################
+# Sparse attention
+#############################################
+SPARSE_ATTENTION = "sparse_attention"
+SPARSE_DENSE_MODE = "dense"
+SPARSE_FIXED_MODE = "fixed"
+SPARSE_VARIABLE_MODE = "variable"
+SPARSE_BIGBIRD_MODE = "bigbird"
+SPARSE_BSLONGFORMER_MODE = "bslongformer"
+SPARSE_MODE = "mode"
+SPARSE_MODE_DEFAULT = SPARSE_FIXED_MODE
+SPARSE_BLOCK = "block"
+SPARSE_BLOCK_DEFAULT = 16
+SPARSE_DIFFERENT_LAYOUT_PER_HEAD = "different_layout_per_head"
+SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULT = False
+SPARSE_NUM_LOCAL_BLOCKS = "num_local_blocks"
+SPARSE_NUM_LOCAL_BLOCKS_DEFAULT = 4
+SPARSE_NUM_GLOBAL_BLOCKS = "num_global_blocks"
+SPARSE_NUM_GLOBAL_BLOCKS_DEFAULT = 1
+SPARSE_ATTENTION_TYPE = "attention"
+SPARSE_ATTENTION_TYPE_DEFAULT = "bidirectional"
+SPARSE_HORIZONTAL_GLOBAL_ATTENTION = "horizontal_global_attention"
+SPARSE_HORIZONTAL_GLOBAL_ATTENTION_DEFAULT = False
+SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS = "num_different_global_patterns"
+SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS_DEFAULT = 1
+SPARSE_NUM_RANDOM_BLOCKS = "num_random_blocks"
+SPARSE_NUM_RANDOM_BLOCKS_DEFAULT = 0
+SPARSE_LOCAL_WINDOW_BLOCKS = "local_window_blocks"
+SPARSE_LOCAL_WINDOW_BLOCKS_DEFAULT = [4]
+SPARSE_GLOBAL_BLOCK_INDICES = "global_block_indices"
+SPARSE_GLOBAL_BLOCK_INDICES_DEFAULT = [0]
+SPARSE_GLOBAL_BLOCK_END_INDICES = "global_block_end_indices"
+SPARSE_GLOBAL_BLOCK_END_INDICES_DEFAULT = None
+SPARSE_NUM_SLIDING_WINDOW_BLOCKS = "num_sliding_window_blocks"
+SPARSE_NUM_SLIDING_WINDOW_BLOCKS_DEFAULT = 3
+
 #############################################
 # Optimizer and lr scheduler
 #############################################
@@ -37,11 +73,6 @@
 ZERO_ALLOW_UNTESTED_OPTIMIZER = "zero_allow_untested_optimizer"
 ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT = False
 
-#############################################
-# Torch distributed constants
-#############################################
-TORCH_DISTRIBUTED_DEFAULT_PORT = "29500"
-
 # Steps
 STEPS_PER_PRINT = "steps_per_print"
 STEPS_PER_PRINT_DEFAULT = 10
@@ -147,35 +178,6 @@
 GRADIENT_CLIPPING = 'gradient_clipping'
 GRADIENT_CLIPPING_DEFAULT = 0.
 
-#########################################
-# ZeRO optimization
-#########################################
-# ZeRO optimization. By default, this optimization is not enabled.
-# Users have to configure the desired optimization (0 means disabled) in params.json as below example:
-ZERO_FORMAT = '''
-ZeRO optimization should be enabled as:
-"session_params": {
-  "zero_optimization": [0|1|2],
-  "zero_all_gather_size": 200
-}
-'''
-
-ZERO_OPTIMIZATION = 'zero_optimization'
-ZERO_OPTIMIZATION_DEFAULT = 0
-ZERO_OPTIMIZATION_OPTIMIZER_STATES = 1
-ZERO_OPTIMIZATION_GRADIENTS = 2
-ZERO_OPTIMIZATION_WEIGHTS = 3
-MAX_STAGE_ZERO_OPTIMIZATION = ZERO_OPTIMIZATION_GRADIENTS
-
-ZERO_REDUCE_SCATTER = "zero_reduce_scatter"
-ZERO_REDUCE_SCATTER_DEFAULT = True
-
-ZERO_MAX_ELEMENTS_PER_COMM = "zero_max_elements_per_comm"
-ZERO_MAX_ELEMENTS_PER_COMM_DEFAULT = 5e8
-
-ALLGATHER_SIZE = 'allgather_size'
-ALLGATHER_SIZE_DEFAULT = 500000000
-
 #########################################
 # FP32 AllReduce
 #########################################
@@ -284,3 +286,41 @@
 # Tensorboard job name
 TENSORBOARD_JOB_NAME = "job_name"
 TENSORBOARD_JOB_NAME_DEFAULT = "DeepSpeedJobName"
+
+#########################################
+# Progressive Layer Drop (PLD)
+#########################################
+PROGRESSIVE_LAYER_DROP = "progressive_layer_drop"
+
+# PLD enable signal
+PLD_ENABLED = "enabled"
+PLD_ENABLED_DEFAULT = False
+
+PLD_THETA = "theta"
+PLD_THETA_DEFAULT = 1.0
+
+PLD_GAMMA = "gamma"
+PLD_GAMMA_DEFAULT = 0.001
+
+
+#########################################
+# Validation modes
+#########################################
+class ValidationMode:
+    WARN = "WARN"
+    IGNORE = "IGNORE"
+    FAIL = "FAIL"
+
+
+#########################################
+# Checkpoint config params
+#########################################
+# "checkpoint": {tag_validation=["Ignore"|"Warn"|"Fail"]}
+CHECKPOINT = "checkpoint"
+CHECKPOINT_TAG_VALIDATION = "tag_validation"
+CHECKPOINT_TAG_VALIDATION_DEFAULT = ValidationMode.WARN
+CHECKPOINT_TAG_VALIDATION_MODES = [
+    ValidationMode.WARN,
+    ValidationMode.IGNORE,
+    ValidationMode.FAIL
+]
diff --git a/deepspeed/pt/deepspeed_csr_tensor.py b/deepspeed/runtime/csr_tensor.py
similarity index 100%
rename from deepspeed/pt/deepspeed_csr_tensor.py
rename to deepspeed/runtime/csr_tensor.py
diff --git a/deepspeed/runtime/custom_collectives.py b/deepspeed/runtime/custom_collectives.py
new file mode 100644
index 000000000000..cb77edcaf60d
--- /dev/null
+++ b/deepspeed/runtime/custom_collectives.py
@@ -0,0 +1,154 @@
+'''
+Copyright 2019 The Microsoft DeepSpeed Team
+'''
+
+from mpi4py import MPI
+import numpy as np
+import cupy
+
+
+def my_igather(rank, size, comm, sendbuf, recbuf, root):
+    req = []
+    if rank == root:
+        for idx in range(size):
+            if idx != rank:
+                req.append(comm.Irecv(recbuf[idx], source=idx))
+            else:
+                recbuf[rank] = sendbuf
+    else:
+        req.append(comm.Isend(sendbuf, dest=root))
+    return req
+
+
+def gather_cuda(rank,
+                world_size,
+                comm,
+                cupy_sign_list_packed,
+                cupy_recvbuf_sign,
+                cupy_worker_scale,
+                cupy_recvbuf_scale):
+    # We do in-place operations on cupy buffers so we do not return any buffers
+    requests = []
+    for idx in range(world_size):
+        req_sign = my_igather(rank,
+                              world_size,
+                              comm,
+                              cupy_sign_list_packed[idx],
+                              cupy_recvbuf_sign,
+                              root=idx)
+        requests += req_sign
+
+    for idx in range(world_size):
+        req_scale = my_igather(rank,
+                               world_size,
+                               comm,
+                               cupy_worker_scale,
+                               cupy_recvbuf_scale,
+                               root=idx)
+        requests += req_scale
+
+    MPI.Request.Waitall(requests)
+
+
+def gather_host(rank,
+                world_size,
+                comm,
+                cupy_sign_list_packed,
+                cupy_recvbuf_sign,
+                cupy_worker_scale,
+                cupy_recvbuf_scale):
+    # In-place operations are not possible for newly created cupy arrays
+    # so we need to return the new buffers
+    numpy_recvbuf_sign = np.zeros([world_size,
+                                   cupy_sign_list_packed[rank].size],
+                                  dtype=cupy_sign_list_packed[0].dtype)
+    numpy_recvbuf_scale = np.zeros([world_size, 1], dtype=cupy_worker_scale.dtype)
+
+    # 1. convert from cupy to numpy
+    numpy_sign_list_packed = cupy_sign_list_packed
+
+    for idx in range(world_size):
+        numpy_sign_list_packed[idx] = cupy.asnumpy(cupy_sign_list_packed[idx])
+
+    numpy_worker_scale = cupy.asnumpy(cupy_worker_scale)
+    numpy_recvbuf_scale = cupy.asnumpy(cupy_recvbuf_scale)
+
+    cupy.cuda.get_current_stream().synchronize()
+
+    # 2. use numpy buffers for communication
+    requests = []
+
+    for idx in range(world_size):
+        req_sign = my_igather(rank,
+                              world_size,
+                              comm,
+                              numpy_sign_list_packed[idx],
+                              numpy_recvbuf_sign,
+                              root=idx)
+        requests += req_sign
+
+    for idx in range(world_size):
+        req_scale = my_igather(rank,
+                               world_size,
+                               comm,
+                               numpy_worker_scale,
+                               numpy_recvbuf_scale,
+                               root=idx)
+        requests += req_scale
+
+    MPI.Request.Waitall(requests)
+
+    # 3. Convert back from numpy to cupy
+    cupy_recvbuf_sign = cupy.asarray(numpy_recvbuf_sign)
+    for idx in range(world_size):
+        cupy_sign_list_packed[idx] = cupy.asarray(numpy_sign_list_packed[idx])
+
+    cupy_worker_scale = cupy.asarray(numpy_worker_scale)
+    cupy_recvbuf_scale = cupy.asarray(numpy_recvbuf_scale)
+    cupy.cuda.get_current_stream().synchronize()
+
+    return cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale
+
+
+def allgather_cuda(comm,
+                   cupy_server_sign_packed,
+                   cupy_recvbuf_sign_server,
+                   cupy_server_scale,
+                   cupy_recvbuf_scale_server):
+    comm.Allgather(cupy_server_sign_packed, cupy_recvbuf_sign_server)
+    comm.Allgather(cupy_server_scale, cupy_recvbuf_scale_server)
+
+
+def allgather_host(comm,
+                   cupy_server_sign_packed,
+                   cupy_recvbuf_sign_server,
+                   cupy_server_scale,
+                   cupy_recvbuf_scale_server):
+
+    # 1. Convert cupy to numpy
+    numpy_recvbuf_sign_server = np.zeros([comm.Get_size(),
+                                          cupy_server_sign_packed.size],
+                                         dtype=cupy_server_sign_packed.dtype)
+    numpy_recvbuf_scale_server = np.zeros([comm.Get_size(),
+                                           1],
+                                          dtype=cupy_server_scale.dtype)
+
+    numpy_server_sign_packed = cupy.asnumpy(cupy_server_sign_packed)
+    numpy_recvbuf_sign_server = cupy.asnumpy(cupy_recvbuf_sign_server)
+    numpy_server_scale = cupy.asnumpy(cupy_server_scale)
+    numpy_recvbuf_scale_server = cupy.asnumpy(cupy_recvbuf_scale_server)
+    cupy.cuda.get_current_stream().synchronize()
+
+    # 2. Communicate numpy buffers
+    comm.Allgather(numpy_server_sign_packed, numpy_recvbuf_sign_server)
+    comm.Allgather(numpy_server_scale, numpy_recvbuf_scale_server)
+    comm.Barrier()
+
+    # 3. Convert numpy back to cupy
+    cupy_server_sign_packed = cupy.asarray(numpy_server_sign_packed)
+    cupy_recvbuf_sign_server = cupy.asarray(numpy_recvbuf_sign_server)
+    cupy_server_scale = cupy.asarray(numpy_server_scale)
+    cupy_recvbuf_scale_server = cupy.asarray(numpy_recvbuf_scale_server)
+    cupy.cuda.get_current_stream().synchronize()
+
+    return cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server
diff --git a/deepspeed/pt/deepspeed_dataloader.py b/deepspeed/runtime/dataloader.py
similarity index 81%
rename from deepspeed/pt/deepspeed_dataloader.py
rename to deepspeed/runtime/dataloader.py
index c009969a0850..23b71d32fbbf 100644
--- a/deepspeed/pt/deepspeed_dataloader.py
+++ b/deepspeed/runtime/dataloader.py
@@ -7,6 +7,29 @@
 from torch.utils.data.distributed import DistributedSampler
 
 
+class RepeatingLoader:
+    def __init__(self, loader):
+        """Wraps an iterator to allow for infinite iteration. This is especially useful
+        for DataLoader types that we wish to automatically restart upon completion.
+
+        Args:
+            loader (iterator): The data loader to repeat.
+        """
+        self.loader = loader
+        self.data_iter = iter(self.loader)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        try:
+            batch = next(self.data_iter)
+        except StopIteration:
+            self.data_iter = iter(self.loader)
+            batch = next(self.data_iter)
+        return batch
+
+
 class DeepSpeedDataLoader(object):
     def __init__(self,
                  dataset,
diff --git a/deepspeed/pt/deepspeed_light.py b/deepspeed/runtime/engine.py
similarity index 64%
rename from deepspeed/pt/deepspeed_light.py
rename to deepspeed/runtime/engine.py
index ce27d7d68f49..5036c7de0ee6 100755
--- a/deepspeed/pt/deepspeed_light.py
+++ b/deepspeed/runtime/engine.py
@@ -2,53 +2,53 @@
 Copyright 2019 The Microsoft DeepSpeed Team
 '''
 
-import torch
 import os
+import torch
 import warnings
+import hashlib
 import torch.distributed as dist
+
 from torch.nn.modules import Module
 from torch.distributed.distributed_c10d import _get_global_rank
-from apex import amp
-
 from tensorboardX import SummaryWriter
 
-from deepspeed.pt.deepspeed_timer import ThroughputTimer, SynchronizedWallClockTimer
-from deepspeed.pt.deepspeed_zero_optimizer import FP16_DeepSpeedZeroOptimizer
-from deepspeed.pt.zero_optimizer_stage1 import FP16_DeepSpeedZeroOptimizer_Stage1
-from deepspeed.pt.log_utils import logger
-import deepspeed.pt.deepspeed_checkpointing as deepspeed_activation_checkpointing
-
-from deepspeed.pt.fp16_optimizer import FP16_Optimizer
-from deepspeed.pt.fp16_unfused_optimizer import FP16_UnfusedOptimizer
-from deepspeed.pt.deepspeed_fused_lamb import FusedLamb
-from deepspeed.pt.deepspeed_config import DeepSpeedConfig, \
-    ADAM_OPTIMIZER, LAMB_OPTIMIZER, DEEPSPEED_OPTIMIZERS
-
-from deepspeed.pt.deepspeed_dataloader import DeepSpeedDataLoader
-from deepspeed.pt.deepspeed_constants import \
+from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer
+from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1
+from deepspeed.runtime.zero.utils import is_zero_supported_optimizer
+from deepspeed.runtime.activation_checkpointing import checkpointing as activation_checkpointing
+from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
+from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
+from deepspeed.runtime.config import DeepSpeedConfig, DEEPSPEED_OPTIMIZERS, \
+    ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, \
+    TORCH_ADAM_PARAM
+
+from deepspeed.runtime.dataloader import DeepSpeedDataLoader
+from deepspeed.runtime.constants import \
     ROUTE_TRAIN, ROUTE_PREDICT, ROUTE_EVAL, \
-    TORCH_DISTRIBUTED_DEFAULT_PORT, \
+    PLD_THETA, PLD_GAMMA
+from deepspeed.runtime.zero.constants import \
     ZERO_OPTIMIZATION_OPTIMIZER_STATES, ZERO_OPTIMIZATION_GRADIENTS
+from deepspeed.runtime.csr_tensor import CSRTensor
+import deepspeed.runtime.lr_schedules as lr_schedules
+from deepspeed.utils import logger, log_dist, init_distributed
+from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer
+from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
+
+from .pipe.module import PipelineModule
+from .utils import ensure_directory_exists
+from ..ops.op_builder import UtilsBuilder
+from ..ops.adam import DeepSpeedCPUAdam
+from ..ops.adam import FusedAdam
 
-import deepspeed.pt.deepspeed_lr_schedules as lr_schedules
-from deepspeed.pt.deepspeed_csr_tensor import CSRTensor
+from deepspeed.profiling.flops_profiler.profiler import FlopsProfiler
 
 MEMORY_OPT_ALLREDUCE_SIZE = 500000000
-SUMMARY_WRITER_DIR_NAME = "JobId"
 
 try:
-    from apex_C import flatten
-    from apex_C import unflatten
+    from apex import amp
 except ImportError:
-    try:
-        _ = warned_flatten
-    except NameError:
-        logger.warning(
-            "Warning:  apex was installed without --cpp_ext.  Falling back to Python flatten and unflatten."
-        )
-        warned_flatten = True
-    from torch._utils import _flatten_dense_tensors as flatten
-    from torch._utils import _unflatten_dense_tensors as unflatten
+    # Fail silently so we don't spam logs unnecessarily if user isn't using amp
+    pass
 
 
 def split_half_float_double_csr(tensors):
@@ -92,7 +92,7 @@ def print_configuration(args, name):
         logger.info('  {} {} {}'.format(arg, dots, getattr(args, arg)))
 
 
-class DeepSpeedLight(Module):
+class DeepSpeedEngine(Module):
     r"""DeepSpeed engine for training.
     """
     def __init__(self,
@@ -106,8 +106,7 @@ def __init__(self,
                  dist_init_required=None,
                  collate_fn=None,
                  config_params=None):
-        super(DeepSpeedLight, self).__init__()
-
+        super(DeepSpeedEngine, self).__init__()
         self.client_optimizer = optimizer
         self.client_model_parameters = model_parameters
         self.client_lr_scheduler = lr_scheduler
@@ -116,6 +115,7 @@ def __init__(self,
         self.mpu = mpu
         self.data_parallel_group = None
         self.global_steps = 0
+        self.global_samples = 0
         self.micro_steps = 0
         self.skipped_steps = 0
         self.gradient_average = True
@@ -123,32 +123,31 @@ def __init__(self,
         self.config_params = config_params
         self.loaded_checkpoint_mp_world_size = None
         self.loaded_checkpoint_dp_world_size = None
+        self.enable_backward_allreduce = True
+        self.progressive_layer_drop = None
+        self.dist_backend = "nccl"
 
         if dist_init_required is None:
             dist_init_required = not dist.is_initialized()
 
-        self._mpi_check(args, dist_init_required)
+        if dist_init_required is False:
+            assert (dist.is_initialized()==True), "Torch distributed not initialized. Please set dist_init_required to True or initialize before calling deepspeed.initialize()"
 
-        self.dist_backend = "nccl"
-        if dist_init_required:
-            if not dist.is_initialized():
-                logger.info("Initializing torch distributed with backend: {}".format(
-                    self.dist_backend))
-                dist.init_process_group(backend=self.dist_backend)
-            else:
-                logger.warning(
-                    "Was given dist_init_required=True but detected that torch"
-                    "distributed was already initialized, cannot initialize twice.")
+        # Initialize torch distributed if needed
+        init_distributed(dist_backend=self.dist_backend)
 
         self._do_args_sanity_check(args)
         self._configure_with_arguments(args, mpu)
         self._do_sanity_check()
 
-        self.sample_count = 0
-        if self.tensorboard_enabled():
-            self.summary_writer = self.get_summary_writer()
+        if mpu is not None:
+            assert not self.elasticity_enabled(), "Elasticity is not currently supported" \
+                " with model parallelism."
 
-        self._init_distributed(dist_init_required)
+        self._set_distributed_vars()
+
+        if self.tensorboard_enabled() and self.global_rank == 0:
+            self.summary_writer = self.get_summary_writer()
 
         # Configure distributed model
         self._configure_distributed_model(model)
@@ -160,10 +159,13 @@ def __init__(self,
         self.tput_timer = ThroughputTimer(
             batch_size=self.train_micro_batch_size_per_gpu(),
             num_workers=self.dp_world_size,
+            steps_per_output=self.steps_per_print(),
             monitor_memory=False)
 
-        self.training_dataloader = self.deepspeed_io(
-            training_data) if training_data else None
+        if training_data:
+            self.training_dataloader = self.deepspeed_io(training_data)
+        else:
+            self.training_dataloader = None
 
         # Configure optimizer and scheduler
         self.optimizer = None
@@ -186,48 +188,52 @@ def __init__(self,
         self.save_zero_checkpoint = False
         self._configure_checkpointing(dist_init_required)
 
+        if self.pld_enabled():
+            self.progressive_layer_drop = self._configure_progressive_layer_drop()
+
         if self.global_rank == 0:
-            self._config.print('DeepSpeedLight configuration')
+            self._config.print('DeepSpeedEngine configuration')
             if self.dump_state():
-                print_configuration(self, 'DeepSpeedLight')
-
-    def _mpi_check(self, args, dist_init_required):
-        if hasattr(args, 'deepspeed_mpi') and args.deepspeed_mpi:
-            from mpi4py import MPI
-            import subprocess
-            comm = MPI.COMM_WORLD
-            rank = comm.Get_rank()
-            world_size = comm.Get_size()
-
-            master_addr = None
-            if rank == 0:
-                hostname_cmd = ["hostname -I"]
-                result = subprocess.check_output(hostname_cmd, shell=True)
-                master_addr = result.decode('utf-8').split()[0]
-            master_addr = comm.bcast(master_addr, root=0)
-
-            # Determine local rank by assuming hostnames are unique
-            proc_name = MPI.Get_processor_name()
-            all_procs = comm.allgather(proc_name)
-            local_rank = sum([i == proc_name for i in all_procs[:rank]])
-
-            os.environ['RANK'] = str(rank)
-            os.environ['WORLD_SIZE'] = str(world_size)
-            args.local_rank = local_rank
-            os.environ['MASTER_ADDR'] = master_addr
-            os.environ['MASTER_PORT'] = TORCH_DISTRIBUTED_DEFAULT_PORT
-
-            logger.info(
-                "Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
-                .format(os.environ['RANK'],
-                        args.local_rank,
-                        os.environ['WORLD_SIZE'],
-                        os.environ['MASTER_ADDR'],
-                        os.environ['MASTER_PORT']))
-
-            if not dist_init_required and dist.is_initialized():
-                assert dist.get_rank() == rank, "MPI rank {} does not match torch rank {}".format(rank, dist.get_rank())
-                assert dist.get_world_size() == world_size, "MPI world size {} does not match torch world size {}".format(world_size, dist.get_world_size())
+                print_configuration(self, 'DeepSpeedEngine')
+
+        # Load pre-installed or JIT compile (un)flatten ops
+        util_ops = UtilsBuilder().load()
+        self.flatten = util_ops.flatten
+        self.unflatten = util_ops.unflatten
+
+    def get_batch_info(self):
+        """ Get all training batch related settings.
+
+        Returns:
+            train_batch_size (int): The effective training batch size. This is the amount of data
+                samples that leads to one step of model update.
+            train_micro_batch_size_per_gpu (int): Batch size to be processed by one GPU in one
+                step (without gradient accumulation).
+            gradient_accumulation_steps (int): Number of training steps to accumulate gradients
+                before averaging and applying them.
+        """
+        return self.train_batch_size, self.train_micro_batch_size_per_gpu, self.gradient_accumulation_steps
+
+    def checkpoint_tag_validation_enabled(self):
+        return self._config.checkpoint_tag_validation_enabled
+
+    def checkpoint_tag_validation_fail(self):
+        return self._config.checkpoint_tag_validation_fail
+
+    def elasticity_enabled(self):
+        return self._config.elasticity_enabled
+
+    def pld_enabled(self):
+        return self._config.pld_enabled
+
+    def pld_params(self):
+        return self._config.pld_params
+
+    def pld_theta(self):
+        return self.pld_params()[PLD_THETA]
+
+    def pld_gamma(self):
+        return self.pld_params()[PLD_GAMMA]
 
     def tensorboard_enabled(self):
         return self._config.tensorboard_enabled
@@ -240,18 +246,49 @@ def tensorboard_job_name(self):
 
     def get_summary_writer(self,
                            name="DeepSpeedJobName",
-                           base=os.environ["HOME"] + "/tensorboard"):
-        if self.tensorboard_job_name():
-            name = self.tensorboard_job_name()
+                           base=os.path.join(os.environ["HOME"],
+                                             "tensorboard")):
         if self.tensorboard_output_path():
-            return SummaryWriter(log_dir=self.tensorboard_output_path())
-        if 'DLWS_JOB_ID' in os.environ:
-            SUMMARY_WRITER_DIR_NAME = os.environ['DLWS_JOB_ID'] + "/logs"
-        return SummaryWriter(log_dir=os.path.join(base, SUMMARY_WRITER_DIR_NAME, name))
+            base_dir = self.tensorboard_output_path()
+            job_name = self.tensorboard_job_name()
+            log_dir = os.path.join(base_dir, job_name)
+        else:
+            if self.tensorboard_job_name():
+                name = self.tensorboard_job_name()
+
+            # Infrastructure-specific job-id
+            if 'DLWS_JOB_ID' in os.environ:
+                infra_job_id = os.environ['DLWS_JOB_ID']
+            elif 'DLTS_JOB_ID' in os.environ:
+                infra_job_id = os.environ['DLTS_JOB_ID']
+            else:
+                infra_job_id = 'unknown-job-id'
+
+            summary_writer_dir_name = os.path.join(infra_job_id, "logs")
+            log_dir = os.path.join(base, summary_writer_dir_name, name)
+
+        os.makedirs(log_dir, exist_ok=True)
+
+        return SummaryWriter(log_dir=log_dir)
 
     def wall_clock_breakdown(self):
         return self._config.wall_clock_breakdown
 
+    def flops_profiler_enabled(self):
+        return self._config.flops_profiler_config.enabled
+
+    def flops_profiler_start_step(self):
+        return self._config.flops_profiler_config.start_step
+
+    def flops_profiler_end_step(self):
+        return self._config.flops_profiler_config.end_step
+
+    def flops_profiler_module_depth(self):
+        return self._config.flops_profiler_config.module_depth
+
+    def flops_profiler_top_modules(self):
+        return self._config.flops_profiler_config.top_modules
+
     def memory_breakdown(self):
         return self._config.memory_breakdown
 
@@ -265,7 +302,7 @@ def train_micro_batch_size_per_gpu(self):
         return self._config.train_micro_batch_size_per_gpu
 
     def optimizer_name(self):
-        return self._config.optimizer_name
+        return self.client_optimizer.__class__.__name__ if self.client_optimizer else self._config.optimizer_name
 
     def optimizer_params(self):
         return self._config.optimizer_params
@@ -291,6 +328,9 @@ def zero_reduce_scatter(self):
     def zero_overlap_comm(self):
         return self._config.zero_config.overlap_comm
 
+    def zero_cpu_offload(self):
+        return self._config.zero_config.cpu_offload
+
     def zero_optimization_stage(self):
         return self._config.zero_optimization_stage
 
@@ -309,8 +349,8 @@ def zero_contiguous_gradients(self):
     def zero_load_from_fp32_weights(self):
         return self._config.zero_config.load_from_fp32_weights
 
-    def allgather_size(self):
-        return self._config.allgather_size
+    def zero_elastic_checkpoint(self):
+        return self._config.zero_config.elastic_checkpoint
 
     def fp16_enabled(self):
         return self._config.fp16_enabled
@@ -361,13 +401,15 @@ def _configure_lr_scheduler(self, client_lr_scheduler):
         # First check for scheduler in json configuration
         lr_scheduler = self._scheduler_from_config(self.optimizer)
         if lr_scheduler:
-            logger.info(
-                f'DeepSpeed using configured LR scheduler = {self.scheduler_name()}')
+            if self.global_rank == 0:
+                logger.info(
+                    f'DeepSpeed using configured LR scheduler = {self.scheduler_name()}')
             self.lr_scheduler = lr_scheduler
         else:
-            logger.warning('DeepSpeed using client LR scheduler')
+            if self.global_rank == 0:
+                logger.info('DeepSpeed using client LR scheduler')
             self.lr_scheduler = client_lr_scheduler
-        logger.info(f'DeepSpeed LR Scheduler = {self.lr_scheduler}')
+        log_dist(f'DeepSpeed LR Scheduler = {self.lr_scheduler}', ranks=[0])
 
     def _configure_checkpointing(self, dist_init_required):
 
@@ -375,15 +417,16 @@ def _configure_checkpointing(self, dist_init_required):
         if self.mpu:
             dp_rank = self.mpu.get_data_parallel_rank()
 
-        #only the first data parallel process needs to store the model checkpoint
+        # only the first data parallel process needs to store the model checkpoint
         self.save_non_zero_checkpoint = (dp_rank == 0)
 
         if self.zero_optimization():
-            pp_rank = torch.distributed.get_rank(group=self.optimizer.dp_process_group)
+            param_rank = torch.distributed.get_rank(
+                group=self.optimizer.dp_process_group)
 
             # Only the first parameter parallel process needs to store the
             # optimizer state checkpoints for zero
-            self.save_zero_checkpoint = (pp_rank == dp_rank)
+            self.save_zero_checkpoint = (param_rank == dp_rank)
 
     def _scheduler_from_config(self, optimizer):
         scheduler_name = self.scheduler_name()
@@ -402,14 +445,12 @@ def _scheduler_from_config(self, optimizer):
         else:
             return None
 
-    def _init_distributed(self, dist_init_required):
+    def _set_distributed_vars(self):
         if self.local_rank >= 0:
             torch.cuda.set_device(self.local_rank)
             self.device = torch.device("cuda", self.local_rank)
             self.world_size = dist.get_world_size()
             self.global_rank = dist.get_rank()
-            logger.info("Set device to local rank {} within node.".format(
-                self.local_rank))
         else:
             self.world_size = 1
             self.global_rank = 0
@@ -418,9 +459,9 @@ def _init_distributed(self, dist_init_required):
     # Configure based on command line arguments
     def _configure_with_arguments(self, args, mpu):
         self.local_rank = args.local_rank if hasattr(args, 'local_rank') else 0
-        self._config = DeepSpeedConfig(args.deepspeed_config,
-                                       mpu,
-                                       param_dict=self.config_params)
+        config_file = args.deepspeed_config if hasattr(args,
+                                                       'deepspeed_config') else None
+        self._config = DeepSpeedConfig(config_file, mpu, param_dict=self.config_params)
 
     # Validate command line arguments
     def _do_args_sanity_check(self, args):
@@ -449,10 +490,9 @@ def _is_supported_optimizer(self, optimizer_name):
     # Validate configuration based on command line arguments
     def _do_sanity_check(self):
         if not self.client_optimizer:
-            assert self._is_supported_optimizer(self.optimizer_name()), \
-                '{} is not a supported DeepSpeed Optimizer'.format(self.optimizer_name())
-            assert self.client_model_parameters, \
-                'DeepSpeed {} optimizer requires parameters in initialize() call'.format(self.optimizer_name())
+            if self.optimizer_name() is not None:
+                assert self._is_supported_optimizer(self.optimizer_name()), \
+                    '{} is not a supported DeepSpeed Optimizer'.format(self.optimizer_name())
 
         if self.optimizer_name() == LAMB_OPTIMIZER:
             assert self.dynamic_loss_scale(), \
@@ -483,58 +523,94 @@ def _configure_distributed_model(self, model):
             self.broadcast_src_rank = _get_global_rank(
                 self.mpu.get_data_parallel_group(),
                 0)
-            logger.info(f"global src_rank={self.broadcast_src_rank}")
 
         if not self.amp_enabled():
             self._broadcast_model()
 
     # Configure optimizer
     def _configure_optimizer(self, client_optimizer, model_parameters):
+
         if client_optimizer is not None:
             basic_optimizer = client_optimizer
-            logger.info('Using client Optimizer as basic optimizer')
+            if self.global_rank == 0:
+                logger.info('Using client Optimizer as basic optimizer')
         else:
             basic_optimizer = self._configure_basic_optimizer(model_parameters)
-            logger.info(
-                'Using DeepSpeed Optimizer param name {} as basic optimizer'.format(
-                    self.optimizer_name()))
+            if self.global_rank == 0:
+                logger.info(
+                    'Using DeepSpeed Optimizer param name {} as basic optimizer'.format(
+                        self.optimizer_name()))
 
-        logger.info('DeepSpeed Basic Optimizer = {}'.format(basic_optimizer))
+        if self.global_rank == 0:
+            logger.info('DeepSpeed Basic Optimizer = {}'.format(basic_optimizer))
 
         if self.zero_optimization():
             assert not self.amp_enabled(), "Amp and ZeRO are not currently compatible, please use (legacy) fp16 mode which performs similar to amp opt_mode=O2"
-            if self.optimizer_name() != ADAM_OPTIMIZER:
+            if not is_zero_supported_optimizer(basic_optimizer):
                 assert self.zero_allow_untested_optimizer(), \
-                'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.'
+                    'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.'
 
-                logger.warning(
-                    "**** You are using ZeRO with an untested optimizer, proceed with caution *****"
-                )
+                if self.global_rank == 0:
+                    logger.warning(
+                        "**** You are using ZeRO with an untested optimizer, proceed with caution *****"
+                    )
             self.optimizer = self._configure_zero_optimizer(basic_optimizer)
         elif self.amp_enabled():
             assert not self.fp16_enabled(), "Cannot enable both amp with (legacy) fp16 mode"
             amp_params = self.amp_params()
-            logger.info(f"Initializing AMP with these params: {amp_params}")
+            if self.global_rank == 0:
+                logger.info(f"Initializing AMP with these params: {amp_params}")
+            try:
+                logger.info("Initializing Apex amp from: {}".format(amp.__path__))
+            except NameError:
+                # If apex/amp is available it will be imported above
+                raise RuntimeError(
+                    "Unable to import apex/amp, please make sure it is installed")
             self.module, self.optimizer = amp.initialize(self.module, basic_optimizer, **amp_params)
             self._broadcast_model()
         elif self.fp16_enabled():
             self.optimizer = self._configure_fp16_optimizer(basic_optimizer)
         else:
             self.optimizer = basic_optimizer
-
-        # logger.info('DeepSpeed Final Optimizer = {}'.format(self.optimizer.state_dict()))
+        logger.info('DeepSpeed Final Optimizer = {}'.format(self.optimizer))
 
     def _configure_basic_optimizer(self, model_parameters):
         optimizer_parameters = self.optimizer_params()
+        # print(optimizer_parameters.keys())
         if 'max_grad_norm' in optimizer_parameters.keys():
             raise ValueError(
                 "'max_grad_norm' is not supported as an optimizer parameter, please switch to using the deepspeed parameter 'gradient_clipping' see: https://www.deepspeed.ai/docs/config-json/#gradient-clipping for more details"
             )
-        if self.optimizer_name() == ADAM_OPTIMIZER:
-            from apex.optimizers.fused_adam import FusedAdam
-            optimizer = FusedAdam(model_parameters, **optimizer_parameters)
+
+        if self.optimizer_name() in [ADAM_OPTIMIZER, ADAMW_OPTIMIZER]:
+            torch_adam = optimizer_parameters.pop(TORCH_ADAM_PARAM, False)
+            adam_w_mode = self.optimizer_name() == ADAMW_OPTIMIZER
+            # zero-offload  torch-adam  adam_w_mode optimizer
+            # T|F           T           T           torch.optim.AdamW
+            # T|F           T           F           torch.optim.Adam
+            # T             F           T|F         DeepSpeedCPUAdam(adam_w_mode)
+            # F             F           T|F         FusedAdam(adam_w_mode)
+            if torch_adam:
+                if adam_w_mode:
+                    optimizer = torch.optim.AdamW(model_parameters,
+                                                  **optimizer_parameters)
+                else:
+                    optimizer = torch.optim.Adam(model_parameters,
+                                                 **optimizer_parameters)
+            elif self.zero_cpu_offload():
+                optimizer = DeepSpeedCPUAdam(model_parameters,
+                                             **optimizer_parameters,
+                                             adamw_mode=adam_w_mode)
+            else:
+                optimizer_parameters['adam_w_mode'] = adam_w_mode
+                optimizer = FusedAdam(model_parameters, **optimizer_parameters)
+
         elif self.optimizer_name() == LAMB_OPTIMIZER:
+            from deepspeed.ops.lamb import FusedLamb
             optimizer = FusedLamb(model_parameters, **optimizer_parameters)
+        elif self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER:
+            from deepspeed.runtime.fp16.onebit_adam import OnebitAdam
+            optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters)
         else:
             torch_optimizer = getattr(torch.optim, self.optimizer_name())
             optimizer = torch_optimizer(model_parameters, **optimizer_parameters)
@@ -544,7 +620,8 @@ def _configure_fp16_optimizer(self, optimizer):
         initial_dynamic_scale = self.initial_dynamic_scale()
         dynamic_loss_args = self.dynamic_loss_scale_args()
         clip_grad = self.gradient_clipping()
-        if self.optimizer_name() == ADAM_OPTIMIZER:
+        if isinstance(optimizer,
+                      FusedAdam) or self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER:
             if self.dynamic_loss_scale():
                 logger.info('Creating fp16 optimizer with dynamic loss scale')
                 timers = self.timers if self.wall_clock_breakdown() else None
@@ -570,6 +647,7 @@ def _configure_fp16_optimizer(self, optimizer):
             logger.info('Creating fp16 unfused optimizer with dynamic loss scale')
             optimizer = FP16_UnfusedOptimizer(
                 optimizer,
+                static_loss_scale=self.loss_scale(),
                 dynamic_loss_scale=self.dynamic_loss_scale(),
                 dynamic_loss_args=dynamic_loss_args,
                 mpu=self.mpu,
@@ -581,7 +659,7 @@ def _configure_fp16_optimizer(self, optimizer):
     def _configure_zero_optimizer(self, optimizer):
         zero_stage = self.zero_optimization_stage()
         logger.info('Creating fp16 ZeRO stage {} optimizer'.format(zero_stage))
-
+        assert not self.allreduce_always_fp32(), "ZeRO does not support 'fp32_allreduce': true"
         if zero_stage == ZERO_OPTIMIZATION_OPTIMIZER_STATES:
             assert self.zero_reduce_scatter(), 'Stage 1 only supports reduce scatter mode'
             optimizer = FP16_DeepSpeedZeroOptimizer_Stage1(
@@ -594,9 +672,9 @@ def _configure_zero_optimizer(self, optimizer):
                 allgather_size=self.zero_allgather_bucket_size(),
                 max_elements_per_comm=self.zero_reduce_bucket_size(),
                 dp_process_group=self.data_parallel_group,
+                elastic_checkpoint=self.zero_elastic_checkpoint(),
                 mpu=self.mpu)
         elif zero_stage == ZERO_OPTIMIZATION_GRADIENTS:
-            assert self.gradient_accumulation_steps() == 1, "ZeRO stage 2 does not support gradient accumulation, if you need gradient accumulation please use stage 1"
             optimizer = FP16_DeepSpeedZeroOptimizer(
                 optimizer,
                 timers=self.timers,
@@ -610,14 +688,21 @@ def _configure_zero_optimizer(self, optimizer):
                 dp_process_group=self.data_parallel_group,
                 reduce_scatter=self.zero_reduce_scatter(),
                 overlap_comm=self.zero_overlap_comm(),
+                cpu_offload=self.zero_cpu_offload(),
                 mpu=self.mpu,
                 postscale_gradients=self.postscale_gradients(),
-                gradient_predivide_factor=self.gradient_predivide_factor())
+                gradient_predivide_factor=self.gradient_predivide_factor(),
+                gradient_accumulation_steps=self.gradient_accumulation_steps())
         else:
             raise NotImplementedError("ZeRO stage {} not implemented".format(zero_stage))
 
         return optimizer
 
+    def _configure_progressive_layer_drop(self):
+        pld = ProgressiveLayerDrop(theta=self.pld_theta(), gamma=self.pld_gamma())
+
+        return pld
+
     def deepspeed_io(self,
                      dataset,
                      batch_size=None,
@@ -647,8 +732,8 @@ def deepspeed_io(self,
         data_parallel_world_size = None
         data_parallel_rank = None
         if self.mpu is not None:
-            data_parallel_world_size = mpu.get_data_parallel_world_size()
-            data_parallel_rank = mpu.get_data_parallel_rank()
+            data_parallel_world_size = self.mpu.get_data_parallel_world_size()
+            data_parallel_rank = self.mpu.get_data_parallel_rank()
 
         return DeepSpeedDataLoader(dataset=dataset,
                                    batch_size=batch_size,
@@ -661,12 +746,12 @@ def deepspeed_io(self,
                                    data_parallel_world_size=data_parallel_world_size,
                                    data_parallel_rank=data_parallel_rank)
 
-    def train(self):
+    def train(self, mode=True):
         r"""
         """
 
         self.warn_unscaled_loss = True
-        self.module.train()
+        self.module.train(mode)
 
     def eval(self):
         r"""
@@ -702,6 +787,33 @@ def forward(self, *inputs, **kwargs):
             *inputs: Variable length input list
             **kwargs: variable length keyword arguments
         """
+        if self.flops_profiler_enabled(
+        ) and self.global_steps == self.flops_profiler_start_step(
+        ) and self.global_rank == 0:
+            self.flops_profiler = FlopsProfiler(self.module)
+            self.flops_profiler.start_profile(ignore_list=None)
+
+        if self.flops_profiler_enabled(
+        ) and self.global_steps == self.flops_profiler_end_step(
+        ) and self.global_rank == 0:
+            print('{:<30}  {:<8}'.format(
+                'Number of multiply-adds: ',
+                self.flops_profiler.get_total_flops(in_str=False)))
+            print('{:<30}  {:<8}'.format(
+                'Number of parameters: ',
+                self.flops_profiler.get_total_params(in_str=False)))
+            print('{:<30}  {:<8}'.format('Number of steps profiled: ',
+                                         self.flops_profiler.get_total_steps()))
+            self.flops_profiler.print_model_profile()
+            self.flops_profiler.print_model_aggregated_profile(
+                module_depth=self.flops_profiler_module_depth(),
+                top_modules=self.flops_profiler_top_modules())
+            self.flops_profiler.flops = self.flops_profiler.get_total_flops()
+            self.flops_profiler.params = self.flops_profiler.get_total_params()
+            self.flops_profiler.end_profile()
+
+        if self.module.training and self.progressive_layer_drop:
+            kwargs.update(self.progressive_layer_drop.get_state())
 
         if self.wall_clock_breakdown():
             self.timers('forward_microstep').start()
@@ -718,19 +830,22 @@ def forward(self, *inputs, **kwargs):
         return loss
 
     def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):
-        if self.is_gradient_accumulation_boundary():
+        #Zero stage 2 communicates during non gradient accumulation boundaries as well
+        if self.zero_optimization_partition_gradients():
+            self.optimizer.overlapping_partition_gradients_reduce_epilogue()
+
+        #Communicate only at gradient accumulation boundaries
+        elif self.is_gradient_accumulation_boundary():
             if self.zero_optimization_stage() == ZERO_OPTIMIZATION_OPTIMIZER_STATES:
                 assert self.zero_reduce_scatter()
                 self.optimizer.reduce_scatter_gradients(
                     postscale_gradients=self.postscale_gradients(),
                     gradient_predivide_factor=self.gradient_predivide_factor(),
                     gradient_average=self.gradient_average)
-            elif self.zero_optimization_partition_gradients():
-                self.optimizer.overlapping_partition_gradients_reduce_epilogue()
             else:
                 self.buffered_allreduce_fallback(elements_per_buffer=bucket_size)
 
-    def backward(self, loss, allreduce_gradients=True):
+    def backward(self, loss, allreduce_gradients=True, release_loss=False):
         r"""Execute backward pass on the loss
 
         Arguments:
@@ -738,6 +853,11 @@ def backward(self, loss, allreduce_gradients=True):
             allreduce_gradients: If this is False, then gradient averaging will be skipped. Default is True.
         """
 
+        if not allreduce_gradients:
+            logger.warning(
+                f'Argument `allreduce_gradients` is deprecated, ignored, and will soon be removed'
+            )
+
         # scale loss w.r.t. gradient accumulation if needed
         if self.gradient_accumulation_steps() > 1:
             loss = self._scale_loss(loss.float())
@@ -746,13 +866,10 @@ def backward(self, loss, allreduce_gradients=True):
         if self.tensorboard_enabled():
             if self.is_gradient_accumulation_boundary():
                 if self.global_rank == 0:
-                    self.sample_count += (self.train_micro_batch_size_per_gpu() *
-                                          self.dp_world_size *
-                                          self.gradient_accumulation_steps())
                     self.summary_events = [
                         (f'Train/Samples/train_loss',
                          loss.mean().item() * self.gradient_accumulation_steps(),
-                         self.sample_count)
+                         self.global_samples)
                     ]
                     for event in self.summary_events:  # write_summary_events
                         self.summary_writer.add_scalar(event[0], event[1], event[2])
@@ -770,9 +887,16 @@ def backward(self, loss, allreduce_gradients=True):
             self.timers('backward_inner').start()
 
         if self.zero_optimization():
+            self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary(
+            )
             self.optimizer.backward(loss)
         elif self.amp_enabled():
-            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
+            # AMP requires delaying unscale when inside gradient accumulation boundaries
+            # https://nvidia.github.io/apex/advanced.html#gradient-accumulation-across-iterations
+            delay_unscale = not self.is_gradient_accumulation_boundary()
+            with amp.scale_loss(loss,
+                                self.optimizer,
+                                delay_unscale=delay_unscale) as scaled_loss:
                 scaled_loss.backward()
         elif self.fp16_enabled():
             self.optimizer.backward(loss)
@@ -787,7 +911,7 @@ def backward(self, loss, allreduce_gradients=True):
             self.timers('backward_allreduce_microstep').start()
             self.timers('backward_allreduce').start()
 
-        if allreduce_gradients:
+        if self.enable_backward_allreduce:
             self.allreduce_gradients()
 
         if self.wall_clock_breakdown():
@@ -796,9 +920,20 @@ def backward(self, loss, allreduce_gradients=True):
             self.timers('backward').stop()
             self.timers('backward_microstep').stop()
 
+        if release_loss:
+            # loss.data = None
+            pass
+
         return loss
 
     def is_gradient_accumulation_boundary(self):
+        """Query whether the current micro-batch is at the boundary of
+        gradient accumulation, and thus will trigger gradient reductions and
+        an optimizer step.
+
+        Returns:
+            bool: if the current step is a gradient accumulation boundary.
+        """
         return (self.micro_steps + 1) % \
             self.gradient_accumulation_steps() == 0
 
@@ -813,8 +948,47 @@ def clip_fp32_gradients(self):
         torch.nn.utils.clip_grad_norm_(parameters=self.module.parameters(),
                                        max_norm=self.gradient_clipping())
 
-    def step(self):
-        r"""Execute the weight update step after forward and backward propagation on effective_train_batch
+    def _take_model_step(self, lr_kwargs):
+        if self.gradient_clipping() > 0.0:
+            if not self.fp16_enabled() and not self.amp_enabled():
+                self.clip_fp32_gradients()
+            elif self.amp_enabled():
+                # AMP's recommended way of doing clipping
+                # https://nvidia.github.io/apex/advanced.html#gradient-clipping
+                master_params = amp.master_params(self.optimizer)
+                torch.nn.utils.clip_grad_norm_(parameters=master_params,
+                                               max_norm=self.gradient_clipping())
+        self.optimizer.step()
+
+        #zero grad in basic optimizer could be unreliable and may not exhibit
+        #the behaviour that we want
+        if not self.zero_optimization() and not self.fp16_enabled(
+        ) and not self.amp_enabled():
+            self.zero_grad()
+        else:
+            self.optimizer.zero_grad()
+
+        report_progress = self.global_rank == 0 if self.global_rank else True
+
+        # Check overlow here since in DS fp16 optimizer, the overflow is updated in above step() function.
+        overflow = False
+        if hasattr(self.optimizer, 'overflow'):
+            overflow = self.optimizer.overflow
+
+        if overflow:
+            self.skipped_steps += 1
+        else:
+            if self.lr_scheduler is not None:
+                self.lr_scheduler.step(**(lr_kwargs or {}))
+            if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
+                self._report_progress(self.global_steps + 1)
+
+        self.global_steps += 1
+        self.global_samples += self.train_batch_size()
+
+    def step(self, lr_kwargs=None):
+        r"""Execute the weight update step after forward and backward propagation
+        on effective_train_batch.
         """
         if self.wall_clock_breakdown():
             self.timers('step_microstep').start()
@@ -824,35 +998,12 @@ def step(self):
                                            "init in order to use step"
         report_progress = self.global_rank == 0 if self.global_rank else True
 
+        # Update the model when we reach gradient accumulation boundaries
         if self.is_gradient_accumulation_boundary():
+            if self.progressive_layer_drop:
+                self.progressive_layer_drop.update_state(self.global_steps)
 
-            if not self.fp16_enabled() and self.gradient_clipping() > 0.0:
-                self.clip_fp32_gradients()
-
-            self.optimizer.step()
-
-            #zero grad in basic optimizer could be unreliable and may not exhibit
-            #the behaviour that we want
-            if not self.zero_optimization() and not self.fp16_enabled():
-                self.zero_grad()
-            else:
-                self.optimizer.zero_grad()
-
-            # Check overlow here since in DS fp16 optimizer, the overflow is updated in above step() function.
-            overflow = False
-            if hasattr(self.optimizer, 'overflow'):
-                overflow = self.optimizer.overflow
-
-            if overflow:
-                self.skipped_steps += 1
-            else:
-                if self.lr_scheduler is not None:
-                    self.lr_scheduler.step()
-                if report_progress and (self.global_steps +
-                                        1) % self.steps_per_print() == 0:
-                    self._report_progress(self.global_steps + 1)
-
-            self.global_steps += 1
+            self._take_model_step(lr_kwargs)
 
         self.tput_timer.stop(report_progress)
 
@@ -862,7 +1013,13 @@ def step(self):
                 if self.global_rank == 0:
                     self.summary_events = [(f'Train/Samples/lr',
                                             self.get_lr()[0],
-                                            self.sample_count)]
+                                            self.global_samples)]
+                    for event in self.summary_events:  # write_summary_events
+                        self.summary_writer.add_scalar(event[0], event[1], event[2])
+                    if self.fp16_enabled() and hasattr(self.optimizer, 'cur_scale'):
+                        self.summary_events.append((f'Train/Samples/loss_scale',
+                                                    self.optimizer.cur_scale,
+                                                    self.global_samples))
                     for event in self.summary_events:  # write_summary_events
                         self.summary_writer.add_scalar(event[0], event[1], event[2])
                     self.summary_writer.flush()
@@ -883,12 +1040,24 @@ def step(self):
             if self.is_gradient_accumulation_boundary():
                 if self.tensorboard_enabled():
                     if self.global_rank == 0:
-                        self.summary_events = [(f'Train/Samples/elapsed_time_ms_forward', self.timers('forward').elapsed(reset=False) * 1000.0, self.sample_count), \
-                                                (f'Train/Samples/elapsed_time_ms_backward', self.timers('backward').elapsed(reset=False) * 1000.0, self.sample_count), \
-                                                (f'Train/Samples/elapsed_time_ms_backward_inner', self.timers('backward_inner').elapsed(reset=False) * 1000.0, self.sample_count), \
-                                                (f'Train/Samples/elapsed_time_ms_backward_allreduce', self.timers('backward_allreduce').elapsed(reset=False) * 1000.0, self.sample_count), \
-                                                (f'Train/Samples/elapsed_time_ms_step', self.timers('step').elapsed(reset=False) * 1000.0, self.sample_count)
-                                                ]
+                        self.summary_events = [
+                            (f'Train/Samples/elapsed_time_ms_forward',
+                             self.timers('forward').elapsed(reset=False) * 1000.0,
+                             self.global_samples),
+                            (f'Train/Samples/elapsed_time_ms_backward',
+                             self.timers('backward').elapsed(reset=False) * 1000.0,
+                             self.global_samples),
+                            (f'Train/Samples/elapsed_time_ms_backward_inner',
+                             self.timers('backward_inner').elapsed(reset=False) * 1000.0,
+                             self.global_samples),
+                            (f'Train/Samples/elapsed_time_ms_backward_allreduce',
+                             self.timers('backward_allreduce').elapsed(reset=False) *
+                             1000.0,
+                             self.global_samples),
+                            (f'Train/Samples/elapsed_time_ms_step',
+                             self.timers('step').elapsed(reset=False) * 1000.0,
+                             self.global_samples)
+                        ]
                         for event in self.summary_events:  # write_summary_events
                             self.summary_writer.add_scalar(event[0], event[1], event[2])
                         self.summary_writer.flush()
@@ -918,21 +1087,29 @@ def _get_optimizer_param(self, param_name):
     def get_lr(self):
         return self._get_optimizer_param('lr')
 
+    def get_type(self):
+        return self._get_optimizer_param('type')
+
     def get_mom(self):
-        return self._get_optimizer_param('betas')
+        if self.optimizer_name() in ['SGD', 'RMSprop']:
+            return self._get_optimizer_param('momentum')
+        else:
+            return self._get_optimizer_param('betas')
+
+    def get_pld_theta(self):
+        if self.progressive_layer_drop:
+            return self.progressive_layer_drop.get_theta()
+        else:
+            return None
 
     def _report_progress(self, step):
         lr = self.get_lr()
         mom = self.get_mom()
-        logger.info('rank:{} step={}, skipped={}, lr={}, mom={}'.format(
-            self.global_rank,
-            step,
-            self.skipped_steps,
-            lr,
-            mom))
+        log_dist(f'step={step}, skipped={self.skipped_steps}, lr={lr}, mom={mom}',
+                 ranks=[0])
 
     def allreduce_bucket(self, bucket):
-        tensor = flatten(bucket)
+        tensor = self.flatten(bucket)
 
         tensor_to_allreduce = tensor
 
@@ -960,7 +1137,7 @@ def allreduce_bucket(self, bucket):
 
     def allreduce_and_copy(self, small_bucket):
         allreduced = self.allreduce_bucket(small_bucket)
-        for buf, synced in zip(small_bucket, unflatten(allreduced, small_bucket)):
+        for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
             buf.copy_(synced)
 
     def allreduce_no_retain(self, bucket, numel_per_bucket=500000000):
@@ -985,10 +1162,10 @@ def buffered_allreduce_fallback(self, grads=None, elements_per_buffer=500000000)
                 # rank is reducing the same size. In some cases it may make
                 # sense in the future to support the ability to average not
                 # w.r.t. world size but with a different value.
-                grads.append(
-                    torch.zeros(param.size(),
-                                dtype=param.dtype,
-                                device=param.device))
+                param.grad = torch.zeros(param.size(),
+                                         dtype=param.dtype,
+                                         device=param.device)
+                grads.append(param.grad.data)
             else:
                 grad_data = param.grad.data
                 if self.sparse_gradients_enabled(
@@ -1085,37 +1262,44 @@ def _get_zero_ckpt_name(self, checkpoints_path, tag):
         return self._get_rank_zero_ckpt_name(checkpoints_path, tag, mp_rank, pp_rank)
 
     def _get_ckpt_name(self, checkpoints_path, tag):
-
         mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
         ckpt_name = os.path.join(checkpoints_path,
                                  str(tag),
                                  'mp_rank_{:02d}'.format(mp_rank) + '_model_states.pt')
         return ckpt_name
 
-    def _ensure_directory_exists(self, filename):
-        dirname = os.path.dirname(filename)
-        if not os.path.exists(dirname):
-            os.makedirs(dirname)
-
     def load_checkpoint(self,
                         load_dir,
-                        tag,
+                        tag=None,
                         load_module_strict=True,
                         load_optimizer_states=True,
                         load_lr_scheduler_states=True):
-        r"""Load training checkpoint
+        """Load training checkpoint
 
         Arguments:
             load_dir: Required. Directory to load the checkpoint from
-            tag: Required. Checkpoint tag used as a unique identifier for the checkpoint. Ex. Global Step.
+            tag: Checkpoint tag used as a unique identifier for checkpoint, if not provided will attempt to load tag in 'latest' file
             load_module_strict: Optional. Boolean to strictly enforce that the keys in state_dict of module and checkpoint match.
             load_optimizer_states: Optional. Boolean to load the training optimizer states from Checkpoint. Ex. ADAM's momentum and variance
             load_lr_scheduler_states: Optional. Boolean to add the learning rate scheduler states from Checkpoint.
-        Return:
-            load_path: Path of the loaded checkpoint. None if loading the checkpoint failed
-            client_state: State dictionary used for loading required training states in the client code.
+        Returns:
+            A tuple of ``load_path`` and ``client_state``.
+
+            *``load_path``: Path of the loaded checkpoint. ``None`` if loading the checkpoint failed.
+
+            *``client_state``: State dictionary used for loading required training states in the client code.
         """
 
+        if tag is None:
+            latest_path = os.path.join(load_dir, 'latest')
+            if os.path.isfile(latest_path):
+                with open(latest_path, 'r') as fd:
+                    tag = fd.read().strip()
+            else:
+                logger.warning(f"Unable to find latest file at {latest_path}, if trying to load latest " \
+                "checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.")
+                return None, None
+
         load_path, client_states = self._load_checkpoint(load_dir,
                                                          tag,
                                                          load_module_strict=load_module_strict,
@@ -1144,17 +1328,21 @@ def _load_checkpoint(self,
                 .format(load_path))
             return None, None
 
-        logger.info('Loading checkpoint: {}'.format(load_path))
+        logger.info(f'rank: {self.global_rank} loading checkpoint: {load_path}')
         checkpoint = torch.load(load_path, map_location=lambda storage, loc: storage)
 
+        if isinstance(self.module, PipelineModule):
+            # Pipeline parallelism uses this to load its own checkpoint files.
+            self._curr_ckpt_path = os.path.join(load_dir, tag)
+
         self.load_module_state_dict(state_dict=checkpoint['module'],
                                     strict=load_module_strict)
-        if not self.zero_optimization():
+        if self.optimizer is not None and not self.zero_optimization():
             if self.fp16_enabled():
                 self.optimizer.load_state_dict(
                     checkpoint['optimizer'],
                     load_optimizer_states=load_optimizer_states)
-            else:
+            elif load_optimizer_states:
                 self.optimizer.load_state_dict(checkpoint['optimizer'])
 
         if load_lr_scheduler_states and self.lr_scheduler is not None:
@@ -1162,6 +1350,8 @@ def _load_checkpoint(self,
 
         self.csr_tensor_module_names = checkpoint['csr_tensor_module_names']
         self.global_steps = checkpoint['global_steps']
+        self.global_samples = checkpoint.get('global_samples',
+                                             self.global_steps * self.train_batch_size())
         self.skipped_steps = checkpoint['skipped_steps']
         self.loaded_checkpoint_mp_world_size = checkpoint['mp_world_size']
         self.loaded_checkpoint_dp_world_size = checkpoint['dp_world_size']
@@ -1236,7 +1426,7 @@ def _get_all_zero_checkpoints(self, load_dir, tag):
                 invalid_zero_ckpt_paths.append(ckpt_name)
 
         if len(invalid_zero_ckpt_paths) > 0:
-            logging.warn(
+            logger.warn(
                 f"Client provided zero checkpoint load paths: {invalid_zero_ckpt_paths} does not exist"
             )
             return None
@@ -1251,49 +1441,90 @@ def _get_all_zero_checkpoints(self, load_dir, tag):
         )
         return zero_optimizer_sd
 
-    def save_checkpoint(self, save_dir, tag, client_state={}):
+    def _checkpoint_tag_validation(self, tag):
+        if self.checkpoint_tag_validation_enabled():
+            s_hash = hashlib.sha1(tag.encode())
+            bhash = torch.ByteTensor([s_hash.digest()]).flatten().to(self.device)
+            max_bhash = bhash.clone()
+            min_bhash = bhash.clone()
+            dist.all_reduce(max_bhash, op=torch.distributed.ReduceOp.MAX)
+            dist.all_reduce(min_bhash, op=torch.distributed.ReduceOp.MIN)
+            valid = all(min_bhash == bhash) and all(max_bhash == bhash)
+            msg = f"[rank={dist.get_rank()}] The checkpoint tag name '{tag}' is not consistent across " \
+                "all ranks. Including rank unique information in checkpoint tag could cause issues when " \
+                "restoring with different world sizes."
+            if self.checkpoint_tag_validation_fail():
+                assert valid, msg
+            elif not valid:
+                logger.warning(msg)
+
+    def save_checkpoint(self, save_dir, tag=None, client_state={}, save_latest=True):
         r"""Save training checkpoint
 
         Arguments:
             save_dir: Required. Directory for saving the checkpoint
-            tag: Required. Checkpoint tag used as a unique identifier for the checkpoint. Ex. Global Step.
+            tag: Optional. Checkpoint tag used as a unique identifier for the checkpoint, global step is
+                used if not provided. Tag name must be the same across all ranks.
             client_state: Optional. State dictionary used for saving required training states in the client code.
+            save_latest: Optional. Save a file 'latest' pointing to the latest saved checkpoint.
         """
 
-        #This is to make sure the checkpoint names are created without collision
-        #There seems to be issue creating them in parallel
-        self._create_checkpoint_files(save_dir, tag)
+        # This is to make sure the checkpoint names are created without collision
+        # There seems to be issue creating them in parallel
+
+        # Ensure save_dir directory exists
+        os.makedirs(save_dir, exist_ok=True)
+
+        if tag is None:
+            tag = f"global_step{self.global_steps}"
+
+        # Ensure checkpoint tag is consistent across ranks
+        self._checkpoint_tag_validation(tag)
 
         if self.save_non_zero_checkpoint:
+            self._create_checkpoint_file(save_dir, tag, False)
             self._save_checkpoint(save_dir, tag, client_state=client_state)
 
         if self.save_zero_checkpoint:
+            self._create_zero_checkpoint_files(save_dir, tag)
             self._save_zero_checkpoint(save_dir, tag)
 
+        # Save latest checkpoint tag
+        if save_latest:
+            with open(os.path.join(save_dir, 'latest'), 'w') as fd:
+                fd.write(tag)
+
+        return True
+
+    def _create_checkpoint_file(self, save_dir, tag, zero_checkpoint):
+        name_function = self._get_zero_ckpt_name if zero_checkpoint else self._get_ckpt_name
+        try:
+            checkpoint_name = name_function(save_dir, tag)
+            ensure_directory_exists(checkpoint_name)
+        except:
+            logger.error(f'Failed saving model checkpoint to {save_dir} with tag {tag}')
+            return False
+
         return True
 
-    def _create_checkpoint_files(self, save_dir, tag):
-        #checkpoint files are created sequentially
+    def _create_zero_checkpoint_files(self, save_dir, tag):
+        success = True
+        # zero checkpoint files are created sequentially
         for rank in range(self.world_size):
             if rank == self.global_rank:
-                try:
-                    if self.save_non_zero_checkpoint:
-                        checkpoint_name = self._get_ckpt_name(save_dir, tag)
-                        self._ensure_directory_exists(checkpoint_name)
-
-                    if self.save_zero_checkpoint:
-                        checkpoint_name = self._get_zero_ckpt_name(save_dir, tag)
-                        self._ensure_directory_exists(checkpoint_name)
-                except:
-                    logger.error(
-                        f'Failed Saving model checkpoint to {save_dir} with tag {tag}')
-                    return False
+                success = self._create_checkpoint_file(save_dir, tag, True)
+
             dist.barrier()
 
+        return success
+
     def _save_checkpoint(self, save_dir, tag, client_state={}):
 
         save_path = self._get_ckpt_name(save_dir, tag)
-        #self._ensure_directory_exists(save_path)
+        # A hack to save the checkpointing directory. Pipeline parallelism overrides
+        # module_state_dict() and uses this path to save the model. module_state_dict()
+        # then instead just returns None.
+        self._curr_ckpt_path = os.path.join(save_dir, tag)
 
         state = {
             'module':
@@ -1309,6 +1540,8 @@ def _save_checkpoint(self, save_dir, tag, client_state={}):
             self.skipped_steps,
             'global_steps':
             self.global_steps,
+            'global_samples':
+            self.global_samples,
             'dp_world_size':
             self.dp_world_size,
             'mp_world_size':
@@ -1316,12 +1549,13 @@ def _save_checkpoint(self, save_dir, tag, client_state={}):
         }
         state.update(client_state)
 
-        logger.info('Saving model checkpoint: {}'.format(save_path))
+        log_dist(message=f'Saving model checkpoint: {save_path}', ranks=[0])
+        #logger.info('Saving model checkpoint: {}'.format(save_path))
         torch.save(state, save_path)
+        self._curr_save_path = None
 
     def _save_zero_checkpoint(self, save_path, tag):
         zero_checkpoint_name = self._get_zero_ckpt_name(save_path, tag)
-        #self._ensure_directory_exists(zero_checkpoint_name)
         zero_sd = {'optimizer_state_dict': self.optimizer.state_dict()}
         torch.save(zero_sd, zero_checkpoint_name)
         logger.info('zero checkpoint saved {}'.format(zero_checkpoint_name))
diff --git a/deepspeed/runtime/fp16/__init__.py b/deepspeed/runtime/fp16/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/deepspeed/pt/fp16_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py
similarity index 92%
rename from deepspeed/pt/fp16_optimizer.py
rename to deepspeed/runtime/fp16/fused_optimizer.py
index a03c3738a4db..8c1d2003cb1b 100755
--- a/deepspeed/pt/fp16_optimizer.py
+++ b/deepspeed/runtime/fp16/fused_optimizer.py
@@ -9,9 +9,9 @@
 import math
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
-from deepspeed.pt.deepspeed_utils import get_grad_norm, CheckOverflow, get_weight_norm
-from deepspeed.pt.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE
-from deepspeed.pt.log_utils import logger
+from deepspeed.runtime.utils import get_grad_norm, CheckOverflow, get_weight_norm
+from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE
+from deepspeed.utils import logger, log_dist
 
 
 class FP16_Optimizer(object):
@@ -97,10 +97,24 @@ def __init__(self,
             self.clip_grad_norm = torch.nn.utils.clip_grad_norm_
 
         #model parallel object
-        self.mpu = None
+        self.mpu = mpu
 
         self.overflow = False
         self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu)
+        self.initialize_optimizer_states()
+
+    def initialize_optimizer_states(self):
+        for i, group in enumerate(self.fp16_groups):
+            self.fp32_groups_flat[i].grad = torch.zeros(
+                self.fp32_groups_flat[i].size(),
+                device=self.fp32_groups_flat[i].device)
+
+        self.optimizer.step()
+
+        for i, group in enumerate(self.fp16_groups):
+            self.fp32_groups_flat[i].grad = None
+
+        return
 
     def zero_grad(self, set_grads_to_None=True):
         """
@@ -190,9 +204,30 @@ def step(self, closure=None):
         UPDATE_FP16 = 'update_fp16'
         STEP_TIMERS = OVERFLOW_TIMERS + [UNSCALE_AND_CLIP, BASIC_STEP, UPDATE_FP16]
 
-        # First compute norm for all group so we know if there is overflow
-        grads_groups_flat = []
+        # First determine if there is overflow.
+        self.start_timers([OVERFLOW_CHECK])
+        fp16_params = []
+        for i, group in enumerate(self.fp16_groups):
+            fp16_params.extend([p for p in group if p.grad is not None])
+        self.overflow = self.overflow_checker.has_overflow(fp16_params)
+        self.stop_timers([OVERFLOW_CHECK])
+        prev_scale = self.cur_scale
+        self._update_scale(self.overflow)
+        if self.overflow:
+            if self.verbose:
+                log_dist(
+                    "Overflow detected. Skipping step. Attempted loss "
+                    f"scale: {prev_scale}, reducing to {self.cur_scale}",
+                    ranks=[0])
+            # Clear gradients
+            for i, group in enumerate(self.fp16_groups):
+                for p in group:
+                    p.grad = None
 
+            self.log_timers(OVERFLOW_TIMERS)
+            return self.overflow
+
+        grads_groups_flat = []
         for i, group in enumerate(self.fp16_groups):
             data_type = self.fp32_groups_flat[i].dtype
 
@@ -204,27 +239,15 @@ def step(self, closure=None):
                     if p.grad is None else p.grad.to(data_type) for p in group
                 ]))
 
+            for p in group:
+                p.grad = None
+
             self.fp32_groups_flat[i].grad = grads_groups_flat[i]
 
         self.start_timers([COMPUTE_NORM])
         all_groups_norm = get_grad_norm(self.fp32_groups_flat, mpu=self.mpu)
         self.stop_timers([COMPUTE_NORM])
 
-        self.start_timers([OVERFLOW_CHECK])
-        self.overflow = self.overflow_checker.check_using_norm([all_groups_norm])
-        self.stop_timers([OVERFLOW_CHECK])
-
-        prev_scale = self.cur_scale
-        self._update_scale(self.overflow)
-
-        if self.overflow:
-            if self.verbose:
-                print("[deepspeed] OVERFLOW! Skipping step. Attempted loss "
-                      "scale: {}, reducing to {}".format(prev_scale,
-                                                         self.cur_scale))
-            self.log_timers(OVERFLOW_TIMERS)
-            return self.overflow
-
         self.start_timers([UNSCALE_AND_CLIP])
         self.unscale_and_clip_grads(grads_groups_flat, [all_groups_norm])
         self.stop_timers([UNSCALE_AND_CLIP])
diff --git a/deepspeed/pt/loss_scaler.py b/deepspeed/runtime/fp16/loss_scaler.py
similarity index 100%
rename from deepspeed/pt/loss_scaler.py
rename to deepspeed/runtime/fp16/loss_scaler.py
diff --git a/deepspeed/runtime/fp16/onebit_adam.py b/deepspeed/runtime/fp16/onebit_adam.py
new file mode 100644
index 000000000000..c6566c28777b
--- /dev/null
+++ b/deepspeed/runtime/fp16/onebit_adam.py
@@ -0,0 +1,374 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+import types
+import torch
+import importlib
+import numpy as np
+import time
+import cupy
+from torch.utils.dlpack import to_dlpack
+from torch.utils.dlpack import from_dlpack
+from deepspeed.utils.logging import logger
+
+from mpi4py import MPI
+from deepspeed.runtime.custom_collectives import gather_cuda, gather_host, allgather_cuda, allgather_host
+
+
+class OnebitAdam(torch.optim.Optimizer):
+    """Implements the 1-bit Adam algorithm. Currently GPU-only.
+    For usage example please see, TODO DeepSpeed Tutorial
+    It has been proposed in APMSqueeze (https://arxiv.org/abs/2008.11343)
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        freeze_step (int, optional): Number of steps for warmup (uncompressed)
+            stage before we start using compressed communication. (default 100000)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        max_coeff(float, optional): maximum value of the lamb coefficient (default: 10.0)
+        min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False) NOT SUPPORTED in 1-bit Adam!
+        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
+            adds eps to the bias-corrected second moment estimate before
+            evaluating square root instead of adding it to the square root of
+            second moment estimate as in the original paper. (default: False)
+        cuda_aware (boolean, required): Set True if the underlying MPI implementation
+            supports CUDA-Aware communication. (default: False)
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+    def __init__(self,
+                 params,
+                 deepspeed=None,
+                 lr=1e-3,
+                 freeze_step=100000,
+                 bias_correction=True,
+                 betas=(0.9,
+                        0.999),
+                 eps=1e-8,
+                 eps_inside_sqrt=False,
+                 weight_decay=0.,
+                 max_grad_norm=0.,
+                 amsgrad=False,
+                 cuda_aware=False):
+
+        if amsgrad:
+            raise RuntimeError('1-bit Adam does not support the AMSGrad variant.')
+        defaults = dict(lr=lr,
+                        bias_correction=bias_correction,
+                        betas=betas,
+                        eps=eps,
+                        weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm)
+
+        super(OnebitAdam, self).__init__(params, defaults)
+        from mpi4py import MPI
+        self.eps_mode = 0 if eps_inside_sqrt else 1
+
+        self.comm = MPI.COMM_WORLD
+        self.rank = self.comm.Get_rank()
+        self.size = self.comm.Get_size()
+        self.comm_time = 0.0
+        self.step_time = 0.0
+        self.ave_step = 1
+        self.bk_time = 0.0
+        self.divider = int(self.size * 8 / np.gcd(self.size, 8))
+        self.deepspeed = deepspeed
+        self.adam_freeze_key = False
+        self.initialize = False
+        self.freeze_step = freeze_step
+        self.cuda_aware = cuda_aware
+
+    def torch2cupy(self, tensor):
+        return cupy.fromDlpack(to_dlpack(tensor))
+
+    def cupy2torch(self, cupy_tensor):
+        return from_dlpack(cupy_tensor.toDlpack())
+
+    def compress_by_chunk(self, cupy_bool_tensor, num_chunks):
+        packed_sign = cupy.packbits(cupy_bool_tensor)
+        sign_list_packed = cupy.split(packed_sign, num_chunks)
+        cupy.cuda.get_current_stream().synchronize()
+        return sign_list_packed
+
+    def Compressed_Allreduce(self,
+                             buffer_m: torch.tensor,
+                             worker_error,
+                             server_error,
+                             rank,
+                             world_size,
+                             comm,
+                             local_rank):
+
+        all_start_time = time.time()
+        original_size = buffer_m.numel()
+        cupy.cuda.Device(local_rank).use()
+
+        if torch.numel(buffer_m) != torch.numel(worker_error):
+            empty_tensor = torch.zeros(torch.numel(worker_error) - torch.numel(buffer_m),
+                                       device=buffer_m.device)
+            buffer_m = torch.cat([buffer_m, empty_tensor])
+
+        buffer_m.add_(worker_error)
+        worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
+        sign_buffer_m = buffer_m.sign().add_(1).bool()
+        sign_buffer_m = sign_buffer_m.float()
+        sign_buffer_m.add_(-0.5).mul_(2.0)
+        worker_error.set_((buffer_m - worker_scale * sign_buffer_m))
+        sign_buffer_m = None
+
+        compensated_buffer_m = buffer_m
+        compensated_buffer_m.sign_()
+        compensated_buffer_m = compensated_buffer_m.add_(1).bool()
+        cupy_worker_scale = self.torch2cupy(worker_scale)
+        cupy_compensated_buffer_m = self.torch2cupy(compensated_buffer_m)
+        compensated_buffer_m = None
+
+        cupy_sign_list_packed = self.compress_by_chunk(cupy_compensated_buffer_m,
+                                                       world_size)
+        cupy_compensated_buffer_m = None
+
+        cupy_recvbuf_sign = cupy.zeros([world_size,
+                                        cupy_sign_list_packed[rank].size],
+                                       dtype=cupy_sign_list_packed[0].dtype)
+        cupy_recvbuf_scale = cupy.zeros([world_size, 1], dtype=cupy_worker_scale.dtype)
+
+        # Communication Phase 1
+        gather_start = time.time()
+        if self.cuda_aware:
+            gather_cuda(rank,
+                        world_size,
+                        comm,
+                        cupy_sign_list_packed,
+                        cupy_recvbuf_sign,
+                        cupy_worker_scale,
+                        cupy_recvbuf_scale)
+        else:
+            cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale = gather_host(rank,
+               world_size,
+               comm,
+               cupy_sign_list_packed,
+               cupy_recvbuf_sign,
+               cupy_worker_scale,
+               cupy_recvbuf_scale)
+        gather_end = time.time()
+
+        cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
+            world_size,
+            -1)
+        cupy_recvbuf_sign = None
+        unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float()
+        cupy_unpacked_sign = None
+        unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0)
+        worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / world_size)
+        compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0)
+        unpacked_sign = None
+
+        compensated_server_m.add_(server_error)
+        server_scale = torch.norm(compensated_server_m) / np.sqrt(
+            compensated_server_m.numel())
+        sign_server_m = compensated_server_m.sign().add_(1).bool()
+        sign_server_m = sign_server_m.float()
+        sign_server_m.add_(-0.5).mul_(2.0)
+        server_error.set_(compensated_server_m - server_scale * sign_server_m)
+        sign_server_m = None
+
+        compensated_server_m.sign_()
+        compensated_server_m = compensated_server_m.add_(1).bool()
+        cupy_server_scale = self.torch2cupy(server_scale)
+        cupy_compensated_server_m = self.torch2cupy(compensated_server_m)
+        compensated_server_m = None
+
+        cupy_server_sign_packed = self.compress_by_chunk(cupy_compensated_server_m, 1)
+
+        cupy_recvbuf_sign_server = cupy.zeros(
+            [world_size,
+             cupy_server_sign_packed[0].size],
+            dtype=cupy_sign_list_packed[0].dtype)
+        cupy_recvbuf_scale_server = cupy.zeros([world_size,
+                                                1],
+                                               dtype=cupy_worker_scale.dtype)
+
+        # Communication Phase 2
+        if self.cuda_aware:
+            allgather_cuda(comm,
+                           cupy_server_sign_packed[0],
+                           cupy_recvbuf_sign_server,
+                           cupy_server_scale,
+                           cupy_recvbuf_scale_server)
+        else:
+            cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server = allgather_host(comm,
+                  cupy_server_sign_packed[0],
+                  cupy_recvbuf_sign_server,
+                  cupy_server_scale,
+                  cupy_recvbuf_scale_server)
+
+        cupy_server_unpacked_sign = (cupy.unpackbits(
+            cupy_recvbuf_sign_server.flatten())).reshape(world_size,
+                                                         -1)
+        cupy_recvbuf_sign_server = None
+
+        server_unpacked_sign = self.cupy2torch(cupy_server_unpacked_sign)
+        cupy_server_unpacked_sign = None
+
+        server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(2.0)
+        server_scale = self.cupy2torch(cupy_recvbuf_scale_server)
+        buffer_m = server_unpacked_sign.mul_(server_scale).flatten()[0:original_size]
+
+        return buffer_m
+
+    def step(self, closure=None, grads=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+            grads (list of tensors, optional): weight gradient to use for the
+                optimizer update. If gradients have type torch.half, parameters
+                are expected to be in type torch.float. (default: None)
+            output params (list of tensors, optional): A reduced recision copy
+                of the updated weights written out in addition to the regular
+                updated weights. Have to be of same type as gradients. (default: None)
+            scale (float, optional): factor to divide gradient tensor values
+                by before applying to weights. (default: 1)
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        gather_time = 0
+        allgather_time = 0
+        all_time = 0
+
+        if self.adam_freeze_key is False:
+            v_diff_buffer = 0.0
+
+        if grads is None:
+            grads_group = [None] * len(self.param_groups)
+        # backward compatibility
+        # assuming a list/generator of parameter means single group
+        elif isinstance(grads, types.GeneratorType):
+            grads_group = [grads]
+        elif type(grads[0]) != list:
+            grads_group = [grads]
+        else:
+            grads_group = grads
+
+        for group, grads_this_group in zip(self.param_groups, grads_group):
+            if grads_this_group is None:
+                grads_this_group = [None] * len(group['params'])
+
+            bias_correction = 1 if group['bias_correction'] else 0
+
+            for p, grad in zip(group['params'], grads_this_group):
+                if p.grad is None and grad is None:
+                    continue
+                if grad is None:
+                    grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError(
+                        'FusedAdam does not support sparse gradients, please consider SparseAdam instead'
+                    )
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                    state['tensor_size'] = torch.numel(p.data)
+                    state['corrected_tensor_size'] = state['tensor_size']
+
+                    if state['tensor_size'] % (self.size * self.divider) != 0:
+                        state['corrected_tensor_size'] += ((self.size * self.divider) -
+                                                           (state['tensor_size'] %
+                                                            (self.size * self.divider)))
+                    state['server_chunk_size'] = state[
+                        'corrected_tensor_size'] // self.size
+
+                if not self.initialize or (self.adam_freeze_key
+                                           and 'worker_error' not in state.keys()):
+                    torch.cuda.empty_cache()
+                    state['worker_error'] = torch.zeros(state['corrected_tensor_size'],
+                                                        device=p.device)
+                    state['server_error'] = torch.zeros(state['server_chunk_size'],
+                                                        device=p.device)
+                    torch.cuda.empty_cache()
+                    self.adam_freeze_key = True
+                    if not self.initialize and torch.distributed.get_rank() == 0:
+                        print("Cupy Buffers Initialized Successfully.")
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                if self.adam_freeze_key is False:
+                    exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                    exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                    grad = None
+                    if self.initialize:
+                        update = exp_avg / (exp_avg_sq.sqrt() + group['eps'])
+
+                else:
+                    if 'non_freeze' in group.keys() and group['non_freeze'] is True:
+                        dist.all_reduce(grad)
+                        grad.mul_(1 / dist.get_world_size())
+                        exp_avg.mul_(beta1).add(1 - beta1, grad)
+                        exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                        grad = None
+                    else:
+                        if self.initialize is True:
+                            exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                        grad = None
+
+                        if self.size > 1:
+                            exp_avg.set_(
+                                self.Compressed_Allreduce(exp_avg,
+                                                          state['worker_error'],
+                                                          state['server_error'],
+                                                          self.rank,
+                                                          self.size,
+                                                          self.comm,
+                                                          self.deepspeed.local_rank))
+                    if self.initialize:
+                        update = exp_avg / (exp_avg_sq.sqrt() + group['eps'])
+
+                if self.initialize:
+                    if group['weight_decay'] > 0.0:
+                        update += group['weight_decay'] * p.data
+                    with torch.no_grad():
+                        p.add_(-group['lr'] * update)
+
+            if not self.initialize:
+                print('Pop out errors', flush=True)
+                state.pop('worker_error')
+                state.pop('server_error')
+
+        if not self.initialize:
+            self.adam_freeze_key = False
+            self.initialize = True
+            print(
+                f"Finished the initialization step at rant {torch.distributed.get_rank()}"
+            )
+            return loss
+
+        if self.adam_freeze_key is False:
+            if state['step'] >= self.freeze_step:
+                self.adam_freeze_key = True
+                self.deepspeed.enable_backward_allreduce = False
+
+        return loss
diff --git a/deepspeed/pt/fp16_unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py
similarity index 92%
rename from deepspeed/pt/fp16_unfused_optimizer.py
rename to deepspeed/runtime/fp16/unfused_optimizer.py
index c21e92a6c5db..37edf9d5002d 100755
--- a/deepspeed/pt/fp16_unfused_optimizer.py
+++ b/deepspeed/runtime/fp16/unfused_optimizer.py
@@ -9,9 +9,9 @@
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 import math
 
-from deepspeed.pt.deepspeed_utils import get_grad_norm, CheckOverflow, get_weight_norm
-from deepspeed.pt.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE
-from deepspeed.pt.log_utils import logger
+from deepspeed.runtime.utils import get_grad_norm, CheckOverflow, get_weight_norm
+from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE
+from deepspeed.utils import logger
 
 
 class FP16_UnfusedOptimizer(object):
@@ -93,11 +93,13 @@ def __init__(self,
         else:
             self.clip_grad_norm = torch.nn.utils.clip_grad_norm_
 
-        self.mpu = None
+        self.mpu = mpu
 
         self.overflow = False
         self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu)
 
+        self.initialize_optimizer_states()
+
     def zero_grad(self, set_grads_to_None=True):
         """
         Zero FP16 parameter grads.
@@ -349,3 +351,26 @@ def load_state_dict(self, state_dict, load_optimizer_states=True):
 
     def __repr__(self):
         return repr(self.optimizer)
+
+    def initialize_optimizer_states(self):
+        for i, group in enumerate(self.fp16_groups):
+            for param in group:
+                param.grad = torch.zeros(param.size(),
+                                         dtype=param.dtype,
+                                         device=torch.cuda.current_device())
+
+        for i, group in enumerate(self.fp32_groups):
+            for param in group:
+                param.grad = torch.zeros(param.size(),
+                                         dtype=param.dtype,
+                                         device=torch.cuda.current_device())
+
+        self.optimizer.step()
+
+        for i, group in enumerate(self.fp16_groups):
+            for param in group:
+                param.grad = None
+
+        for i, group in enumerate(self.fp32_groups):
+            for param in group:
+                param.grad = None
diff --git a/deepspeed/pt/deepspeed_lr_schedules.py b/deepspeed/runtime/lr_schedules.py
similarity index 83%
rename from deepspeed/pt/deepspeed_lr_schedules.py
rename to deepspeed/runtime/lr_schedules.py
index 97c18ffec0a2..515233851a1d 100755
--- a/deepspeed/pt/deepspeed_lr_schedules.py
+++ b/deepspeed/runtime/lr_schedules.py
@@ -12,14 +12,15 @@
 from torch.optim import Optimizer
 from typing import Union, List
 import math
-from deepspeed.pt.deepspeed_constants import *
-from deepspeed.pt.log_utils import logger
+from deepspeed.runtime.constants import *
+from deepspeed.utils import logger
 
 LR_SCHEDULE = 'lr_schedule'
 LR_RANGE_TEST = 'LRRangeTest'
 ONE_CYCLE = 'OneCycle'
 WARMUP_LR = 'WarmupLR'
-VALID_LR_SCHEDULES = [LR_RANGE_TEST, ONE_CYCLE, WARMUP_LR]
+WARMUP_DECAY_LR = 'WarmupDecayLR'
+VALID_LR_SCHEDULES = [LR_RANGE_TEST, ONE_CYCLE, WARMUP_LR, WARMUP_DECAY_LR]
 
 LR_RANGE_TEST_MIN_LR = 'lr_range_test_min_lr'
 LR_RANGE_TEST_STEP_RATE = 'lr_range_test_step_rate'
@@ -47,6 +48,8 @@
 WARMUP_MAX_LR = 'warmup_max_lr'
 WARMUP_NUM_STEPS = 'warmup_num_steps'
 
+TOTAL_NUM_STEPS = 'total_num_steps'
+
 
 def add_tuning_arguments(parser):
     group = parser.add_argument_group('Convergence Tuning',
@@ -364,10 +367,10 @@ def __init__(self,
             self._update_optimizer(self.min_lr)
 
     def _staircase_interval(self):
-        return math.floor(float(self.last_batch_iteration) / self.step_size)
+        return math.floor(float(self.last_batch_iteration + 1) / self.step_size)
 
     def _continous_interval(self):
-        return float(self.last_batch_iteration) / self.step_size
+        return float(self.last_batch_iteration + 1) / self.step_size
 
     def _get_increase(self):
         return (1 + self.step_rate * self.interval_fn())
@@ -378,6 +381,12 @@ def get_lr(self):
             lr_range_test_min_lr * lr_increase for lr_range_test_min_lr in self.min_lr
         ]
 
+    def get_last_lr(self):
+        """ Return last computed learning rate by current scheduler.
+        """
+        assert getattr(self, '_last_lr', None) is not None, "need to call step() first"
+        return self._last_lr
+
     def _update_optimizer(self, group_lrs):
         for param_group, lr in zip(self.optimizer.param_groups, group_lrs):
             param_group['lr'] = lr
@@ -387,6 +396,7 @@ def step(self, batch_iteration=None):
             batch_iteration = self.last_batch_iteration + 1
         self.last_batch_iteration = batch_iteration
         self._update_optimizer(self.get_lr())
+        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
 
     def state_dict(self):
         return {'last_batch_iteration': self.last_batch_iteration}
@@ -564,73 +574,98 @@ def _initialize_momentum(self,
             for momentum, group in zip(self.min_moms, optimizer.param_groups):
                 group['betas'] = momentum
 
-    def _get_cycle_lr(self):
-        cycle = math.floor(1 + self.last_batch_iteration / self.total_size)
-        x = 1. + self.last_batch_iteration / self.total_size - cycle
+    def _get_scale_factor(self):
+        batch_iteration = (self.last_batch_iteration + 1)
+        cycle = math.floor(1 + batch_iteration / self.total_size)
+        x = 1. + batch_iteration / self.total_size - cycle
         if x <= self.step_ratio:
             scale_factor = x / self.step_ratio
         else:
             scale_factor = (x - 1) / (self.step_ratio - 1)
 
+        return scale_factor
+
+    def _get_cycle_mom(self):
+        scale_factor = self._get_scale_factor()
+        momentums = []
+        for base_betas, max_betas in zip(self.min_moms, self.max_moms):
+            cycle_min_mom = base_betas[0]
+            cycle_max_mom = max_betas[0]
+            base_height = (cycle_max_mom - cycle_min_mom) * scale_factor
+            momentum = cycle_max_mom - base_height
+            momentums.append((momentum, base_betas[1]))
+        return momentums
+
+    def _get_cycle_lr(self):
+        scale_factor = self._get_scale_factor()
         lrs = []
         for cycle_min_lr, cycle_max_lr in zip(self.min_lrs, self.max_lrs):
             base_height = (cycle_max_lr - cycle_min_lr) * scale_factor
             lr = cycle_min_lr + base_height
             lrs.append(lr)
 
-        if self.cycle_momentum:
-            momentums = []
-            for base_betas, max_betas in zip(self.min_moms, self.max_moms):
-                cycle_min_mom = base_betas[0]
-                cycle_max_mom = max_betas[0]
-                base_height = (cycle_max_mom - cycle_min_mom) * scale_factor
-                momentum = cycle_max_mom - base_height
-                momentums.append((momentum, base_betas[1]))
-            for param_group, momentum in zip(self.optimizer.param_groups, momentums):
-                param_group['betas'] = momentum
-
         return lrs
 
+    def _get_decay_mom(self, decay_batch_iteration):
+        decay_interval = decay_batch_iteration / self.decay_step_size
+        mom_decay_factor = (1 + self.decay_mom_rate * decay_interval)
+        momentums = [(beta0 * mom_decay_factor, beta1) for beta0, beta1 in self.max_moms]
+        return momentums
+
     def _get_decay_lr(self, decay_batch_iteration):
         """Calculates the learning rate at batch index. This function is used
         after the cycle completes and post cycle decaying of lr/mom is enabled.
         This function treats `self.last_batch_iteration` as the last batch index.
-
-        If `self.cycle_momentum` is ``True``, this function has a side effect of
-        updating the optimizer's momentum.
         """
         decay_interval = decay_batch_iteration / self.decay_step_size
-
         lr_decay_factor = (1 + self.decay_lr_rate * decay_interval)
-        lrs = [cycle_min_lr * lr_decay_factor for cycle_min_lr in self.min_lrs]
-
-        if self.cycle_momentum:
-            mom_decay_factor = (1 + self.decay_mom_rate * decay_interval)
-            momentums = [(beta0 * mom_decay_factor,
-                          beta1) for beta0,
-                         beta1 in self.max_moms]
-            for param_group, momentum in zip(self.optimizer.param_groups, momentums):
-                param_group['betas'] = momentum
+        lrs = [cycle_min_lr / lr_decay_factor for cycle_min_lr in self.min_lrs]
 
         return lrs
 
     def get_lr(self):
         """Calculates the learning rate at batch index. This function treats
         `self.last_batch_iteration` as the last batch index.
-
-        If `self.cycle_momentum` is ``True``, this function has a side effect of
-        updating the optimizer's momentum.
         """
-        if self.last_batch_iteration <= self.total_size:
+        if self.last_batch_iteration < self.total_size:
             return self._get_cycle_lr()
-        return self._get_decay_lr(self.last_batch_iteration - self.total_size)
+        return self._get_decay_lr(self.last_batch_iteration - self.total_size + 1)
+
+    def get_mom(self):
+        """Calculates the momentum at batch index. This function treats
+        `self.last_batch_iteration` as the last batch index.
+        """
+        if not self.cycle_momentum:
+            return None
+
+        if self.last_batch_iteration < self.total_size:
+            return self._get_cycle_mom()
+        return self._get_decay_mom(self.last_batch_iteration - self.total_size + 1)
+
+    def get_last_lr(self):
+        """ Return last computed learning rate by current scheduler.
+        """
+        assert getattr(self, '_last_lr', None) is not None, "need to call step() first"
+        return self._last_lr
 
     def step(self, batch_iteration=None):
+        """ Updates the optimizer with the learning rate for the last batch index.
+        `self.last_batch_iteration` is treated as the last batch index.
+
+        If self.cycle_momentum is true, also updates optimizer momentum.
+        """
         if batch_iteration is None:
             batch_iteration = self.last_batch_iteration + 1
+
         self.last_batch_iteration = batch_iteration
         for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
             param_group['lr'] = lr
+        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+
+        if self.cycle_momentum:
+            momentums = self.get_mom()
+            for param_group, momentum in zip(self.optimizer.param_groups, momentums):
+                param_group['betas'] = momentum
 
     def state_dict(self):
         return {'last_batch_iteration': self.last_batch_iteration}
@@ -676,6 +711,10 @@ def __init__(self,
         self.last_batch_iteration = last_batch_iteration
 
     def get_lr(self):
+        if self.last_batch_iteration < 0:
+            logger.warning(
+                "Attempting to get learning rate from scheduler before it has started")
+            return [0.0]
         gamma = self._get_gamma()
         return [
             min_lr + (delta_lr * gamma) for min_lr,
@@ -683,12 +722,19 @@ def get_lr(self):
                             self.delta_lrs)
         ]
 
+    def get_last_lr(self):
+        """ Return last computed learning rate by current scheduler.
+        """
+        assert getattr(self, '_last_lr', None) is not None, "need to call step() first"
+        return self._last_lr
+
     def step(self, last_batch_iteration=None):
         if last_batch_iteration is None:
             last_batch_iteration = self.last_batch_iteration + 1
         self.last_batch_iteration = last_batch_iteration
         for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
             param_group['lr'] = lr
+        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
 
     def state_dict(self):
         return {'last_batch_iteration': self.last_batch_iteration}
@@ -710,3 +756,54 @@ def _format_param(self, optimizer, param_value, param_name):
                     FileNotFoundError(param_value)))
             return list(param_value)
         return [param_value] * len(optimizer.param_groups)
+
+
+class WarmupDecayLR(WarmupLR):
+    """Increase the learning rate of each parameter group from min lr to max lr
+        over warmup_num_steps steps, and then decay at linear rate over the remaining training steps.
+
+        Args:
+            optimizer (Optimizer): Wrapped optimizer.
+            total_num_steps (int): total number of training steps
+            warmup_min_lr (float or list): minimum learning rate. Default: 0
+            warmup_max_lr (float or list): maximum learning rate. Default: 0.001
+            warmup_num_steps (int): number of steps to warm up from min_lr to max_lr. Default: 1000
+            last_batch_iteration (int): The index of the last batch. Default: -1.
+        Example:
+            >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+            >>> scheduler = WarmupDecayLR(optimizer, 1000000)
+            >>> data_loader = torch.utils.data.DataLoader(...)
+            >>> for epoch in range(10):
+            >>>     for batch in data_loader:
+            >>>         train_batch(...)
+            >>>         scheduler.step()
+
+    """
+    def __init__(self,
+                 optimizer: Optimizer,
+                 total_num_steps: int,
+                 warmup_min_lr: float = 0.0,
+                 warmup_max_lr: float = 0.001,
+                 warmup_num_steps: int = 1000,
+                 last_batch_iteration: int = -1):
+
+        self.total_num_steps = total_num_steps
+        super(WarmupDecayLR,
+              self).__init__(optimizer,
+                             warmup_min_lr,
+                             warmup_max_lr,
+                             warmup_num_steps,
+                             last_batch_iteration)
+        if self.total_num_steps < self.warmup_num_steps:
+            logger.warning('total_num_steps {} is less than warmup_num_steps {}'.format(
+                total_num_steps,
+                warmup_num_steps))
+
+    def _get_gamma(self):
+        if self.last_batch_iteration < self.warmup_num_steps:
+            return self.inverse_log_warm_up * math.log(self.last_batch_iteration + 1)
+        return max(
+            0.0,
+            float(self.total_num_steps - self.last_batch_iteration) /
+            float(max(1.0,
+                      self.total_num_steps - self.warmup_num_steps)))
diff --git a/deepspeed/runtime/pipe/__init__.py b/deepspeed/runtime/pipe/__init__.py
new file mode 100644
index 000000000000..3171d3e254c8
--- /dev/null
+++ b/deepspeed/runtime/pipe/__init__.py
@@ -0,0 +1 @@
+from .module import PipelineModule, LayerSpec, TiedLayerSpec
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
new file mode 100644
index 000000000000..a7b3827c01b7
--- /dev/null
+++ b/deepspeed/runtime/pipe/engine.py
@@ -0,0 +1,1172 @@
+# Copyright 2019 The Microsoft DeepSpeed Team
+
+import time
+import logging
+import copy
+import os
+
+from types import MethodType
+
+from numpy import prod
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.distributed as dist
+
+from deepspeed.utils.logging import logger
+from deepspeed.utils.timer import SynchronizedWallClockTimer, ThroughputTimer
+
+from ..engine import DeepSpeedEngine, MEMORY_OPT_ALLREDUCE_SIZE
+from ..utils import PartitionedTensor, ensure_directory_exists
+from ..dataloader import RepeatingLoader
+
+from .module import PipelineModule, PipelineError, TiedLayerSpec
+from . import p2p
+from . import schedule
+
+TARGET_ID = -2
+LOG_STAGE = -2
+DATA_PARALLEL_ID = -2
+
+
+def is_even(number):
+    return number % 2 == 0
+
+
+mem_alloced = 0
+mem_cached = 0
+
+
+def _tensor_bytes(tensor):
+    return tensor.numel() * tensor.element_size()
+
+
+class PipelineEngine(DeepSpeedEngine):
+    """ A training engine hybrid pipeline, data, and model parallel training.
+
+    This engine is created by ``deepspeed.initialize()`` when a :class:`PipelineModule`
+    is provided.
+    """
+    def __init__(self, *super_args, **super_kwargs):
+        super().__init__(*super_args, **super_kwargs)
+        assert isinstance(self.module, PipelineModule), "model must base PipelineModule"
+
+        # We schedule the all-reduces, so disable it in super().backward()
+        self.enable_backward_allreduce = False
+        assert not self.elasticity_enabled(), "Elasticity is not currently supported" \
+            " with pipeline parallelism."
+
+        # pipeline step for logging
+        self.log_batch_step_id = -1
+
+        self.micro_batch_size = self.train_micro_batch_size_per_gpu()
+        self.micro_batches = self.gradient_accumulation_steps()
+
+        # Set Grid and Communication Groups
+        self.grid = self.module._grid
+        if self.grid.get_global_rank() == 0:
+            logger.info(f'CONFIG: micro_batches={self.micro_batches} '
+                        f'micro_batch_size={self.micro_batch_size}')
+
+        self.global_rank = self.grid.get_global_rank()
+
+        assert self.dp_world_size == self.grid.data_parallel_size
+        assert self.train_batch_size() == \
+            self.micro_batch_size * self.micro_batches * self.grid.data_parallel_size
+
+        #  Set Stage Inf
+        self.num_stages = self.grid.pipe_parallel_size
+        self.stage_id = self.grid.get_stage_id()
+        self.prev_stage = self.stage_id - 1
+        self.next_stage = self.stage_id + 1
+
+        self.data_iterator = None
+        self.batch_fn = None
+
+        self._force_grad_boundary = False
+
+        self.batch_timer = ThroughputTimer(batch_size=self.micro_batch_size *
+                                           self.micro_batches,
+                                           num_workers=self.dp_world_size,
+                                           logging_fn=self.tput_log,
+                                           monitor_memory=False,
+                                           steps_per_output=self.steps_per_print())
+
+        # PipelineEngine needs to handle data loading specially due to only the first
+        # and last stages loading inputs/labels. We construct a sampler that uses
+        if self.training_data:
+            self._build_data_iter(self.training_data)
+
+        self.is_pipe_parallel = self.grid.pipe_parallel_size > 1
+        self.is_data_parallel = self.grid.data_parallel_size > 1
+        self.is_model_parallel = self.grid.model_parallel_size > 1
+
+        # Partition input/output buffers
+        self.is_pipe_partitioned = self.is_model_parallel
+        self.is_grad_partitioned = False
+
+        model_parameters = filter(lambda p: p.requires_grad, self.module.parameters())
+        num_params = sum([p.numel() for p in model_parameters])
+        unique_params = num_params
+        # Subtract tied parameters if we don't own them
+        if self.module.tied_comms:
+            tied_params = 0
+            for key, d in self.module.tied_comms.items():
+                if self.global_rank != min(d['ranks']):
+                    tied_params += sum(p.numel() for p in d['module'].parameters())
+            unique_params -= tied_params
+        params_tensor = torch.LongTensor(data=[num_params,
+                                               unique_params]).to(self.device)
+        dist.all_reduce(params_tensor, group=self.grid.get_model_parallel_group())
+        params_tensor = params_tensor.tolist()
+        total_params = params_tensor[0]
+        unique_params = params_tensor[1]
+        if self.grid.data_parallel_id == 0:
+            logger.info(f'RANK={self.global_rank} '
+                        f'STAGE={self.stage_id} '
+                        f'LAYERS={self.module._local_stop - self.module._local_start} '
+                        f'[{self.module._local_start}, {self.module._local_stop}) '
+                        f'STAGE_PARAMS={num_params} ({num_params/1e6:0.3f}M) '
+                        f'TOTAL_PARAMS={total_params} ({total_params/1e6:0.3f}M) '
+                        f'UNIQUE_PARAMS={unique_params} ({unique_params/1e6:0.3f}M)')
+
+        #intialize peer-2-peer communication and allreduce groups
+        if self.is_pipe_parallel:
+            p2p.init_process_groups(self.grid)
+
+        # Pipeline buffers
+        self.num_pipe_buffers = 0
+        self.pipe_buffers = {
+            'inputs' : [],   # batch input and received activations
+            'labels' : [],   # labels from batch input
+            'outputs' : [],  # activations
+            'output_tensors' : [], # tensor object to preserve backward graph
+        }
+        self.pipe_recv_buf = None
+        self.grad_layer = None
+
+        self.meta_buffer = None
+
+        self.first_output_send = True
+        self.first_gradient_send = True
+
+        #stores the loss for the current micro batch being processed
+        self.loss = torch.tensor(0.0).to(self.device)
+
+        #stores the loss for the entire batch
+        self.total_loss = None
+        self.agg_loss = torch.tensor(0.0, requires_grad=False).to(self.device)
+        self.dp_group_loss = torch.tensor(0.0, requires_grad=False).to(self.device)
+
+        if self._config.pipeline['activation_checkpoint_interval'] > 0:
+            self.module.activation_checkpoint_interval = self._config.pipeline[
+                'activation_checkpoint_interval']
+
+        if self.is_last_stage():
+            self.loss_model = self.module.loss_fn
+
+        # Initialize pipeline communicators. Just send a 0.
+        if is_even(self.stage_id):
+            if not self.is_last_stage():
+                p2p.send(self.loss, self.next_stage)
+            if not self.is_first_stage():
+                p2p.recv(self.loss, self.prev_stage)
+        else:
+            if not self.is_first_stage():
+                p2p.recv(self.loss, self.prev_stage)
+            if not self.is_last_stage():
+                p2p.send(self.loss, self.next_stage)
+
+        # XXX look into timer reporting timing
+        # Initialize some timers because of early weirdness.
+        if self.wall_clock_breakdown():
+            self.timers('forward_microstep').start()
+            self.timers('forward_microstep').stop()
+            self.timers('backward_microstep').start()
+            self.timers('backward_microstep').stop()
+            self.timers('backward_inner_microstep').start()
+            self.timers('backward_inner_microstep').stop()
+            self.timers('backward_allreduce_microstep').start()
+            self.timers('backward_allreduce_microstep').stop()
+            self.timers('backward_allreduce').start()
+            self.timers('backward_allreduce').stop()
+            self.timers('step_microstep').start()
+            self.timers('step_microstep').stop()
+
+    def _build_data_iter(self, dataset):
+        sampler = torch.utils.data.distributed.DistributedSampler(
+            dataset,
+            num_replicas=self.dp_world_size,
+            rank=self.mpu.get_data_parallel_rank(),
+            shuffle=False)
+        # Build a loader and make it repeating.
+        pipe_dataloader = self.deepspeed_io(dataset, data_sampler=sampler)
+        pipe_dataloader = RepeatingLoader(pipe_dataloader)
+        self.set_dataloader(pipe_dataloader)
+
+    def _exec_reduce_tied_grads(self):
+        # We need to run this first to write to self.averaged_gradients;
+        # since this class turns `enable_backward_allreduce` off,
+        # `self.overlapping_partition_gradients_reduce_epilogue()` defined in the DeepSpeedEngine
+        # never actually runs. I suspect this is because of efficiency problems; get_flat_partition in
+        # stage2.py might do something expensive; someone will have to look into that later. But
+        # in the meantime, this fixes ZeRO2 + Pipelining enough to run a demo. Further profiling
+        # needed to decide if it actually breaks everything.
+        # (see https://github.com/EleutherAI/gpt-neox/issues/62#issuecomment-761471944)
+        if self.zero_optimization_partition_gradients():
+            self.optimizer.overlapping_partition_gradients_reduce_epilogue()
+        self.module.allreduce_tied_weight_gradients()
+
+    def _exec_reduce_grads(self):
+        self._force_grad_boundary = True
+        if self.is_data_parallel:
+            self.buffered_allreduce_fallback(
+                elements_per_buffer=MEMORY_OPT_ALLREDUCE_SIZE)
+        self._force_grad_boundary = False
+
+    def _reserve_pipe_buffers(self, num_buffers):
+        """Ensure that each pipeline buffer has at least ``num_buffers`` slots.
+
+        This method only reserves slots and does not allocate tensors.
+
+        Args:
+            num_buffers (int): The number of buffers to reserve.
+        """
+        if self.num_pipe_buffers >= num_buffers:
+            return
+
+        num_added = num_buffers - self.num_pipe_buffers
+        for key in self.pipe_buffers:
+            self.pipe_buffers[key].extend([None] * num_added)
+        self.num_pipe_buffers = num_buffers
+
+    def train_batch(self, data_iter=None):
+        """Progress the pipeline to train the next batch of data. The engine will ingest
+        ``self.train_batch_size()`` total samples collectively across all workers.
+
+
+        An iterator that over training data should be provided as an argument
+        unless ``deepspeed.initialize()`` was provided a training set. In that event,
+        the training data will automatically be read.
+
+
+        .. warning::
+            A total of ``self.gradient_accumulation_steps()`` entries will be pulled
+            from ``data_iter`` by each pipeline. There must be sufficient
+            data left in ``data_iter`` or else a ``StopIteration`` will halt training.
+
+            DeepSpeed provides a convenience class :class:`deepspeed.utils.RepeatingLoader`
+            that wraps data loaders to automatically restart upon a ``StopIteration``.
+
+        Args:
+            data_iter (Iterator, optional): Iterator of training data.
+
+        Returns:
+            The arithmetic mean of the losses computed this batch.
+        """
+        if not torch._C.is_grad_enabled():
+            raise RuntimeError(
+                f'train_batch() requires gradients enabled. Use eval_batch() instead.')
+
+        if data_iter:
+            self.set_dataiterator(data_iter)
+
+        self.module.train()
+        self.total_loss = None
+
+        # Do the work
+        self.timers('train_batch').start()
+        sched = schedule.TrainSchedule(micro_batches=self.micro_batches,
+                                       stages=self.num_stages,
+                                       stage_id=self.stage_id)
+        self._exec_schedule(sched)
+        self.agg_train_loss = self._aggregate_total_loss()
+        self.timers('train_batch').stop()
+
+        if self.global_steps % self.steps_per_print() == 0:
+            if self.global_rank == 0:
+                elapsed = self.timers('train_batch').elapsed(reset=True)
+                iter_time = elapsed / self.steps_per_print()
+                tput = self.train_batch_size() / iter_time
+                print(f'steps: {self.global_steps} '
+                      f'loss: {self.agg_train_loss:0.4f} '
+                      f'iter time (s): {iter_time:0.3f} '
+                      f'samples/sec: {tput:0.3f}')
+
+        # Tensorboard
+        if self.tensorboard_enabled():
+            if self.global_rank == 0:
+                self.summary_events = [(f'Train/Samples/train_loss',
+                                        self.agg_train_loss.mean().item(),
+                                        self.global_samples)]
+                for event in self.summary_events:  # write_summary_events
+                    self.summary_writer.add_scalar(event[0], event[1], event[2])
+                if self.global_steps % self.steps_per_print() == 0:
+                    self.summary_writer.flush()
+
+        if self.wall_clock_breakdown(
+        ) and self.global_steps % self.steps_per_print() == 0:
+            self.timers.log([
+                'pipe_send_output',
+                'pipe_send_grad',
+                'pipe_recv_input',
+                'pipe_recv_grad'
+            ])
+
+        # TODO: should return precisely what loss returned and allow others to be queried?
+        return self.agg_train_loss
+
+    def eval_batch(self, data_iter):
+        """Evaluate the pipeline on a batch of data from ``data_iter``. The
+        engine will evaluate ``self.train_batch_size()`` total samples
+        collectively across all workers.
+
+        This method is equivalent to:
+
+        .. code-block:: python
+
+            module.eval()
+            with torch.no_grad():
+                output = module(batch)
+
+        .. warning::
+            A total of ``self.gradient_accumulation_steps()`` entries will be pulled
+            from ``data_iter`` by each pipeline. There must be sufficient
+            data left in ``data_iter`` or else a ``StopIteration`` will halt training.
+
+            DeepSpeed provides a convenience class :class:`deepspeed.utils.RepeatingLoader`
+            that wraps data loaders to automatically restart upon a ``StopIteration``.
+
+        Args:
+            data_iter (Iterator): Iterator of data to evaluate.
+
+        Returns:
+            The arithmetic mean of the losses computed this batch.
+        """
+
+        self.module.eval()
+        self.total_loss = None
+
+        # Use the provided data iterator
+        train_iterator = self.data_iterator
+        self.set_dataiterator(data_iter)
+
+        # Do the work
+        sched = schedule.InferenceSchedule(micro_batches=self.micro_batches,
+                                           stages=self.num_stages,
+                                           stage_id=self.stage_id)
+        with torch.no_grad():
+            self._exec_schedule(sched)
+
+        self.agg_eval_loss = self._aggregate_total_loss()
+        if self.tensorboard_enabled():
+            if self.global_rank == 0:
+                self.summary_events = [(f'Train/Samples/eval_loss',
+                                        self.agg_eval_loss.mean().item(),
+                                        self.global_samples)]
+                for event in self.summary_events:  # write_summary_events
+                    self.summary_writer.add_scalar(event[0], event[1], event[2])
+                self.summary_writer.flush()
+
+        # Restore the training iterator
+        self.set_dataiterator(train_iterator)
+
+        # Reset any buffers that may have been populated during the forward passes.
+        #ds_checkpointing.reset()
+
+        return self.agg_eval_loss
+
+    def is_first_stage(self):
+        """True if this process is in the first stage in the pipeline."""
+        return self.stage_id == 0
+
+    def is_last_stage(self):
+        """True if this process is in the last stage in the pipeline."""
+        return self.stage_id == self.num_stages - 1
+
+    def _aggregate_total_loss(self):
+        # Scale loss, average among DP ranks, and bcast loss to the rest of my DP group
+        if self.is_last_stage():
+            loss = self._scale_loss(self.total_loss)
+            self.dp_group_loss = loss.clone().detach()
+
+            ## Average loss across all data-parallel groups
+            agg_loss = self.dp_group_loss.clone().detach()
+            #print(f'RANK={self.global_rank} bcast SENDER src={self.global_rank} group={self.grid.pp_group}', flush=True)
+            if self.is_data_parallel:
+                dist.all_reduce(agg_loss, group=self.mpu.get_data_parallel_group())
+                agg_loss /= self.dp_world_size
+
+            assert self.global_rank in self.grid.pp_group
+            losses = torch.Tensor([self.dp_group_loss, agg_loss]).to(self.device)
+            dist.broadcast(tensor=losses,
+                           src=self.global_rank,
+                           group=self.mpu.get_pipe_parallel_group())
+
+        else:
+            # Get loss from last stage
+            src_rank = self.grid.stage_to_global(self.num_stages - 1)
+            assert src_rank in self.grid.pp_group
+            losses = torch.Tensor([0., 0.]).to(self.device)
+            dist.broadcast(tensor=losses,
+                           src=src_rank,
+                           group=self.grid.get_pipe_parallel_group())
+            self.dp_group_loss = losses[0].clone().detach()
+            agg_loss = losses[1].clone().detach()
+
+        return agg_loss
+
+    def set_dataloader(self, loader):
+        """"""
+        if self.is_first_stage() or self.is_last_stage():
+            self.training_dataloader = loader
+            self.data_iterator = iter(self.training_dataloader)
+
+    def set_dataiterator(self, iterator):
+        """ Store an iterator to sample for training data. """
+        if self.is_first_stage() or self.is_last_stage():
+            self.training_dataloader = None
+            self.data_iterator = iterator
+
+    def set_batch_fn(self, fn):
+        self.batch_fn = fn
+
+    def is_gradient_accumulation_boundary(self):
+        """True if the engine is executing a gradient reduction or optimizer step instruction.
+
+        This is overridden from :class:`DeepSpeedEngine` to force reductions
+        and steps when the pipeline engine is instructed to do so.
+
+        Returns:
+            bool: whether reductions and optimizer steps should occur.
+        """
+        return self._force_grad_boundary
+
+    def log_for_device(self, *msg):
+        if LOG_STAGE == self.stage_id or LOG_STAGE == -1:
+            if DATA_PARALLEL_ID == self.grid.data_parallel_id or DATA_PARALLEL_ID == -1:
+                print(
+                    f'RANK={dist.get_rank()} '
+                    f'PIPE-ID={self.stage_id} '
+                    f'DATA-ID={self.grid.data_parallel_id} '
+                    f'MBATCH-ID={self.microbatch_id} '
+                    f'STEP-ID={self.log_batch_step_id} '
+                    '::',
+                    *msg,
+                    flush=True)
+
+    def tput_log(self, *msg):
+        if self.global_rank == 0 and self.global_steps % self.steps_per_print() == 0:
+            print(*msg)
+
+    def _next_batch(self):
+        if self.is_model_parallel:
+            mp_rank = self.grid.get_slice_parallel_rank()
+        else:
+            mp_rank = 0
+
+        batch = None
+
+        # Only MP rank 0 loads the data.
+        if mp_rank == 0:
+            if self.data_iterator is None:
+                raise ValueError(f"RANK={self.global_rank} no data iterator provided.")
+            batch = next(self.data_iterator)
+
+        # All MP ranks participate in batch_fn, where they might broadcast the data.
+        if self.batch_fn:
+            batch = self.batch_fn(batch)
+
+        # Sanity check dimensions.
+        # XXX: the last minibatch with size < micro_batch_size kills us
+        if torch.is_tensor(batch[0]):
+            if batch[0].size(0) != self.micro_batch_size:
+                print(f'size mismatch: {batch[0].size(0)} mb: {self.micro_batch_size}')
+                return self._next_batch()
+        else:
+            assert torch.is_tensor(batch[0][0])
+            if batch[0][0].size(0) != self.micro_batch_size:
+                return self._next_batch()
+
+        return batch
+
+    def _exec_forward_pass(self, buffer_id):
+        self.tput_timer.start()
+        self.mem_status('BEFORE FWD', reset_max=True)
+
+        if isinstance(self.pipe_buffers['inputs'][buffer_id], tuple):
+            inputs = tuple(t.clone() for t in self.pipe_buffers['inputs'][buffer_id])
+        else:
+            inputs = self.pipe_buffers['inputs'][buffer_id].clone()
+
+        # collect the partitioned input from the previous stage
+        if self.is_pipe_partitioned and not self.is_first_stage():
+            part_input = PartitionedTensor.from_meta(
+                meta=inputs[0],
+                local_part=inputs[1],
+                group=self.grid.get_slice_parallel_group())
+
+            inputs = tuple([part_input.full(), inputs[2]])
+            inputs[0].requires_grad = True
+            # skip mask
+            #inputs[1].requires_grad = True
+            part_input = None
+            self.pipe_buffers['inputs'][buffer_id] = inputs
+
+        # Zero out the gradients each time we use the tensor because only the data in
+        # tensor changes across batches
+        self._zero_grads(inputs)
+
+        outputs = super().forward(inputs)
+
+        # Partition the outputs if we are not the last stage
+        if self.is_pipe_partitioned and not self.is_last_stage():
+            part = PartitionedTensor(tensor=outputs[0],
+                                     group=self.grid.get_slice_parallel_group())
+            # Clear the large output data, but save the computation graph
+            outputs[0].data = torch.zeros(1)
+            self.pipe_buffers['output_tensors'][buffer_id] = outputs[0]
+            # Inject the partitioned tensor into the output before sending
+            outputs = tuple([part.to_meta(), part.data(), outputs[1]])
+            part = None
+
+        self.pipe_buffers['outputs'][buffer_id] = outputs
+
+        # Optionally compute loss on the last device
+        if self.is_last_stage():
+            if self.loss_model is not None:
+                labels = self.pipe_buffers['labels'][buffer_id]
+                self.loss = self.loss_model(outputs, labels)
+            else:
+                # Some models just return loss from forward()
+                self.loss = outputs
+
+            if isinstance(self.loss, torch.Tensor):
+                if self.total_loss is None:
+                    self.total_loss = torch.zeros_like(self.loss)
+                self.total_loss += self.loss.detach()
+            else:
+                if self.total_loss is None:
+                    self.total_loss = [torch.zeros_like(l) for l in self.loss]
+                for idx, l in enumerate(self.loss):
+                    self.total_loss[idx] += l.detach()
+
+    def _exec_backward_pass(self, buffer_id):
+        assert self.optimizer is not None, "must provide optimizer during " \
+                                           "init in order to use backward"
+
+        self.mem_status('BEFORE BWD', reset_max=True)
+
+        # The last stage just runs backward on the loss using DeepSpeed's typical
+        # mechanisms.
+        if self.is_last_stage():
+            super().backward(self.loss)
+            self.mem_status('AFTER BWD')
+            return
+
+        outputs = self.pipe_buffers['outputs'][buffer_id]
+
+        if self.wall_clock_breakdown():
+            self.timers('backward_microstep').start()
+            self.timers('backward').start()
+            self.timers('backward_inner_microstep').start()
+            self.timers('backward_inner').start()
+
+        # Reconstruct if we previously partitioned the output. We must be
+        # careful to also restore the computational graph of the tensors we partitioned.
+        if self.is_pipe_partitioned:
+            if self.is_grad_partitioned:
+                part_output = PartitionedTensor.from_meta(
+                    meta=outputs[0],
+                    local_part=outputs[1],
+                    group=self.grid.get_slice_parallel_group())
+                self.pipe_buffers['output_tensors'][buffer_id].data = part_output.full()
+                outputs = tuple(
+                    [self.pipe_buffers['output_tensors'][buffer_id],
+                     outputs[2]])
+            else:
+                # Already restored from partition
+                self.pipe_buffers['output_tensors'][buffer_id].data = outputs[0]
+                outputs = tuple(
+                    [self.pipe_buffers['output_tensors'][buffer_id],
+                     outputs[1]])
+
+        grad_tensors = self.grad_layer
+        if self.is_grad_partitioned:
+            #print(f'RANK={self.global_rank} BEFORE-BWD restoring grad={self.grad_layer[0].size()} {self.grad_layer[1].size()}')
+            part_grad = PartitionedTensor.from_meta(
+                meta=self.grad_layer[0],
+                local_part=self.grad_layer[1],
+                group=self.grid.get_slice_parallel_group())
+            grad_tensors = tuple([part_grad.full(), self.grad_layer[2]])
+            part_grad = None
+            #print(f'RANK={self.global_rank} BEFORE-BWD restored grad={self.grad_layer[0].size()} {self.grad_layer[1].size()}')
+
+        # This handles either a single tensor or tuple of tensors.
+        if isinstance(outputs, tuple):
+            out_tensors = [t for t in outputs if t.is_floating_point()]
+            assert len(out_tensors) == len(grad_tensors)
+            torch.autograd.backward(tensors=out_tensors, grad_tensors=grad_tensors)
+        else:
+            torch.autograd.backward(tensors=(outputs, ), grad_tensors=(grad_tensors, ))
+
+        # Free up the memory from the output of forward()
+        self.pipe_buffers['output_tensors'][buffer_id] = None
+        self.pipe_buffers['outputs'][buffer_id] = None
+        grad_tensors = None
+
+        if self.wall_clock_breakdown():
+            self.timers('backward_inner').stop()
+            self.timers('backward_inner_microstep').stop()
+            self.timers('backward').stop()
+            self.timers('backward_microstep').stop()
+
+        self.mem_status('AFTER BWD')
+
+    def _exec_load_micro_batch(self, buffer_id):
+        if self.wall_clock_breakdown():
+            self.timers('batch_input').start()
+
+        batch = self._next_batch()
+
+        if self.is_first_stage():
+            loaded = None
+            if torch.is_tensor(batch[0]):
+                loaded = batch[0].clone().to(self.device).detach()
+                loaded.requires_grad = loaded.is_floating_point()
+            else:
+                assert isinstance(batch[0], tuple)
+                # Assume list or tuple
+                loaded = []
+                for x in batch[0]:
+                    assert torch.is_tensor(x)
+                    mine = x.clone().detach().to(self.device)
+                    mine.requires_grad = mine.is_floating_point()
+                    loaded.append(mine)
+                loaded = tuple(loaded)
+
+            self.pipe_buffers['inputs'][buffer_id] = loaded
+
+        if self.is_last_stage():
+            loaded = batch[1]
+            if torch.is_tensor(batch[1]):
+                loaded = batch[1].to(self.device)
+            elif isinstance(batch[1], tuple):
+                loaded = []
+                for x in batch[1]:
+                    assert torch.is_tensor(x)
+                    x = x.to(self.device).detach()
+                    loaded.append(x)
+                loaded = tuple(loaded)
+
+            self.pipe_buffers['labels'][buffer_id] = loaded
+
+        if self.wall_clock_breakdown():
+            self.timers('batch_input').stop()
+
+    def _send_tensor_meta(self, buffer, recv_stage):
+        """ Communicate metadata about upcoming p2p transfers.
+
+        Metadata is communicated in this order:
+            * type (0: tensor, 1: list)
+            * num_tensors if type=list
+            foreach tensor in buffer:
+                * ndims
+                * shape
+        """
+        send_bytes = 0
+        if isinstance(buffer, torch.Tensor):
+            type_tensor = torch.LongTensor(data=[0]).to(self.device)
+            p2p.send(type_tensor, recv_stage)
+            send_shape = torch.LongTensor(data=buffer.size()).to(self.device)
+            send_ndims = torch.LongTensor(data=[len(buffer.size())]).to(self.device)
+            p2p.send(send_ndims, recv_stage)
+            p2p.send(send_shape, recv_stage)
+            send_bytes += _tensor_bytes(buffer)
+        elif isinstance(buffer, list):
+            assert (False)
+            type_tensor = torch.LongTensor(data=[1]).to(self.device)
+            p2p.send(type_tensor, recv_stage)
+            count_tensor = torch.LongTensor(data=[len(buffer)]).to(self.device)
+            p2p.send(count_tensor, recv_stage)
+            for tensor in buffer:
+                assert isinstance(tensor, torch.Tensor)
+                send_shape = torch.LongTensor(data=tensor.size()).to(self.device)
+                send_ndims = torch.LongTensor(data=[len(tensor.size())]).to(self.device)
+                p2p.send(send_ndims, recv_stage)
+                p2p.send(send_shape, recv_stage)
+                send_bytes += _tensor_bytes(tensor)
+        elif isinstance(buffer, tuple):
+            type_tensor = torch.LongTensor(data=[2]).to(self.device)
+            p2p.send(type_tensor, recv_stage)
+            count_tensor = torch.LongTensor(data=[len(buffer)]).to(self.device)
+            p2p.send(count_tensor, recv_stage)
+            for idx, tensor in enumerate(buffer):
+                assert isinstance(tensor, torch.Tensor)
+                send_shape = torch.LongTensor(data=tensor.size()).to(self.device)
+                send_ndims = torch.LongTensor(data=[len(tensor.size())]).to(self.device)
+                p2p.send(send_ndims, recv_stage)
+                p2p.send(send_shape, recv_stage)
+                # Useful for performance debugging.
+                '''
+                new_bytes = _tensor_bytes(tensor)
+                send_bytes += _tensor_bytes(tensor)
+                # Useful for performance debugging.
+                if self.grid.data_parallel_id == 0:
+                    print(
+                        f'STAGE={self.stage_id} pipe-send-volume[{idx}]: shape={send_shape} {new_bytes/1024**2:0.2f}MB'
+                    )
+                '''
+        else:
+            raise NotImplementedError(f'Could not send meta type {type(buffer)}')
+
+        # Useful for performance debugging.
+        '''
+        if self.grid.data_parallel_id == 0:
+            print(f'STAGE={self.stage_id} pipe-send-volume: {send_bytes/1024**2:0.2f}MB')
+        '''
+
+    def _recv_tensor_meta(self, send_stage):
+        """Receive metadata about upcoming p2p transfers and return allocated buffers.
+
+        Metadata is communicated in this order:
+            * type (0: tensor, 1: list)
+            * num_tensors if type=list
+            foreach tensor in buffer:
+                * ndims
+                * shape
+
+        Returns:
+            Allocated buffer for receiving from send_stage.
+        """
+
+        type_tensor = torch.LongTensor(data=[0]).to(self.device)
+        p2p.recv(type_tensor, send_stage)
+        recv_type = type_tensor.item()
+
+        # A single tensor will be sent.
+        if recv_type == 0:
+            recv_ndims = torch.LongTensor(data=[0]).to(self.device)
+            p2p.recv(recv_ndims, send_stage)
+            recv_ndims = recv_ndims.item()
+            recv_shape = torch.LongTensor([1] * recv_ndims).to(self.device)
+            p2p.recv(recv_shape, send_stage)
+            recv_shape = recv_shape.tolist()
+            return self._allocate_buffer(recv_shape, num_buffers=1)[0]
+
+        # List or tuple of tensors
+        elif recv_type == 1 or recv_type == 2:
+            count_tensor = torch.LongTensor(data=[0]).to(self.device)
+            p2p.recv(count_tensor, send_stage)
+            num_tensors = count_tensor.item()
+            recv_shapes = []
+            for idx in range(num_tensors):
+                recv_ndims = torch.LongTensor(data=[0]).to(self.device)
+                p2p.recv(recv_ndims, send_stage)
+                recv_ndims = recv_ndims.item()
+                recv_shape = torch.LongTensor([1] * recv_ndims).to(self.device)
+                p2p.recv(recv_shape, send_stage)
+                recv_shapes.append(recv_shape.tolist())
+
+            buffers = self._allocate_buffers(recv_shapes, num_buffers=1)[0]
+            # Convert to tuples if requested.
+            if recv_type == 2:
+                buffers = tuple(buffers)
+            return buffers
+
+        else:
+            raise NotImplementedError(f'Could not receive type {type(recv_type)}')
+
+    def _exec_send_activations(self, buffer_id):
+        if self.wall_clock_breakdown():
+            self.timers('pipe_send_output').start()
+
+        outputs = self.pipe_buffers['outputs'][buffer_id]
+
+        # NCCL does not like to send torch.BoolTensor types, so cast the mask to half().
+        # We could do char, but with half() we can eventually flatten with other fp16
+        # messages (TODO)
+        if self.module.__class__.__name__ == 'GPT2ModelPipe':
+            outputs = list(outputs)
+            outputs[-1] = outputs[-1].half()
+            outputs = tuple(outputs)
+
+        if self.first_output_send:
+            self.first_output_send = False
+            self._send_tensor_meta(outputs, self.next_stage)
+
+        if isinstance(outputs, torch.Tensor):
+            p2p.send(outputs, self.next_stage)
+        elif isinstance(outputs, tuple):
+            for idx, buffer in enumerate(outputs):
+                p2p.send(buffer, self.next_stage)
+        else:
+            raise NotImplementedError('Could not send output of type '
+                                      f'{type(outputs)}')
+
+        # Restore the boolean tensor
+        if self.module.__class__.__name__ == 'GPT2ModelPipe':
+            outputs = list(outputs)
+            outputs[-1] = outputs[-1].bool()
+            outputs = tuple(outputs)
+
+        if self.wall_clock_breakdown():
+            self.timers('pipe_send_output').stop()
+
+    def _exec_send_grads(self, buffer_id):
+        if self.wall_clock_breakdown():
+            self.timers('pipe_send_grad').start()
+
+        inputs = self.pipe_buffers['inputs'][buffer_id]
+
+        # Partition the gradient
+        if self.is_grad_partitioned:
+            part = PartitionedTensor(tensor=inputs[0].grad,
+                                     group=self.grid.get_slice_parallel_group())
+            # Clear the large output data, but save the computation graph
+            # Inject the partitoned tensor into the output before sending
+
+            # XXX Hack
+            inputs = tuple([part.to_meta(), part.data(), inputs[1]])
+
+        # XXX Terrible hack
+        # Drop the attention mask from the input buffer here. It does not have
+        # a grad that needs to be communicated. We free the buffer immediately
+        # after, so no need to restore it. The receiver also has a hack that skips
+        # the recv. This is because NCCL does not let us send torch.BoolTensor :-(.
+        if self.module.__class__.__name__ == 'GPT2ModelPipe':
+            inputs = list(inputs)
+            inputs.pop()
+            inputs = tuple(inputs)
+
+        if isinstance(inputs, torch.Tensor):
+            assert inputs.grad is not None
+            p2p.send(inputs.grad, self.prev_stage)
+        else:
+            # XXX terrible hacky branch
+            if self.is_grad_partitioned:
+                # First two sends are partitioned gradient
+                p2p.send(inputs[0], self.prev_stage)
+                p2p.send(inputs[1], self.prev_stage)
+                # XXX hack hack hack
+                #p2p.send(inputs[2].grad, self.prev_stage)
+            else:
+                for idx, buffer in enumerate(inputs):
+                    # Skip tensors that will not produce a grad
+                    if not buffer.is_floating_point():
+                        assert buffer.grad is None
+                        continue
+                    assert buffer.grad is not None
+                    p2p.send(buffer.grad, self.prev_stage)
+
+        # We can free up the input buffer now
+        self.pipe_buffers['inputs'][buffer_id] = None
+
+        if self.wall_clock_breakdown():
+            self.timers('pipe_send_grad').stop()
+
+    def _exec_recv_activations(self, buffer_id):
+        if self.wall_clock_breakdown():
+            self.timers('pipe_recv_input').start()
+
+        recvd = None
+
+        # Allocate the buffer if necessary
+        if self.pipe_recv_buf is None:
+            self.pipe_recv_buf = self._recv_tensor_meta(self.prev_stage)
+
+        if isinstance(self.pipe_recv_buf, torch.Tensor):
+            p2p.recv(self.pipe_recv_buf, self.prev_stage)
+            recvd = self.pipe_recv_buf.clone().detach()
+            recvd.requires_grad = recvd.is_floating_point()
+        else:
+            assert isinstance(self.pipe_recv_buf, tuple)
+            recvd = [None] * len(self.pipe_recv_buf)
+            for idx, buffer in enumerate(self.pipe_recv_buf):
+                assert torch.is_tensor(buffer)
+                # XXX hardcode meta type
+                if self.is_pipe_partitioned and idx == 0 and buffer.dtype != torch.long:
+                    if self.meta_buffer is None:
+                        self.meta_buffer = torch.zeros(buffer.size(),
+                                                       dtype=torch.long,
+                                                       device=self.device)
+                    buffer = self.meta_buffer
+
+                p2p.recv(buffer, self.prev_stage)
+                recvd[idx] = buffer.clone().detach()
+
+            # NCCL does not like to send torch.BoolTensor types, so un-cast the
+            # attention mask
+            if self.module.__class__.__name__ == 'GPT2ModelPipe':
+                recvd[-1] = recvd[-1].bool()
+
+            recvd = tuple(recvd)
+
+            for buffer in recvd:
+                buffer.requires_grad = buffer.is_floating_point()
+
+        self.pipe_buffers['inputs'][buffer_id] = recvd
+
+        if self.wall_clock_breakdown():
+            self.timers('pipe_recv_input').stop()
+
+    def _exec_recv_grads(self, buffer_id):
+        if self.wall_clock_breakdown():
+            self.timers('pipe_recv_grad').start()
+
+        outputs = self.pipe_buffers['outputs'][buffer_id]
+        # XXX these shapes are hardcoded for Megatron
+        # Restore partitioned output if it was partitioned and we are sending full gradients
+        if self.is_pipe_partitioned and not self.is_grad_partitioned:
+            part_output = PartitionedTensor.from_meta(
+                meta=outputs[0],
+                local_part=outputs[1],
+                group=self.grid.get_slice_parallel_group())
+            outputs[0].data = part_output.full()
+            outputs = tuple([outputs[0], outputs[2]])
+            # save for backward
+            self.pipe_buffers['outputs'][buffer_id] = outputs
+
+        # Allocate gradient if necessary
+        if self.grad_layer is None:
+            if isinstance(outputs, torch.Tensor):
+                s = list(outputs.size())
+                self.grad_layer = self._allocate_buffer(s, num_buffers=1)[0]
+            else:
+                sizes = [list(t.size()) for t in outputs if t.is_floating_point()]
+                self.grad_layer = self._allocate_buffers(sizes, num_buffers=1)[0]
+
+        if isinstance(self.grad_layer, torch.Tensor):
+            p2p.recv(self.grad_layer, self.next_stage)
+        else:
+            assert isinstance(outputs, tuple)
+            for idx, buffer in enumerate(self.grad_layer):
+                # XXX GPT-2 hack
+                if self.is_grad_partitioned and idx == 0 and buffer.dtype != torch.long:
+                    buffer.data = torch.zeros(buffer.size(),
+                                              dtype=torch.long,
+                                              device=self.device)
+                p2p.recv(buffer, self.next_stage)
+
+        if self.wall_clock_breakdown():
+            self.timers('pipe_recv_grad').stop()
+
+    def _exec_optimizer_step(self, lr_kwargs=None):
+        if self.wall_clock_breakdown():
+            self.timers('step_microstep').start()
+            self.timers('step').start()
+        self.mem_status('BEFORE STEP', reset_max=True)
+
+        self._force_grad_boundary = True
+        self._take_model_step(lr_kwargs)
+        self._force_grad_boundary = False
+
+        self.mem_status('AFTER STEP')
+
+        if self.tensorboard_enabled():
+            if self.global_rank == 0:
+                self.summary_events = [(f'Train/Samples/lr',
+                                        self.get_lr()[0],
+                                        self.global_samples)]
+                if self.fp16_enabled() and hasattr(self.optimizer, 'cur_scale'):
+                    self.summary_events.append((f'Train/Samples/loss_scale',
+                                                self.optimizer.cur_scale,
+                                                self.global_samples))
+                for event in self.summary_events:  # write_summary_events
+                    self.summary_writer.add_scalar(event[0], event[1], event[2])
+
+        if self.wall_clock_breakdown():
+            self.timers('step_microstep').stop()
+            self.timers('step').stop()
+            if self.global_steps % self.steps_per_print() == 0:
+                self.timers.log([
+                    'batch_input',
+                    'forward_microstep',
+                    'backward_microstep',
+                    'backward_inner_microstep',
+                    'backward_allreduce_microstep',
+                    'backward_tied_allreduce_microstep',
+                    'step_microstep'
+                ])
+            if self.global_steps % self.steps_per_print() == 0:
+                self.timers.log([
+                    'forward',
+                    'backward',
+                    'backward_inner',
+                    'backward_allreduce',
+                    'step'
+                ])
+
+    def _zero_grads(self, inputs):
+        if isinstance(inputs, torch.Tensor):
+            if inputs.grad is not None:
+                inputs.grad.data.zero_()
+        else:
+            for t in inputs:
+                if t.grad is not None:
+                    t.grad.data.zero_()
+
+    def _allocate_zeros(self, shape, fp16=None, **kwargs):
+        """ Allocate a tensor of zeros on the engine's device.
+
+        Arguments:
+            shape: the shape of the tensor to allocate
+            fp16 (bool): whether to use FP16. default: defer to self.fp16_enabled()
+            kwargs: passed to torch.zeros()
+
+        Returns:
+            A tensor from torch.zeros() allocated on self.device.
+        """
+
+        if fp16 is None:
+            fp16 = self.fp16_enabled()
+
+        if fp16:
+            return torch.zeros(shape, dtype=torch.half, device=self.device, **kwargs)
+        else:
+            return torch.zeros(shape, device=self.device, **kwargs)
+
+    def _allocate_buffer(self, shape, num_buffers=-1, **kwargs):
+        buffers = []
+        if num_buffers == -1:
+            num_buffers = self.num_pipe_buffers
+        for count in range(num_buffers):
+            buffers.append(self._allocate_zeros(shape, **kwargs))
+        return buffers
+
+    def _allocate_buffers(self, shapes, requires_grad=False, num_buffers=-1):
+        buffers = []
+        if num_buffers == -1:
+            num_buffers = self.num_pipe_buffers
+        for count in range(num_buffers):
+            buffer = []
+            for shape in shapes:
+                buffer.append(self._allocate_zeros(shape, requires_grad=requires_grad))
+            buffers.append(buffer)
+        return buffers
+
+    def forward(self, *args, **kwargs):
+        """Disabled for pipeline parallel training. See ``train_batch()``. """
+        raise PipelineError("Only train_batch() is accessible in pipeline mode.")
+
+    def backward(self, *args, **kwargs):
+        """Disabled for pipeline parallel training. See ``train_batch()``. """
+        raise PipelineError("Only train_batch() is accessible in pipeline mode.")
+
+    def step(self, *args, **kwargs):
+        """Disabled for pipeline parallel training. See ``train_batch()``. """
+        raise PipelineError("Only train_batch() is accessible in pipeline mode.")
+
+    def mem_status(self, msg, print_rank=-1, reset_max=False):
+        return
+        global mem_alloced, mem_cached
+        if not self.global_steps == 0 or not self.global_steps == 9:
+            #return
+            pass
+        if self.mpu.get_data_parallel_rank() != 0:
+            return
+
+        if self.global_rank != 0:
+            return
+
+        rank = self.global_rank
+        if print_rank != -1 and rank != print_rank:
+            return
+
+        torch.cuda.synchronize()
+
+        if reset_max:
+            torch.cuda.reset_max_memory_cached()
+            torch.cuda.reset_max_memory_allocated()
+
+        new_alloced = torch.cuda.memory_allocated()
+        new_cached = torch.cuda.memory_cached()
+
+        delta_alloced = new_alloced - mem_alloced
+        delta_cached = new_cached - mem_cached
+
+        mem_cached = new_cached
+        mem_alloced = new_alloced
+
+        max_alloced = torch.cuda.max_memory_allocated()
+        max_cached = torch.cuda.max_memory_cached()
+
+        # convert to GB for printing
+        new_alloced /= 1024**3
+        new_cached /= 1024**3
+        delta_alloced /= 1024**3
+        delta_cached /= 1024**3
+        max_alloced /= 1024**3
+        max_cached /= 1024**3
+
+        print(
+            f'RANK={rank} STAGE={self.stage_id} STEP={self.global_steps} MEMSTATS',
+            msg,
+            f'current alloc={new_alloced:0.4f}GB (delta={delta_alloced:0.4f}GB max={max_alloced:0.4f}GB) '
+            f'current cache={new_cached:0.4f}GB (delta={delta_cached:0.4f}GB max={max_cached:0.4f}GB)'
+        )
+
+    def module_state_dict(self):
+        """Override hack to save a pipe model and return the directory path of the save.
+
+        This method should only be called by DeepSpeed's ``save_checkpoint()``. The
+        recommended way of saving a ``PipelineModule`` outside of ``save_checkpoint()``
+        is ``save_state_dict()``.
+
+        Returns:
+            None
+        """
+        assert isinstance(self.module, PipelineModule)
+        assert self._curr_ckpt_path is not None, \
+            "PipelineEngine expects module_state_dict() to be called from save_checkpoint()"
+
+        self.module.save_state_dict(self._curr_ckpt_path)
+        return None
+
+    def load_module_state_dict(self, state_dict, strict=True):
+        """Override hack to instead use a directory path.
+
+        This is important because pipeline models checkpoint by layer instead of rank.
+
+        If ``state_dict`` is not ``None`` or a ``str``, we revert to ``super()`` expecting a ``dict``.
+
+        Args:
+            state_dict (str, None): unused
+            strict (bool, optional): Strict state loading. Defaults to True.
+        """
+        if (state_dict is not None) and (not isinstance(state_dict, str)):
+            super().load_module_state_dict(state_dict, strict)
+            return
+
+        self.module.load_state_dir(load_dir=self._curr_ckpt_path, strict=strict)
+
+    # A map of PipeInstruction types to methods. Each method will be executed with the
+    # kwargs provided to the PipeInstruction from the scheduler.
+    _INSTRUCTION_MAP = {
+        schedule.OptimizerStep: _exec_optimizer_step,
+        schedule.ReduceGrads: _exec_reduce_grads,
+        schedule.ReduceTiedGrads: _exec_reduce_tied_grads,
+        schedule.LoadMicroBatch: _exec_load_micro_batch,
+        schedule.ForwardPass: _exec_forward_pass,
+        schedule.BackwardPass: _exec_backward_pass,
+        schedule.SendActivation: _exec_send_activations,
+        schedule.RecvActivation: _exec_recv_activations,
+        schedule.SendGrad: _exec_send_grads,
+        schedule.RecvGrad: _exec_recv_grads,
+    }
+
+    def _exec_schedule(self, pipe_schedule):
+        self._reserve_pipe_buffers(pipe_schedule.num_pipe_buffers())
+        # For each step in the schedule
+        for step_cmds in pipe_schedule:
+            # For each instruction in the step
+            for cmd in step_cmds:
+                if type(cmd) not in self._INSTRUCTION_MAP:
+                    raise RuntimeError(
+                        f'{self.__class__.__name__} does not understand instruction {repr(cmd)}'
+                    )
+
+                # Equivalent to: self._exec_forward_pass(buffer_id=0)
+                self._exec_instr = MethodType(self._INSTRUCTION_MAP[type(cmd)], self)
+                self._exec_instr(**cmd.kwargs)
diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
new file mode 100644
index 000000000000..6d24ed469f3a
--- /dev/null
+++ b/deepspeed/runtime/pipe/module.py
@@ -0,0 +1,575 @@
+import os
+import enum
+
+import re as regex
+
+from collections import defaultdict
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+
+from deepspeed.utils import logger
+from .. import utils as ds_utils
+from ..activation_checkpointing import checkpointing
+from .topology import PipeDataParallelTopology, PipelineParallelGrid
+
+
+class PipelineError(Exception):
+    """Errors related to the use of deepspeed.PipelineModule """
+
+
+class LayerSpec:
+    """Building block for specifying pipeline-parallel modules.
+
+    LayerSpec stores the type information and parameters for each stage in a
+    PipelineModule. For example:
+
+    .. code-block:: python
+
+        nn.Sequence(
+            torch.nn.Linear(self.in_dim, self.hidden_dim, bias=False),
+            torch.nn.Linear(self.hidden_hidden, self.out_dim)
+        )
+
+    becomes
+
+    .. code-block:: python
+
+        layer_specs = [
+            LayerSpec(torch.nn.Linear, self.in_dim, self.hidden_dim, bias=False),
+            LayerSpec(torch.nn.Linear, self.hidden_hidden, self.out_dim)]
+        ]
+    """
+    def __init__(self, typename, *module_args, **module_kwargs):
+        self.typename = typename
+        self.module_args = module_args
+        self.module_kwargs = module_kwargs
+
+        if not issubclass(typename, nn.Module):
+            raise RuntimeError('LayerSpec only supports torch.nn.Module types.')
+
+        if dist.is_initialized():
+            self.global_rank = dist.get_rank()
+        else:
+            self.global_rank = -1
+
+    def __repr__(self):
+        return ds_utils.call_to_str(self.typename.__name__,
+                                    self.module_args,
+                                    self.module_kwargs)
+
+    def build(self, log=False):
+        """Build the stored specification."""
+        if log:
+            logger.info(f'RANK={self.global_rank} building {repr(self)}')
+
+        return self.typename(*self.module_args, **self.module_kwargs)
+
+
+class TiedLayerSpec(LayerSpec):
+    def __init__(self,
+                 key,
+                 typename,
+                 *module_args,
+                 forward_fn=None,
+                 tied_weight_attr='weight',
+                 **module_kwargs):
+        super().__init__(typename, *module_args, **module_kwargs)
+        self.key = key
+        self.forward_fn = forward_fn
+        self.tied_weight_attr = tied_weight_attr
+
+
+class PipelineModule(nn.Module):
+    def __init__(self,
+                 layers,
+                 num_stages=None,
+                 topology=None,
+                 loss_fn=None,
+                 seed_layers=False,
+                 seed_fn=None,
+                 base_seed=1234,
+                 partition_method='parameters',
+                 activation_checkpoint_interval=0,
+                 activation_checkpoint_func=checkpointing.checkpoint):
+        """Modules to be parallelized with pipeline parallelism.
+
+        The key constraint that enables pipeline parallelism is the
+        representation of the forward pass as a sequence of layers
+        and the enforcement of a simple interface between them. The
+        forward pass is implicitly defined by the module ``layers``. The key
+        assumption is that the output of each layer can be directly fed as
+        input to the next, like a ``torch.nn.Sequence``. The forward pass is
+        implicitly:
+
+        .. code-block:: python
+
+            def forward(self, inputs):
+                x = inputs
+                for layer in self.layers:
+                    x = layer(x)
+                return x
+
+        Args:
+            layers (Iterable): A sequence of layers defining pipeline structure. Can be a ``torch.nn.Sequential`` module.
+            num_stages (int, optional): The degree of pipeline parallelism. If not specified, ``topology`` must be provided.
+            topology (``deepseed.pipe.ProcessTopology``, optional): Defines the axes of parallelism axes for training. Must be provided if ``num_stages`` is ``None``.
+            loss_fn (callable, optional): Loss is computed ``loss = loss_fn(outputs, label)``
+            base_seed (int, optional): [description]. Defaults to 1234.
+            partition_method (str, optional): [description]. Defaults to 'parameters'.
+            activation_checkpoint_interval (int, optional): The granularity activation checkpointing in terms of number of layers. 0 disables activation checkpointing.
+            activation_checkpoint_func (callable, optional): The function to use for activation checkpointing. Defaults to ``deepspeed.checkpointing.checkpoint``.
+        """
+
+        super().__init__()
+
+        if num_stages is None and topology is None:
+            raise RuntimeError('must provide num_stages or topology')
+
+        self.micro_offset = 0
+
+        self.loss_fn = loss_fn
+
+        self.seed_layers = seed_layers
+        self.seed_fn = seed_fn
+        self.base_seed = base_seed
+        if dist.get_rank() == 0:
+            try:
+                seed_str = self.seed_fn.__name__
+            except AttributeError:
+                seed_str = None
+            print(
+                f'SEED_LAYERS={self.seed_layers} BASE_SEED={self.base_seed} SEED_FN={seed_str}'
+            )
+
+        # Setup world info
+        self.world_group = dist.new_group(ranks=range(dist.get_world_size()))
+        self.global_rank = dist.get_rank(group=self.world_group)
+        self.world_size = dist.get_world_size(group=self.world_group)
+
+        if topology:
+            self._topo = topology
+            self.num_stages = self._topo.get_dim('pipe')
+        else:
+            self.num_stages = num_stages
+            if topology is None:
+                if self.world_size % self.num_stages != 0:
+                    raise RuntimeError(
+                        f'num_stages ({self.num_stages}) must divide distributed world size ({self.world_size})'
+                    )
+                dp = self.world_size // num_stages
+                topology = PipeDataParallelTopology(num_pp=num_stages, num_dp=dp)
+                self._topo = topology
+
+        # Contruct communicators for pipeline topology
+        self._grid = PipelineParallelGrid(process_group=self.world_group,
+                                          topology=self._topo)
+
+        self.stage_id = self._topo.get_coord(self.global_rank).pipe
+
+        # Initialize partition information
+        self._layer_specs = list(layers)
+        self._num_layers = len(self._layer_specs)
+        self._local_start = 0
+        self._local_stop = None
+        self._partition_layers(method=partition_method)
+
+        self.forward_funcs = []
+        self.tied_modules = nn.ModuleDict()
+        self.tied_weight_attrs = {}
+
+        # Offset the random seed by the stage ID.
+        #newseed = torch.cuda.initial_seed() + self._grid.get_stage_id()
+        #ds_utils.set_random_seed(newseed)
+
+        #with torch.random.fork_rng(devices=[torch.cuda.current_device()]):
+        self._build()
+        self.to('cuda')
+
+        self.tied_comms = self._index_tied_modules()
+        self._synchronize_tied_weights()
+
+        self.activation_checkpoint_interval = activation_checkpoint_interval
+        self.activation_checkpoint_func = activation_checkpoint_func
+
+    def _build(self):
+        specs = self._layer_specs
+
+        for local_idx, layer in enumerate(specs[self._local_start:self._local_stop]):
+            layer_idx = local_idx + self._local_start
+            if self.seed_layers:
+                if self.seed_fn:
+                    self.seed_fn(self.base_seed + layer_idx)
+                else:
+                    ds_utils.set_random_seed(self.base_seed + layer_idx)
+
+            # Recursively build PipelineModule objects
+            if isinstance(layer, PipelineModule):
+                raise NotImplementedError('RECURSIVE BUILD NOT YET IMPLEMENTED')
+
+            # LayerSpec objects contain an nn.Module that should be allocated now.
+            elif isinstance(layer, nn.Module):
+                name = str(layer_idx)
+                self.forward_funcs.append(layer)
+                self.add_module(name, layer)
+
+            # TiedLayerSpec objects contain an nn.Module that should be allocated now.
+            elif isinstance(layer, TiedLayerSpec):
+                # Build and register the module if we haven't seen it before.
+                if layer.key not in self.tied_modules:
+                    self.tied_modules[layer.key] = layer.build()
+                    self.tied_weight_attrs[layer.key] = layer.tied_weight_attr
+
+                if layer.forward_fn is None:
+                    # Just use forward()
+                    self.forward_funcs.append(self.tied_modules[layer.key])
+                else:
+                    # User specified fn with args (module, input)
+                    self.forward_funcs.append(
+                        partial(layer.forward_fn,
+                                self.tied_modules[layer.key]))
+
+            # LayerSpec objects contain an nn.Module that should be allocated now.
+            elif isinstance(layer, LayerSpec):
+                module = layer.build()
+                name = str(layer_idx)
+                self.forward_funcs.append(module)
+                self.add_module(name, module)
+
+            # Last option: layer may be a functional (e.g., lambda). We do nothing in
+            # that case and just use it in forward()
+            else:
+                self.forward_funcs.append(layer)
+
+        # All pipeline parameters should be considered as model parallel in the context
+        # of our FP16 optimizer
+        for p in self.parameters():
+            p.model_parallel = True
+
+    def _count_layer_params(self):
+        """Count the trainable parameters in individual layers.
+
+        This routine will only build one layer at a time.
+
+        Returns:
+            A list of the number of parameters in each layer.
+        """
+        param_counts = [0] * len(self._layer_specs)
+        for idx, layer in enumerate(self._layer_specs):
+            if isinstance(layer, LayerSpec):
+                l = layer.build()
+                params = filter(lambda p: p.requires_grad, l.parameters())
+                param_counts[idx] = sum(p.numel() for p in params)
+            elif isinstance(layer, nn.Module):
+                params = filter(lambda p: p.requires_grad, layer.parameters())
+                param_counts[idx] = sum(p.numel() for p in params)
+        return param_counts
+
+    def _find_layer_type(self, layername):
+        idxs = []
+        typeregex = regex.compile(layername, regex.IGNORECASE)
+        for idx, layer in enumerate(self._layer_specs):
+            name = None
+            if isinstance(layer, LayerSpec):
+                name = layer.typename.__name__
+            elif isinstance(layer, nn.Module):
+                name = layer.__class__.__name__
+            else:
+                try:
+                    name = layer.__name__
+                except AttributeError:
+                    continue
+            if typeregex.search(name):
+                idxs.append(idx)
+
+        if len(idxs) == 0:
+            raise RuntimeError(
+                f"Partitioning '{layername}' found no valid layers to partition.")
+        return idxs
+
+    def forward(self, forward_input):
+        # We need to offset the seed by the microbatch ID. Save it in a local var to
+        # ensure it is preserved in the closure. Otherwise checkpointed forward funcs
+        # will see a different offset.
+        self.micro_offset += 1
+
+        def exec_range_func(start, end):
+            ''' Helper function to be used with checkpoint()
+            Adapted from torch.utils.checkpoint:checkpoint_sequential()
+            '''
+            local_micro_offset = self.micro_offset + 1
+
+            def exec_func(*inputs):
+                # Single tensor inputs need to be unwrapped
+                if len(inputs) == 1:
+                    inputs = inputs[0]
+                for idx, layer in enumerate(self.forward_funcs[start:end]):
+                    self.curr_layer = idx + self._local_start
+                    if self.seed_layers:
+                        new_seed = (self.base_seed *
+                                    local_micro_offset) + self.curr_layer
+                        if self.seed_fn:
+                            self.seed_fn(new_seed)
+                        else:
+                            ds_utils.set_random_seed(new_seed)
+
+                    inputs = layer(inputs)
+                return inputs
+
+            return exec_func
+
+        if self.activation_checkpoint_interval == 0:
+            func = exec_range_func(0, len(self.forward_funcs))
+            x = func(forward_input)
+        else:
+            num_layers = len(self.forward_funcs)
+            x = forward_input
+            for start_idx in range(0, num_layers, self.activation_checkpoint_interval):
+                end_idx = min(start_idx + self.activation_checkpoint_interval,
+                              num_layers)
+
+                funcs = self.forward_funcs[start_idx:end_idx]
+                # Since we either pass tensors or tuples of tensors without unpacking, we
+                # need to be careful not to double-wrap tensors with tuple.
+                if not isinstance(x, tuple):
+                    x = (x, )
+
+                if self._is_checkpointable(funcs):
+                    x = self.activation_checkpoint_func(
+                        exec_range_func(start_idx,
+                                        end_idx),
+                        *x)
+                else:
+                    x = exec_range_func(start_idx, end_idx)(*x)
+        return x
+
+    def _partition_layers(self, method='uniform'):
+        num_stages = self._topo.get_dim('pipe')
+        stage_id = self._topo.get_coord(self.global_rank).pipe
+
+        if self.global_rank == 0:
+            logger.info(f'Partitioning pipeline stages with method {method}')
+
+        method = method.lower()
+
+        # Each stage gets a simple uniform number of layers.
+        if method == 'uniform':
+            num_layers = len(self._layer_specs)
+            self.parts = ds_utils.partition_uniform(num_items=num_layers,
+                                                    num_parts=num_stages)
+        elif method == 'parameters':
+            param_counts = self._count_layer_params()
+            self.parts = ds_utils.partition_balanced(weights=param_counts,
+                                                     num_parts=num_stages)
+        elif method.startswith('type:'):
+            layertype = method.split(':')[1]
+            binary_weights = [0] * len(self._layer_specs)
+            for idx in self._find_layer_type(layertype):
+                binary_weights[idx] = 1
+            else:
+                self.parts = ds_utils.partition_balanced(weights=binary_weights,
+                                                         num_parts=num_stages)
+        elif method == 'profile':
+            raise NotImplementedError(f'Partitioning method {method} not implemented.')
+        else:
+            raise NotImplementedError(f'Partitioning method {method} not implemented.')
+
+        # Print some information on the partitioning.
+        if self.global_rank == 0:
+            for stage in range(num_stages):
+                start = self.parts[stage]
+                stop = self.parts[stage + 1]
+                print(f'stage={stage} layers={stop - start}')
+                for idx, layer in enumerate(self._layer_specs[start:stop]):
+                    name = str(layer)
+                    if isinstance(layer, LayerSpec):
+                        name = layer.typename.__name__
+                    if isinstance(layer, nn.Module):
+                        name = layer.__class__.__name__
+                    else:
+                        try:
+                            name = layer.__name__
+                        except AttributeError:
+                            pass
+                    print(f'    {idx+start:2d}: {name}')
+            if self.loss_fn:
+                try:
+                    print(f'  loss: {self.loss_fn.__name__}')
+                except AttributeError:
+                    print(f'  loss: {self.loss_fn.__class__.__name__}')
+
+        self._set_bounds(start=self.parts[stage_id], stop=self.parts[stage_id + 1])
+
+    def allreduce_tied_weight_gradients(self):
+        '''All reduce the gradients of the tied weights between tied stages'''
+        for key, comm in self.tied_comms.items():
+            weight = getattr(self.tied_modules[key], comm['weight_attr'])
+            dist.all_reduce(weight.grad, group=comm['group'])
+
+    def _synchronize_tied_weights(self):
+        for key, comm in self.tied_comms.items():
+            dist.broadcast(
+                getattr(comm['module'],
+                        comm['weight_attr']),
+                src=min(comm['ranks']),
+                group=comm['group'],
+            )
+
+    def _index_tied_modules(self):
+        ''' Build communication structures for tied modules. '''
+        tied_comms = {}
+        if self._topo.get_dim('pipe') == 1:
+            return tied_comms
+
+        specs = self._layer_specs
+        tie_keys = set(s.key for s in specs if isinstance(s, TiedLayerSpec))
+        for key in tie_keys:
+            # Find the layers that the tied module appears in
+            tied_layers = []
+            for idx, layer in enumerate(specs):
+                if isinstance(layer, TiedLayerSpec) and layer.key == key:
+                    tied_layers.append(idx)
+            # Find all stages with this tied module
+            # TODO: Would be nice to remove the nested data/model parallelism loops and
+            # TODO: instead generalize in some way, since we really just care about the
+            # TODO: stage that owns the tied layer. Then loop over each (dp, mp, ...)
+            # TODO: fiber to generate process groups.
+            tied_stages = set(self.stage_owner(idx) for idx in tied_layers)
+            for dp in range(self._grid.data_parallel_size):
+                for mp in range(self._grid.model_parallel_size):
+                    tied_ranks = []
+                    for s in sorted(tied_stages):
+                        if self._grid.model_parallel_size > 1:
+                            tied_ranks.append(
+                                self._grid.stage_to_global(stage_id=s,
+                                                           data=dp,
+                                                           model=mp))
+                        else:
+                            tied_ranks.append(
+                                self._grid.stage_to_global(stage_id=s,
+                                                           data=dp))
+                    group = dist.new_group(ranks=tied_ranks)
+
+                    # Record this tied module if we own a local copy of it.
+                    if self.global_rank in tied_ranks:
+                        assert key in self.tied_modules
+                        if key in self.tied_modules:
+                            tied_comms[key] = {
+                                'ranks': tied_ranks,
+                                'group': group,
+                                'weight_attr': self.tied_weight_attrs[key],
+                                'module': self.tied_modules[key],
+                            }
+                            # Only count the tied module once in the eyes of the FP16 optimizer
+                            if self.global_rank != tied_ranks[0]:
+                                for p in self.tied_modules[key].parameters():
+                                    p.model_parallel = False
+        '''
+        if len(tied_comms) > 0:
+            print(f'RANK={self.global_rank} tied_comms={tied_comms}')
+        '''
+
+        return tied_comms
+
+    def partitions(self):
+        return self.parts
+
+    def stage_owner(self, layer_idx):
+        assert 0 <= layer_idx < self._num_layers
+        for stage in range(self._topo.get_dim('pipe')):
+            if self.parts[stage] <= layer_idx < self.parts[stage + 1]:
+                return stage
+        raise RuntimeError(f'Layer {layer_idx} not owned? parts={self.parts}')
+
+    def _set_bounds(self, start=None, stop=None):
+        """Manually define the range of layers that will be built on this process.
+
+        These boundaries are treated as list slices and so start is inclusive and stop is
+        exclusive. The default of None for both results in all layers being built
+        locally.
+        """
+        self._local_start = start
+        self._local_stop = stop
+
+    def set_checkpoint_interval(self, interval):
+        assert interval >= 0
+        self.checkpoint_interval = interval
+
+    def topology(self):
+        """ ProcessTopology object to query process mappings. """
+        return self._topo
+
+    def mpu(self):
+        return self._grid
+
+    def num_pipeline_stages(self):
+        return self._topo.get_dim('pipe')
+
+    def ckpt_prefix(self, checkpoints_path, tag):
+        """Build a prefix for all checkpoint files written by this module. """
+        # All checkpoint files start with this
+        rank_name = 'module'
+
+        # Data parallelism is omitted from the naming convention because we are agnostic
+        # to this in the checkpoint.
+        omit_dims = frozenset(['data'])
+        axes = [a for a in self._grid._topo.get_axis_names() if a not in omit_dims]
+        for dim in axes:
+            rank = getattr(self._grid._topo.get_coord(rank=self.global_rank), dim)
+            rank_name += f'-{dim}_{rank:02d}'
+
+        ckpt_name = os.path.join(checkpoints_path, str(tag), rank_name)
+        return ckpt_name
+
+    def ckpt_layer_path(self, ckpt_dir, local_layer_idx):
+        """Customize a prefix for a specific pipeline module layer. """
+        idx = local_layer_idx + self._local_start
+        layer_ckpt_path = os.path.join(ckpt_dir, f'layer_{idx:02d}')
+        rank_repr = self._grid._topo.get_rank_repr(rank=self.global_rank)
+        if rank_repr is not '':
+            layer_ckpt_path += f'-{rank_repr}'
+        layer_ckpt_path += '-model_states.pt'
+        return layer_ckpt_path
+
+    def save_state_dict(self, save_dir):
+        if self._grid.data_parallel_id != 0:
+            return
+
+        os.makedirs(save_dir, exist_ok=True)
+        layer_offset = self._local_start
+        for idx, layer in enumerate(self.forward_funcs):
+            model_ckpt_path = self.ckpt_layer_path(save_dir, idx)
+            if not hasattr(layer, 'state_dict'):
+                continue
+            torch.save(layer.state_dict(), model_ckpt_path)
+
+    def load_state_dir(self, load_dir, strict=True):
+        rank = dist.get_rank()
+
+        layer_offset = self._local_start
+        for idx, layer in enumerate(self.forward_funcs):
+            # Functions, etc. will not have state_dicts
+            if not hasattr(layer, 'load_state_dict'):
+                continue
+
+            model_ckpt_path = self.ckpt_layer_path(load_dir, idx)
+            layer.load_state_dict(torch.load(model_ckpt_path,
+                                             map_location=lambda storage,
+                                             loc: storage),
+                                  strict=strict)
+            if self._grid.data_parallel_id == 0:
+                logger.info(
+                    f'RANK={self.global_rank} Loaded layer={idx+layer_offset} file={model_ckpt_path}'
+                )
+
+        self._synchronize_tied_weights()
+
+    def _is_checkpointable(self, funcs):
+        if self.__class__.__name__ == 'GPT2ModelPipe':
+            return all('ParallelTransformerLayerPipe' in f.__class__.__name__
+                       for f in funcs)
+
+        params = [f.parameters() for f in funcs if isinstance(f, torch.nn.Module)]
+        return any(len(list(p)) > 0 for p in params)
diff --git a/deepspeed/runtime/pipe/p2p.py b/deepspeed/runtime/pipe/p2p.py
new file mode 100644
index 000000000000..24c0f250a4b9
--- /dev/null
+++ b/deepspeed/runtime/pipe/p2p.py
@@ -0,0 +1,90 @@
+'''
+Copyright 2019 The Microsoft DeepSpeed Team
+'''
+
+import torch.distributed as dist
+
+_groups = None
+_grid = None
+
+
+#initializes adjacent process groups
+#run this only after torch.distributed.init_process_group() has been called
+def init_process_groups(grid):
+    global _groups, _grid
+    _grid = grid
+
+    assert _grid.pipe_parallel_size > 1, "There is no pipeline parallelism"
+
+    _groups = [dist.new_group(ranks=group) for group in _grid.p2p_groups]
+
+
+def _is_valid_send_recv(src_stage, dest_stage):
+    first_stage = 0
+    last_stage = _grid.pipe_parallel_size - 1
+    assert abs(src_stage-dest_stage) == 1 or \
+        (src_stage == first_stage and dest_stage == last_stage) or \
+        (src_stage == last_stage and dest_stage == first_stage), \
+    "Functionality currently limited to send and receive between adjacent ranks only"
+
+
+def send(tensor, dest_stage, async_op=False):
+    global _groups
+
+    async_op = False
+    src_stage = _grid.get_stage_id()
+    _is_valid_send_recv(src_stage, dest_stage)
+
+    group = _get_send_recv_group(src_stage, dest_stage)
+    src_rank = _grid.stage_to_global(stage_id=src_stage)
+
+    return dist.broadcast(tensor, src_rank, group=group, async_op=async_op)
+
+
+def recv(tensor, src_stage, async_op=False):
+
+    global _groups
+
+    async_op = False
+    dest_stage = _grid.get_stage_id()
+    _is_valid_send_recv(src_stage, dest_stage)
+
+    group = _get_send_recv_group(src_stage, dest_stage)
+    src_rank = _grid.stage_to_global(stage_id=src_stage)
+
+    return dist.broadcast(tensor, src_rank, group=group, async_op=async_op)
+
+
+def barrier(stage_id):
+    global _groups, _grid
+    group_id = _grid.stage_to_global(stage_id=stage_id)
+    if (dist.get_rank() >= 0):
+        print("Barrier Group ID", group_id)
+        print("Barrier Group", _grid.p2p_groups[group_id])
+    dist.barrier(group=_groups[group_id])
+    if (dist.get_rank() >= 0):
+        print("Exiting Barrier ", group_id)
+
+
+def _get_send_recv_group(src_stage, dest_stage):
+    '''the group id is always the smaller rank unless its a wrap around'''
+
+    stage_id = None
+
+    first_stage = 0
+    last_stage = _grid.pipe_parallel_size - 1
+
+    if (src_stage == first_stage and dest_stage == last_stage
+            or dest_stage == first_stage and src_stage == last_stage):
+        stage_id = last_stage
+    elif src_stage > dest_stage:
+        stage_id = dest_stage
+    else:
+        stage_id = src_stage
+    '''group_id corresponds to group of [group_id, group_id+1]
+     unless group_id is the rank of the last stage
+     in which case group_id correspods to group[group_id-num_stages+1, group_id]
+     '''
+    group_id = _grid.stage_to_global(stage_id=stage_id)
+
+    return _groups[group_id]
diff --git a/deepspeed/runtime/pipe/schedule.py b/deepspeed/runtime/pipe/schedule.py
new file mode 100644
index 000000000000..1f6ac1d8dce8
--- /dev/null
+++ b/deepspeed/runtime/pipe/schedule.py
@@ -0,0 +1,482 @@
+from ..utils import call_to_str
+
+from abc import ABC, abstractmethod
+
+
+class PipeSchedule(ABC):
+    """Directs the execution of a pipeline engine by generating sequences of
+    :class:`PipeInstruction`.
+
+    Schedules are generators that yield sequences of
+    :class:`PipeInstruction` to process the micro-batches in one batch.
+    Each yielded step is atomic in the sense that a barrier
+    synchronization can be placed between successive steps without
+    deadlock.
+
+    Below is an example schedule that implements data parallelism with gradient accumulation:
+
+    .. code-block:: python
+
+        class DataParallelSchedule(PipeSchedule):
+            def steps(self):
+                for step_id in range(self.micro_batches):
+                    cmds = [
+                        LoadMicroBatch(buffer_id=0),
+                        ForwardPass(buffer_id=0),
+                        BackwardPass(buffer_id=0),
+                    ]
+                    if step_id == self.micro_batches - 1:
+                        cmds.extend([
+                            ReduceGrads(),
+                            OptimizerStep(),
+                        ])
+                    yield cmds
+
+            def num_pipe_buffers(self):
+                return 1
+
+    Args:
+        micro_batches (int): The number of micro-batches that comprise a batch.
+        stages (int): The number of pipeline stages.
+        stage_id (int): The pipe stage that will execute the generated schedule.
+    """
+    def __init__(self, micro_batches, stages, stage_id):
+        super().__init__()
+        self.micro_batches = micro_batches
+        self.stages = stages
+        self.stage_id = stage_id
+        self.prev_stage = self.stage_id - 1
+        self.next_stage = self.stage_id + 1
+
+    @abstractmethod
+    def steps(self):
+        """Yield a list of :class:`PipeInstruction` for each step in the schedule.
+
+        .. note::
+            Schedules must implement ``steps()`` to define the schedule.
+
+        Returns:
+            Instructions to be executed as one step of the pipeline
+        """
+        pass
+
+    def num_pipe_buffers(self):
+        """The number of pipeline buffers that will be used by this stage.
+
+        .. note::
+            Schedules should specialize ``num_pipe_buffers()`` for memory savings at scale.
+
+        Returns:
+            The number of buffers for the engine to allocate.
+        """
+        return self.micro_batches
+
+    def _valid_micro_batch(self, micro_batch_id):
+        return 0 <= micro_batch_id < self.micro_batches
+
+    def _valid_stage(self, stage_id):
+        return 0 <= stage_id < self.stages
+
+    @property
+    def stage(self):
+        """Stage index used to configure this schedule."""
+        return self.stage_id
+
+    @property
+    def num_stages(self):
+        """The number of total pipeline stages used to configure this schedule."""
+        return self.stages
+
+    @property
+    def num_micro_batches(self):
+        """The number of total micro_batches used to configure this schedule."""
+        return self.micro_batches
+
+    @property
+    def is_first_stage(self):
+        """True if the configured ``stage_id`` is the first stage in the pipeline."""
+        return self.stage_id == 0
+
+    @property
+    def is_last_stage(self):
+        """True if the configured ``stage_id`` is the last stage in the pipeline."""
+        return self.stage_id == self.stages - 1
+
+    def _buffer_idx(self, micro_batch_id):
+        """Map a micro-batch index to a pipeline buffer index.
+
+        This method uses a cyclic allocation strategy.
+
+        Args:
+            micro_batch_id (int): The micro-batch index relative to the beginning of the schedule.
+
+        Returns:
+            int: The index of the buffer that should store data.
+        """
+        assert self._valid_micro_batch(micro_batch_id)
+        return micro_batch_id % self.num_pipe_buffers()
+
+    def __iter__(self):
+        self.it = None
+        return self
+
+    def __next__(self):
+        if self.it is None:
+            self.it = self.steps()
+        return next(self.it)
+
+
+class InferenceSchedule(PipeSchedule):
+    """A schedule for inferencing batches using pipeline parallelism.
+    """
+    def steps(self):
+        """"""
+        prev_micro_batch_id = -1
+        total_steps = self.micro_batches + self.stages - 1
+        for step_id in range(total_steps):
+            cmds = []
+            micro_batch_id = step_id - self.stage_id
+
+            # Alternate send/recv buffers
+            if _is_even(self.stage_id):
+                recv_buf = step_id % 2
+                send_buf = (step_id + 1) % 2
+            else:
+                recv_buf = (step_id + 1) % 2
+                send_buf = step_id % 2
+
+            if self.is_first_stage or self.is_last_stage:
+                if self._valid_micro_batch(micro_batch_id):
+                    cmds.append(LoadMicroBatch(recv_buf))
+
+            if _is_even(self.stage_id):
+                if self._valid_stage(self.next_stage):
+                    if self._valid_micro_batch(micro_batch_id - 1):
+                        cmds.append(SendActivation(send_buf))
+                if self._valid_stage(self.prev_stage):
+                    if self._valid_micro_batch(micro_batch_id):
+                        cmds.append(RecvActivation(recv_buf))
+            else:
+                if self._valid_stage(self.prev_stage):
+                    if self._valid_micro_batch(micro_batch_id):
+                        cmds.append(RecvActivation(recv_buf))
+
+                if self._valid_stage(self.next_stage):
+                    if self._valid_micro_batch(micro_batch_id - 1):
+                        cmds.append(SendActivation(send_buf))
+
+            if self._valid_micro_batch(micro_batch_id):
+                cmds.append(ForwardPass(recv_buf))
+
+            yield cmds
+
+    def num_pipe_buffers(self):
+        """Only two pipeline buffers are required for inferencing.
+
+        Returns:
+            ``2``
+        """
+        return 2
+
+
+class TrainSchedule(PipeSchedule):
+    """A schedule for training a batch using hybrid parallelism.
+
+    Pipeline parallelism is extracted through gradient accumulation and thus
+    convergence follows that of a data parallel approach with the same batch
+    size.
+    """
+    def steps(self):
+        """"""
+        prev_micro_batch_id = -1
+        total_steps = 2 * (self.micro_batches + self.stages - 1)
+        for step_id in range(total_steps):
+            # Map the step of the pipeline to the micro-batch id and also whether it is a
+            # forward or backward pass step.
+            micro_batch_id, is_forward = self._step_to_micro_batch(step_id)
+
+            if self._valid_micro_batch(prev_micro_batch_id):
+                prev_buffer = self._buffer_idx(prev_micro_batch_id)
+            if self._valid_micro_batch(micro_batch_id):
+                curr_buffer = self._buffer_idx(micro_batch_id)
+
+            cmds = []
+
+            # Exchange activations
+            if is_forward:
+                if self._valid_micro_batch(micro_batch_id) and self._valid_stage(
+                        self.prev_stage):
+                    cmds.append(RecvActivation(curr_buffer))
+                if self._valid_micro_batch(prev_micro_batch_id) and self._valid_stage(
+                        self.prev_stage):
+                    cmds.append(SendGrad(prev_buffer))
+            else:
+                if self._valid_micro_batch(prev_micro_batch_id) and self._valid_stage(
+                        self.next_stage):
+                    cmds.append(SendActivation(prev_buffer))
+                if self._valid_micro_batch(micro_batch_id) and self._valid_stage(
+                        self.next_stage):
+                    cmds.append(RecvGrad(curr_buffer))
+
+            # First/last stage loads
+            if self.stage_id == 0 or self.stage_id == self.stages - 1:
+                if is_forward and self._valid_micro_batch(micro_batch_id):
+                    cmds.append(LoadMicroBatch(curr_buffer))
+
+            # Computation
+            if self._valid_micro_batch(micro_batch_id):
+                if is_forward:
+                    cmds.append(ForwardPass(curr_buffer))
+                else:
+                    cmds.append(BackwardPass(curr_buffer))
+
+            # Model step at the end of the batch
+            if step_id == total_steps - 1:
+                cmds.append(ReduceTiedGrads())
+                cmds.append(ReduceGrads())
+                cmds.append(OptimizerStep())
+
+            # Prepare state for next time
+            prev_micro_batch_id = micro_batch_id
+            yield cmds
+
+    def num_pipe_buffers(self):
+        """As many buffers as the distance from this stage to the last stage.
+        """
+        buffers = min(self.stages - self.stage_id + 1, self.micro_batches)
+        return max(2, buffers)
+
+    def _step_to_micro_batch(self, step_id):
+        if _is_even(step_id) and _is_even(self.stage_id):
+            micro_batch_id = self._even_step_forward_id(step_id)
+            is_forward = True
+
+        elif _is_odd(step_id) and _is_odd(self.stage_id):
+            micro_batch_id = self._odd_step_forward_id(step_id)
+            is_forward = True
+
+        elif _is_even(step_id) and _is_odd(self.stage_id):
+            micro_batch_id = self._even_step_backward_id(step_id)
+            is_forward = False
+
+        elif _is_odd(step_id) and _is_even(self.stage_id):
+            micro_batch_id = self._odd_step_backward_id(step_id)
+            is_forward = False
+
+        else:
+            assert False
+
+        return micro_batch_id, is_forward
+
+    def _even_step_forward_id(self, step_id):
+        base = step_id // 2
+        micro_batch_id = int(base - self.stage_id // 2)
+        return micro_batch_id
+
+    def _odd_step_forward_id(self, step_id):
+        base = (step_id - 1) // 2
+        micro_batch_id = int(base - self.stage_id // 2)
+        return micro_batch_id
+
+    def _even_step_backward_id(self, step_id):
+        base = step_id // 2
+        micro_batch_id = int(base - self.stages + (self.stage_id + 1) // 2)
+        return micro_batch_id
+
+    def _odd_step_backward_id(self, step_id):
+        base = ((step_id - 1) // 2) - self.stages + 1
+        micro_batch_id = int(base + self.stage_id // 2)
+        return micro_batch_id
+
+
+class DataParallelSchedule(PipeSchedule):
+    """An example schedule that trains using traditional data parallelism with gradient
+    accumulation.
+    """
+    def steps(self):
+        """"""
+        for step_id in range(self.micro_batches):
+            cmds = [
+                LoadMicroBatch(buffer_id=0),
+                ForwardPass(buffer_id=0),
+                BackwardPass(buffer_id=0),
+            ]
+            if step_id == self.micro_batches - 1:
+                cmds.extend([
+                    ReduceGrads(),
+                    OptimizerStep(),
+                ])
+            yield cmds
+
+    def num_pipe_buffers(self):
+        """Only one pipeline buffer needed.
+        """
+        return 1
+
+
+class PipeInstruction:
+    """Base class for all instructions to be executed by the pipeline engine.
+
+    All keyword arguments are stored as members similar to a ``namedtuple``. These are
+    then accessible to the :class:`PipeEngine` during execution.
+
+    Args:
+        kwargs (optional): keyword arguments to store as members
+    """
+    def __init__(self, **kwargs):
+        self.name = self.__class__.__name__
+        self.kwargs = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+    def __repr__(self):
+        return call_to_str(self.name, **self.kwargs)
+
+
+class OptimizerStep(PipeInstruction):
+    """Performs one step with the optimizer and zeros gradients.
+
+    .. note:: Should be issued after :class:`ReduceGrads` and :class:`ReduceTiedGrads`.
+
+    .. note:: Can be a synchronization point among data-parallel ranks.
+    """
+    pass
+
+
+class ReduceGrads(PipeInstruction):
+    """Reduce the computed gradients among data-parallel processes within the stage.
+    """
+    pass
+
+
+class ReduceTiedGrads(PipeInstruction):
+    """Reduce the computed gradients of tied modules within a pipeline-parallel group.
+
+    .. warning::
+        The stages included in this synchronization point are not known until
+        the model is partitioned among pipeline stages. In the worst case, it
+        includes all pipeline stages. This instruction should be scheduled
+        carefully to avoid deadlocks.
+    """
+    pass
+
+
+class BufferOpInstruction(PipeInstruction):
+    """A pipeline instruction that operates on pipeline buffer(s).
+
+    Args:
+        buffer_id (int): the index of the pipeline buffer() to modify.
+    """
+    def __init__(self, buffer_id, **kwargs):
+        super().__init__(buffer_id=buffer_id, **kwargs)
+
+
+# IO
+class LoadMicroBatch(BufferOpInstruction):
+    """Load a micro-batch into a buffer.
+
+    Roughly:
+
+    .. code-block:: python
+
+        buffers['inputs'][buffer_id] = next(data_iter)
+    """
+    pass
+
+
+# Compute
+class ForwardPass(BufferOpInstruction):
+    """Compute a forward pass.
+
+    Roughly:
+
+    .. code-block:: python
+
+        buffers['ouputs'][buffer_id] = forward(buffers['inputs'][buffer_id])
+    """
+    pass
+
+
+class BackwardPass(BufferOpInstruction):
+    """Compute a backward pass and accumulate gradients.
+
+    Roughly:
+
+    .. code-block:: python
+
+        outputs = buffers['ouputs'][buffer_id]
+        gradients = buffers['gradients'][buffer_id]
+        torch.autograd.backward(tensors=outputs,
+                                grad_tensors=gradients)
+    """
+    pass
+
+
+# Communication
+class SendActivation(BufferOpInstruction):
+    """Send activations to the next stage in the pipeline.
+
+    Roughly:
+
+    .. code-block:: python
+
+        send(buffers['outputs'][buffer_id])
+
+    .. note::
+        The communication is blocking and must be paired with a :class:`RecvActivation`
+        on the next pipeline stage to avoid deadlock.
+    """
+    pass
+
+
+class RecvActivation(BufferOpInstruction):
+    """Receive activations from the previous stage in the pipeline.
+
+    Roughly:
+
+    .. code-block:: python
+
+        buffers['inputs'][buffer_id] = recv()
+
+    .. note::
+        The communication is blocking and must be paired with a :class:`SendActivation`
+        on the previous pipeline stage to avoid deadlock.
+    """
+    pass
+
+
+class SendGrad(BufferOpInstruction):
+    """Send computed gradients to the previous pipeline stage.
+    with respect to the received activations
+
+    .. note::
+        Only received tensors with ``requires_grad==True`` will produce gradients.
+        Missing gradients will be replaced with ``None`` on the receiving stage.
+
+    .. note::
+        The communication is blocking and must be paired with a :class:`RecvGrad`
+        on the previous pipeline stage to avoid deadlock.
+    """
+    pass
+
+
+class RecvGrad(BufferOpInstruction):
+    """Receive computed gradients the next pipeline stage.
+
+    .. note::
+        Only activations with ``requires_grad==True`` will produce gradients.
+        Missing gradients will be replaced with ``None``.
+
+    .. note::
+        The communication is blocking and must be paired with a :class:`SendGrad`
+        on the next pipeline stage to avoid deadlock.
+    """
+    pass
+
+
+def _is_even(x):
+    return x % 2 == 0
+
+
+def _is_odd(x):
+    return x % 2 != 0
diff --git a/deepspeed/runtime/pipe/topology.py b/deepspeed/runtime/pipe/topology.py
new file mode 100644
index 000000000000..3862e8469aef
--- /dev/null
+++ b/deepspeed/runtime/pipe/topology.py
@@ -0,0 +1,455 @@
+# Copyright 2019 The Microsoft DeepSpeed Team
+
+from deepspeed.utils import logger
+
+import torch.distributed as dist
+import sys
+
+from collections import namedtuple
+from itertools import product as cartesian_product
+
+
+class ProcessTopology:
+    """ Manages the mapping of n-dimensional Cartesian coordinates to linear
+    indices. This mapping is used to map the rank of processes to the grid
+    for various forms of parallelism.
+
+    Each axis of the tensor is accessed by its name. The provided ordering
+    of the axes defines the layout of the topology. ProcessTopology uses a "row-major"
+    layout of the tensor axes, and so axes=['x', 'y'] would map coordinates (x,y) and
+    (x,y+1) to adjacent linear indices. If instead axes=['y', 'x'] was used, coordinates
+    (x,y) and (x+1,y) would be adjacent.
+
+    Some methods return ProcessCoord namedtuples.
+    """
+    def __init__(self, axes, dims):
+        """Create a mapping of n-dimensional tensor coordinates to linear indices.
+
+        Arguments:
+            axes (list): the names of the tensor axes
+            dims (list): the dimension (length) of each axis of the topology tensor
+        """
+
+        self.axes = axes  # names of each topology axis
+        self.dims = dims  # length of each topology axis
+
+        # This is actually a class that lets us hash {'row':3, 'col':2} mappings
+        self.ProcessCoord = namedtuple('ProcessCoord', axes)
+
+        self.mapping = {}
+        ranges = [range(d) for d in dims]
+        # example: 1, (0,0,1)
+        for global_rank, coord in enumerate(cartesian_product(*ranges)):
+            key = {axis: coord[self.axes.index(axis)] for axis in self.axes}
+            key = self.ProcessCoord(**key)
+            # for example, {ProcessCoord(row=0, col=1) : 1}
+            self.mapping[key] = global_rank
+
+    def get_rank(self, **coord_kwargs):
+        """Return the global rank of a process via its coordinates.
+
+        Coordinates are specified as kwargs. For example:
+
+            >>> X = ProcessTopology(axes=['x', 'y'], dims=[2,3])
+            >>> X.get_rank(x=0, y=1)
+            1
+        """
+        if len(coord_kwargs) != len(self.axes):
+            raise ValueError('get_rank() does not support slices. Use filter_match())')
+
+        key = self.ProcessCoord(**coord_kwargs)
+        assert key in self.mapping, f'key {kwargs} invalid'
+        return self.mapping[key]
+
+    def get_axis_names(self):
+        """Return a list of the axis names in the ordering of the topology. """
+        return self.axes
+
+    def get_rank_repr(self,
+                      rank,
+                      omit_axes=['data',
+                                 'pipe'],
+                      inner_sep='_',
+                      outer_sep='-'):
+        """Return a string representation of a rank.
+
+        This method is primarily used for checkpointing model data.
+
+        For example:
+            >>> topo = Topo(axes=['a', 'b'], dims=[2, 2])
+            >>> topo.get_rank_repr(rank=3)
+            'a_01-b_01'
+            >>> topo.get_rank_repr(rank=3, omit_axes=['a'])
+            'b_01'
+
+        Args:
+            rank (int): A rank in the topology.
+            omit_axes (list, optional): Axes that should not be in the representation. Defaults to ['data', 'pipe'].
+            inner_sep (str, optional): [description]. Defaults to '_'.
+            outer_sep (str, optional): [description]. Defaults to '-'.
+
+        Returns:
+            str: A string representation of the coordinate owned by ``rank``.
+        """
+        omit_axes = frozenset(omit_axes)
+        axes = [a for a in self.get_axis_names() if a not in omit_axes]
+        names = []
+        for ax in axes:
+            ax_rank = getattr(self.get_coord(rank=rank), ax)
+            names.append(f'{ax}{inner_sep}{ax_rank:02d}')
+        return outer_sep.join(names)
+
+    def get_dim(self, axis):
+        """Return the number of processes along the given axis.
+
+        For example:
+            >>> X = ProcessTopology(axes=['x', 'y'], dims=[2,3])
+            >>> X.get_dim('y')
+            3
+        """
+        if axis not in self.axes:
+            return 0
+        return self.dims[self.axes.index(axis)]
+
+    def get_coord(self, rank):
+        """Return the coordinate owned by a process rank.
+
+        The axes of the returned namedtuple can be directly accessed as members. For
+        example:
+            >>> X = ProcessTopology(axes=['x', 'y'], dims=[2,3])
+            >>> coord = X.get_coord(rank=1)
+            >>> coord.x
+            0
+            >>> coord.y
+            1
+        """
+        for coord, idx in self.mapping.items():
+            if idx == rank:
+                return coord
+        raise ValueError(f'rank {rank} not found in topology.')
+
+    def get_axis_comm_lists(self, axis):
+        """ Construct lists suitable for a communicator group along axis ``axis``.
+
+        Example:
+            >>> topo = Topo(axes=['pipe', 'data', 'model'], dims=[2, 2, 2])
+            >>> topo.get_axis_comm_lists('pipe')
+            [
+                [0, 4], # data=0, model=0
+                [1, 5], # data=0, model=1
+                [2, 6], # data=1, model=0
+                [3, 7], # data=1, model=1
+            ]
+
+        Returns:
+            A list of lists whose coordinates match in all axes *except* ``axis``.
+        """
+
+        # We don't want to RuntimeError because it allows us to write more generalized
+        # code for hybrid parallelisms.
+        if axis not in self.axes:
+            return []
+
+        # Grab all axes but `axis`
+        other_axes = [a for a in self.axes if a != axis]
+
+        lists = []
+
+        # Construct all combinations of coords with other_axes
+        ranges = [range(self.get_dim(a)) for a in other_axes]
+        for coord in cartesian_product(*ranges):
+            other_keys = {a: coord[other_axes.index(a)] for a in other_axes}
+            # now go over all ranks in `axis`.
+            sub_list = []
+            for axis_key in range(self.get_dim(axis)):
+                key = self.ProcessCoord(**other_keys, **{axis: axis_key})
+                sub_list.append(self.mapping[key])
+            lists.append(sub_list)
+
+        return lists
+
+    def filter_match(self, **filter_kwargs):
+        """Return the list of ranks whose coordinates match the provided criteria.
+
+        Example:
+            >>> X = ProcessTopology(axes=['pipe', 'data', 'model'], dims=[2, 2, 2])
+            >>> X.filter_match(pipe=0, data=1)
+            [2, 3]
+            >>> [X.get_coord(rank) for rank in X.filter_match(pipe=0, data=1)]
+            [ProcessCoord(pipe=0, data=1, model=0), ProcessCoord(pipe=0, data=1, model=1)]
+
+        Arguments:
+            **filter_kwargs (dict): criteria used to select coordinates.
+
+        Returns:
+            The list of ranks whose coordinates match filter_kwargs.
+        """
+        def _filter_helper(x):
+            for key, val in filter_kwargs.items():
+                if getattr(x, key) != val:
+                    return False
+            return True
+
+        coords = filter(_filter_helper, self.mapping.keys())
+        return [self.mapping[coo] for coo in coords]
+
+    def get_axis_list(self, axis, idx):
+        """Returns the list of global ranks whose coordinate in an axis is idx.
+
+        For example:
+            >>> X = ProcessTopology(axes=['x', 'y'], dims=[2,3])
+            >>> X.get_axis_list(axis='x', idx=0)
+            [0, 1, 2]
+            >>> X.get_axis_list(axis='y', idx=0)
+            [0, 3]
+        """
+
+        # This could be faster by generating the desired keys directly instead of
+        # filtering.
+        axis_num = self.axes.index(axis)
+        ranks = [self.mapping[k] for k in self.mapping.keys() if k[axis_num] == idx]
+        return ranks
+
+    def world_size(self):
+        return len(self.mapping)
+
+    def __str__(self):
+        return str(self.mapping)
+
+
+def _prime_factors(N):
+    """ Returns the prime factorization of positive integer N. """
+    if N <= 0:
+        raise ValueError("Values must be strictly positive.")
+
+    primes = []
+    while N != 1:
+        for candidate in range(2, N + 1):
+            if N % candidate == 0:
+                primes.append(candidate)
+                N //= candidate
+                break
+    return primes
+
+
+class PipeDataParallelTopology(ProcessTopology):
+    """ A topology specialiation for hybrid data and pipeline parallelism.
+
+        Uses data parallelism on the last dimension to encourage gradient
+        reductions to use high-bandwidth intra-node links and lower-volume
+        pipeline communications to use low-bandwidth inter-node links.
+    """
+    def __init__(self, num_pp, num_dp):
+        super().__init__(axes=['pipe', 'data'], dims=[num_pp, num_dp])
+
+
+class PipeModelDataParallelTopology(ProcessTopology):
+    """ A topology for hybrid pipeline, model, and data parallelism. """
+    def __init__(self, num_pp, num_mp, num_dp):
+        super().__init__(axes=['pipe', 'data', 'model'], dims=[num_pp, num_dp, num_mp])
+
+
+class PipelineParallelGrid:
+    """Implements a grid object that stores the data parallel ranks
+    corresponding to each o the model parallel stages
+
+    The grid object organizes the processes in a distributed pytorch job
+    into a 2D grid, of stage_id and data_parallel_id.
+
+    self.stage_id and self.data_parallel_id stores the stage id
+    and the data parallel id of current process.
+
+    self.dp_group groups the processes by stage_id.
+    self.dp_group[i], is a list containing all process ranks whose
+    stage_id is i.
+
+    self.p2p_groups stores a list of tuple, where each tuple
+    stores process ranks of adjacent stages for a given data_parallel_id.
+    For example if num_stage is 5 then a tuple [7,8] represents stages [3, 4],
+    with data_parallel id = 1. A stage wrap around will appear as non-adjacent ranks,
+    for example tuple [4,0] with representing wrap-around stage 4 and 0, for
+    data_parallel_id = 0, or similarly [9,5] represents wrapped around stages [4,0]
+    for data_parallel_id = 1.
+    """
+    def __init__(self, topology=None, process_group=None):
+        # TODO use process_group if provided
+        self.global_rank = dist.get_rank()
+        self.world_size = dist.get_world_size()
+        if topology is not None:
+            if self.global_rank == 0:
+                print('Using topology:', topology)
+            self._topo = topology
+        else:
+            num_pp = 1
+            num_dp = 1
+            for idx, prime in enumerate(_prime_factors(self.world_size)):
+                if idx % 2 == 0:
+                    num_pp *= prime
+                else:
+                    num_dp *= prime
+            self._topo = PipeDataParallelTopology(num_dp=num_dp, num_pp=num_pp)
+        self.data_parallel_size = max(self._topo.get_dim('data'), 1)
+        self.pipe_parallel_size = max(self._topo.get_dim('pipe'), 1)
+        self.model_parallel_size = max(self._topo.get_dim('model'), 1)
+        assert self._is_grid_valid(), "Invalid Grid"
+
+        self.stage_id = self.get_stage_id()
+        self.data_parallel_id = self.get_data_parallel_id()
+
+        # Create new ProcessGroups for all model parallelism. DeepSpeedLight uses these
+        # to detect overflow, etc.
+        self.ds_model_proc_group = None
+        self.ds_model_rank = -1
+        for dp in range(self.data_parallel_size):
+            ranks = sorted(self._topo.get_axis_list(axis='data', idx=dp))
+            if self.global_rank == 0:
+                #print(f'RANK={self.global_rank} building DeepSpeed model group: {ranks}')
+                pass
+            proc_group = dist.new_group(ranks=ranks)
+            if self.global_rank in ranks:
+                self.ds_model_proc_group = proc_group
+                self.ds_model_world_size = len(ranks)
+                self.ds_model_rank = ranks.index(self.global_rank)
+        assert self.ds_model_rank > -1
+        assert self.ds_model_proc_group is not None
+
+        # Create new ProcessGroup for gradient all-reduces - these are the data parallel groups
+        self.dp_group = []
+        self.dp_groups = self._topo.get_axis_comm_lists('data')
+        for g in self.dp_groups:
+            proc_group = dist.new_group(ranks=g)
+            if self.global_rank in g:
+                self.dp_group = g
+                self.dp_proc_group = proc_group
+
+        self.is_first_stage = (self.stage_id == 0)
+        self.is_last_stage = (self.stage_id == (self.pipe_parallel_size - 1))
+
+        self.p2p_groups = self._build_p2p_groups()
+
+        # Create new ProcessGroup for pipeline collectives - these are pipe parallel groups
+        self.pp_group = []
+        self.pp_proc_group = None
+        self.pipe_groups = self._topo.get_axis_comm_lists('pipe')
+        for ranks in self.pipe_groups:
+            if self.global_rank == 0:
+                #print(f'RANK={self.global_rank} building pipeline group: {ranks}')
+                pass
+            proc_group = dist.new_group(ranks=ranks)
+            if self.global_rank in ranks:
+                self.pp_group = ranks
+                self.pp_proc_group = proc_group
+        assert self.pp_proc_group is not None
+
+        # Create new ProcessGroup for model (tensor-slicing) collectives
+
+        # Short circuit case without model parallelism.
+        # TODO: it would be nice if topology had bcast semantics to avoid this branching
+        # case?
+        if self.model_parallel_size == 1:
+            for group_rank in range(self.world_size):
+                group_rank = [group_rank]
+                group = dist.new_group(ranks=group_rank)
+                if group_rank[0] == self.global_rank:
+                    self.slice_group = group_rank
+                    self.slice_proc_group = group
+            return
+        else:
+            self.mp_group = []
+            self.model_groups = self._topo.get_axis_comm_lists('model')
+            for g in self.model_groups:
+                proc_group = dist.new_group(ranks=g)
+                if self.global_rank in g:
+                    self.slice_group = g
+                    self.slice_proc_group = proc_group
+
+    def get_stage_id(self):
+        return self._topo.get_coord(rank=self.global_rank).pipe
+
+    def get_data_parallel_id(self):
+        return self._topo.get_coord(rank=self.global_rank).data
+
+    def _build_p2p_groups(self):
+        """Groups for sending and receiving activations and gradients across model
+        parallel stages.
+        """
+        comm_lists = self._topo.get_axis_comm_lists('pipe')
+        p2p_lists = []
+        for rank in range(self.world_size):
+            for l in comm_lists:
+                assert len(l) == self.pipe_parallel_size
+                if rank in l:
+                    idx = l.index(rank)
+                    buddy_rank = l[(idx + 1) % self.pipe_parallel_size]
+                    p2p_lists.append([rank, buddy_rank])
+                    break  # next global rank
+        assert len(p2p_lists) == self.world_size
+        return p2p_lists
+
+    def _is_grid_valid(self):
+        ranks = 1
+        for ax in self._topo.get_axis_names():
+            ranks *= self._topo.get_dim(ax)
+        return ranks == dist.get_world_size()
+
+    #returns the global rank of the process with the provided stage id
+    #which has the same data_parallel_id as caller process
+    def stage_to_global(self, stage_id, **kwargs):
+        me = self._topo.get_coord(self.global_rank)
+        transform = me._replace(pipe=stage_id, **kwargs)._asdict()
+        return self._topo.get_rank(**transform)
+
+    def topology(self):
+        return self._topo
+
+    # MPU functions for DeepSpeed integration
+    def get_global_rank(self):
+        return self.global_rank
+
+    def get_pipe_parallel_rank(self):
+        """ The stage of the pipeline this rank resides in. """
+        return self.get_stage_id()
+
+    def get_pipe_parallel_world_size(self):
+        """ The number of stages in the pipeline. """
+        return self.pipe_parallel_size
+
+    def get_pipe_parallel_group(self):
+        """ The group of ranks within the same pipeline. """
+        return self.pp_proc_group
+
+    def get_data_parallel_rank(self):
+        """ Which pipeline this rank resides in. """
+        return self.data_parallel_id
+
+    def get_data_parallel_world_size(self):
+        """ The number of pipelines. """
+        return self.data_parallel_size
+
+    def get_data_parallel_group(self):
+        """ The group of ranks within the same stage of all pipelines. """
+        return self.dp_proc_group
+
+    # These are model parallel groups across all types of model parallelism.
+    # Deepspeed uses them to detect overflow, etc.
+    def get_model_parallel_rank(self):
+        return self.ds_model_rank
+
+    def get_model_parallel_world_size(self):
+        return self.ds_model_world_size
+
+    def get_model_parallel_group(self):
+        return self.ds_model_proc_group
+
+    # For Megatron-style tensor slicing
+    def get_slice_parallel_rank(self):
+        if 'model' in self._topo.get_axis_names():
+            return self._topo.get_coord(rank=self.global_rank).model
+        else:
+            return 0
+
+    def get_slice_parallel_world_size(self):
+        self.slice_parallel_size
+
+    def get_slice_parallel_group(self):
+        return self.slice_proc_group
diff --git a/deepspeed/runtime/progressive_layer_drop.py b/deepspeed/runtime/progressive_layer_drop.py
new file mode 100755
index 000000000000..770978a940a0
--- /dev/null
+++ b/deepspeed/runtime/progressive_layer_drop.py
@@ -0,0 +1,33 @@
+import numpy as np
+from deepspeed.utils import log_dist
+
+
+class ProgressiveLayerDrop(object):
+    r""" Progressive Layer Dropping (PLD) for model training.
+        This implements the PLD technique for compressed model training
+        from this paper: https://arxiv.org/pdf/2010.13369.pdf
+    Args:
+        theta (float): a hyper-parameter that controls the trade-off between training time and robustness.
+        The lower the theta value, the faster the training speed. Default value: 0.5.
+        gamma (float): a hyper-parameter that controls how fast the drop ratio increases. Default value: 0.001.
+    """
+    def __init__(self, theta=0.5, gamma=0.001):
+        super().__init__()
+
+        self.theta = theta
+        self.gamma = gamma
+        self.current_theta = 1.0
+        log_dist(f'Enabled progressive layer dropping (theta = {self.theta})', ranks=[0])
+
+    def get_state(self):
+        kwargs = {'progressive_layer_drop': True, 'pld_theta': self.get_theta()}
+        return kwargs
+
+    def get_theta(self):
+        return self.current_theta
+
+    def update_state(self, global_step):
+        def _prob(x, gamma, p):
+            return (1. - p) * np.exp(-gamma * x) + p
+
+        self.current_theta = _prob(global_step, self.gamma, self.theta)
diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
new file mode 100755
index 000000000000..05285b328851
--- /dev/null
+++ b/deepspeed/runtime/utils.py
@@ -0,0 +1,558 @@
+'''
+Copyright 2019 The Microsoft DeepSpeed Team
+
+Copyright NVIDIA/Megatron
+
+Helper functions and classes from multiple sources.
+'''
+
+import os
+from math import ceil
+from math import floor
+from bisect import bisect_left, bisect_right
+
+import torch
+import torch.distributed as dist
+from torch._six import inf
+import torch.distributed as dist
+
+from deepspeed.utils import logger
+from numpy import prod
+
+
+def ensure_directory_exists(filename):
+    """Create the directory path to ``filename`` if it does not already exist.
+
+    Args:
+        filename (str): A file path.
+    """
+    dirname = os.path.dirname(filename)
+    os.makedirs(dirname, exist_ok=True)
+
+
+def set_random_seed(seed):
+    import numpy
+    import random
+    random.seed(seed)
+    numpy.random.seed(seed)
+    torch.manual_seed(seed)
+
+
+class CheckOverflow(object):
+    '''Checks for overflow in gradient across parallel process'''
+    def __init__(self, param_groups=None, mpu=None, zero_reduce_scatter=False):
+        self.mpu = mpu
+        self.params = [] if param_groups else None
+        self.zero_reduce_scatter = zero_reduce_scatter
+        if param_groups:
+            for group in param_groups:
+                for param in group:
+                    self.params.append(param)
+
+    def check_using_norm(self, norm_group, reduce_overflow=True):
+        #TODO: I don't think reduce_overflow is needed if mpu is None
+        overflow = -1 in norm_group
+
+        if self.mpu is not None:
+            overflow_gpu = torch.cuda.ByteTensor([overflow])
+            torch.distributed.all_reduce(overflow_gpu,
+                                         op=torch.distributed.ReduceOp.MAX,
+                                         group=self.mpu.get_model_parallel_group())
+            overflow = overflow_gpu[0].item()
+        elif reduce_overflow:
+            cuda_overflow = torch.cuda.FloatTensor([overflow])
+            dist.all_reduce(cuda_overflow, op=torch.distributed.ReduceOp.MAX)
+            dist.barrier()
+            overflow = cuda_overflow[0].item()
+
+        return bool(overflow)
+
+    def check(self, param_groups=None):
+        params = []
+        if param_groups is None:
+            params = self.params
+        else:
+            assert param_groups is not None, \
+                "self.params and param_groups both cannot be none"
+
+            for group in param_groups:
+                for param in group:
+                    params.append(param)
+
+        return self.has_overflow(params)
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow_serial(self, params):
+        for i, p in enumerate(params):
+            if p.grad is not None and self._has_inf_or_nan(p.grad.data, i):
+                return True
+        return False
+
+    def has_overflow(self, params):
+        overflow = self.has_overflow_serial(params)
+        # Since each model parallel GPU carries only part of the model,
+        # make sure overflow flag is synced across all the model parallel GPUs
+        overflow_gpu = torch.cuda.ByteTensor([overflow])
+        #torch.distributed.all_reduce(overflow_gpu,
+        #                             op=torch.distributed.ReduceOp.MAX,
+        #                             group=mpu.get_model_parallel_group())
+        if self.zero_reduce_scatter:
+            torch.distributed.all_reduce(overflow_gpu,
+                                         op=torch.distributed.ReduceOp.MAX,
+                                         group=torch.distributed.group.WORLD)
+        elif self.mpu is not None:
+            torch.distributed.all_reduce(overflow_gpu,
+                                         op=torch.distributed.ReduceOp.MAX,
+                                         group=self.mpu.get_model_parallel_group())
+
+        overflow = overflow_gpu[0].item()
+        return bool(overflow)
+
+    # `x` is a torch.Tensor
+    @staticmethod
+    def _has_inf_or_nan(x, i):
+        try:
+            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
+            # Pytorch's .sum() creates a one-element tensor of the same type as x
+            # (which is true for some recent version of pytorch).
+            cpu_sum = float(x.float().sum())
+            # More efficient version that can be used if .sum() returns a Python scalar
+            # cpu_sum = float(x.sum())
+        except RuntimeError as instance:
+            # We want to check if inst is actually an overflow exception.
+            # RuntimeError could come from a different error.
+            # If so, we still want the exception to propagate.
+            if "value cannot be converted" not in instance.args[0]:
+                raise
+            return True
+        else:
+            if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+                return True
+            return False
+
+
+def _handle_overflow(cpu_sum, x, i):
+    import math
+    rank = torch.distributed.get_rank()
+    if rank == 0:
+        t_i = -1
+        for v_i, v in enumerate(x.data.contiguous().view(-1)):
+            if not math.isfinite(float(v)):
+                t_i = v_i
+                break
+        logger.info(
+            f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}"
+        )
+
+
+def get_grad_norm(parameters, norm_type=2, mpu=None):
+    """Clips gradient norm of an iterable of parameters.
+
+    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+    added functionality to handle model parallel parameters. Note that
+    the gradients are modified in place. Taken from Nvidia Megatron.
+
+    Arguments:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+
+    norm_type = float(norm_type)
+    if norm_type == inf:
+        total_norm = max(p.grad.data.abs().max() for p in parameters)
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        # Take max across all GPUs.
+        if mpu is not None:
+            torch.distributed.all_reduce(total_norm_cuda,
+                                         op=torch.distributed.ReduceOp.MAX,
+                                         group=mpu.get_model_parallel_group())
+        total_norm = total_norm_cuda[0].item()
+    else:
+        total_norm = 0.
+        for p in parameters:
+            if mpu is not None:
+                if (mpu.get_model_parallel_rank() == 0
+                    ) or is_model_parallel_parameter(p):
+                    param_norm = p.grad.data.float().norm(norm_type)
+                    total_norm += param_norm.item()**norm_type
+            else:
+                param_norm = p.grad.data.float().norm(norm_type)
+                total_norm += param_norm.item()**norm_type
+
+        # Sum across all model parallel GPUs.
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        if mpu is not None:
+            torch.distributed.all_reduce(total_norm_cuda,
+                                         op=torch.distributed.ReduceOp.SUM,
+                                         group=mpu.get_model_parallel_group())
+        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+
+    if total_norm == float(
+            'inf') or total_norm == -float('inf') or total_norm != total_norm:
+        total_norm = -1
+
+    return total_norm
+
+
+def get_weight_norm(parameters, norm_type=2, mpu=None):
+    """Clips gradient norm of an iterable of parameters.
+
+    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+    added functionality to handle model parallel parameters. Note that
+    the gradients are modified in place. Taken from Nvidia Megatron.
+
+    Arguments:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+
+    norm_type = float(norm_type)
+    if norm_type == inf:
+        total_norm = max(p.data.abs().max() for p in parameters)
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        # Take max across all GPUs.
+        if mpu is not None:
+            torch.distributed.all_reduce(total_norm_cuda,
+                                         op=torch.distributed.ReduceOp.MAX,
+                                         group=mpu.get_model_parallel_group())
+        total_norm = total_norm_cuda[0].item()
+    else:
+        total_norm = 0.
+        for p in parameters:
+            if mpu is not None:
+                if (mpu.get_model_parallel_rank() == 0
+                    ) or is_model_parallel_parameter(p):
+                    try:
+                        param_norm = float(torch.norm(p, norm_type, dtype=torch.float32))
+                    except TypeError as err:
+                        param_norm = float(torch.norm(p.float(), norm_type))
+
+                    #param_norm = p.data.float().norm(norm_type)
+                    total_norm += param_norm**norm_type
+            else:
+                try:
+                    param_norm = float(torch.norm(p, norm_type, dtype=torch.float32))
+                except TypeError as err:
+                    param_norm = float(torch.norm(p.float(), norm_type))
+                #param_norm = p.data.float().norm(norm_type)
+                total_norm += param_norm**norm_type
+
+        # Sum across all model parallel GPUs.
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        if mpu is not None:
+            torch.distributed.all_reduce(total_norm_cuda,
+                                         op=torch.distributed.ReduceOp.SUM,
+                                         group=mpu.get_model_parallel_group())
+        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+
+    if total_norm == float(
+            'inf') or total_norm == -float('inf') or total_norm != total_norm:
+        total_norm = -1
+
+    return total_norm
+
+
+def is_model_parallel_parameter(p):
+    return hasattr(p, 'model_parallel') and p.model_parallel
+
+
+def prefix_sum_inc(weights):
+    """ Compute an inclusive prefix sum.
+
+    Example:
+        >>> prefix_sum_inc([3,4,5])
+        [3, 7, 12]
+    """
+    weights_ = [w for w in weights]
+    for x in range(1, len(weights_)):
+        weights_[x] += weights_[x - 1]
+    return weights_
+
+
+def partition_uniform(num_items, num_parts):
+    parts = [0] * (num_parts + 1)
+    # First check for the trivial edge case
+    if num_items <= num_parts:
+        for p in range(num_parts + 1):
+            parts[p] = min(p, num_items)
+        return parts
+
+    chunksize = floor(num_items / num_parts)
+    for p in range(num_parts):
+        parts[p] = min(chunksize * p, num_items)
+    parts[num_parts] = num_items
+    return parts
+
+
+def _lprobe(weights, num_parts, bottleneck):
+    num_items = len(weights)
+    total_weight = weights[-1]
+
+    # initialize partitioning
+    parts = [0] * (num_parts + 1)
+    for p in range(1, num_parts + 1):
+        parts[p] = num_items
+
+    bsum = bottleneck  # running sum of target weight for pth partition
+    chunksize = num_items // num_parts
+    step = chunksize
+    for p in range(1, num_parts):
+        # Jump to the next bucket
+        while (step < num_items) and (weights[step] < bsum):
+            step += chunksize
+
+        # Find the end index of partition p
+        parts[p] = bisect_left(weights,
+                               bsum,
+                               lo=step - chunksize,
+                               hi=min(step,
+                                      num_items))
+        # Nothing more to partition, return early
+        if parts[p] == num_items:
+            # See if the current partition is overweight.
+            part_size = weights[-1] - weights[parts[p - 1]]
+            return parts, part_size < bottleneck
+
+        # Next partition target
+        bsum = weights[parts[p] - 1] + bottleneck
+
+    return parts, bsum >= total_weight
+
+
+def _rb_partition_balanced(weights, num_parts, eps):
+    total_weight = weights[-1]
+    lower = total_weight / num_parts  # best case heaviest partition
+    upper = total_weight  # worst case heaviest partition
+
+    # Do a binary search for the best partitioning
+    while upper > lower + eps:
+        mid = lower + ((upper - lower) / 2)
+        parts, success = _lprobe(weights, num_parts, mid)
+        if success:
+            upper = mid
+        else:
+            lower = mid + eps
+    return upper
+
+
+def partition_balanced(weights, num_parts, eps=1e-3):
+    num_items = len(weights)
+    # First check for the trivial edge case
+    if num_items <= num_parts:
+        return partition_uniform(num_items, num_parts)
+
+    weights_ = prefix_sum_inc(weights)
+
+    # Find the smallest bottleneck (weight of heaviest partition)
+    bottleneck = _rb_partition_balanced(weights_, num_parts, eps=eps)
+
+    # Now compute that partitioning
+    parts, success = _lprobe(weights_, num_parts, bottleneck)
+    assert success
+
+    return parts
+
+
+class PartitionedTensor:
+    def __init__(self, tensor, group, partition_meta=None):
+        super().__init__()
+
+        self.group = group
+        self.num_parts = dist.get_world_size(group=self.group)
+        self.rank = dist.get_rank(group=self.group)
+
+        self.orig_size = list(tensor.size())
+        self.orig_device = tensor.device
+        self.local_data, self.partition = self._partition_tensor(tensor)
+
+    @classmethod
+    def from_meta(cls, meta, local_part, group, device='cuda'):
+        assert meta.dtype == torch.long
+        dummy = torch.ones(dist.get_world_size(group=group))
+        part_obj = cls(tensor=dummy, group=group)
+
+        meta = meta.tolist()
+
+        # [N, list0, ..., listN-1]
+        part_obj.orig_size = meta[1:(1 + meta[0])]
+        meta = meta[1 + meta[0]:]
+
+        part_obj.orig_device = device
+        part_obj.local_data = local_part.detach()
+
+        part_obj.group = group
+
+        # Partition is encoded like the rowptr of a CSR matrix:
+        # [num_parts, rank, 0, part_1, ..., part_num_parts]
+        # TODO: support shuffle between different partition granularities
+        assert part_obj.num_parts == meta[0]
+        assert part_obj.rank == meta[1]
+        part_obj.partition = meta[2:]  # length num_parts+1
+
+        return part_obj
+
+    def _partition_tensor(self, tensor):
+        partition = partition_uniform(num_items=tensor.numel(), num_parts=self.num_parts)
+        start = partition[self.rank]
+        length = partition[self.rank + 1] - start
+        tensor_part = tensor.detach().contiguous().view(-1).narrow(
+            0,
+            start=start,
+            length=length).clone()
+
+        return tensor_part, partition
+
+    def full(self, device=None):
+        if device is None:
+            device = self.orig_device
+
+        # Allocate the full tensor as a flat buffer.
+        full_numel = prod(self.full_size())
+        flat_tensor = torch.zeros([full_numel],
+                                  dtype=self.local_data.dtype,
+                                  device=device)
+
+        # Prepare all-gather buffer
+        partition_tensors = []
+        for part_id in range(self.num_parts):
+            part_size = self.partition[part_id + 1] - self.partition[part_id]
+            buf = flat_tensor.narrow(0, start=self.partition[part_id], length=part_size)
+            if part_id == self.rank:
+                buf.copy_(self.local_data)
+            partition_tensors.append(buf)
+
+        # Collect the full tensor
+        dist.all_gather(partition_tensors,
+                        partition_tensors[self.rank],
+                        group=self.group)
+
+        for i in range(len(partition_tensors)):
+            partition_tensors[i].data = torch.zeros(1)
+            partition_tensors[i] = None
+
+        return flat_tensor.view(self.full_size()).clone().detach()
+
+    def to_meta(self):
+        """Returns a torch.LongTensor that encodes partitioning information.
+
+        Can be used along with ``data()`` to serialize a ``PartitionedTensor`` for
+        communication.
+
+        Returns:
+            torch.LongTensor: a tensor encoding the meta-information for the partitioning
+        """
+        meta = []
+        meta.append(len(self.orig_size))
+        meta += list(self.orig_size)
+        meta.append(self.num_parts)
+        meta.append(self.rank)
+        meta += self.partition
+        return torch.LongTensor(data=meta).to(self.orig_device)
+
+    def data(self):
+        return self.local_data
+
+    def local_size(self):
+        return self.local_data.size()
+
+    def full_size(self):
+        return self.orig_size
+
+
+mem_alloced = 0
+mem_cached = 0
+
+
+def memory_status(msg, print_rank=-1, reset_max=False):
+    global mem_alloced, mem_cached
+
+    rank = dist.get_rank()
+    if print_rank != -1 and rank != print_rank:
+        return
+
+    torch.cuda.synchronize()
+
+    if reset_max:
+        torch.cuda.reset_max_memory_cached()
+        torch.cuda.reset_max_memory_allocated()
+
+    new_alloced = torch.cuda.memory_allocated()
+    new_cached = torch.cuda.memory_cached()
+
+    delta_alloced = new_alloced - mem_alloced
+    delta_cached = new_cached - mem_cached
+
+    mem_cached = new_cached
+    mem_alloced = new_alloced
+
+    max_alloced = torch.cuda.max_memory_allocated()
+    max_cached = torch.cuda.max_memory_cached()
+
+    # convert to GB for printing
+    new_alloced /= 1024**3
+    new_cached /= 1024**3
+    delta_alloced /= 1024**3
+    delta_cached /= 1024**3
+    max_alloced /= 1024**3
+    max_cached /= 1024**3
+
+    print(
+        f'RANK={rank} MEMSTATS',
+        msg,
+        f'device={torch.cuda.current_device()} '
+        f'current alloc={new_alloced:0.4f}GB (delta={delta_alloced:0.4f}GB max={max_alloced:0.4f}GB) '
+        f'current cache={new_cached:0.4f}GB (delta={delta_cached:0.4f}GB max={max_cached:0.4f}GB)'
+    )
+
+
+def see_memory_usage(message):
+    return
+    if torch.distributed.is_initialized() and not torch.distributed.get_rank() == 0:
+        return
+
+    # Print message except when distributed but not rank 0
+    logger.info(message)
+    logger.info(
+        f"MA {round(torch.cuda.memory_allocated() / (1024 * 1024 * 1024),2 )} GB \
+        Max_MA {round(torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),2)} GB \
+        CA {round(torch.cuda.memory_cached() / (1024 * 1024 * 1024),2)} GB \
+        Max_CA {round(torch.cuda.max_memory_cached() / (1024 * 1024 * 1024))} GB ")
+
+
+def call_to_str(base, *args, **kwargs):
+    """Construct a string representation of a call.
+
+    Args:
+        base (str): name of the call
+        args (tuple, optional): args to ``base``
+        kwargs (dict, optional): kwargs supplied to ``base``
+
+    Returns:
+        str: A string representation of base(*args, **kwargs)
+    """
+    name = f'{base}('
+    if args:
+        name += ', '.join(repr(arg) for arg in args)
+        if kwargs:
+            name += ', '
+    if kwargs:
+        name += ', '.join(f'{key}={repr(arg)}' for key, arg in kwargs.items())
+    name += ')'
+    return name
diff --git a/deepspeed/runtime/zero/__init__.py b/deepspeed/runtime/zero/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/deepspeed/pt/deepspeed_zero_config.py b/deepspeed/runtime/zero/config.py
similarity index 52%
rename from deepspeed/pt/deepspeed_zero_config.py
rename to deepspeed/runtime/zero/config.py
index 4f654d3b8c30..b784f3ffdd6c 100755
--- a/deepspeed/pt/deepspeed_zero_config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -3,82 +3,10 @@
 Licensed under the MIT license.
 """
 
-#from deepspeed.pt.deepspeed_constants import *
-from deepspeed.pt.deepspeed_config_utils import get_scalar_param
-from deepspeed.pt.log_utils import logger
-
-#########################################
-# ZeRO optimization
-#########################################
-# ZeRO optimization. By default, this optimization is not enabled.
-# Users have to configure the desired optimization (0 means disabled) in params.json as below example:
-ZERO_FORMAT = '''
-ZeRO optimization should be enabled as:
-"session_params": {
-  "zero_optimization": {
-    "stage": [0|1|2],
-    "allgather_partitions": [true|false],
-    "allgather_bucket_size": 500000000,
-    "reduce_scatter": [true|false],
-    "contiguous_gradients" : [true|false]
-    "overlap_comm": [true|false],
-    "reduce_bucket_size": 500000000
-    "load_from_fp32_weights": [true|false]
-    }
-}
-'''
-
-ZERO_OPTIMIZATION = 'zero_optimization'
-ZERO_OPTIMIZATION_DISABLED = 0
-ZERO_OPTIMIZATION_OPTIMIZER_STATES = 1
-ZERO_OPTIMIZATION_GRADIENTS = 2
-ZERO_OPTIMIZATION_WEIGHTS = 3
-MAX_STAGE_ZERO_OPTIMIZATION = ZERO_OPTIMIZATION_GRADIENTS
-
-ZERO_OPTIMIZATION_STAGE = 'stage'
-ZERO_OPTIMIZATION_STAGE_1 = 'stage_1'
-ZERO_OPTIMIZATION_STAGE_2 = 'stage_2'
-ZERO_OPTIMIZATION_STAGE_3 = 'stage_3'
-
-ZERO_OPTIMIZATION_STAGE_DEFAULT = ZERO_OPTIMIZATION_DISABLED
-
-ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS = 'allgather_partitions'
-ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT = True
-
-ZERO_OPTIMIZATION_REDUCE_SCATTER = 'reduce_scatter'
-ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT = True
-
-ZERO_OPTIMIZATION_OVERLAP_COMM = 'overlap_comm'
-ZERO_OPTIMIZATION_OVERLAP_COMM_DEFAULT = False
-
-ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS = 'contiguous_gradients'
-ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT = False
-
-ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE = 'reduce_bucket_size'
-ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT = 500000000
-
-ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE = 'allgather_bucket_size'
-ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT = 500000000
-ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEPRECATED = 'allgather_size'
-ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS = 'load_from_fp32_weights'
-ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT = True
-
-ZERO_OPTIMIZATION_DEFAULT = {
-    ZERO_OPTIMIZATION_STAGE:
-    ZERO_OPTIMIZATION_STAGE_DEFAULT,
-    ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS:
-    ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT,
-    ZERO_OPTIMIZATION_REDUCE_SCATTER:
-    ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT,
-    ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE:
-    ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT,
-    ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS:
-    ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT,
-    ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE:
-    ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT,
-    ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS:
-    ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT
-}
+from deepspeed.runtime.config_utils import get_scalar_param
+from deepspeed.utils import logger
+from deepspeed.runtime.zero.constants import *
+import json
 
 
 class DeepSpeedZeroConfig(object):
@@ -93,6 +21,8 @@ def __init__(self, param_dict):
         self.allgather_bucket_size = None
         self.overlap_comm = None
         self.load_from_fp32_weights = None
+        self.cpu_offload = None
+        self.elastic_checkpoint = None
 
         if ZERO_OPTIMIZATION in param_dict.keys():
             zero_config_dict = param_dict[ZERO_OPTIMIZATION]
@@ -125,6 +55,9 @@ def read_zero_config_deprecated(self, param_dict):
     def repr(self):
         return self.__dict__
 
+    def __repr__(self):
+        return json.dumps(self.__dict__, sort_keys=True, indent=4)
+
     def _initialize(self, zero_config_dict):
         self.stage = get_scalar_param(zero_config_dict,
                                       ZERO_OPTIMIZATION_STAGE,
@@ -157,7 +90,17 @@ def _initialize(self, zero_config_dict):
             zero_config_dict,
             ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE,
             ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT)
+
         self.load_from_fp32_weights = get_scalar_param(
             zero_config_dict,
             ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS,
             ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT)
+
+        self.cpu_offload = get_scalar_param(zero_config_dict,
+                                            ZERO_OPTIMIZATION_CPU_OFFLOAD,
+                                            ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT)
+
+        self.elastic_checkpoint = get_scalar_param(
+            zero_config_dict,
+            ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT,
+            ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT_DEFAULT)
diff --git a/deepspeed/runtime/zero/constants.py b/deepspeed/runtime/zero/constants.py
new file mode 100755
index 000000000000..fd90033dc3f5
--- /dev/null
+++ b/deepspeed/runtime/zero/constants.py
@@ -0,0 +1,83 @@
+"""
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+"""
+
+#########################################
+# ZeRO optimization
+#########################################
+# ZeRO optimization. By default, this optimization is not enabled.
+# Users have to configure the desired optimization (0 means disabled) in params.json as below example:
+ZERO_FORMAT = '''
+ZeRO optimization should be enabled as:
+"session_params": {
+  "zero_optimization": {
+    "stage": [0|1|2],
+    "allgather_partitions": [true|false],
+    "allgather_bucket_size": 500000000,
+    "reduce_scatter": [true|false],
+    "contiguous_gradients" : [true|false]
+    "overlap_comm": [true|false],
+    "reduce_bucket_size": 500000000
+    "load_from_fp32_weights": [true|false]
+    "cpu_offload": [true|false]
+    }
+}
+'''
+
+ZERO_OPTIMIZATION = 'zero_optimization'
+ZERO_OPTIMIZATION_DISABLED = 0
+ZERO_OPTIMIZATION_OPTIMIZER_STATES = 1
+ZERO_OPTIMIZATION_GRADIENTS = 2
+ZERO_OPTIMIZATION_WEIGHTS = 3
+MAX_STAGE_ZERO_OPTIMIZATION = ZERO_OPTIMIZATION_GRADIENTS
+
+ZERO_OPTIMIZATION_STAGE = 'stage'
+ZERO_OPTIMIZATION_STAGE_1 = 'stage_1'
+ZERO_OPTIMIZATION_STAGE_2 = 'stage_2'
+ZERO_OPTIMIZATION_STAGE_3 = 'stage_3'
+
+ZERO_OPTIMIZATION_STAGE_DEFAULT = ZERO_OPTIMIZATION_DISABLED
+
+ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS = 'allgather_partitions'
+ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT = True
+
+ZERO_OPTIMIZATION_REDUCE_SCATTER = 'reduce_scatter'
+ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT = True
+
+ZERO_OPTIMIZATION_OVERLAP_COMM = 'overlap_comm'
+ZERO_OPTIMIZATION_OVERLAP_COMM_DEFAULT = False
+
+ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS = 'contiguous_gradients'
+ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT = False
+
+ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE = 'reduce_bucket_size'
+ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT = 500000000
+
+ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE = 'allgather_bucket_size'
+ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT = 500000000
+ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEPRECATED = 'allgather_size'
+ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS = 'load_from_fp32_weights'
+ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT = True
+
+ZERO_OPTIMIZATION_CPU_OFFLOAD = 'cpu_offload'
+ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT = False
+
+ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT = 'elastic_checkpoint'
+ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT_DEFAULT = True
+
+ZERO_OPTIMIZATION_DEFAULT = {
+    ZERO_OPTIMIZATION_STAGE: ZERO_OPTIMIZATION_STAGE_DEFAULT,
+    ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS:
+    ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT,
+    ZERO_OPTIMIZATION_REDUCE_SCATTER: ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT,
+    ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE: ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT,
+    ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS:
+    ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT,
+    ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE:
+    ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT,
+    ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS:
+    ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT,
+    ZERO_OPTIMIZATION_CPU_OFFLOAD: ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT,
+    ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT: ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT_DEFAULT
+}
diff --git a/deepspeed/pt/zero_optimizer_stage1.py b/deepspeed/runtime/zero/stage1.py
similarity index 75%
rename from deepspeed/pt/zero_optimizer_stage1.py
rename to deepspeed/runtime/zero/stage1.py
index d527a17123d7..d5c7616ff87e 100755
--- a/deepspeed/pt/zero_optimizer_stage1.py
+++ b/deepspeed/runtime/zero/stage1.py
@@ -4,11 +4,11 @@
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from collections import defaultdict
 
-from deepspeed.pt.zero_utils import _initialize_parameter_parallel_groups
-from deepspeed.pt.log_utils import log_dist, logger
-from deepspeed.pt.loss_scaler import LossScaler, DynamicLossScaler
-from deepspeed.pt.deepspeed_utils import get_grad_norm, CheckOverflow
-from deepspeed.pt.deepspeed_zero_config import ZERO_OPTIMIZATION_OPTIMIZER_STATES
+from deepspeed.runtime.zero.utils import _initialize_parameter_parallel_groups
+from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler
+from deepspeed.runtime.utils import get_grad_norm, CheckOverflow
+from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_OPTIMIZER_STATES
+from deepspeed.utils import logger, log_dist
 
 
 def get_alignment_padding(flattened_lean_size, sub_partition_id, sub_partition_size):
@@ -26,13 +26,6 @@ def get_group_alignment_padding(tensor_list, sub_partition_size, sub_partition_c
         padding = get_alignment_padding(flattened_size, i, sub_partition_size)
         group_paddings.append(padding)
 
-    logger.info("****Padding information*****")
-    logger.info(f"tensor_size = {flattened_size}")
-    logger.info(f"sub_partition_size = {sub_partition_size}")
-    logger.info(f"sub_partition_count = {sub_partition_count}")
-    for i, padding in enumerate(group_paddings):
-        logger.info(f"padding[{i}] = {padding}")
-
     return group_paddings
 
 
@@ -40,8 +33,7 @@ def flatten_dense_tensors_sub_partition_aligned(tensor_list,
                                                 dp,
                                                 max_elements_per_comm,
                                                 pg):
-    assert (max_elements_per_comm >= dp,
-            f"max_elements_per_comm {max_elements_per_comm} < dp {dp}")
+    assert max_elements_per_comm >= dp, f"max_elements_per_comm {max_elements_per_comm} < dp {dp}"
 
     num_elements = sum(t.numel() for t in tensor_list)
     log_dist("Total number of elements in model: {}, max elements per com: {}".format(
@@ -81,7 +73,8 @@ def flatten_dense_tensors_sub_partition_aligned(tensor_list,
                                  dtype=tensor_list[0].dtype)
         aligned_tensor_list = tensor_list + [pad_tensor]
 
-    return _flatten_dense_tensors(aligned_tensor_list)
+    flat_tensors = _flatten_dense_tensors(aligned_tensor_list)
+    return flat_tensors
 
 
 def _single_range_check(current_index, start_index, end_index, tensor_size):
@@ -131,7 +124,8 @@ def __init__(self,
                  all_gather_partitions=True,
                  allgather_size=500000000,
                  clip_grad=0.0,
-                 max_elements_per_comm=5e8):
+                 max_elements_per_comm=5e8,
+                 elastic_checkpoint=True):
 
         if dp_process_group is not None and partition_size is not None:
             raise ValueError("Cannot specify both dp_process_group "
@@ -151,8 +145,11 @@ def __init__(self,
         self.all_gather_partitions = all_gather_partitions
         self.allgather_size = allgather_size
 
-        self.max_elements_per_comm = max_elements_per_comm
-        logger.info("max_elements_per_comm={}".format(max_elements_per_comm))
+        # self.max_elements_per_comm = max_elements_per_comm
+        # logger.info("max_elements_per_comm={}".format(max_elements_per_comm))
+
+        self.elastic_checkpoint = elastic_checkpoint
+        logger.info(f'ZeRO Elastic Checkpoint = {elastic_checkpoint}')
 
         # param flattened by groups
         self.fp16_groups = []
@@ -193,19 +190,31 @@ def __init__(self,
         self.group_paddings = []
         self.partition_count = dist.get_world_size(group=self.dp_process_group)
 
+        self.default_device = self.optimizer.param_groups[0]['params'][0].device
+
+        # max elems per param group
+        self.max_elems_per_comm = []
+
         # loop to deal with groups
         for i, param_group in enumerate(self.optimizer.param_groups):
             # push this group to list before modify
             self.fp16_groups.append(param_group['params'])
 
+            # calculate best max elements per comm based to minimize padding
+            self.max_elems_per_comm.append(
+                self.best_max_elems_per_comm(
+                    num_elements=sum(t.numel() for t in self.fp16_groups[i]),
+                    max_elements_per_comm=max_elements_per_comm,
+                    dp=dist.get_world_size(group=self.dp_process_group)))
+
             # flattens all tensors into single 1d tensor aligned with sub-partition size for later dividing
             # RS: create aligned sub-partitions
-            self.fp16_groups_flat.append(
-                flatten_dense_tensors_sub_partition_aligned(
-                    tensor_list=self.fp16_groups[i],
-                    dp=dist.get_world_size(group=self.dp_process_group),
-                    max_elements_per_comm=self.max_elements_per_comm,
-                    pg=self.dp_process_group))
+            flat_aligned_params = flatten_dense_tensors_sub_partition_aligned(
+                tensor_list=self.fp16_groups[i],
+                dp=dist.get_world_size(group=self.dp_process_group),
+                max_elements_per_comm=self.max_elems_per_comm[i],
+                pg=self.dp_process_group)
+            self.fp16_groups_flat.append(flat_aligned_params)
 
             # TODO: I don't think this does anything?
             # set model fp16 weight to slices of flattened buffer
@@ -220,7 +229,7 @@ def __init__(self,
             comm_partitions, dp_sub_partitions, element_intervals, sub_partition_size, num_comm_intervals = \
                 self.get_data_parallel_sub_partitions(
                     tensor=self.fp16_groups_flat[i],
-                    max_elements_per_comm=self.max_elements_per_comm,
+                    max_elements_per_comm=self.max_elems_per_comm[i],
                     world_size=dist.get_world_size(
                         group=self.dp_process_group),
                     dp_process_group=self.dp_process_group
@@ -257,8 +266,7 @@ def __init__(self,
 
             # RS: divide up the sub-partitions and keep track of offsets for each param
             # partition_size = len(self.fp16_groups_flat[i]) / dist.get_world_size(group=self.dp_process_group)
-            params_in_rank_sub_partition, params_in_rank_sub_partitions_offsets, \
-            params_not_local = self.get_all_sub_partition_info(
+            params_in_rank_sub_partition, params_in_rank_sub_partitions_offsets, params_not_local = self.get_all_sub_partition_info(
                 tensor_list=self.fp16_groups[i],
                 all_element_intervals=element_intervals,
                 local_rank=local_rank,
@@ -308,6 +316,34 @@ def _initialize_optimizer_states(self):
             for idx, sub_partition_param in enumerate(group):
                 sub_partition_param.grad = None
 
+    @staticmethod
+    def best_max_elems_per_comm(num_elements, max_elements_per_comm, dp):
+        # if we use max-elems-per-comm as is, how many comm intervals will there be
+        max_comm_intervals = math.ceil(num_elements / max_elements_per_comm)
+        padding_for_max_comm = (max_elements_per_comm *
+                                max_comm_intervals) - num_elements
+
+        # if we use 1 less comm interval how much extra comm padding would be required
+        min_comm_intervals = num_elements // max_elements_per_comm
+        if min_comm_intervals == 0:
+            log_dist(f'Using default max_elements_per_comm {max_elements_per_comm}',
+                     ranks=[0])
+            return max_elements_per_comm
+
+        padding_for_min_comm = math.ceil(num_elements / (dp * min_comm_intervals))
+
+        # choose padding that uses least amount of overhead
+        if padding_for_max_comm > padding_for_min_comm:
+            new_max_elements_per_comm = padding_for_min_comm + max_elements_per_comm
+            log_dist(
+                f'Updating max_elements_per_comm from {max_elements_per_comm} -> {new_max_elements_per_comm}',
+                ranks=[0])
+            return new_max_elements_per_comm
+        else:
+            log_dist(f'Using default max_elements_per_comm {max_elements_per_comm}',
+                     ranks=[0])
+            return max_elements_per_comm
+
     @staticmethod
     def get_data_parallel_sub_partitions(tensor,
                                          max_elements_per_comm,
@@ -424,9 +460,10 @@ def get_flat_sub_partitions(comm_tensor_list,
                                 comm_param_offsets,
                                 sub_partition_size,
                                 dtype,
+                                default_device,
                                 num_comm_intervals=None,
-                                default_device=None,
                                 return_partition_params=False):
+
         partition_params = []
         final_param_offsets = []
         flat_sub_partitions = []
@@ -436,9 +473,6 @@ def get_flat_sub_partitions(comm_tensor_list,
             my_offsets = []
             my_params = []
 
-            if dtype is None:
-                dtype = tensor_list[0].dtype
-
             for i, tensor in enumerate(tensor_list):
                 if tensor.grad is None:
                     tensor.grad = torch.zeros(tensor.size(),
@@ -543,84 +577,52 @@ def reduce_scatter_gradients(self,
         local_rank = dist.get_rank(group=self.dp_process_group)
 
         for i, group in enumerate(self.fp16_groups):
-            partition_param_map = {}
-            param_partition_map = {}
-            my_params = set()
-
-            # [rank] -> [comm] -> partition
             num_comm_intervals = self.num_comm_intervals_per_group[i]
             all_sub_partitions = []
             for rank in range(world_size):
                 # gsp is list of partitions indexed by comm_idx
-                #FIXME: currently hardcoding fp16, should infer dtype
-                grad_sub_partitions, partition_params, param_offsets = self.get_flat_sub_partitions(
+                grad_sub_partitions = self.get_flat_sub_partitions(
                     comm_tensor_list=self.params_in_rank_sub_partitions[i][rank],
-                    comm_param_offsets=self.params_in_rank_sub_partitions_offsets[i][rank],
+                    comm_param_offsets=self.params_in_rank_sub_partitions_offsets[i]
+                    [rank],
+                    dtype=torch.half,
+                    default_device=self.default_device,
                     sub_partition_size=self.sub_partition_sizes[i],
-                    dtype=torch.half, #self.params_in_rank_sub_partitions[i][rank][0][0].dtype,
-                    num_comm_intervals=self.num_comm_intervals_per_group[i],
-                    default_device='cuda', #self.params_in_rank_sub_partitions[i][rank][0][0].device,
-                    return_partition_params=True)
+                    num_comm_intervals=self.num_comm_intervals_per_group[i])
                 all_sub_partitions.append(grad_sub_partitions)
 
-                # create map from partition -> params in that partition
-                for comm_idx, part in enumerate(grad_sub_partitions):
-                    partition_param_map[part] = (partition_params[comm_idx],
-                                                 param_offsets[comm_idx])
-
-                for comm_idx, params in enumerate(partition_params):
-                    for pidx, p in enumerate(params):
-                        # store the parameters we care about locally
-                        if rank == local_rank:
-                            my_params.add(p)
-                        # map from param -> partitions
-                        if p in param_partition_map:
-                            param_partition_map[p].append(grad_sub_partitions[comm_idx])
-                        else:
-                            param_partition_map[p] = [grad_sub_partitions[comm_idx]]
-
                 assert len(grad_sub_partitions) == num_comm_intervals
 
-            if not postscale_gradients:
-                raise NotImplementedError("pre-scale_gradients is not implemented")
-
-            all_comm_partitions = []
+            local_comm_partitions = []
             for comm_idx in range(num_comm_intervals):
                 single_comm_all_partitions = []
                 for rank in range(world_size):
                     single_comm_all_partitions.append(all_sub_partitions[rank][comm_idx])
-                dist.reduce_scatter(output=single_comm_all_partitions[local_rank],
-                                    input_list=single_comm_all_partitions,
-                                    group=self.dp_process_group)
 
-                if gradient_average:
+                if postscale_gradients:
+                    if gradient_predivide_factor != 1.0:
+                        for partition in single_comm_all_partitions:
+                            partition.mul_(1. / gradient_predivide_factor)
+
+                    dist.reduce_scatter(output=single_comm_all_partitions[local_rank],
+                                        input_list=single_comm_all_partitions,
+                                        group=self.dp_process_group)
+
+                    if gradient_average:
+                        # Only need to average our local grads in post scaling
+                        if gradient_predivide_factor != world_size:
+                            single_comm_all_partitions[local_rank].mul_(
+                                gradient_predivide_factor / world_size)
+                else:
                     for partition in single_comm_all_partitions:
-                        partition.mul_(gradient_predivide_factor / world_size)
-
-                all_comm_partitions.append(single_comm_all_partitions)
-
-            for p in my_params:
-                partitions = param_partition_map[p]
-                parts = []
-                for part in partitions:
-                    params, offsets = partition_param_map[part]
-                    found = False
-                    for p_idx, _p in enumerate(params):
-                        if p.__hash__() == _p.__hash__():
-                            found = True
-                            if offsets[p_idx][0] is not None:
-                                my_part = part.narrow(0,
-                                                      offsets[p_idx][0],
-                                                      offsets[p_idx][1])
-                                parts.append(my_part)
-                    assert found
-                if p is not None:
-                    updated_grad = _unflatten_dense_tensors(torch.cat(parts), [p])
-                    p.grad.copy_(updated_grad[0])
+                        partition.div_(world_size)
+
+                    dist.reduce_scatter(output=single_comm_all_partitions[local_rank],
+                                        input_list=single_comm_all_partitions,
+                                        group=self.dp_process_group)
 
     def step(self, closure=None):
         # First compute norm for all group so we know if there is overflow
-
         self.overflow = self.overflow_checker.check()
 
         prev_scale = self.loss_scale
@@ -639,7 +641,6 @@ def step(self, closure=None):
 
         partition_id = dist.get_rank(group=self.dp_process_group)
         for i, group in enumerate(self.fp16_groups):
-
             #TODO RS: update get grad norm to support sub partitions
             norm_groups.append(get_grad_norm(group, mpu=self.mpu))
 
@@ -647,16 +648,7 @@ def step(self, closure=None):
             #free gradients for all the parameters that are not updated by this process
             self.free_grad_in_param_list(self.params_not_local[i])
 
-            #create flat gradients for parameters updated by this process
-            #tensor_list, first_offset, partition_size, dtype
-            #single_grad_partition = self.get_flat_partition(
-            #    tensor_list=self.params_in_partition[i],
-            #    first_offset=self.first_offset[i],
-            #    partition_size=self.partition_size[i],
-            #    dtype=self.single_partition_of_fp32_groups[i].dtype
-            #)
-
-            #TODO RS: can we safely use dtype of the first sub-partition? i think so
+            # create flat gradient partitions for parameters updated by this process
             local_grad_sub_partitions = self.get_flat_sub_partitions(
                 comm_tensor_list=self.params_in_rank_sub_partitions[i][partition_id],
                 comm_param_offsets=self.params_in_rank_sub_partitions_offsets[i]
@@ -664,13 +656,11 @@ def step(self, closure=None):
                 sub_partition_size=self.sub_partition_sizes[i],
                 dtype=self.local_sub_partitions_of_fp32_groups[i][0].dtype,
                 num_comm_intervals=self.num_comm_intervals_per_group[i],
-                default_device=self.local_sub_partitions_of_fp32_groups[i][0].device)
+                default_device=self.default_device)
 
             #RS: update all our local params with sub-partition grads
-            #logger. info("self.local_sub_partitions_of_fp32_groups[i]={}, local_grad_sub_partitions={}".format(len(self.local_sub_partitions_of_fp32_groups[i]), len(local_grad_sub_partitions)))
             for idx, sub_partition_param in enumerate(self.local_sub_partitions_of_fp32_groups[i]):
                 sub_partition_param.grad = local_grad_sub_partitions[idx]
-            #self.single_partition_of_fp32_groups[i].grad = single_grad_partition
 
             #RS: update free grads for sub-partitions
             #release all the gradient since we have already created a necessary copy in dp_grad_partition
@@ -773,18 +763,30 @@ def _set_loss_scale(self, value):
     loss_scale = property(_get_loss_scale, _set_loss_scale)
     cur_scale = property(_get_loss_scale, _set_loss_scale)
 
+    # Return communication interval paddings for local rank and group
+    def _get_local_group_paddings(self, group_index):
+        local_rank = dist.get_rank(group=self.dp_process_group)
+        sub_partition_indices = [
+            local_rank + (comm_idx * self.partition_count)
+            for comm_idx in range(self.num_comm_intervals_per_group[group_index])
+        ]
+        group_paddings = [
+            self.group_paddings[group_index][sub_idx]
+            for sub_idx in sub_partition_indices
+        ]
+        return group_paddings
+
     # Return group tensor after removing paddings that are added for alignment to DP world size.
     # This method works on the assumption that each group contains sub partitions.
     def _get_groups_without_padding(self, groups_with_padding):
         groups_without_padding = []
-        local_rank = dist.get_rank(group=self.dp_process_group)
-        for i, group in enumerate(groups_with_padding):
-            low_index = local_rank * len(group)
-            high_index = (local_rank + 1) * len(group)
-            group_paddings = self.group_paddings[i][low_index:high_index]
+
+        for group_index, group in enumerate(groups_with_padding):
+            group_paddings = self._get_local_group_paddings(group_index)
+
             lean_sub_partitions = []
-            for j, sub_partition in enumerate(group):
-                lean_length = sub_partition.numel() - group_paddings[j]
+            for sub_partition, padding in zip(group, group_paddings):
+                lean_length = sub_partition.numel() - padding
                 lean_sub_partitions.append(sub_partition[:lean_length])
             groups_without_padding.append(lean_sub_partitions)
 
@@ -794,8 +796,11 @@ def _get_groups_without_padding(self, groups_with_padding):
     def _get_state_without_padding(self, state_with_padding, padding):
         lean_state = {}
         for key, value in state_with_padding.items():
-            lean_length = value.numel() - padding
-            lean_state[key] = value[:lean_length]
+            if torch.is_tensor(value):
+                lean_length = value.numel() - padding
+                lean_state[key] = value[:lean_length]
+            else:
+                lean_state[key] = value
 
         return lean_state
 
@@ -803,12 +808,11 @@ def _get_state_without_padding(self, state_with_padding, padding):
     # This method assumes that each param group contains a single flattened tensor.
     def _get_base_optimizer_state(self):
         optimizer_groups_state = []
-        local_rank = dist.get_rank(group=self.dp_process_group)
-        for group_idx, group in enumerate(self.optimizer.param_groups):
+
+        for group_index, group in enumerate(self.optimizer.param_groups):
+            param_paddings = self._get_local_group_paddings(group_index)
+
             group_lean_state = []
-            low_index = local_rank * self.num_comm_intervals_per_group[group_idx]
-            high_index = (local_rank + 1) * self.num_comm_intervals_per_group[group_idx]
-            param_paddings = self.group_paddings[group_idx][low_index:high_index]
             for param_idx, param in enumerate(group['params']):
                 lean_state = self._get_state_without_padding(self.optimizer.state[param],
                                                              param_paddings[param_idx])
@@ -818,7 +822,10 @@ def _get_base_optimizer_state(self):
 
         return optimizer_groups_state
 
-    def state_dict(self):
+    def _rigid_state_dict(self):
+        """
+            Returns a dict that can be loaded for continued training with same DP degree
+        """
         """
         Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
         This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
@@ -833,6 +840,19 @@ def state_dict(self):
         state_dict['loss_scaler'] = self.loss_scaler
         state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
         state_dict['overflow'] = self.overflow
+        state_dict['base_optimizer_state'] = self.optimizer.state_dict()
+        state_dict[
+            'local_sub_partitions_of_fp32_groups'] = self.local_sub_partitions_of_fp32_groups
+        return state_dict
+
+    def _elastic_state_dict(self):
+        """
+            Returns a dict that can be loaded for elastic training with different DP degree
+        """
+        state_dict = {}
+        state_dict['loss_scaler'] = self.loss_scaler
+        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
+        state_dict['overflow'] = self.overflow
         state_dict['base_optimizer_state'] = self._get_base_optimizer_state()
 
         state_dict['zero_stage'] = ZERO_OPTIMIZATION_OPTIMIZER_STATES
@@ -846,28 +866,58 @@ def state_dict(self):
 
         return state_dict
 
-    def _retrieve_group_sub_partition_weights(self, all_partition_fp32_weights):
-        partition_id = dist.get_rank(group=self.dp_process_group)
-
-        all_sub_partition_weights = []
-        for partition_weights in all_partition_fp32_weights:
-            for sub_partition_weights in partition_weights:
-                all_sub_partition_weights.append(sub_partition_weights)
+    def state_dict(self):
+        """
+        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
+        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
+        of the contained Pytorch optimizer.
+        Example::
+            checkpoint = {}
+            checkpoint['model'] = model.state_dict()
+            checkpoint['optimizer'] = optimizer.state_dict()
+            torch.save(checkpoint, "saved.pth")
+        """
+        if self.elastic_checkpoint:
+            return self._elastic_state_dict()
+
+        return self._rigid_state_dict()
+
+    # Extract the fp32 weights of the current rank from checkpoint by merging the
+    # sub partitions of communication intervals across ranks.
+    # Let sub_i_j = sub partition of rank i and comm interval j
+    # For 2 ranks and 2 comm intervals, checkpoints (minus padding) are as follows:
+    # rank 0 = [sub_0_0, sub_0_1]
+    # rank 1 = [sub_1_0, sub_1_1]
+    # Merge to get [sub_0_0, sub_1_0, sub_0_1, sub_1_1] => original un-padded flattened tensor.
+    def _retrieve_group_sub_partition_weights(self,
+                                              all_partition_fp32_weights,
+                                              max_elems_per_comm):
+        num_partitions = len(all_partition_fp32_weights)
+        num_comm_intervals = len(all_partition_fp32_weights[0])
+        num_sub_partitions = num_partitions * num_comm_intervals
+        all_sub_partition_weights = [None] * num_sub_partitions
+
+        for rank, partition_weights in enumerate(all_partition_fp32_weights):
+            for comm_idx, sub_partition_weights in enumerate(partition_weights):
+                #all_sub_partition_weights.append(sub_partition_weights)
+                sub_partition_idx = (comm_idx * num_partitions) + rank
+                all_sub_partition_weights[sub_partition_idx] = sub_partition_weights
 
         flat_merged_weights = flatten_dense_tensors_sub_partition_aligned(
             tensor_list=all_sub_partition_weights,
             dp=dist.get_world_size(group=self.dp_process_group),
-            max_elements_per_comm=self.max_elements_per_comm,
+            max_elements_per_comm=max_elems_per_comm,
             pg=self.dp_process_group)
 
         comm_partitions, dp_sub_partitions, element_intervals, sub_partition_size, num_comm_intervals = \
             self.get_data_parallel_sub_partitions(
                 tensor=flat_merged_weights,
-                max_elements_per_comm=self.max_elements_per_comm,
+                max_elements_per_comm=max_elems_per_comm,
                 world_size=dist.get_world_size(group=self.dp_process_group),
                 dp_process_group=self.dp_process_group
             )
 
+        partition_id = dist.get_rank(group=self.dp_process_group)
         return [sub_partition for sub_partition in dp_sub_partitions[partition_id]]
 
     # Restore base optimizer fp32 weights from checkpoint by:
@@ -881,8 +931,11 @@ def _restore_from_fp32_weights(self, all_state_dict):
                 sd['local_sub_partitions_of_fp32_groups'][group_idx]
                 for sd in all_state_dict
             ]
+            max_elems_per_comm = self.max_elems_per_comm[group_idx]
+
             sub_partition_weights = self._retrieve_group_sub_partition_weights(
-                all_partition_fp32_weights)
+                all_partition_fp32_weights,
+                max_elems_per_comm)
             sub_partition_of_fp32_groups.append(sub_partition_weights)
 
         for current_group, saved_group in zip(self.local_sub_partitions_of_fp32_groups, sub_partition_of_fp32_groups):
@@ -890,45 +943,56 @@ def _restore_from_fp32_weights(self, all_state_dict):
                 current_sub_part.data.copy_(saved_sub_part.data)
 
     # Extract optimizer state for current partition from merged states of all partitions
-    def _partition_base_optimizer_state(self, state_key, all_partition_states):
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        alignment = dist.get_world_size(group=self.dp_process_group)
+    def _partition_base_optimizer_state(self,
+                                        state_key,
+                                        all_partition_states,
+                                        max_elems_per_comm):
+        if not torch.is_tensor(all_partition_states[0]):
+            return all_partition_states[0]
 
+        alignment = dist.get_world_size(group=self.dp_process_group)
         flat_merged_partitions = flatten_dense_tensors_sub_partition_aligned(
             tensor_list=all_partition_states,
             dp=dist.get_world_size(group=self.dp_process_group),
-            max_elements_per_comm=self.max_elements_per_comm,
+            max_elements_per_comm=max_elems_per_comm,
             pg=self.dp_process_group)
 
         comm_partitions, dp_sub_partitions, element_intervals, sub_partition_size, num_comm_intervals = \
             self.get_data_parallel_sub_partitions(
                 tensor=flat_merged_partitions,
-                max_elements_per_comm=self.max_elements_per_comm,
+                max_elements_per_comm=max_elems_per_comm,
                 world_size=dist.get_world_size(group=self.dp_process_group),
                 dp_process_group=self.dp_process_group
             )
 
+        partition_id = dist.get_rank(group=self.dp_process_group)
         return [sub_partition for sub_partition in dp_sub_partitions[partition_id]]
 
     # Compute the optimizer state partitions for the group by
     # 1) Merging state values across the previous partitioning.
     # 2) Repartition state values for the new partitioning
     # 3) Return state corresponding to local partition
-    def _retrieve_group_optimizer_states(self, all_partition_states):
+    def _retrieve_group_optimizer_states(self, all_partition_states, max_elems_per_comm):
         merged_optimizer_states = {}
-        for partition_state in all_partition_states:
-            for sub_partition_state in partition_state:
+        num_partitions = len(all_partition_states)
+        num_comm_intervals = len(all_partition_states[0])
+        num_sub_partitions = num_partitions * num_comm_intervals
+
+        for rank, partition_state in enumerate(all_partition_states):
+            for comm_idx, sub_partition_state in enumerate(partition_state):
                 for key, value in sub_partition_state.items():
                     if not key in merged_optimizer_states.keys():
-                        merged_optimizer_states[key] = [value]
-                    else:
-                        merged_optimizer_states[key].append(value)
+                        merged_optimizer_states[key] = [None] * num_sub_partitions
+
+                    sub_partition_idx = (comm_idx * num_partitions) + rank
+                    merged_optimizer_states[key][sub_partition_idx] = value
 
         group_optimizer_states = {}
         for key, value in merged_optimizer_states.items():
             group_optimizer_states[key] = self._partition_base_optimizer_state(
                 key,
-                value)
+                value,
+                max_elems_per_comm)
 
         return group_optimizer_states
 
@@ -942,15 +1006,20 @@ def _restore_base_optimizer_state(self, state_dict_list):
             all_partition_group_states = [
                 sd['base_optimizer_state'][group_idx] for sd in state_dict_list
             ]
+            max_elems_per_comm = self.max_elems_per_comm[group_idx]
             group_optimizer_states = self._retrieve_group_optimizer_states(
-                all_partition_group_states)
+                all_partition_group_states,
+                max_elems_per_comm)
             base_optimizer_group_states.append(group_optimizer_states)
 
         for group_idx, group in enumerate(self.optimizer.param_groups):
             for param_idx, param in enumerate(group['params']):
                 for key, saved in base_optimizer_group_states[group_idx].items():
-                    current = self.optimizer.state[param][key]
-                    current.data.copy_(saved[param_idx].data)
+                    if torch.is_tensor(self.optimizer.state[param][key]):
+                        current = self.optimizer.state[param][key]
+                        current.data.copy_(saved[param_idx].data)
+                    else:
+                        self.optimizer.state[param][key] = saved
 
     # Restore base optimizer fp32 weights from ZeRO fp16 weights
     def _restore_from_fp16_weights(self):
@@ -963,10 +1032,23 @@ def _restore_from_fp16_weights(self):
     def refresh_fp32_params(self):
         self._restore_from_fp16_weights()
 
-    def load_state_dict(self,
-                        state_dict_list,
-                        load_optimizer_states=True,
-                        load_from_fp32_weights=False):
+    def _rigid_load_state_dict(self, state_dict, load_optimizer_states=True):
+
+        # I think it should actually be ok to reload the optimizer before the model.
+        self.loss_scaler = state_dict['loss_scaler']
+        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
+        self.overflow = state_dict['overflow']
+        if load_optimizer_states:
+            self.optimizer.load_state_dict(state_dict['base_optimizer_state'])
+
+        for curr_group, saved_group in zip(self.local_sub_partitions_of_fp32_groups, state_dict['local_sub_partitions_of_fp32_groups']):
+            for curr_param, saved_param in zip(curr_group, saved_group):
+                curr_param.data.copy_(saved_param.data)
+
+    def _elastic_load_state_dict(self,
+                                 state_dict_list,
+                                 load_optimizer_states=True,
+                                 load_from_fp32_weights=False):
         """
         Loads a state_dict created by an earlier call to state_dict().
         If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
@@ -994,3 +1076,46 @@ def load_state_dict(self,
             self._restore_from_fp32_weights(state_dict_list)
         else:
             self._restore_from_fp16_weights()
+
+    def load_state_dict(self,
+                        state_dict_list,
+                        load_optimizer_states=True,
+                        load_from_fp32_weights=False):
+        """
+        Loads a state_dict created by an earlier call to state_dict().
+        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
+        whose parameters in turn came from ``model``, it is expected that the user
+        will call ``model.load_state_dict()`` before
+        ``fp16_optimizer_instance.load_state_dict()`` is called.
+        Example::
+            model = torch.nn.Linear(D_in, D_out).cuda().half()
+            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+            ...
+            checkpoint = torch.load("saved.pth")
+            model.load_state_dict(checkpoint['model'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+        """
+        if self.elastic_checkpoint:
+            self._elastic_load_state_dict(state_dict_list,
+                                          load_optimizer_states,
+                                          load_from_fp32_weights)
+        else:
+            self._rigid_load_state_dict(
+                state_dict_list[dist.get_rank(group=self.dp_process_group)],
+                load_optimizer_states)
+
+    def _dump_optimizer_state(self, message):
+        logger.info(f'{message}')
+        for i, group in enumerate(self.optimizer.param_groups):
+            for j, param in enumerate(group['params']):
+                for key, value in self.optimizer.state[param].items():
+                    t_stats = [
+                        value.min(),
+                        value.max(),
+                        (value.max() - value.min()),
+                        value.mean()
+                    ]
+                    stats = [float(t) for t in t_stats]
+                    logger.info(
+                        f'group/param/key/min/max/delta/mean = {i}, {j}, {key}: {stats}')
diff --git a/deepspeed/pt/deepspeed_zero_optimizer.py b/deepspeed/runtime/zero/stage2.py
similarity index 77%
rename from deepspeed/pt/deepspeed_zero_optimizer.py
rename to deepspeed/runtime/zero/stage2.py
index cbfb249b501d..f6fa523fc8c0 100755
--- a/deepspeed/pt/deepspeed_zero_optimizer.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -1,1552 +1,1855 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
-'''
-
-import torch
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-from torch.distributed.distributed_c10d import _get_global_rank
-import torch.distributed as dist
-import math
-from torch._six import inf
-from torch.autograd import Variable
-
-from deepspeed.pt.loss_scaler import LossScaler, DynamicLossScaler
-from deepspeed.pt.deepspeed_utils import see_memory_usage, is_model_parallel_parameter
-from deepspeed.pt.deepspeed_zero_config import ZERO_OPTIMIZATION_GRADIENTS
-
-#Toggle this to true to enable correctness test
-#with gradient partitioning and without
-pg_correctness_test = False
-
-from deepspeed.pt.log_utils import logger
-
-try:
-    from apex_C import flatten
-    from apex_C import unflatten
-except ImportError:
-    try:
-        _ = warned_flatten
-    except NameError:
-        logger.warning(
-            "apex was installed without --cpp_ext.  Falling back to Python flatten and unflatten."
-        )
-        warned_flatten = True
-    from torch._utils import _flatten_dense_tensors as flatten
-    from torch._utils import _unflatten_dense_tensors as unflatten
-
-
-def input(msg):
-    return
-
-
-def split_half_float_double(tensors):
-    dtypes = [
-        "torch.cuda.HalfTensor",
-        "torch.cuda.FloatTensor",
-        "torch.cuda.DoubleTensor"
-    ]
-    buckets = []
-    for i, dtype in enumerate(dtypes):
-        bucket = [t for t in tensors if t.type() == dtype]
-        if bucket:
-            buckets.append(bucket)
-    return buckets
-
-
-def isclose(a, b, rtol=1e-09, atol=0.0):
-    return abs(a - b) <= max(rtol * max(abs(a), abs(b)), atol)
-
-
-def lcm(x, y):
-    from fractions import gcd  # or can import gcd from `math` in Python 3
-    return x * y // gcd(x, y)
-
-
-# create a flat tensor aligned at the alignment boundary
-def flatten_dense_tensors_aligned(tensor_list, alignment):
-    num_elements = 0
-    for tensor in tensor_list:
-        num_elements = num_elements + tensor.numel()
-
-    remaining = num_elements % alignment
-
-    if remaining:
-        elements_to_add = alignment - remaining
-        pad_tensor = torch.zeros(elements_to_add,
-                                 device=tensor_list[0].device,
-                                 dtype=tensor_list[0].dtype)
-        padded_tensor_list = tensor_list + [pad_tensor]
-
-        num_elements = num_elements + elements_to_add
-    else:
-        padded_tensor_list = tensor_list
-
-    return _flatten_dense_tensors(padded_tensor_list)
-
-
-def get_alignment_padding(tensor_list, alignment):
-    num_elements = sum([tensor.numel() for tensor in tensor_list])
-    remainder = num_elements % alignment
-    return (alignment - remainder) if remainder else remainder
-
-
-def move_to_cpu(tensor_list):
-    for tensor in tensor_list:
-        tensor.data = tensor.data.cpu()
-
-
-def print_rank_msg(msg):
-    print(f"rank {dist.get_rank()} - {msg}")
-
-
-class FP16_DeepSpeedZeroOptimizer(object):
-    """
-    DeepSpeedZeroOptimizer designed to reduce the memory footprint
-    required for training large deep learning models.
-
-    For more details please see ZeRO: Memory Optimization Towards Training A Trillion Parameter Models
-    https://arxiv.org/abs/1910.02054
-
-    For usage examples, refer to TODO: DeepSpeed Tutorial
-
-    """
-    def __init__(self,
-                 init_optimizer,
-                 timers,
-                 static_loss_scale=1.0,
-                 dynamic_loss_scale=False,
-                 dynamic_loss_args=None,
-                 verbose=True,
-                 contiguous_gradients=True,
-                 reduce_bucket_size=500000000,
-                 allgather_bucket_size=5000000000,
-                 dp_process_group=None,
-                 reduce_scatter=True,
-                 overlap_comm=False,
-                 mpu=None,
-                 clip_grad=0.0,
-                 allreduce_always_fp32=False,
-                 postscale_gradients=True,
-                 gradient_predivide_factor=1.0):
-
-        if dist.get_rank() == 0:
-            logger.info(f"Reduce bucket size {reduce_bucket_size}")
-            logger.info(f"Allgather bucket size {allgather_bucket_size}")
-        # The fused optimizer does all the work. We need this layer for two reason:
-        # 1. maintain same user API from apex.fp16_utils
-        # 2. keep common stuff here in case we need to add ne552w fused optimizer later
-
-        # differences from apex.fp16_utils:
-        # - assume all model params in fp16
-        # - assume all params requires grad
-        # - flat by groups, not keeping state. TODO: remove state explicitly?
-        # - master gard and unflat master weight never exist. TODO: a way to save out unflat master?
-        if not torch.cuda.is_available:
-            raise SystemError("Cannot use fp16 without CUDA.")
-        self.optimizer = init_optimizer
-
-        self.timers = timers
-
-        self.reduce_scatter = reduce_scatter
-
-        self.overlap_comm = overlap_comm
-
-        self.dp_process_group = dp_process_group
-
-        self.partition_count = dist.get_world_size(group=self.dp_process_group)
-
-        if mpu is None:
-            self.model_parallel_group = None
-            self.model_parallel_rank = 0
-        else:
-            self.model_parallel_group = mpu.get_model_parallel_group()
-            self.model_parallel_rank = mpu.get_model_parallel_rank()
-
-        self.overflow = False
-        self.clip_grad = clip_grad
-        self.allreduce_always_fp32 = allreduce_always_fp32
-        self.gradient_predivide_factor = gradient_predivide_factor
-        self.postscale_gradients = postscale_gradients
-
-        if self.reduce_scatter:
-            assert not self.allreduce_always_fp32, "allreduce_always_fp32 is not yet supported with ZeRO-2 with reduce scatter enabled"
-            assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with ZeRO-2 with reduce scatter enabled"
-            assert self.postscale_gradients, "pre-scale gradients is not yet supported with ZeRO-2 with reduce scatter enabled"
-
-        # param flattened by groups
-        self.fp16_groups = []
-        self.fp16_groups_flat = []
-
-        #param partitioned by data parallel degree
-        #this will contain a list of equal sized tensors
-        #each of which will be updated by a different process
-        self.parallel_partitioned_fp16_groups = []
-
-        #a single 32-bit partition of the parallel partitioned parameters
-        #that this process will update
-        self.single_partition_of_fp32_groups = []
-
-        #param partition info
-
-        #These are the parameters in each group that will not be updated by this process directly
-        self.params_not_in_partition = []
-
-        #These are the parameters that will be updated by this process directly
-        self.params_in_partition = []
-
-        #Offset from the first paramter in the the self.params_in_partition
-        #the parameter boundaries may not align with partition boundaries
-        #so we need to keep track of the offset
-        self.first_offset = []
-
-        #number of elements per partition in each group
-        self.partition_size = []
-
-        partition_id = dist.get_rank(group=self.dp_process_group)
-
-        self.all_reduce_print = False
-
-        # padding on each partition for alignment purposes
-        self.groups_padding = []
-        # loop to deal with groups
-        for i, param_group in enumerate(self.optimizer.param_groups):
-            # push this group to list before modify
-            self.fp16_groups.append(param_group['params'])
-            # Record padding required to align group to world size
-            if partition_id == dist.get_world_size(group=self.dp_process_group) - 1:
-                padding = get_alignment_padding(self.fp16_groups[i],
-                                                self.partition_count)
-            else:
-                padding = 0
-            self.groups_padding.append(padding)
-
-            #not sure why apex was cloning the weights before flattening
-            #removing cloning here
-
-            see_memory_usage(f"Before moving param group {i} to CPU")
-            #move all the parameters to cpu to free up GPU space for creating flat buffer
-            move_to_cpu(self.fp16_groups[i])
-            see_memory_usage(f"After moving param group {i} to CPU")
-
-            #create flat buffer in CPU and move to GPU
-            self.fp16_groups_flat.append(
-                flatten_dense_tensors_aligned(
-                    self.fp16_groups[i],
-                    dist.get_world_size(group=self.dp_process_group)).cuda(
-                        torch.cuda.current_device()))
-            see_memory_usage(f"After flattening and moving param group {i} to GPU")
-
-            if dist.get_rank(group=self.dp_process_group) == 0:
-                see_memory_usage(
-                    f"After Flattening and after emptying param group {i} cache")
-
-            # set model fp16 weight to slices of flattened buffer
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
-            for p, q in zip(self.fp16_groups[i], updated_params):
-                p.data = q.data
-
-            #divide the flat weights into near equal paritition equal to the data parallel degree
-            #each process will compute on a different part of the partition
-            data_parallel_partitions = self.get_data_parallel_partitions(
-                self.fp16_groups_flat[i])
-            self.parallel_partitioned_fp16_groups.append(data_parallel_partitions)
-
-            # a partition of the fp32 master weights that will be updated by this process
-            self.single_partition_of_fp32_groups.append(
-                self.parallel_partitioned_fp16_groups[i]
-                [partition_id].clone().float().detach())
-
-            # modify optimizer of have flat master weight
-            self.single_partition_of_fp32_groups[
-                i].requires_grad = True  # keep this in case internal optimizer uses it
-            param_group['params'] = [self.single_partition_of_fp32_groups[i]]
-
-            partition_size = len(self.fp16_groups_flat[i]) / dist.get_world_size(
-                group=self.dp_process_group)
-            params_in_partition, params_not_in_partition, first_offset = self.get_partition_info(self.fp16_groups[i], partition_size, partition_id)
-
-            self.partition_size.append(partition_size)
-            self.params_in_partition.append(params_in_partition)
-            self.params_not_in_partition.append(params_not_in_partition)
-            self.first_offset.append(first_offset)
-
-        self.reduce_bucket_size = int(reduce_bucket_size)
-        self.allgather_bucket_size = int(allgather_bucket_size)
-
-        self.reduction_event = torch.cuda.Event(enable_timing=False, blocking=False)
-        self.reduction_stream = torch.cuda.Stream()
-        self.callback_queued = False
-
-        self.param_dict = {}
-
-        #map between param_id and bool to specify if a param is in this partition
-        self.is_param_in_current_partition = {}
-
-        self.contiguous_gradients = contiguous_gradients
-        self.grads_in_ipg_bucket = []
-        self.params_in_ipg_bucket = []
-        self.elements_in_ipg_bucket = 0
-        self.params_already_reduced = []
-        self._release_ipg_buffers()
-        self.previous_reduced_grads = None
-
-        #simplified param id
-        self.param_id = {}
-
-        count = 0
-        for i, params_group in enumerate(self.fp16_groups):
-            for param in params_group:
-                unique_id = id(param)
-                self.param_id[unique_id] = count
-                self.param_dict[count] = param
-                self.params_already_reduced.append(False)
-                count = count + 1
-
-        for param_group in self.params_in_partition:
-            for param in param_group:
-                self.is_param_in_current_partition[self.get_param_id(param)] = True
-
-        for param_group in self.params_not_in_partition:
-            for param in param_group:
-                self.is_param_in_current_partition[self.get_param_id(param)] = False
-
-        #mapping from parameter to partition that it belongs to
-        self.param_to_partition_ids = {}
-
-        #stores if a partition has been reduced in this step
-        self.is_partition_reduced = {}
-
-        #number of grads in partition that still need to be computed
-        self.remaining_grads_in_partition = {}
-
-        #total number of grads in partition
-        self.total_grads_in_partition = {}
-
-        #stores if a grad in a partition has been computed or not
-        self.is_grad_computed = {}
-
-        #stores the offset at which a parameter gradient needs to be inserted in a partition
-        self.grad_partition_insertion_offset = {}
-
-        #the offset in the gradient at which it must be inserted at the beginning of the paritition
-        self.grad_start_offset = {}
-
-        #will store the averaged gradients required by this parititon
-        self.averaged_gradients = {}
-
-        # store index of first parameter in each partition
-        self.first_param_index_in_partition = {}
-
-        #initializes all data structures for implementing gradient partitioning
-        self.initialize_gradient_partitioning_data_structures()
-
-        #resets the data structure value for the next backward propagation
-        self.reset_partition_gradient_structures()
-
-        #creates backward hooks for gradient partitioning
-        self.create_reduce_and_remove_grad_hooks()
-
-        # we may have a way of fusing dynamic scale. Do not support for now
-        if dynamic_loss_scale:
-            if dynamic_loss_args is None:
-                self.loss_scaler = DynamicLossScaler()
-            else:
-                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
-
-            self.dynamic_loss_scale = True
-
-        else:
-            self.dynamic_loss_scale = False
-            self.loss_scaler = LossScaler(scale=static_loss_scale)
-            self.cur_iter = 0
-
-        see_memory_usage("Before initializing optimizer states")
-        self.initialize_optimizer_states()
-        see_memory_usage("After initializing optimizer states")
-
-        if dist.get_rank() == 0:
-            logger.info(f"optimizer state initialized")
-
-        if dist.get_rank(group=self.dp_process_group) == 0:
-            see_memory_usage(f"After initializing ZeRO optimizer")
-
-    def _release_ipg_buffers(self):
-        if self.contiguous_gradients:
-            self.ipg_buffer = None
-            self.grads_in_partition = None
-            self.grads_in_partition_offset = 0
-
-    def initialize_optimizer_states(self):
-
-        for i, group in enumerate(self.fp16_groups):
-            single_grad_partition = torch.zeros(
-                int(self.partition_size[i]),
-                dtype=self.single_partition_of_fp32_groups[i].dtype,
-                device=torch.cuda.current_device())
-            self.single_partition_of_fp32_groups[i].grad = single_grad_partition
-
-        self.optimizer.step()
-
-        for group in self.single_partition_of_fp32_groups:
-            group.grad = None
-
-        return
-
-    #########################################################################
-    #########################ZeRO Partition Gradients########################
-    #########################################################################
-
-    def get_first_param_index(self, group_id, param_group, partition_id):
-        for index, param in enumerate(param_group):
-            param_id = self.get_param_id(param)
-            if partition_id in self.param_to_partition_ids[group_id][param_id]:
-                return index
-        return None
-
-    def initialize_gradient_partitioning_data_structures(self):
-
-        total_partitions = dist.get_world_size(group=self.dp_process_group)
-
-        for i, param_group in enumerate(self.fp16_groups):
-
-            self.param_to_partition_ids[i] = {}
-            self.is_partition_reduced[i] = {}
-            self.total_grads_in_partition[i] = {}
-            self.remaining_grads_in_partition[i] = {}
-            self.is_grad_computed[i] = {}
-            self.grad_partition_insertion_offset[i] = {}
-            self.grad_start_offset[i] = {}
-            self.first_param_index_in_partition[i] = {}
-
-            for partition_id in range(total_partitions):
-                self.is_grad_computed[i][partition_id] = {}
-                self.grad_partition_insertion_offset[i][partition_id] = {}
-                self.grad_start_offset[i][partition_id] = {}
-                self.total_grads_in_partition[i][partition_id] = 0
-                self.initialize_gradient_partition(i, param_group, partition_id)
-                self.is_partition_reduced[i][partition_id] = False
-                self.first_param_index_in_partition[i][
-                    partition_id] = self.get_first_param_index(
-                        i,
-                        param_group,
-                        partition_id)
-
-    def independent_gradient_partition_epilogue(self):
-        self.report_ipg_memory_usage(f"In ipg_epilogue before reduce_ipg_grads", 0)
-        self.reduce_ipg_grads()
-        self.report_ipg_memory_usage(f"In ipg_epilogue after reduce_ipg_grads", 0)
-
-        #if dist.get_rank() == 0:
-        #    logger.info("Params already reduced %s", self.params_already_reduced)
-        for i in range(len(self.params_already_reduced)):
-            self.params_already_reduced[i] = False
-
-        if self.overlap_comm:
-            torch.cuda.synchronize()
-
-        for i, _ in enumerate(self.fp16_groups):
-            self.averaged_gradients[i] = self.get_flat_partition(
-                self.params_in_partition[i],
-                self.first_offset[i],
-                self.partition_size[i],
-                dtype=torch.half,
-                device=torch.cuda.current_device(),
-                return_tensor_list=True)
-
-        self._release_ipg_buffers()
-
-        see_memory_usage(f"End ipg_epilogue")
-
-    # resets all partition to no reduced
-    # sets remianing grads to the total number of grads in each partition
-    # set is grad computed to false for all grads in partition
-    def reset_partition_gradient_structures(self):
-        total_partitions = dist.get_world_size(group=self.dp_process_group)
-        for i, _ in enumerate(self.fp16_groups):
-            for partition_id in range(total_partitions):
-                self.is_partition_reduced[i][partition_id] = False
-                self.remaining_grads_in_partition[i][
-                    partition_id] = self.total_grads_in_partition[i][partition_id]
-
-                for param_id in self.is_grad_computed[i][partition_id]:
-                    self.is_grad_computed[i][partition_id][param_id] = False
-
-    def initialize_gradient_partition(self, i, param_group, partition_id):
-        def set_key_value_list(dictionary, key, value):
-            if key in dictionary:
-                dictionary[key].append(value)
-            else:
-                dictionary[key] = [value]
-
-        def increment_value(dictionary, key):
-            if key in dictionary:
-                dictionary[key] += 1
-            else:
-                dictionary[key] = 1
-
-        partition_size = self.partition_size[i]
-
-        start_index = partition_size * partition_id
-        end_index = partition_size * (partition_id + 1)
-
-        current_index = 0
-        first_offset = 0
-
-        for param in param_group:
-
-            param_size = param.numel()
-            param_id = self.get_param_id(param)
-
-            if (current_index >= start_index and current_index < end_index):
-                set_key_value_list(self.param_to_partition_ids[i],
-                                   param_id,
-                                   partition_id)
-                increment_value(self.total_grads_in_partition[i], partition_id)
-
-                self.is_grad_computed[i][partition_id][param_id] = False
-
-                self.grad_partition_insertion_offset[i][partition_id][
-                    param_id] = current_index - start_index
-                self.grad_start_offset[i][partition_id][param_id] = 0
-
-            elif start_index > current_index and start_index < (current_index +
-                                                                param_size):
-                assert (first_offset==0), "This can happen either zero or only once as this must be the first tensor in the partition"
-                first_offset = start_index - current_index
-
-                set_key_value_list(self.param_to_partition_ids[i],
-                                   param_id,
-                                   partition_id)
-                increment_value(self.total_grads_in_partition[i], partition_id)
-
-                self.is_grad_computed[i][partition_id][param_id] = False
-
-                self.grad_partition_insertion_offset[i][partition_id][param_id] = 0
-                self.grad_start_offset[i][partition_id][param_id] = first_offset
-
-            current_index = current_index + param_size
-
-    def overlapping_partition_gradients_reduce_epilogue(self):
-        self.independent_gradient_partition_epilogue()
-
-    def create_reduce_and_remove_grad_hooks(self):
-        self.grad_accs = []
-        for i, param_group in enumerate(self.fp16_groups):
-            for param in param_group:
-                if param.requires_grad:
-
-                    def wrapper(param, i):
-                        param_tmp = param.expand_as(param)
-                        grad_acc = param_tmp.grad_fn.next_functions[0][0]
-
-                        def reduce_partition_and_remove_grads(*notneeded):
-                            self.reduce_ready_partitions_and_remove_grads(param, i)
-
-                        grad_acc.register_hook(reduce_partition_and_remove_grads)
-                        self.grad_accs.append(grad_acc)
-
-                    wrapper(param, i)
-
-    def get_param_id(self, param):
-        unique_id = id(param)
-        return self.param_id[unique_id]
-
-    def report_ipg_memory_usage(self, tag, param_elems):
-        elem_count = self.elements_in_ipg_bucket + param_elems
-        percent_of_bucket_size = (100.0 * elem_count) // self.reduce_bucket_size
-        see_memory_usage(
-            f"{tag}: elems in_bucket {self.elements_in_ipg_bucket} param {param_elems} max_percent {percent_of_bucket_size}"
-        )
-
-    ###############Idependent Partition Gradient ########################
-    def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
-        if self.elements_in_ipg_bucket + param.numel() > self.reduce_bucket_size:
-            self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads",
-                                         param.numel())
-            self.reduce_ipg_grads()
-            if self.contiguous_gradients and self.overlap_comm:
-                # Swap ipg_index between 0 and 1
-                self.ipg_index = 1 - self.ipg_index
-            self.report_ipg_memory_usage("In ipg_remove_grads after reduce_ipg_grads",
-                                         param.numel())
-
-        param_id = self.get_param_id(param)
-
-        assert self.params_already_reduced[param_id] == False, \
-            f"The parameter {param_id} has already been reduced. \
-            Gradient computed twice for this partition. \
-            Multiple gradient reduction is currently not supported"
-
-        #keeping the gradients contiguous to prevent memory fragmentation, and avoid flattening
-        if self.contiguous_gradients:
-            new_grad_tensor = self.ipg_buffer[self.ipg_index].narrow(
-                0,
-                self.elements_in_ipg_bucket,
-                param.numel())
-            new_grad_tensor.copy_(param.grad.view(-1))
-            param.grad.data = new_grad_tensor.data.view_as(param.grad)
-
-        self.elements_in_ipg_bucket += param.numel()
-        self.grads_in_ipg_bucket.append(param.grad)
-        self.params_in_ipg_bucket.append((i, param, param_id))
-
-        self.report_ipg_memory_usage("End ipg_remove_grads", 0)
-
-    def print_rank_0(self, message):
-        if dist.get_rank() == 0:
-            logger.info(message)
-
-    def gradient_reduction_w_predivide(self, tensor):
-        dp_world_size = dist.get_world_size(group=self.dp_process_group)
-
-        tensor_to_allreduce = tensor
-
-        if self.allreduce_always_fp32:
-            tensor_to_allreduce = tensor.float()
-
-        if self.postscale_gradients:
-            if self.gradient_predivide_factor != 1.0:
-                tensor_to_allreduce.mul_(1. / self.gradient_predivide_factor)
-
-            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
-
-            if self.gradient_predivide_factor() != dp_world_size:
-                tensor_to_allreduce.mul_(self.gradient_predivide_factor() /
-                                         dp_world_size)
-        else:
-            tensor_to_allreduce.div_(dp_world_size)
-            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
-
-        if self.allreduce_always_fp32 and tensor is not tensor_to_allreduce:
-            tensor.copy_(tensor_to_allreduce)
-
-        return tensor
-
-    def average_tensor(self, tensor):
-        if self.overlap_comm:
-            torch.cuda.synchronize()
-            stream = self.reduction_stream
-        else:
-            stream = torch.cuda.current_stream()
-
-        with torch.cuda.stream(stream):
-            if not self.reduce_scatter:
-                self.gradient_reduction_w_predivide(tensor)
-                return
-
-            # Accumulate destination ranks and bucket offsets for each gradient slice.
-            # Note: potential future optimization, record access pattern of parameters
-            # in backward pass and partition gradients w.r.t. access pattern so that our
-            # bucket is guaranteed to be contiguous w.r.t. ranks
-            rank_and_offsets = []
-            curr_size = 0
-            prev_id = -1
-            for i, param, param_id in self.params_in_ipg_bucket:
-                partition_ids = self.param_to_partition_ids[i][param_id]
-                partition_size = self.partition_size[i]
-                # Get all partition ids + their offsets
-                partition_ids_w_offsets = []
-                for partition_id in partition_ids:
-                    offset = self.grad_start_offset[i][partition_id][param_id]
-                    partition_ids_w_offsets.append((partition_id, offset))
-                partition_ids_w_offsets.sort(key=lambda t: t[1])
-
-                # Calculate rank and offsets for grad slices
-                for idx in range(len(partition_ids_w_offsets)):
-                    partition_id, offset = partition_ids_w_offsets[idx]
-
-                    # Calculate numel for grad slice depending on partition location
-                    if idx == len(partition_ids_w_offsets) - 1:
-                        # Last partition_id uses its own offset
-                        numel = param.numel() - offset
-                    else:
-                        # Set numel to next partition's offset
-                        numel = partition_ids_w_offsets[idx + 1][1] - offset
-
-                    # Merge bucket ranges if they belong to the same rank
-                    if partition_id == prev_id:
-                        prev_pid, prev_size, prev_numel = rank_and_offsets[-1]
-                        rank_and_offsets[-1] = (prev_pid, prev_size, prev_numel + numel)
-                    else:
-                        rank_and_offsets.append((partition_id, curr_size, numel))
-
-                    curr_size += numel
-                    prev_id = partition_id
-            tensor.div_(dist.get_world_size(group=self.dp_process_group))
-
-            async_handles = []
-            for dst, bucket_offset, numel in rank_and_offsets:
-                grad_slice = tensor.narrow(0, int(bucket_offset), int(numel))
-                dst_rank = _get_global_rank(self.dp_process_group, dst)
-                async_handle = dist.reduce(grad_slice,
-                                           dst=dst_rank,
-                                           group=self.dp_process_group,
-                                           async_op=True)
-                async_handles.append(async_handle)
-
-            for handle in async_handles:
-                handle.wait()
-
-    def copy_grads_in_partition(self, param):
-        if self.grads_in_partition is None:
-            self.grads_in_partition_offset = 0
-            total_size = 0
-            for group in self.params_in_partition:
-                for param_in_partition in group:
-                    total_size += param_in_partition.numel()
-
-            see_memory_usage(f"before copying {total_size} gradients into partition")
-            self.grads_in_partition = torch.empty(int(total_size),
-                                                  dtype=torch.half,
-                                                  device=torch.cuda.current_device())
-            see_memory_usage(f"after copying {total_size} gradients into partition")
-
-        #The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
-        new_grad_tensor = self.grads_in_partition.narrow(0,
-                                                         self.grads_in_partition_offset,
-                                                         param.numel())
-        new_grad_tensor.copy_(param.grad.view(-1))
-        param.grad.data = new_grad_tensor.data.view_as(param.grad)
-        self.grads_in_partition_offset += param.numel()
-
-    def reduce_ipg_grads(self):
-        if self.overlap_comm:
-            stream = self.reduction_stream
-        else:
-            stream = torch.cuda.current_stream()
-
-        if self.contiguous_gradients:
-            self.average_tensor(self.ipg_buffer[self.ipg_index])
-        else:
-            self.buffered_reduce_fallback(
-                None,
-                self.grads_in_ipg_bucket,
-                elements_per_buffer=self.elements_in_ipg_bucket)
-
-        with torch.cuda.stream(stream):
-            for _, param, param_id in self.params_in_ipg_bucket:
-                self.params_already_reduced[param_id] = True
-
-                if not self.is_param_in_current_partition[param_id]:
-                    if self.overlap_comm and self.contiguous_gradients is False:
-                        # Clear the previous grads during the next reduction
-                        # to avoid clearing them before the reduction is complete.
-                        if self.previous_reduced_grads is None:
-                            self.previous_reduced_grads = []
-                        self.previous_reduced_grads.append(param)
-                    else:
-                        param.grad = None
-                elif self.contiguous_gradients:
-                    self.copy_grads_in_partition(param)
-
-        self.grads_in_ipg_bucket = []
-        self.params_in_ipg_bucket = []
-        self.elements_in_ipg_bucket = 0
-        #####################################################################
-
-    def reduce_ready_partitions_and_remove_grads(self, param, i):
-        self.reduce_independent_p_g_buckets_and_remove_grads(param, i)
-
-    def zero_reduced_gradients(self, partition_id, i):
-        def are_all_related_partitions_reduced(params_id):
-            for partition_id in self.param_to_partition_ids[i][params_id]:
-                if not self.is_partition_reduced[i][partition_id]:
-                    return False
-            return True
-
-        for params_id in self.is_grad_computed[i][partition_id]:
-            if are_all_related_partitions_reduced(params_id):
-                self.param_dict[params_id].grad = None
-
-    def flatten_and_print(self, message, tensors, start=0, n=5):
-        flatten_tensor = _flatten_dense_tensors(tensors)
-
-        def print_func():
-            logger.info(flatten_tensor.contiguous().view(-1).narrow(0, start, n))
-
-        self.sequential_execution(print_func, message)
-
-    def get_grads_to_reduce(self, i, partition_id):
-        def get_reducable_portion(key):
-            grad = self.param_dict[key].grad
-            total_elements = grad.numel()
-            start = self.grad_start_offset[i][partition_id][key]
-            num_elements = min(
-                total_elements - start,
-                self.partition_size[i] -
-                self.grad_partition_insertion_offset[i][partition_id][key])
-            if not pg_correctness_test:
-                if num_elements == total_elements:
-                    return grad
-                else:
-                    return grad.contiguous().view(-1).narrow(0,
-                                                             int(start),
-                                                             int(num_elements))
-            else:
-                if num_elements == total_elements:
-                    return grad.clone()
-                else:
-                    return grad.clone().contiguous().view(-1).narrow(
-                        0,
-                        int(start),
-                        int(num_elements))
-
-        grads_to_reduce = []
-        for key in self.is_grad_computed[i][partition_id]:
-            grad = get_reducable_portion(key)
-            grads_to_reduce.append(grad)
-        return grads_to_reduce
-
-    def sequential_execution(self, function, message, group=None):
-        if group is None:
-            group = self.dp_process_group
-        if dist.get_rank(group=group) == 0:
-            logger.info(message)
-        for id in range(dist.get_world_size(group=group)):
-            if id == dist.get_rank(group=group):
-                function()
-            dist.barrier(group=group)
-
-    def set_none_gradients_to_zero(self, i, partition_id):
-        for param_id in self.is_grad_computed[i][partition_id]:
-            param = self.param_dict[param_id]
-            if param.grad is None:
-                param.grad = torch.zero_like(param)
-
-    ######################Reduction Related Methods##############################
-
-    def allreduce_bucket(self, bucket, allreduce_always_fp32=False, rank=None, log=None):
-        rank = None
-        tensor = flatten(bucket)
-
-        tensor_to_allreduce = tensor
-
-        if pg_correctness_test:
-            allreduce_always_fp32 = True
-
-        if allreduce_always_fp32:
-            tensor_to_allreduce = tensor.float()
-
-        tensor_to_allreduce.div_(dist.get_world_size(group=self.dp_process_group))
-
-        if rank is None:
-            #    "All Reducing"
-            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
-        else:
-            global_rank = _get_global_rank(self.dp_process_group, rank)
-            dist.reduce(tensor_to_allreduce, global_rank, group=self.dp_process_group)
-
-        if allreduce_always_fp32 and tensor is not tensor_to_allreduce:
-            if rank is None or rank == dist.get_rank(group=self.dp_process_group):
-                tensor.copy_(tensor_to_allreduce)
-
-        return tensor
-
-    #if rank is specified do a reduction instead of an allreduce
-    def allreduce_and_copy(self, small_bucket, rank=None, log=None):
-        if self.overlap_comm:
-            torch.cuda.synchronize()
-            if self.previous_reduced_grads is not None:
-                # previous_reduced_grads has the previous reduced grads,
-                # now it is safe to clear.
-                for param in self.previous_reduced_grads:
-                    param.grad = None
-                self.previous_reduced_grads = None
-            stream = self.reduction_stream
-        else:
-            stream = torch.cuda.current_stream()
-
-        with torch.cuda.stream(stream):
-            allreduced = self.allreduce_bucket(small_bucket, rank=rank, log=log)
-            if rank is None or rank == dist.get_rank(group=self.dp_process_group):
-                for buf, synced in zip(small_bucket, unflatten(allreduced, small_bucket)):
-                    buf.copy_(synced)
-
-    def allreduce_no_retain(self,
-                            bucket,
-                            numel_per_bucket=500000000,
-                            rank=None,
-                            log=None):
-        small_bucket = []
-        numel = 0
-        for tensor in bucket:
-            small_bucket.append(tensor)
-            numel = numel + tensor.numel()
-            if numel > numel_per_bucket:
-                self.allreduce_and_copy(small_bucket, rank=rank, log=None)
-                small_bucket = []
-        if len(small_bucket) > 0:
-            self.allreduce_and_copy(small_bucket, rank=rank, log=log)
-
-    #allows using reduction of gradients instead of using all_reduce
-    def buffered_reduce_fallback(self,
-                                 rank,
-                                 grads,
-                                 elements_per_buffer=500000000,
-                                 log=None):
-        split_buckets = split_half_float_double(grads)
-
-        for i, bucket in enumerate(split_buckets):
-            self.allreduce_no_retain(bucket,
-                                     numel_per_bucket=elements_per_buffer,
-                                     rank=rank,
-                                     log=log)
-
-    #############################################################################
-    #############################################################################
-    #############################################################################
-
-    #views the tensor as multiple partitions and returns
-    #those partitions
-    def get_data_parallel_partitions(self, tensor):
-        partitions = []
-
-        dp = dist.get_world_size(group=self.dp_process_group)
-        dp_id = dist.get_rank(group=self.dp_process_group)
-
-        total_num_elements = tensor.numel()
-
-        base_size = total_num_elements // dp
-        remaining = total_num_elements % dp
-
-        start = 0
-        for id in range(dp):
-            partition_size = base_size
-            if id < remaining:
-                partition_size = partition_size + 1
-            partitions.append(tensor.narrow(0, start, partition_size))
-            start = start + partition_size
-        return partitions
-
-    def get_partition_info(self, tensor_list, partition_size, partition_id):
-        params_in_partition = []
-        params_not_in_partition = []
-
-        start_index = partition_size * partition_id
-        end_index = partition_size * (partition_id + 1)
-
-        current_index = 0
-        first_offset = 0
-
-        for tensor in tensor_list:
-
-            tensor_size = tensor.numel()
-
-            if (current_index >= start_index and current_index < end_index):
-                params_in_partition.append(tensor)
-
-            elif start_index > current_index and start_index < (current_index +
-                                                                tensor_size):
-                params_in_partition.append(tensor)
-
-                assert (first_offset==0), "This can happen either zero or only once as this must be the first tensor in the partition"
-                first_offset = start_index - current_index
-
-            else:
-                params_not_in_partition.append(tensor)
-
-            current_index = current_index + tensor_size
-
-        return params_in_partition, params_not_in_partition, first_offset
-
-    def zero_grad(self, set_grads_to_None=True):
-        """
-        Zero FP16 parameter grads.
-        """
-        # FP32 grad should never exist.
-        # For speed, set model fp16 grad to None by default
-        for group in self.fp16_groups:
-            for p in group:
-                if set_grads_to_None:
-                    p.grad = None
-                else:
-                    if p.grad is not None:
-                        p.grad.detach_()
-                        p.grad.zero_()
-
-    def _model_parallel_all_reduce(self, tensor, op):
-        """ Perform all reduce within model parallel group, if any.
-        """
-        if self.model_parallel_group is None:
-            torch.distributed.all_reduce(tensor=tensor, op=op)
-        else:
-            torch.distributed.all_reduce(tensor=tensor,
-                                         op=op,
-                                         group=self.model_parallel_group)
-
-    def get_grad_norm_direct(self, gradients, params, norm_type=2):
-        """Clips gradient norm of an iterable of parameters.
-
-        This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
-        added functionality to handle model parallel parameters. Note that
-        the gradients are modified in place.
-
-        Arguments:
-            parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-                single Tensor that will have gradients normalized
-            max_norm (float or int): max norm of the gradients
-            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-                infinity norm.
-
-        Returns:
-            Total norm of the parameters (viewed as a single vector).
-        """
-        norm_type = float(norm_type)
-        if norm_type == inf:
-            total_norm = max(g.data.abs().max() for g in gradients)
-            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=self.dp_process_group)
-
-            # Take max across all GPUs.
-            self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                            op=torch.distributed.ReduceOp.MAX)
-            total_norm = total_norm_cuda[0].item()
-        else:
-            total_norm = 0.0
-            #if dist.get_rank() == 0:
-            #    logger.info(f"Total Norm begining {total_norm}")
-            for g, p in zip(gradients, params):
-                if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
-                    param_norm = g.data.double().norm(2)
-                    total_norm += param_norm.item()**2
-            # Sum across all model parallel GPUs.
-            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.SUM,
-                                         group=self.dp_process_group)
-
-            self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                            op=torch.distributed.ReduceOp.SUM)
-
-            total_norm = total_norm_cuda[0].item()**(1. / norm_type)
-
-        if total_norm == float(
-                'inf') or total_norm == -float('inf') or total_norm != total_norm:
-            total_norm = -1
-
-        return total_norm
-
-    #creates a flat fused tensor from the tensor list starting at the first_offset
-    #in the first tensor of the list. If there are not enough elements in the tensor
-    #list then the flat tensor will be padded with zeros
-    def get_flat_partition(self,
-                           tensor_list,
-                           first_offset,
-                           partition_size,
-                           dtype,
-                           device,
-                           return_tensor_list=False):
-        flat_tensor_list = []
-        current_size = 0
-        for i, tensor in enumerate(tensor_list):
-            if tensor.grad is None:
-                continue
-
-            tensor = tensor.grad
-            num_elements = tensor.numel()
-            tensor_offset = 0
-
-            #we need to offset to get to the right element
-            if i == 0 and first_offset > 0:
-                tensor_offset = first_offset
-                num_elements = num_elements - tensor_offset
-
-            #we dont need all elements of the tensor
-            if num_elements > (partition_size - current_size):
-                num_elements = partition_size - current_size
-
-            #we need a narrow view of the tensor based on the tensor offset and number of elements that
-            #we need from this tensor
-            if tensor_offset > 0 or num_elements < tensor.numel():
-                flat_tensor_list.append(tensor.contiguous().view(-1).narrow(
-                    0,
-                    int(tensor_offset),
-                    int(num_elements)))
-            else:
-                flat_tensor_list.append(tensor)
-
-            current_size = current_size + num_elements
-
-        #this means its the last partition and does not align with the dp boundary. We need to pad before flattening
-        if current_size < partition_size:
-            flat_tensor_list.append(
-                torch.zeros(int(partition_size - current_size),
-                            dtype=dtype,
-                            device=device))
-
-        if return_tensor_list:
-            return flat_tensor_list
-
-        return _flatten_dense_tensors(flat_tensor_list)
-
-    def free_grad_in_param_list(self, param_list):
-        for p in param_list:
-            p.grad = None
-
-    def step(self, closure=None):
-        """
-        Not supporting closure.
-        """
-        see_memory_usage(f"In step before checking overflow")
-
-        # First compute norm for all group so we know if there is overflow
-        self.check_overflow()
-
-        timers = self.timers
-
-        prev_scale = self.loss_scale
-        self._update_scale(self.overflow)
-        if self.overflow:
-            see_memory_usage('After overflow before clearing gradients')
-            self.zero_grad()
-            see_memory_usage('After overflow after clearing gradients')
-
-            logger.info(
-                "[deepscale] OVERFLOW! Rank {} Skipping step. Attempted loss scale: {}, "
-                "reducing to {}".format(dist.get_rank(),
-                                        prev_scale,
-                                        self.loss_scale))
-            timers('optimizer_step').start()
-            timers('optimizer_step').stop()
-            timers('optimizer_allgather').start()
-            timers('optimizer_allgather').stop()
-            return
-
-        norm_groups = []
-        single_partition_grad_groups = []
-        skip = False
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        for i, group in enumerate(self.fp16_groups):
-
-            norm_groups.append(
-                self.get_grad_norm_direct(self.averaged_gradients[i],
-                                          self.params_in_partition[i]))
-
-            #free gradients for all the prameters that are not updated by this process
-            self.free_grad_in_param_list(self.params_not_in_partition[i])
-
-            #create a flat gradients for parameters updated by this process
-            # If we are last partition, ensure we have same size grads and partition size, if not pad with zero tensors
-            if partition_id == dist.get_world_size(group=self.dp_process_group) - 1:
-                single_grad_partition = flatten_dense_tensors_aligned(
-                    self.averaged_gradients[i],
-                    int(self.partition_size[i])).to(
-                        self.single_partition_of_fp32_groups[i].dtype)
-            else:
-                single_grad_partition = _flatten_dense_tensors(
-                    self.averaged_gradients[i]).to(
-                        self.single_partition_of_fp32_groups[i].dtype)
-            assert single_grad_partition.numel() == self.partition_size[i], \
-                "averaged gradients have different number of elements that partition size {} {} {} {}".format(single_grad_partition.numel(), self.partition_size[i], i, partition_id)
-
-            self.single_partition_of_fp32_groups[i].grad = single_grad_partition
-            #release all the gradient since we have already created a necessary copy in dp_grad_partition
-            self.free_grad_in_param_list(self.params_in_partition[i])
-
-            self.averaged_gradients[i] = None
-
-            single_partition_grad_groups.append(single_grad_partition)
-
-        self.unscale_and_clip_grads(single_partition_grad_groups, norm_groups)
-
-        timers('optimizer_step').start()
-        self.optimizer.step()
-        #get rid of the fp32 gradients. Not needed anymore
-        for group in self.single_partition_of_fp32_groups:
-            group.grad = None
-
-        for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
-            fp16_partitions[partition_id].data.copy_(fp32_partition.data)
-        timers('optimizer_step').stop()
-
-        timers('optimizer_allgather').start()
-        #gather the updated weights from everyone
-        for group_id, partitioned_params in enumerate(self.parallel_partitioned_fp16_groups):
-
-            #Sequential AllGather Best of both worlds
-            dp_world_size = dist.get_world_size(group=self.dp_process_group)
-            num_shards = max(
-                1,
-                partitioned_params[partition_id].numel() * dp_world_size //
-                self.allgather_bucket_size)
-
-            shard_size = partitioned_params[partition_id].numel() // num_shards
-            num_elements = shard_size
-
-            assert shard_size * num_shards <= partitioned_params[partition_id].numel()
-
-            for shard_id in range(num_shards):
-
-                if shard_id == (num_shards - 1):
-                    num_elements = partitioned_params[partition_id].numel(
-                    ) - shard_id * shard_size
-
-                shard_list = []
-                for dp_id in range(dp_world_size):
-                    curr_shard = partitioned_params[dp_id].narrow(
-                        0,
-                        shard_id * shard_size,
-                        num_elements).detach()
-                    shard_list.append(curr_shard)
-
-                dist.all_gather(shard_list,
-                                shard_list[partition_id],
-                                group=self.dp_process_group)
-        timers('optimizer_allgather').stop()
-
-        # TODO: we probably don't need this? just to be safe
-        for i in range(len(norm_groups)):
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
-            for p, q in zip(self.fp16_groups[i], updated_params):
-                p.data = q.data
-
-        see_memory_usage('After zero_optimizer step')
-        return
-
-    def unscale_and_clip_grads(self, grad_groups_flat, norm_groups):
-        total_norm = 0.0
-        for norm in norm_groups:
-            total_norm += norm**2.0
-        total_norm = math.sqrt(total_norm)
-
-        # compute combined scale factor for this group
-        combined_scale = self.loss_scale
-        if self.clip_grad > 0.:
-            # norm is in fact norm*scale
-            clip = ((total_norm / self.loss_scale) + 1e-6) / self.clip_grad
-            if clip > 1:
-                combined_scale = clip * self.loss_scale
-
-        for grad in grad_groups_flat:
-            if isinstance(grad, list):
-                sub_partitions = grad
-                for g in sub_partitions:
-                    g.data.mul_(1. / combined_scale)
-            else:
-                grad.data.mul_(1. / combined_scale)
-
-    def _check_overflow(self, partition_gradients=True):
-        self.overflow = self.has_overflow(partition_gradients)
-
-    # `params` is a list / generator of torch.Variable
-    def has_overflow_serial(self, params, is_grad_list=False):
-        for p in params:
-            if p.grad is not None and self._has_inf_or_nan(p.grad.data):
-                return True
-
-        return False
-
-    def has_overflow_partitioned_grads_serial(self):
-        for i in range(len(self.fp16_groups)):
-            for j, grad in enumerate(self.averaged_gradients[i]):
-                if grad is not None and self._has_inf_or_nan(grad.data, j):
-                    return True
-        return False
-
-    def has_overflow(self, partition_gradients=True):
-        if partition_gradients:
-            overflow = self.has_overflow_partitioned_grads_serial()
-            overflow_gpu = torch.cuda.ByteTensor([overflow])
-            torch.distributed.all_reduce(overflow_gpu,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=self.dp_process_group)
-
-        else:
-            params = []
-            for group in self.fp16_groups:
-                for param in group:
-                    params.append(param)
-
-            overflow = self.has_overflow_serial(params, is_grad_list=partition_gradients)
-            overflow_gpu = torch.cuda.ByteTensor([overflow])
-
-        # Since each model parallel GPU carries only part of the model,
-        # make sure overflow flag is synced across all the model parallel GPUs
-        self._model_parallel_all_reduce(tensor=overflow_gpu,
-                                        op=torch.distributed.ReduceOp.MAX)
-
-        overflow = overflow_gpu[0].item()
-        return bool(overflow)
-
-    # `x` is a torch.Tensor
-    @staticmethod
-    def _has_inf_or_nan(x, j=None):
-        try:
-            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
-            # Pytorch's .sum() creates a one-element tensor of the same type as x
-            # (which is true for some recent version of pytorch).
-            cpu_sum = float(x.float().sum())
-            # More efficient version that can be used if .sum() returns a Python scalar
-            # cpu_sum = float(x.sum())
-        except RuntimeError as instance:
-            # We want to check if inst is actually an overflow exception.
-            # RuntimeError could come from a different error.
-            # If so, we still want the exception to propagate.
-            if "value cannot be converted" not in instance.args[0]:
-                raise
-            return True
-        else:
-            if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
-                return True
-            return False
-
-    def backward(self, loss, retain_graph=False):
-        """
-        :attr:`backward` performs the following steps:
-
-        1. fp32_loss = loss.float()
-        2. scaled_loss = fp32_loss*loss_scale
-        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's fp16 leaves
-        """
-        if self.contiguous_gradients:
-            self.ipg_buffer = []
-            buf_0 = torch.empty(self.reduce_bucket_size,
-                                dtype=torch.half,
-                                device=torch.cuda.current_device())
-            self.ipg_buffer.append(buf_0)
-
-            # Use double buffers to avoid data access conflict when overlap_comm is enabled.
-            if self.overlap_comm:
-                buf_1 = torch.empty(self.reduce_bucket_size,
-                                    dtype=torch.half,
-                                    device=torch.cuda.current_device())
-                self.ipg_buffer.append(buf_1)
-            self.ipg_index = 0
-
-        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
-
-    def check_overflow(self, partition_gradients=True):
-        self._check_overflow(partition_gradients)
-
-    def _update_scale(self, has_overflow=False):
-        self.loss_scaler.update_scale(has_overflow)
-
-    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
-    def _get_state(self):
-        return self.optimizer.state
-
-    def _set_state(self, value):
-        self.optimizer.state = value
-
-    state = property(_get_state, _set_state)
-
-    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
-    # (for example, to adjust the learning rate)
-    def _get_param_groups(self):
-        return self.optimizer.param_groups
-
-    def _set_param_groups(self, value):
-        self.optimizer.param_groups = value
-
-    param_groups = property(_get_param_groups, _set_param_groups)
-
-    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
-    def _get_loss_scale(self):
-        return self.loss_scaler.loss_scale
-
-    def _set_loss_scale(self, value):
-        self.loss_scaler.cur_scale = value
-
-    loss_scale = property(_get_loss_scale, _set_loss_scale)
-    cur_scale = property(_get_loss_scale, _set_loss_scale)
-
-    # Return group tensor after removing paddings that are added for alignment to DP world size.
-    # This method works on the assumption that each group contains a single flattened tensor.
-    def _get_groups_without_padding(self, groups_with_padding):
-        groups_without_padding = []
-        for i, group in enumerate(groups_with_padding):
-            lean_length = group.numel() - self.groups_padding[i]
-            groups_without_padding.append(group[:lean_length])
-
-        return groups_without_padding
-
-    # Return optimizer state after removing paddings that are added for alignment.
-    def _get_state_without_padding(self, state_with_padding, padding):
-        lean_state = {}
-        for key, value in state_with_padding.items():
-            lean_length = value.numel() - padding
-            lean_state[key] = value[:lean_length]
-
-        return lean_state
-
-    # Return base optimizer states.
-    # This method assumes that each param group contains a single flattened tensor.
-    def _get_base_optimizer_state(self):
-        optimizer_groups_state = []
-        for i, group in enumerate(self.optimizer.param_groups):
-            p = group['params'][0]
-            lean_optimizer_state = self._get_state_without_padding(
-                self.optimizer.state[p],
-                self.groups_padding[i])
-            optimizer_groups_state.append(lean_optimizer_state)
-
-        return optimizer_groups_state
-
-    def state_dict(self):
-        """
-        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
-        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
-        of the contained Pytorch optimizer.
-        Example::
-            checkpoint = {}
-            checkpoint['model'] = model.state_dict()
-            checkpoint['optimizer'] = optimizer.state_dict()
-            torch.save(checkpoint, "saved.pth")
-        """
-        state_dict = {}
-        state_dict['loss_scaler'] = self.loss_scaler
-        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
-        state_dict['overflow'] = self.overflow
-        state_dict['base_optimizer_state'] = self._get_base_optimizer_state()
-
-        state_dict['zero_stage'] = ZERO_OPTIMIZATION_GRADIENTS
-        state_dict['partition_count'] = self.partition_count
-
-        # Remove paddings for DP alignment to enable loading for other alignment values
-        fp32_groups_without_padding = self._get_groups_without_padding(
-            self.single_partition_of_fp32_groups)
-        state_dict['single_partition_of_fp32_groups'] = fp32_groups_without_padding
-
-        return state_dict
-
-    # Restore base optimizer fp32 weights from checkpoint by:
-    # 1) Merging fp32 weights from checkpoints of all partitions
-    # 2) Extracting fp32 weights for current partition from merged weights
-    # 3) Using extracted weights to update base optimizer weights directly.
-    def _restore_from_fp32_weights(self, all_state_dict):
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        merged_single_partition_of_fp32_groups = []
-        for i in range(len(self.single_partition_of_fp32_groups)):
-            merged_partitions = [
-                sd['single_partition_of_fp32_groups'][i] for sd in all_state_dict
-            ]
-            flat_merged_partitions = flatten_dense_tensors_aligned(
-                merged_partitions,
-                dist.get_world_size(group=self.dp_process_group))
-            dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions)
-            merged_single_partition_of_fp32_groups.append(dp_partitions[partition_id])
-
-        for current, saved in zip(self.single_partition_of_fp32_groups, merged_single_partition_of_fp32_groups):
-            current.data.copy_(saved.data)
-
-    # Restore base optimizer fp32 weights from ZeRO fp16 weights
-    def _restore_from_fp16_weights(self):
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
-            fp32_partition.data.copy_(fp16_partitions[partition_id].data)
-
-    # Refresh the fp32 master params from the fp16 copies.
-    def refresh_fp32_params(self):
-        self._restore_from_fp16_weights()
-
-    # Extract optimizer state for current partition from merged states of all partitions
-    def _partition_base_optimizer_state(self, state_key, all_partition_states):
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        alignment = dist.get_world_size(group=self.dp_process_group)
-        flat_merged_partitions = flatten_dense_tensors_aligned(
-            all_partition_states,
-            alignment)
-        dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions)
-        return dp_partitions[partition_id]
-
-    # Restore base optimizer state from checkpoint by
-    # 1) Merging optimizer state from checkpoints of all partitions
-    # 2) Extracting optimizer state for current partition from the merged state
-    # 3) Using the extracted value to directly update the base optimizer.
-    def _restore_base_optimizer_state(self, all_state_dict):
-        base_optimizer_group_states = []
-        for i in range(len(self.optimizer.param_groups)):
-            partition_states = {}
-            all_partition_group_states = [
-                sd['base_optimizer_state'][i] for sd in all_state_dict
-            ]
-            for key in all_partition_group_states[0].keys():
-                all_partition_states = [
-                    all_states[key] for all_states in all_partition_group_states
-                ]
-                partition_states[key] = self._partition_base_optimizer_state(
-                    key,
-                    all_partition_states)
-            base_optimizer_group_states.append(partition_states)
-
-        for i, group in enumerate(self.optimizer.param_groups):
-            p = group['params'][0]
-            for key, saved in base_optimizer_group_states[i].items():
-                current = self.optimizer.state[p][key]
-                current.data.copy_(saved.data)
-
-    def load_state_dict(self,
-                        state_dict_list,
-                        load_optimizer_states=True,
-                        load_from_fp32_weights=False):
-        r"""Loading ZeRO checkpoint
-
-        Arguments:
-            state_dict_list: List of all saved ZeRO checkpoints, one for each saved partition.
-                Note that the number of saved partitions may differ from number of loading partitions to support
-                changing GPU count, specifically DP world size, between saving and loading checkpoints.
-            load_optimizer_states: Boolean indicating whether or not to load base optimizer states
-            load_from_fp32_weights: Boolean indicating whether to initialize fp32 master weights from fp32
-            copies in checkpoints (no precision loss) or from model's fp16 copies (with precision loss).
-        """
-        """
-        Loads a state_dict created by an earlier call to state_dict().
-        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
-        whose parameters in turn came from ``model``, it is expected that the user
-        will call ``model.load_state_dict()`` before
-        ``fp16_optimizer_instance.load_state_dict()`` is called.
-        Example::
-            model = torch.nn.Linear(D_in, D_out).cuda().half()
-            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
-            ...
-            checkpoint = torch.load("saved.pth")
-            model.load_state_dict(checkpoint['model'])
-            optimizer.load_state_dict(checkpoint['optimizer'])
-        """
-        # I think it should actually be ok to reload the optimizer before the model.
-        self.loss_scaler = state_dict_list[0]['loss_scaler']
-        self.dynamic_loss_scale = state_dict_list[0]['dynamic_loss_scale']
-        self.overflow = state_dict_list[0]['overflow']
-
-        if load_optimizer_states:
-            self._restore_base_optimizer_state(state_dict_list)
-
-        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
-        # The optimizer's hyperparameters and internal buffers are also up to date.
-        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
-        # out of date.  There are two options.
-        # 1:  Refresh the master params from the model's fp16 params.
-        # This requires less storage but incurs precision loss.
-        # 2:  Save and restore the fp32 master copies separately.
-        # We choose option 1 if changing DP degree and option 2 otherwise.
-        #
-        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
-        # of their associated parameters, because it's possible those buffers might not exist yet in
-        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
-        # constructed in the same way as the one whose state_dict we are loading, the same master params
-        # are guaranteed to exist, so we can just copy_() from the saved master params.
-
-        if load_from_fp32_weights:
-            self._restore_from_fp32_weights(state_dict_list)
-        else:
-            self._restore_from_fp16_weights()
-
-
-def _handle_overflow(cpu_sum, x, i):
-    import math
-    rank = torch.distributed.get_rank()
-    if rank == 0:
-        t_i = -1
-        for v_i, v in enumerate(x.data.contiguous().view(-1)):
-            if not math.isfinite(float(v)):
-                t_i = v_i
-                break
-        logger.info(
-            f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}"
-        )
+'''
+Copyright 2019 The Microsoft DeepSpeed Team
+'''
+
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.distributed.distributed_c10d import _get_global_rank
+import torch.distributed as dist
+import math
+from torch._six import inf
+from torch.autograd import Variable
+
+import collections
+
+from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler
+from deepspeed.runtime.utils import see_memory_usage, is_model_parallel_parameter
+from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_GRADIENTS
+from deepspeed.ops.adam import DeepSpeedCPUAdam
+
+from deepspeed.utils import logger
+from ...ops.op_builder import UtilsBuilder
+
+#Toggle this to true to enable correctness test
+#with gradient partitioning and without
+pg_correctness_test = False
+
+
+def input(msg):
+    return
+
+
+def split_half_float_double(tensors):
+    dtypes = [
+        "torch.cuda.HalfTensor",
+        "torch.cuda.FloatTensor",
+        "torch.cuda.DoubleTensor"
+    ]
+    buckets = []
+    for i, dtype in enumerate(dtypes):
+        bucket = [t for t in tensors if t.type() == dtype]
+        if bucket:
+            buckets.append(bucket)
+    return buckets
+
+
+def isclose(a, b, rtol=1e-09, atol=0.0):
+    return abs(a - b) <= max(rtol * max(abs(a), abs(b)), atol)
+
+
+def lcm(x, y):
+    from fractions import gcd  # or can import gcd from `math` in Python 3
+    return x * y // gcd(x, y)
+
+
+# create a flat tensor aligned at the alignment boundary
+def flatten_dense_tensors_aligned(tensor_list, alignment):
+    num_elements = 0
+    for tensor in tensor_list:
+        num_elements = num_elements + tensor.numel()
+
+    remaining = num_elements % alignment
+
+    if remaining:
+        elements_to_add = alignment - remaining
+        pad_tensor = torch.zeros(elements_to_add,
+                                 device=tensor_list[0].device,
+                                 dtype=tensor_list[0].dtype)
+        padded_tensor_list = tensor_list + [pad_tensor]
+
+        num_elements = num_elements + elements_to_add
+    else:
+        padded_tensor_list = tensor_list
+
+    return _flatten_dense_tensors(padded_tensor_list)
+
+
+def get_alignment_padding(tensor_list, alignment):
+    num_elements = sum([tensor.numel() for tensor in tensor_list])
+    remainder = num_elements % alignment
+    return (alignment - remainder) if remainder else remainder
+
+
+def move_to_cpu(tensor_list):
+    for tensor in tensor_list:
+        tensor.data = tensor.data.cpu()
+
+
+def print_rank_msg(msg):
+    print(f"rank {dist.get_rank()} - {msg}")
+
+
+class FP16_DeepSpeedZeroOptimizer(object):
+    """
+    DeepSpeedZeroOptimizer designed to reduce the memory footprint
+    required for training large deep learning models.
+
+    For more details please see ZeRO: Memory Optimization Towards Training A Trillion Parameter Models
+    https://arxiv.org/abs/1910.02054
+
+    For usage examples, refer to TODO: DeepSpeed Tutorial
+
+    """
+    def __init__(self,
+                 init_optimizer,
+                 timers,
+                 static_loss_scale=1.0,
+                 dynamic_loss_scale=False,
+                 dynamic_loss_args=None,
+                 verbose=True,
+                 contiguous_gradients=True,
+                 reduce_bucket_size=500000000,
+                 allgather_bucket_size=5000000000,
+                 dp_process_group=None,
+                 reduce_scatter=True,
+                 overlap_comm=False,
+                 cpu_offload=False,
+                 mpu=None,
+                 clip_grad=0.0,
+                 allreduce_always_fp32=False,
+                 postscale_gradients=True,
+                 gradient_predivide_factor=1.0,
+                 gradient_accumulation_steps=1):
+
+        # Load pre-installed or JIT compile (un)flatten ops
+        util_ops = UtilsBuilder().load()
+        self.flatten = util_ops.flatten
+        self.unflatten = util_ops.unflatten
+
+        if dist.get_rank() == 0:
+            logger.info(f"Reduce bucket size {reduce_bucket_size}")
+            logger.info(f"Allgather bucket size {allgather_bucket_size}")
+            logger.info(f"CPU Offload: {cpu_offload}")
+        # The fused optimizer does all the work. We need this layer for two reason:
+        # 1. maintain same user API from apex.fp16_utils
+        # 2. keep common stuff here in case we need to add ne552w fused optimizer later
+
+        # differences from apex.fp16_utils:
+        # - assume all model params in fp16
+        # - assume all params requires grad
+        # - flat by groups, not keeping state. TODO: remove state explicitly?
+        # - master gard and unflat master weight never exist. TODO: a way to save out unflat master?
+        if not torch.cuda.is_available:
+            raise SystemError("Cannot use fp16 without CUDA.")
+        self.optimizer = init_optimizer
+
+        self.timers = timers
+
+        self.reduce_scatter = reduce_scatter
+
+        self.overlap_comm = overlap_comm
+
+        self.cpu_offload = cpu_offload
+
+        self.deepspeed_adam_offload = cpu_offload
+
+        self.device = torch.cuda.current_device() if not self.cpu_offload else 'cpu'
+
+        self.dp_process_group = dp_process_group
+
+        self.partition_count = dist.get_world_size(group=self.dp_process_group)
+
+        if mpu is None:
+            self.model_parallel_group = None
+            self.model_parallel_rank = 0
+        else:
+            self.model_parallel_group = mpu.get_model_parallel_group()
+            self.model_parallel_rank = mpu.get_model_parallel_rank()
+
+        self.overflow = False
+        self.clip_grad = clip_grad
+        self.allreduce_always_fp32 = allreduce_always_fp32
+        self.gradient_predivide_factor = gradient_predivide_factor
+        self.postscale_gradients = postscale_gradients
+        self.gradient_accumulation_steps = gradient_accumulation_steps
+        self.micro_step_id = 0
+
+        if self.reduce_scatter:
+            assert not self.allreduce_always_fp32, "allreduce_always_fp32 is not yet supported with ZeRO-2 with reduce scatter enabled"
+            assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with ZeRO-2 with reduce scatter enabled"
+            assert self.postscale_gradients, "pre-scale gradients is not yet supported with ZeRO-2 with reduce scatter enabled"
+
+        # param flattened by groups
+        self.fp16_groups = []
+        self.fp16_groups_flat = []
+
+        #param partitioned by data parallel degree
+        #this will contain a list of equal sized tensors
+        #each of which will be updated by a different process
+        self.parallel_partitioned_fp16_groups = []
+
+        #a single 32-bit partition of the parallel partitioned parameters
+        #that this process will update
+        self.single_partition_of_fp32_groups = []
+
+        #param partition info
+
+        #These are the parameters in each group that will not be updated by this process directly
+        self.params_not_in_partition = []
+
+        #These are the parameters that will be updated by this process directly
+        self.params_in_partition = []
+
+        #Offset from the first paramter in the the self.params_in_partition
+        #the parameter boundaries may not align with partition boundaries
+        #so we need to keep track of the offset
+        self.first_offset = []
+
+        #number of elements per partition in each group
+        self.partition_size = []
+
+        partition_id = dist.get_rank(group=self.dp_process_group)
+
+        self.all_reduce_print = False
+
+        # padding on each partition for alignment purposes
+        self.groups_padding = []
+        # loop to deal with groups
+        for i, param_group in enumerate(self.optimizer.param_groups):
+            # push this group to list before modify
+            self.fp16_groups.append(param_group['params'])
+            # Record padding required to align group to world size
+            if partition_id == dist.get_world_size(group=self.dp_process_group) - 1:
+                padding = get_alignment_padding(self.fp16_groups[i],
+                                                self.partition_count)
+            else:
+                padding = 0
+            self.groups_padding.append(padding)
+
+            #not sure why apex was cloning the weights before flattening
+            #removing cloning here
+
+            see_memory_usage(f"Before moving param group {i} to CPU")
+            #move all the parameters to cpu to free up GPU space for creating flat buffer
+            move_to_cpu(self.fp16_groups[i])
+            see_memory_usage(f"After moving param group {i} to CPU")
+
+            #create flat buffer in CPU and move to GPU
+            self.fp16_groups_flat.append(
+                flatten_dense_tensors_aligned(
+                    self.fp16_groups[i],
+                    dist.get_world_size(group=self.dp_process_group)).cuda(
+                        torch.cuda.current_device()))
+            see_memory_usage(f"After flattening and moving param group {i} to GPU")
+
+            if dist.get_rank(group=self.dp_process_group) == 0:
+                see_memory_usage(
+                    f"After Flattening and after emptying param group {i} cache")
+
+            # set model fp16 weight to slices of flattened buffer
+            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
+                                                      self.fp16_groups[i])
+            for p, q in zip(self.fp16_groups[i], updated_params):
+                p.data = q.data
+
+            #divide the flat weights into near equal partition equal to the data parallel degree
+            #each process will compute on a different part of the partition
+            data_parallel_partitions = self.get_data_parallel_partitions(
+                self.fp16_groups_flat[i])
+            self.parallel_partitioned_fp16_groups.append(data_parallel_partitions)
+
+            # a partition of the fp32 master weights that will be updated by this process
+            self.single_partition_of_fp32_groups.append(
+                self.parallel_partitioned_fp16_groups[i][partition_id].to(
+                    self.device).clone().float().detach())
+
+            # modify optimizer of have flat master weight
+            self.single_partition_of_fp32_groups[
+                i].requires_grad = True  # keep this in case internal optimizer uses it
+            param_group['params'] = [self.single_partition_of_fp32_groups[i]]
+
+            partition_size = len(self.fp16_groups_flat[i]) / dist.get_world_size(
+                group=self.dp_process_group)
+            params_in_partition, params_not_in_partition, first_offset = self.get_partition_info(self.fp16_groups[i], partition_size, partition_id)
+
+            self.partition_size.append(partition_size)
+            self.params_in_partition.append(params_in_partition)
+            self.params_not_in_partition.append(params_not_in_partition)
+            self.first_offset.append(first_offset)
+
+        self.reduce_bucket_size = int(reduce_bucket_size)
+        self.allgather_bucket_size = int(allgather_bucket_size)
+
+        self.reduction_event = torch.cuda.Event(enable_timing=False, blocking=False)
+        self.reduction_stream = torch.cuda.Stream()
+        self.cpu_computation_stream = torch.cuda.Stream()
+        self.migration_stream = torch.cuda.Stream()
+        self.callback_queued = False
+
+        self.param_dict = {}
+
+        #map between param_id and bool to specify if a param is in this partition
+        self.is_param_in_current_partition = {}
+
+        # CPU-Offload requires contiguous gradients
+        self.contiguous_gradients = contiguous_gradients or cpu_offload
+        self.grads_in_ipg_bucket = []
+        self.params_in_ipg_bucket = []
+        self.elements_in_ipg_bucket = 0
+        self.params_already_reduced = []
+        self._release_ipg_buffers()
+        self.previous_reduced_grads = None
+
+        #simplified param id
+        self.param_id = {}
+
+        largest_param_numel = 0
+        count = 0
+        for i, params_group in enumerate(self.fp16_groups):
+            for param in params_group:
+                unique_id = id(param)
+                self.param_id[unique_id] = count
+                self.param_dict[count] = param
+                self.params_already_reduced.append(False)
+                if param.numel() > largest_param_numel:
+                    largest_param_numel = param.numel()
+                count = count + 1
+
+        for param_group in self.params_in_partition:
+            for param in param_group:
+                self.is_param_in_current_partition[self.get_param_id(param)] = True
+
+        for param_group in self.params_not_in_partition:
+            for param in param_group:
+                self.is_param_in_current_partition[self.get_param_id(param)] = False
+
+        if self.cpu_offload:
+            self.accumulated_grads_in_cpu = {}
+            self.norm_for_param_grads = {}
+            self.local_overflow = False
+            self.grad_position = {}
+            self.temp_grad_buffer_for_cpu_offload = torch.zeros(
+                largest_param_numel,
+                device=self.device).half().pin_memory()
+            self.temp_grad_buffer_for_gpu_offload = torch.zeros(
+                largest_param_numel,
+                device=torch.cuda.current_device()).half()
+
+            for i, params_group in enumerate(self.fp16_groups):
+                self.get_grad_position(i,
+                                       self.params_in_partition[i],
+                                       self.first_offset[i],
+                                       self.partition_size[i])
+
+        #mapping from parameter to partition that it belongs to
+        self.param_to_partition_ids = {}
+
+        #stores if a partition has been reduced in this step
+        self.is_partition_reduced = {}
+
+        #number of grads in partition that still need to be computed
+        self.remaining_grads_in_partition = {}
+
+        #total number of grads in partition
+        self.total_grads_in_partition = {}
+
+        #stores if a grad in a partition has been computed or not
+        self.is_grad_computed = {}
+
+        #stores the offset at which a parameter gradient needs to be inserted in a partition
+        self.grad_partition_insertion_offset = {}
+
+        #the offset in the gradient at which it must be inserted at the beginning of the partition
+        self.grad_start_offset = {}
+
+        #will store the averaged gradients required by this partition
+        self.averaged_gradients = {}
+
+        # store index of first parameter in each partition
+        self.first_param_index_in_partition = {}
+
+        #initializes all data structures for implementing gradient partitioning
+        self.initialize_gradient_partitioning_data_structures()
+
+        #resets the data structure value for the next backward propagation
+        self.reset_partition_gradient_structures()
+
+        #creates backward hooks for gradient partitioning
+        self.create_reduce_and_remove_grad_hooks()
+
+        # we may have a way of fusing dynamic scale. Do not support for now
+        if dynamic_loss_scale:
+            if dynamic_loss_args is None:
+                self.loss_scaler = DynamicLossScaler()
+            else:
+                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
+
+            self.dynamic_loss_scale = True
+
+        else:
+            self.dynamic_loss_scale = False
+            self.loss_scaler = LossScaler(scale=static_loss_scale)
+            self.cur_iter = 0
+
+        see_memory_usage("Before initializing optimizer states")
+        self.initialize_optimizer_states()
+        see_memory_usage("After initializing optimizer states")
+
+        if dist.get_rank() == 0:
+            logger.info(f"optimizer state initialized")
+
+        if dist.get_rank(group=self.dp_process_group) == 0:
+            see_memory_usage(f"After initializing ZeRO optimizer")
+
+    def _release_ipg_buffers(self):
+        if self.contiguous_gradients:
+            self.ipg_buffer = None
+            self.grads_in_partition = None
+            self.grads_in_partition_offset = 0
+
+    def initialize_optimizer_states(self):
+
+        for i, group in enumerate(self.fp16_groups):
+            single_grad_partition = torch.zeros(
+                int(self.partition_size[i]),
+                dtype=self.single_partition_of_fp32_groups[i].dtype,
+                device=self.device)
+            self.single_partition_of_fp32_groups[
+                i].grad = single_grad_partition.pin_memory(
+                ) if self.cpu_offload else single_grad_partition
+
+        self.optimizer.step()
+
+        if not self.cpu_offload:
+            for group in self.single_partition_of_fp32_groups:
+                group.grad = None
+
+        return
+
+    #########################################################################
+    #########################ZeRO Partition Gradients########################
+    #########################################################################
+
+    def get_first_param_index(self, group_id, param_group, partition_id):
+        for index, param in enumerate(param_group):
+            param_id = self.get_param_id(param)
+            if partition_id in self.param_to_partition_ids[group_id][param_id]:
+                return index
+        return None
+
+    def initialize_gradient_partitioning_data_structures(self):
+
+        total_partitions = dist.get_world_size(group=self.dp_process_group)
+
+        for i, param_group in enumerate(self.fp16_groups):
+
+            self.param_to_partition_ids[i] = {}
+            self.is_partition_reduced[i] = {}
+            self.total_grads_in_partition[i] = {}
+            self.remaining_grads_in_partition[i] = {}
+            self.is_grad_computed[i] = {}
+            self.grad_partition_insertion_offset[i] = {}
+            self.grad_start_offset[i] = {}
+            self.first_param_index_in_partition[i] = {}
+
+            for partition_id in range(total_partitions):
+                self.is_grad_computed[i][partition_id] = {}
+                self.grad_partition_insertion_offset[i][partition_id] = {}
+                self.grad_start_offset[i][partition_id] = {}
+                self.total_grads_in_partition[i][partition_id] = 0
+                self.initialize_gradient_partition(i, param_group, partition_id)
+                self.is_partition_reduced[i][partition_id] = False
+                self.first_param_index_in_partition[i][
+                    partition_id] = self.get_first_param_index(
+                        i,
+                        param_group,
+                        partition_id)
+
+    def independent_gradient_partition_epilogue(self):
+        self.report_ipg_memory_usage(f"In ipg_epilogue before reduce_ipg_grads", 0)
+        self.reduce_ipg_grads()
+        self.report_ipg_memory_usage(f"In ipg_epilogue after reduce_ipg_grads", 0)
+
+        #if dist.get_rank() == 0:
+        #    logger.info("Params already reduced %s", self.params_already_reduced)
+        for i in range(len(self.params_already_reduced)):
+            self.params_already_reduced[i] = False
+
+        if self.overlap_comm:
+            torch.cuda.synchronize()
+
+        if self.cpu_offload is False:
+            for i, _ in enumerate(self.fp16_groups):
+
+                if not i in self.averaged_gradients or self.averaged_gradients[i] is None:
+                    self.averaged_gradients[i] = self.get_flat_partition(
+                        self.params_in_partition[i],
+                        self.first_offset[i],
+                        self.partition_size[i],
+                        dtype=torch.half,
+                        device=torch.cuda.current_device(),
+                        return_tensor_list=True)
+                else:
+                    avg_new = self.get_flat_partition(self.params_in_partition[i],
+                                                      self.first_offset[i],
+                                                      self.partition_size[i],
+                                                      dtype=torch.half,
+                                                      device=torch.cuda.current_device(),
+                                                      return_tensor_list=True)
+
+                    for accumulated_grad, new_avg_grad in zip(self.averaged_gradients[i],avg_new):
+                        accumulated_grad.add_(new_avg_grad)
+
+        self._release_ipg_buffers()
+
+        # No need to keep the gradients anymore.
+        # All gradients required by the step
+        # are in self.averaged_gradients
+        self.zero_grad()
+        see_memory_usage(f"End ipg_epilogue")
+
+    # resets all partition to no reduced
+    # sets remaining grads to the total number of grads in each partition
+    # set is grad computed to false for all grads in partition
+    def reset_partition_gradient_structures(self):
+        total_partitions = dist.get_world_size(group=self.dp_process_group)
+        for i, _ in enumerate(self.fp16_groups):
+            for partition_id in range(total_partitions):
+                self.is_partition_reduced[i][partition_id] = False
+                self.remaining_grads_in_partition[i][
+                    partition_id] = self.total_grads_in_partition[i][partition_id]
+
+                for param_id in self.is_grad_computed[i][partition_id]:
+                    self.is_grad_computed[i][partition_id][param_id] = False
+
+    def initialize_gradient_partition(self, i, param_group, partition_id):
+        def set_key_value_list(dictionary, key, value):
+            if key in dictionary:
+                dictionary[key].append(value)
+            else:
+                dictionary[key] = [value]
+
+        def increment_value(dictionary, key):
+            if key in dictionary:
+                dictionary[key] += 1
+            else:
+                dictionary[key] = 1
+
+        partition_size = self.partition_size[i]
+
+        start_index = partition_size * partition_id
+        end_index = partition_size * (partition_id + 1)
+
+        current_index = 0
+        first_offset = 0
+
+        for param in param_group:
+
+            param_size = param.numel()
+            param_id = self.get_param_id(param)
+
+            if (current_index >= start_index and current_index < end_index):
+                set_key_value_list(self.param_to_partition_ids[i],
+                                   param_id,
+                                   partition_id)
+                increment_value(self.total_grads_in_partition[i], partition_id)
+
+                self.is_grad_computed[i][partition_id][param_id] = False
+
+                self.grad_partition_insertion_offset[i][partition_id][
+                    param_id] = current_index - start_index
+                self.grad_start_offset[i][partition_id][param_id] = 0
+
+            elif start_index > current_index and start_index < (current_index +
+                                                                param_size):
+                assert (first_offset==0), "This can happen either zero or only once as this must be the first tensor in the partition"
+                first_offset = start_index - current_index
+
+                set_key_value_list(self.param_to_partition_ids[i],
+                                   param_id,
+                                   partition_id)
+                increment_value(self.total_grads_in_partition[i], partition_id)
+
+                self.is_grad_computed[i][partition_id][param_id] = False
+
+                self.grad_partition_insertion_offset[i][partition_id][param_id] = 0
+                self.grad_start_offset[i][partition_id][param_id] = first_offset
+
+            current_index = current_index + param_size
+
+    def overlapping_partition_gradients_reduce_epilogue(self):
+        self.independent_gradient_partition_epilogue()
+
+    def create_reduce_and_remove_grad_hooks(self):
+        self.grad_accs = []
+        for i, param_group in enumerate(self.fp16_groups):
+            for param in param_group:
+                if param.requires_grad:
+
+                    def wrapper(param, i):
+                        param_tmp = param.expand_as(param)
+                        grad_acc = param_tmp.grad_fn.next_functions[0][0]
+
+                        def reduce_partition_and_remove_grads(*notneeded):
+                            self.reduce_ready_partitions_and_remove_grads(param, i)
+
+                        grad_acc.register_hook(reduce_partition_and_remove_grads)
+                        self.grad_accs.append(grad_acc)
+
+                    wrapper(param, i)
+
+    def get_param_id(self, param):
+        unique_id = id(param)
+        return self.param_id[unique_id]
+
+    def report_ipg_memory_usage(self, tag, param_elems):
+        elem_count = self.elements_in_ipg_bucket + param_elems
+        percent_of_bucket_size = (100.0 * elem_count) // self.reduce_bucket_size
+        see_memory_usage(
+            f"{tag}: elems in_bucket {self.elements_in_ipg_bucket} param {param_elems} max_percent {percent_of_bucket_size}"
+        )
+
+    ############### Independent Partition Gradient ########################
+    def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
+        if self.elements_in_ipg_bucket + param.numel() > self.reduce_bucket_size:
+            self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads",
+                                         param.numel())
+            self.reduce_ipg_grads()
+            if self.contiguous_gradients and self.overlap_comm:
+                # Swap ipg_index between 0 and 1
+                self.ipg_index = 1 - self.ipg_index
+            self.report_ipg_memory_usage("In ipg_remove_grads after reduce_ipg_grads",
+                                         param.numel())
+
+        param_id = self.get_param_id(param)
+
+        assert self.params_already_reduced[param_id] == False, \
+            f"The parameter {param_id} has already been reduced. \
+            Gradient computed twice for this partition. \
+            Multiple gradient reduction is currently not supported"
+
+        #keeping the gradients contiguous to prevent memory fragmentation, and avoid flattening
+        if self.contiguous_gradients:
+            new_grad_tensor = self.ipg_buffer[self.ipg_index].narrow(
+                0,
+                self.elements_in_ipg_bucket,
+                param.numel())
+            new_grad_tensor.copy_(param.grad.view(-1))
+            param.grad.data = new_grad_tensor.data.view_as(param.grad)
+
+        self.elements_in_ipg_bucket += param.numel()
+        self.grads_in_ipg_bucket.append(param.grad)
+        self.params_in_ipg_bucket.append((i, param, param_id))
+
+        self.report_ipg_memory_usage("End ipg_remove_grads", 0)
+
+    def print_rank_0(self, message):
+        if dist.get_rank() == 0:
+            logger.info(message)
+
+    def gradient_reduction_w_predivide(self, tensor):
+        dp_world_size = dist.get_world_size(group=self.dp_process_group)
+
+        tensor_to_allreduce = tensor
+
+        if self.allreduce_always_fp32:
+            tensor_to_allreduce = tensor.float()
+
+        if self.postscale_gradients:
+            if self.gradient_predivide_factor != 1.0:
+                tensor_to_allreduce.mul_(1. / self.gradient_predivide_factor)
+
+            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
+
+            if self.gradient_predivide_factor != dp_world_size:
+                tensor_to_allreduce.mul_(self.gradient_predivide_factor / dp_world_size)
+        else:
+            tensor_to_allreduce.div_(dp_world_size)
+            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
+
+        if self.allreduce_always_fp32 and tensor is not tensor_to_allreduce:
+            tensor.copy_(tensor_to_allreduce)
+
+        return tensor
+
+    def average_tensor(self, tensor):
+        if self.overlap_comm:
+            torch.cuda.synchronize()
+            stream = self.reduction_stream
+        else:
+            stream = torch.cuda.current_stream()
+
+        with torch.cuda.stream(stream):
+            if not self.reduce_scatter:
+                self.gradient_reduction_w_predivide(tensor)
+                return
+
+            # Accumulate destination ranks and bucket offsets for each gradient slice.
+            # Note: potential future optimization, record access pattern of parameters
+            # in backward pass and partition gradients w.r.t. access pattern so that our
+            # bucket is guaranteed to be contiguous w.r.t. ranks
+            rank_and_offsets = []
+            curr_size = 0
+            prev_id = -1
+            for i, param, param_id in self.params_in_ipg_bucket:
+                partition_ids = self.param_to_partition_ids[i][param_id]
+                partition_size = self.partition_size[i]
+                # Get all partition ids + their offsets
+                partition_ids_w_offsets = []
+                for partition_id in partition_ids:
+                    offset = self.grad_start_offset[i][partition_id][param_id]
+                    partition_ids_w_offsets.append((partition_id, offset))
+                partition_ids_w_offsets.sort(key=lambda t: t[1])
+
+                # Calculate rank and offsets for grad slices
+                for idx in range(len(partition_ids_w_offsets)):
+                    partition_id, offset = partition_ids_w_offsets[idx]
+
+                    # Calculate numel for grad slice depending on partition location
+                    if idx == len(partition_ids_w_offsets) - 1:
+                        # Last partition_id uses its own offset
+                        numel = param.numel() - offset
+                    else:
+                        # Set numel to next partition's offset
+                        numel = partition_ids_w_offsets[idx + 1][1] - offset
+
+                    # Merge bucket ranges if they belong to the same rank
+                    if partition_id == prev_id:
+                        prev_pid, prev_size, prev_numel = rank_and_offsets[-1]
+                        rank_and_offsets[-1] = (prev_pid, prev_size, prev_numel + numel)
+                    else:
+                        rank_and_offsets.append((partition_id, curr_size, numel))
+
+                    curr_size += numel
+                    prev_id = partition_id
+            tensor.div_(dist.get_world_size(group=self.dp_process_group))
+
+            async_handles = []
+            for dst, bucket_offset, numel in rank_and_offsets:
+                grad_slice = tensor.narrow(0, int(bucket_offset), int(numel))
+                dst_rank = _get_global_rank(self.dp_process_group, dst)
+                async_handle = dist.reduce(grad_slice,
+                                           dst=dst_rank,
+                                           group=self.dp_process_group,
+                                           async_op=True)
+                async_handles.append(async_handle)
+
+            for handle in async_handles:
+                handle.wait()
+
+    ##############################################################################
+    ############################# CPU Offload Methods#############################
+    ##############################################################################
+    def get_grad_position(self, group_id, tensor_list, first_offset, partition_size):
+        current_offset = 0
+
+        for i, tensor in enumerate(tensor_list):
+            param_id = self.get_param_id(tensor)
+            param_start_offset = 0
+
+            num_elements = tensor.numel()
+            tensor_offset = 0
+
+            #we need to offset to get to the right element
+            if i == 0 and first_offset > 0:
+                tensor_offset = first_offset
+                num_elements = num_elements - tensor_offset
+                param_start_offset = first_offset
+
+            #we dont need all elements of the tensor
+            if num_elements > (partition_size - current_offset):
+                num_elements = partition_size - current_offset
+
+            self.grad_position[param_id] = [
+                int(group_id),
+                int(param_start_offset),
+                int(current_offset),
+                int(num_elements)
+            ]
+            current_offset += num_elements
+
+    def update_overflow_tracker_for_param_grad(self, param):
+        if param.grad is not None and self._has_inf_or_nan(param.grad.data):
+            self.local_overflow = True
+
+    def async_accumulate_grad_in_cpu(self, param):
+        param_id = self.get_param_id(param)
+
+        #copy to a preexisiting buffer to avoid memory allocation penalty
+        dest_buffer = self.temp_grad_buffer_for_cpu_offload.view(-1).narrow(
+            0,
+            0,
+            param.numel())
+        dest_buffer.copy_(param.grad.view(-1), non_blocking=True)
+
+        if param_id not in self.accumulated_grads_in_cpu:
+            self.accumulated_grads_in_cpu[param_id] = torch.zeros(
+                param.numel(),
+                dtype=param.dtype,
+                device=self.device).pin_memory()
+
+        self.accumulated_grads_in_cpu[param_id].add_(dest_buffer)
+
+    def async_accumulate_grad_in_cpu_via_gpu(self, param):
+        param_id = self.get_param_id(param)
+
+        #copy to a preexisiting buffer to avoid memory allocation penalty
+        dest_buffer = self.temp_grad_buffer_for_gpu_offload.view(-1).narrow(
+            0,
+            0,
+            param.numel())
+
+        if param_id not in self.accumulated_grads_in_cpu:
+            self.accumulated_grads_in_cpu[param_id] = torch.zeros(
+                param.numel(),
+                dtype=param.dtype,
+                device=self.device).pin_memory()
+
+        if self.micro_step_id > 0:
+            dest_buffer.copy_(self.accumulated_grads_in_cpu[param_id].view(-1),
+                              non_blocking=True)
+            param.grad.data.view(-1).add_(dest_buffer)
+
+        #at the boundary we will send 32bit directly
+        if not self.is_gradient_accumulation_boundary:
+            self.accumulated_grads_in_cpu[param_id].data.copy_(param.grad.data.view(-1),
+                                                               non_blocking=True)
+
+    def set_norm_for_param_grad(self, param):
+        param_id = self.get_param_id(param)
+        accumulated_grad = self.accumulated_grads_in_cpu[
+            param_id] if self.gradient_accumulation_steps > 1 else param.grad
+
+        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
+
+        start = source_offset
+        accumulated_grad = accumulated_grad.view(-1).narrow(0, start, num_elements)
+
+        self.norm_for_param_grads[param_id] = accumulated_grad.data.double().norm(2)
+
+    def set_norm_for_param_grad_in_gpu(self, param):
+        param_id = self.get_param_id(param)
+        accumulated_grad = param.grad
+
+        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
+
+        start = source_offset
+        accumulated_grad = accumulated_grad.view(-1).narrow(0, start, num_elements)
+
+        self.norm_for_param_grads[param_id] = accumulated_grad.data.double().norm(2)
+
+    def async_inplace_copy_grad_to_fp32_buffer(self, param):
+        param_id = self.get_param_id(param)
+
+        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
+
+        dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(
+            0,
+            dest_offset,
+            num_elements)
+        if self.gradient_accumulation_steps > 1:
+            src_tensor = self.accumulated_grads_in_cpu[param_id].view(-1).narrow(
+                0,
+                source_offset,
+                num_elements)
+        else:
+            src_tensor = param.grad.view(-1).narrow(0,
+                                                    source_offset,
+                                                    num_elements).float()
+        dest_tensor.copy_(src_tensor, non_blocking=True)
+
+    def async_inplace_copy_grad_to_fp32_buffer_from_gpu(self, param):
+        param_id = self.get_param_id(param)
+
+        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
+
+        dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(
+            0,
+            dest_offset,
+            num_elements)
+
+        src_tensor = param.grad.view(-1).narrow(0, source_offset, num_elements).float()
+        dest_tensor.copy_(src_tensor, non_blocking=True)
+        param.grad = None
+
+    def complete_grad_norm_calculation_for_cpu_offload(self, params):
+        total_norm = 0.0
+        norm_type = 2.0
+        for p in params:
+            if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
+                param_id = self.get_param_id(p)
+                param_norm = self.norm_for_param_grads[param_id]
+                total_norm += param_norm.item()**2
+
+        # Sum across all model parallel GPUs.
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+
+        torch.distributed.all_reduce(total_norm_cuda,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=self.dp_process_group)
+
+        self._model_parallel_all_reduce(tensor=total_norm_cuda,
+                                        op=torch.distributed.ReduceOp.SUM)
+
+        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+
+        if total_norm == float(
+                'inf') or total_norm == -float('inf') or total_norm != total_norm:
+            total_norm = -1
+
+        return total_norm
+
+    ############################################################################################
+
+    def copy_grads_in_partition(self, param):
+        if self.cpu_offload:
+
+            if self.gradient_accumulation_steps > 1:
+                self.async_accumulate_grad_in_cpu_via_gpu(param)
+
+            if self.is_gradient_accumulation_boundary:
+                self.set_norm_for_param_grad_in_gpu(param)
+
+                self.update_overflow_tracker_for_param_grad(param)
+
+                self.async_inplace_copy_grad_to_fp32_buffer_from_gpu(param)
+
+            return
+        #print(f"ID {self.get_param_id(param)} grad norm {param.grad.norm()}")
+        if self.grads_in_partition is None:
+            self.grads_in_partition_offset = 0
+            total_size = 0
+            for group in self.params_in_partition:
+                for param_in_partition in group:
+                    total_size += param_in_partition.numel()
+
+            see_memory_usage(f"before copying {total_size} gradients into partition")
+            self.grads_in_partition = torch.empty(int(total_size),
+                                                  dtype=torch.half,
+                                                  device=torch.cuda.current_device())
+            see_memory_usage(f"after copying {total_size} gradients into partition")
+
+        #The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
+        new_grad_tensor = self.grads_in_partition.view(-1).narrow(
+            0,
+            self.grads_in_partition_offset,
+            param.numel())
+        new_grad_tensor.copy_(param.grad.view(-1))
+        param.grad.data = new_grad_tensor.data.view_as(param.grad)
+        #print(f"Grad norm after copy to contiguous_buffer {param.grad.data.norm()}")
+        self.grads_in_partition_offset += param.numel()
+
+    def reduce_ipg_grads(self):
+        if self.overlap_comm:
+            stream = self.reduction_stream
+        else:
+            stream = torch.cuda.current_stream()
+
+        if self.contiguous_gradients:
+            self.average_tensor(self.ipg_buffer[self.ipg_index])
+        else:
+            self.buffered_reduce_fallback(
+                None,
+                self.grads_in_ipg_bucket,
+                elements_per_buffer=self.elements_in_ipg_bucket)
+
+        with torch.cuda.stream(stream):
+            for _, param, param_id in self.params_in_ipg_bucket:
+
+                assert self.params_already_reduced[param_id] == False, \
+                    f"The parameter {param_id} has already been reduced. \
+                    Gradient computed twice for this partition. \
+                    Multiple gradient reduction is currently not supported"
+
+                self.params_already_reduced[param_id] = True
+
+                if not self.is_param_in_current_partition[param_id]:
+                    if self.overlap_comm and self.contiguous_gradients is False:
+                        # Clear the previous grads during the next reduction
+                        # to avoid clearing them before the reduction is complete.
+                        if self.previous_reduced_grads is None:
+                            self.previous_reduced_grads = []
+                        self.previous_reduced_grads.append(param)
+                    else:
+                        param.grad = None
+                elif self.contiguous_gradients:
+                    self.copy_grads_in_partition(param)
+
+        self.grads_in_ipg_bucket = []
+        self.params_in_ipg_bucket = []
+        self.elements_in_ipg_bucket = 0
+        #####################################################################
+
+    def reduce_ready_partitions_and_remove_grads(self, param, i):
+        self.reduce_independent_p_g_buckets_and_remove_grads(param, i)
+
+    def zero_reduced_gradients(self, partition_id, i):
+        def are_all_related_partitions_reduced(params_id):
+            for partition_id in self.param_to_partition_ids[i][params_id]:
+                if not self.is_partition_reduced[i][partition_id]:
+                    return False
+            return True
+
+        for params_id in self.is_grad_computed[i][partition_id]:
+            if are_all_related_partitions_reduced(params_id):
+                self.param_dict[params_id].grad = None
+
+    def flatten_and_print(self, message, tensors, start=0, n=5):
+        flatten_tensor = _flatten_dense_tensors(tensors)
+
+        def print_func():
+            logger.info(flatten_tensor.contiguous().view(-1).narrow(0, start, n))
+
+        self.sequential_execution(print_func, message)
+
+    def get_grads_to_reduce(self, i, partition_id):
+        def get_reducable_portion(key):
+            grad = self.param_dict[key].grad
+            total_elements = grad.numel()
+            start = self.grad_start_offset[i][partition_id][key]
+            num_elements = min(
+                total_elements - start,
+                self.partition_size[i] -
+                self.grad_partition_insertion_offset[i][partition_id][key])
+            if not pg_correctness_test:
+                if num_elements == total_elements:
+                    return grad
+                else:
+                    return grad.contiguous().view(-1).narrow(0,
+                                                             int(start),
+                                                             int(num_elements))
+            else:
+                if num_elements == total_elements:
+                    return grad.clone()
+                else:
+                    return grad.clone().contiguous().view(-1).narrow(
+                        0,
+                        int(start),
+                        int(num_elements))
+
+        grads_to_reduce = []
+        for key in self.is_grad_computed[i][partition_id]:
+            grad = get_reducable_portion(key)
+            grads_to_reduce.append(grad)
+        return grads_to_reduce
+
+    def sequential_execution(self, function, message, group=None):
+        if group is None:
+            group = self.dp_process_group
+        if dist.get_rank(group=group) == 0:
+            logger.info(message)
+        for id in range(dist.get_world_size(group=group)):
+            if id == dist.get_rank(group=group):
+                function()
+            dist.barrier(group=group)
+
+    def set_none_gradients_to_zero(self, i, partition_id):
+        for param_id in self.is_grad_computed[i][partition_id]:
+            param = self.param_dict[param_id]
+            if param.grad is None:
+                param.grad = torch.zero_like(param)
+
+    ######################Reduction Related Methods##############################
+
+    def allreduce_bucket(self, bucket, allreduce_always_fp32=False, rank=None, log=None):
+        rank = None
+        tensor = self.flatten(bucket)
+
+        tensor_to_allreduce = tensor
+
+        if pg_correctness_test:
+            allreduce_always_fp32 = True
+
+        if allreduce_always_fp32:
+            tensor_to_allreduce = tensor.float()
+
+        tensor_to_allreduce.div_(dist.get_world_size(group=self.dp_process_group))
+
+        if rank is None:
+            #    "All Reducing"
+            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
+        else:
+            global_rank = _get_global_rank(self.dp_process_group, rank)
+            dist.reduce(tensor_to_allreduce, global_rank, group=self.dp_process_group)
+
+        if allreduce_always_fp32 and tensor is not tensor_to_allreduce:
+            if rank is None or rank == dist.get_rank(group=self.dp_process_group):
+                tensor.copy_(tensor_to_allreduce)
+
+        return tensor
+
+    #if rank is specified do a reduction instead of an allreduce
+    def allreduce_and_copy(self, small_bucket, rank=None, log=None):
+        if self.overlap_comm:
+            torch.cuda.synchronize()
+            if self.previous_reduced_grads is not None:
+                # previous_reduced_grads has the previous reduced grads,
+                # now it is safe to clear.
+                for param in self.previous_reduced_grads:
+                    param.grad = None
+                self.previous_reduced_grads = None
+            stream = self.reduction_stream
+        else:
+            stream = torch.cuda.current_stream()
+
+        with torch.cuda.stream(stream):
+            allreduced = self.allreduce_bucket(small_bucket, rank=rank, log=log)
+            if rank is None or rank == dist.get_rank(group=self.dp_process_group):
+                for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
+                    buf.copy_(synced)
+
+    def allreduce_no_retain(self,
+                            bucket,
+                            numel_per_bucket=500000000,
+                            rank=None,
+                            log=None):
+        small_bucket = []
+        numel = 0
+        for tensor in bucket:
+            small_bucket.append(tensor)
+            numel = numel + tensor.numel()
+            if numel > numel_per_bucket:
+                self.allreduce_and_copy(small_bucket, rank=rank, log=None)
+                small_bucket = []
+        if len(small_bucket) > 0:
+            self.allreduce_and_copy(small_bucket, rank=rank, log=log)
+
+    #allows using reduction of gradients instead of using all_reduce
+    def buffered_reduce_fallback(self,
+                                 rank,
+                                 grads,
+                                 elements_per_buffer=500000000,
+                                 log=None):
+        split_buckets = split_half_float_double(grads)
+
+        for i, bucket in enumerate(split_buckets):
+            self.allreduce_no_retain(bucket,
+                                     numel_per_bucket=elements_per_buffer,
+                                     rank=rank,
+                                     log=log)
+
+    #############################################################################
+    #############################################################################
+    #############################################################################
+
+    #views the tensor as multiple partitions and returns
+    #those partitions
+    def get_data_parallel_partitions(self, tensor):
+        partitions = []
+
+        dp = dist.get_world_size(group=self.dp_process_group)
+        dp_id = dist.get_rank(group=self.dp_process_group)
+
+        total_num_elements = tensor.numel()
+
+        base_size = total_num_elements // dp
+        remaining = total_num_elements % dp
+
+        start = 0
+        for id in range(dp):
+            partition_size = base_size
+            if id < remaining:
+                partition_size = partition_size + 1
+            partitions.append(tensor.narrow(0, start, partition_size))
+            start = start + partition_size
+        return partitions
+
+    def get_partition_info(self, tensor_list, partition_size, partition_id):
+        params_in_partition = []
+        params_not_in_partition = []
+
+        start_index = partition_size * partition_id
+        end_index = partition_size * (partition_id + 1)
+
+        current_index = 0
+        first_offset = 0
+
+        for tensor in tensor_list:
+
+            tensor_size = tensor.numel()
+
+            if (current_index >= start_index and current_index < end_index):
+                params_in_partition.append(tensor)
+
+            elif start_index > current_index and start_index < (current_index +
+                                                                tensor_size):
+                params_in_partition.append(tensor)
+
+                assert (first_offset==0), "This can happen either zero or only once as this must be the first tensor in the partition"
+                first_offset = start_index - current_index
+
+            else:
+                params_not_in_partition.append(tensor)
+
+            current_index = current_index + tensor_size
+
+        return params_in_partition, params_not_in_partition, first_offset
+
+    def zero_grad(self, set_grads_to_None=True):
+        """
+        Zero FP16 parameter grads.
+        """
+        # FP32 grad should never exist.
+        # For speed, set model fp16 grad to None by default
+        for group in self.fp16_groups:
+            for p in group:
+                if set_grads_to_None:
+                    p.grad = None
+                else:
+                    if p.grad is not None:
+                        p.grad.detach_()
+                        p.grad.zero_()
+
+    def _model_parallel_all_reduce(self, tensor, op):
+        """ Perform all reduce within model parallel group, if any.
+        """
+        if self.model_parallel_group is None:
+            pass
+        else:
+            torch.distributed.all_reduce(tensor=tensor,
+                                         op=op,
+                                         group=self.model_parallel_group)
+
+    def get_grad_norm_direct(self, gradients, params, norm_type=2):
+        """Clips gradient norm of an iterable of parameters.
+
+        This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+        added functionality to handle model parallel parameters. Note that
+        the gradients are modified in place.
+
+        Arguments:
+            parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+                single Tensor that will have gradients normalized
+            max_norm (float or int): max norm of the gradients
+            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+                infinity norm.
+
+        Returns:
+            Total norm of the parameters (viewed as a single vector).
+        """
+        norm_type = float(norm_type)
+        if norm_type == inf:
+            total_norm = max(g.data.abs().max() for g in gradients)
+            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+            torch.distributed.all_reduce(total_norm_cuda,
+                                         op=torch.distributed.ReduceOp.MAX,
+                                         group=self.dp_process_group)
+
+            # Take max across all GPUs.
+            self._model_parallel_all_reduce(tensor=total_norm_cuda,
+                                            op=torch.distributed.ReduceOp.MAX)
+            total_norm = total_norm_cuda[0].item()
+        else:
+            total_norm = 0.0
+            #if dist.get_rank() == 0:
+            #    logger.info(f"Total Norm begining {total_norm}")
+            for g, p in zip(gradients, params):
+                if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
+                    param_norm = g.data.double().norm(2)
+                    total_norm += param_norm.item()**2
+            # Sum across all model parallel GPUs.
+            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+
+            torch.distributed.all_reduce(total_norm_cuda,
+                                         op=torch.distributed.ReduceOp.SUM,
+                                         group=self.dp_process_group)
+
+            self._model_parallel_all_reduce(tensor=total_norm_cuda,
+                                            op=torch.distributed.ReduceOp.SUM)
+
+            total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+
+        if total_norm == float(
+                'inf') or total_norm == -float('inf') or total_norm != total_norm:
+            total_norm = -1
+
+        return total_norm
+
+    #creates a flat fused tensor from the tensor list starting at the first_offset
+    #in the first tensor of the list. If there are not enough elements in the tensor
+    #list then the flat tensor will be padded with zeros
+    def get_flat_partition(self,
+                           tensor_list,
+                           first_offset,
+                           partition_size,
+                           dtype,
+                           device,
+                           return_tensor_list=False):
+        flat_tensor_list = []
+        current_size = 0
+        for i, tensor in enumerate(tensor_list):
+            if tensor.grad is None:
+                tensor.grad = torch.zeros_like(tensor)
+
+            tensor = tensor.grad
+            num_elements = tensor.numel()
+            tensor_offset = 0
+
+            #we need to offset to get to the right element
+            if i == 0 and first_offset > 0:
+                tensor_offset = first_offset
+                num_elements = num_elements - tensor_offset
+
+            #we dont need all elements of the tensor
+            if num_elements > (partition_size - current_size):
+                num_elements = partition_size - current_size
+
+            #we need a narrow view of the tensor based on the tensor offset and number of elements that
+            #we need from this tensor
+            if tensor_offset > 0 or num_elements < tensor.numel():
+                flat_tensor_list.append(tensor.contiguous().view(-1).narrow(
+                    0,
+                    int(tensor_offset),
+                    int(num_elements)))
+            else:
+                flat_tensor_list.append(tensor)
+
+            current_size = current_size + num_elements
+
+        #this means its the last partition and does not align with the dp boundary. We need to pad before flattening
+        if current_size < partition_size:
+            flat_tensor_list.append(
+                torch.zeros(int(partition_size - current_size),
+                            dtype=dtype,
+                            device=device))
+
+        if return_tensor_list:
+            return flat_tensor_list
+
+        return _flatten_dense_tensors(flat_tensor_list)
+
+    def free_grad_in_param_list(self, param_list):
+        for p in param_list:
+            p.grad = None
+
+    def reset_cpu_buffers(self):
+        self.norm_for_param_grads = {}
+        self.local_overflow = False
+
+    def step(self, closure=None):
+        """
+        Not supporting closure.
+        """
+        self.micro_step_id = -1
+
+        if self.cpu_offload:
+            torch.cuda.current_stream().wait_stream(self.migration_stream)
+
+        see_memory_usage(f"In step before checking overflow")
+
+        # First compute norm for all group so we know if there is overflow
+        self.check_overflow()
+
+        timers = self.timers
+
+        prev_scale = self.loss_scale
+        self._update_scale(self.overflow)
+        if self.overflow:
+            see_memory_usage('After overflow before clearing gradients')
+            self.zero_grad()
+            if self.cpu_offload:
+                self.reset_cpu_buffers()
+            else:
+                self.averaged_gradients = {}
+
+            see_memory_usage('After overflow after clearing gradients')
+
+            logger.info(
+                "[deepscale] OVERFLOW! Rank {} Skipping step. Attempted loss scale: {}, "
+                "reducing to {}".format(dist.get_rank(),
+                                        prev_scale,
+                                        self.loss_scale))
+            timers('optimizer_gradients').start()
+            timers('optimizer_gradients').stop()
+            timers('optimizer_step').start()
+            timers('optimizer_step').stop()
+            timers('optimizer_allgather').start()
+            timers('optimizer_allgather').stop()
+            return
+
+        timers('optimizer_gradients').start()
+        norm_groups = []
+        single_partition_grad_groups = []
+        skip = False
+        partition_id = dist.get_rank(group=self.dp_process_group)
+        for i, group in enumerate(self.fp16_groups):
+            if self.cpu_offload:
+                norm_groups.append(
+                    self.complete_grad_norm_calculation_for_cpu_offload(
+                        self.params_in_partition[i]))
+                single_grad_partition = self.single_partition_of_fp32_groups[i].grad
+            else:
+                norm_groups.append(
+                    self.get_grad_norm_direct(self.averaged_gradients[i],
+                                              self.params_in_partition[i]))
+
+                #free gradients for all the prameters that are not updated by this process
+                self.free_grad_in_param_list(self.params_not_in_partition[i])
+
+                #create a flat gradients for parameters updated by this process
+                # If we are last partition, ensure we have same size grads and partition size, if not pad with zero tensors
+                if partition_id == dist.get_world_size(group=self.dp_process_group) - 1:
+                    single_grad_partition = flatten_dense_tensors_aligned(
+                        self.averaged_gradients[i],
+                        int(self.partition_size[i])).to(
+                            self.single_partition_of_fp32_groups[i].dtype)
+                else:
+                    single_grad_partition = _flatten_dense_tensors(
+                        self.averaged_gradients[i]).to(
+                            self.single_partition_of_fp32_groups[i].dtype)
+                assert single_grad_partition.numel() == self.partition_size[i], \
+                    "averaged gradients have different number of elements that partition size {} {} {} {}".format(single_grad_partition.numel(), self.partition_size[i], i, partition_id)
+
+                self.single_partition_of_fp32_groups[i].grad = single_grad_partition
+                #release all the gradient since we have already created a necessary copy in dp_grad_partition
+                self.free_grad_in_param_list(self.params_in_partition[i])
+
+                self.averaged_gradients[i] = None
+
+            single_partition_grad_groups.append(single_grad_partition)
+
+        self.unscale_and_clip_grads(single_partition_grad_groups, norm_groups)
+        timers('optimizer_gradients').stop()
+
+        #torch.set_num_threads(12)
+        timers('optimizer_step').start()
+        if self.deepspeed_adam_offload:
+            from deepspeed.ops.adam import DeepSpeedCPUAdam
+            if type(self.optimizer) == DeepSpeedCPUAdam:
+                fp16_param_groups = [
+                    fp16_partitions[partition_id]
+                    for fp16_partitions in self.parallel_partitioned_fp16_groups
+                ]
+                self.optimizer.step(fp16_param_groups=fp16_param_groups)
+            else:
+                self.optimizer.step()
+                for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
+                    fp16_partitions[partition_id].data.copy_(fp32_partition.data)
+        else:
+            self.optimizer.step()
+
+            #get rid of the fp32 gradients. Not needed anymore
+            if not self.cpu_offload:
+                for group in self.single_partition_of_fp32_groups:
+                    group.grad = None
+
+            for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
+                fp16_partitions[partition_id].data.copy_(fp32_partition.data)
+
+        timers('optimizer_step').stop()
+
+        if self.cpu_offload:
+            self.reset_cpu_buffers()
+
+        timers('optimizer_allgather').start()
+        #gather the updated weights from everyone
+        for group_id, partitioned_params in enumerate(self.parallel_partitioned_fp16_groups):
+
+            #Sequential AllGather Best of both worlds
+            dp_world_size = dist.get_world_size(group=self.dp_process_group)
+            num_shards = max(
+                1,
+                partitioned_params[partition_id].numel() * dp_world_size //
+                self.allgather_bucket_size)
+
+            shard_size = partitioned_params[partition_id].numel() // num_shards
+            num_elements = shard_size
+
+            assert shard_size * num_shards <= partitioned_params[partition_id].numel()
+
+            for shard_id in range(num_shards):
+
+                if shard_id == (num_shards - 1):
+                    num_elements = partitioned_params[partition_id].numel(
+                    ) - shard_id * shard_size
+
+                shard_list = []
+                for dp_id in range(dp_world_size):
+                    curr_shard = partitioned_params[dp_id].narrow(
+                        0,
+                        shard_id * shard_size,
+                        num_elements).detach()
+                    shard_list.append(curr_shard)
+
+                dist.all_gather(shard_list,
+                                shard_list[partition_id],
+                                group=self.dp_process_group)
+        timers('optimizer_allgather').stop()
+
+        # TODO: we probably don't need this? just to be safe
+        for i in range(len(norm_groups)):
+            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
+                                                      self.fp16_groups[i])
+            for p, q in zip(self.fp16_groups[i], updated_params):
+                p.data = q.data
+
+        timers.log(
+            names=['optimizer_gradients',
+                   'optimizer_step',
+                   'optimizer_allgather'])
+        see_memory_usage('After zero_optimizer step')
+        return
+
+    def unscale_and_clip_grads(self, grad_groups_flat, norm_groups):
+        total_norm = 0.0
+        for norm in norm_groups:
+            total_norm += norm**2.0
+        total_norm = math.sqrt(total_norm)
+
+        # compute combined scale factor for this group
+        combined_scale = self.loss_scale
+        if self.clip_grad > 0.:
+            # norm is in fact norm*scale
+            clip = ((total_norm / self.loss_scale) + 1e-6) / self.clip_grad
+            if clip > 1:
+                combined_scale = clip * self.loss_scale
+
+        for grad in grad_groups_flat:
+            if isinstance(grad, list):
+                sub_partitions = grad
+                for g in sub_partitions:
+                    g.data.mul_(1. / combined_scale)
+            else:
+                grad.data.mul_(1. / combined_scale)
+
+    def _check_overflow(self, partition_gradients=True):
+        self.overflow = self.has_overflow(partition_gradients)
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow_serial(self, params, is_grad_list=False):
+        for p in params:
+            if p.grad is not None and self._has_inf_or_nan(p.grad.data):
+                return True
+
+        return False
+
+    def has_overflow_partitioned_grads_serial(self):
+        for i in range(len(self.fp16_groups)):
+            for j, grad in enumerate(self.averaged_gradients[i]):
+                if grad is not None and self._has_inf_or_nan(grad.data, j):
+                    return True
+        return False
+
+    def has_overflow(self, partition_gradients=True):
+        if partition_gradients:
+            overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial(
+            )
+            overflow_gpu = torch.cuda.ByteTensor([overflow])
+            torch.distributed.all_reduce(overflow_gpu,
+                                         op=torch.distributed.ReduceOp.MAX,
+                                         group=self.dp_process_group)
+
+        else:
+            params = []
+            for group in self.fp16_groups:
+                for param in group:
+                    params.append(param)
+
+            overflow = self.has_overflow_serial(params, is_grad_list=partition_gradients)
+            overflow_gpu = torch.cuda.ByteTensor([overflow])
+
+        # Since each model parallel GPU carries only part of the model,
+        # make sure overflow flag is synced across all the model parallel GPUs
+        self._model_parallel_all_reduce(tensor=overflow_gpu,
+                                        op=torch.distributed.ReduceOp.MAX)
+
+        overflow = overflow_gpu[0].item()
+        return bool(overflow)
+
+    # `x` is a torch.Tensor
+    @staticmethod
+    def _has_inf_or_nan(x, j=None):
+        try:
+            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
+            # Pytorch's .sum() creates a one-element tensor of the same type as x
+            # (which is true for some recent version of pytorch).
+            cpu_sum = float(x.float().sum())
+            # More efficient version that can be used if .sum() returns a Python scalar
+            # cpu_sum = float(x.sum())
+        except RuntimeError as instance:
+            # We want to check if inst is actually an overflow exception.
+            # RuntimeError could come from a different error.
+            # If so, we still want the exception to propagate.
+            if "value cannot be converted" not in instance.args[0]:
+                raise
+            return True
+        else:
+            if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+                return True
+            return False
+
+    def backward(self, loss, retain_graph=False):
+        """
+        :attr:`backward` performs the following steps:
+
+        1. fp32_loss = loss.float()
+        2. scaled_loss = fp32_loss*loss_scale
+        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's fp16 leaves
+        """
+        self.micro_step_id += 1
+        if self.cpu_offload:
+            torch.cuda.current_stream().wait_stream(self.migration_stream)
+
+        #TODO: we need to revist this and remove the magic 4.5x multiplier here
+        if self.contiguous_gradients:
+            self.ipg_buffer = []
+            buf_0 = torch.empty(int(self.reduce_bucket_size * 4.5),
+                                dtype=torch.half,
+                                device=torch.cuda.current_device())
+            self.ipg_buffer.append(buf_0)
+
+            # Use double buffers to avoid data access conflict when overlap_comm is enabled.
+            if self.overlap_comm:
+                buf_1 = torch.empty(int(self.reduce_bucket_size * 4.5),
+                                    dtype=torch.half,
+                                    device=torch.cuda.current_device())
+                self.ipg_buffer.append(buf_1)
+            self.ipg_index = 0
+
+        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
+
+    def check_overflow(self, partition_gradients=True):
+        self._check_overflow(partition_gradients)
+
+    def _update_scale(self, has_overflow=False):
+        self.loss_scaler.update_scale(has_overflow)
+
+    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
+    def _get_state(self):
+        return self.optimizer.state
+
+    def _set_state(self, value):
+        self.optimizer.state = value
+
+    state = property(_get_state, _set_state)
+
+    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
+    # (for example, to adjust the learning rate)
+    def _get_param_groups(self):
+        return self.optimizer.param_groups
+
+    def _set_param_groups(self, value):
+        self.optimizer.param_groups = value
+
+    param_groups = property(_get_param_groups, _set_param_groups)
+
+    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
+    def _get_loss_scale(self):
+        return self.loss_scaler.loss_scale
+
+    def _set_loss_scale(self, value):
+        self.loss_scaler.cur_scale = value
+
+    loss_scale = property(_get_loss_scale, _set_loss_scale)
+    cur_scale = property(_get_loss_scale, _set_loss_scale)
+
+    # Return group tensor after removing paddings that are added for alignment to DP world size.
+    # This method works on the assumption that each group contains a single flattened tensor.
+    def _get_groups_without_padding(self, groups_with_padding):
+        groups_without_padding = []
+        for i, group in enumerate(groups_with_padding):
+            lean_length = group.numel() - self.groups_padding[i]
+            groups_without_padding.append(group[:lean_length])
+
+        return groups_without_padding
+
+    # Return optimizer state after removing paddings that are added for alignment.
+    def _get_state_without_padding(self, state_with_padding, padding):
+        lean_state = {}
+        for key, value in state_with_padding.items():
+            if torch.is_tensor(value):
+                lean_length = value.numel() - padding
+                lean_state[key] = value[:lean_length]
+            else:
+                lean_state[key] = value
+
+        return lean_state
+
+    # Return base optimizer states.
+    # This method assumes that each param group contains a single flattened tensor.
+    def _get_base_optimizer_state(self):
+        optimizer_groups_state = []
+        for i, group in enumerate(self.optimizer.param_groups):
+            p = group['params'][0]
+            lean_optimizer_state = self._get_state_without_padding(
+                self.optimizer.state[p],
+                self.groups_padding[i])
+            optimizer_groups_state.append(lean_optimizer_state)
+
+        return optimizer_groups_state
+
+    def state_dict(self):
+        """
+        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
+        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
+        of the contained Pytorch optimizer.
+        Example::
+            checkpoint = {}
+            checkpoint['model'] = model.state_dict()
+            checkpoint['optimizer'] = optimizer.state_dict()
+            torch.save(checkpoint, "saved.pth")
+        """
+        state_dict = {}
+        state_dict['loss_scaler'] = self.loss_scaler
+        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
+        state_dict['overflow'] = self.overflow
+        state_dict['base_optimizer_state'] = self._get_base_optimizer_state()
+
+        state_dict['zero_stage'] = ZERO_OPTIMIZATION_GRADIENTS
+        state_dict['partition_count'] = self.partition_count
+
+        # Remove paddings for DP alignment to enable loading for other alignment values
+        fp32_groups_without_padding = self._get_groups_without_padding(
+            self.single_partition_of_fp32_groups)
+        state_dict['single_partition_of_fp32_groups'] = fp32_groups_without_padding
+
+        #        if self.cpu_offload:
+        #            state_dict_tmp = async_copy_to(state_dict,
+        #                                           'cpu',
+        #                                           torch.cuda.current_stream())
+        #            state_dict = state_dict_tmp
+
+        return state_dict
+
+    # Restore base optimizer fp32 weights from checkpoint by:
+    # 1) Merging fp32 weights from checkpoints of all partitions
+    # 2) Extracting fp32 weights for current partition from merged weights
+    # 3) Using extracted weights to update base optimizer weights directly.
+    def _restore_from_fp32_weights(self, all_state_dict):
+        partition_id = dist.get_rank(group=self.dp_process_group)
+        merged_single_partition_of_fp32_groups = []
+        for i in range(len(self.single_partition_of_fp32_groups)):
+            merged_partitions = [
+                sd['single_partition_of_fp32_groups'][i] for sd in all_state_dict
+            ]
+            flat_merged_partitions = flatten_dense_tensors_aligned(
+                merged_partitions,
+                dist.get_world_size(group=self.dp_process_group))
+            dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions)
+            merged_single_partition_of_fp32_groups.append(dp_partitions[partition_id])
+
+        for current, saved in zip(self.single_partition_of_fp32_groups, merged_single_partition_of_fp32_groups):
+            current.data.copy_(saved.data)
+
+    # Restore base optimizer fp32 weights from ZeRO fp16 weights
+    def _restore_from_fp16_weights(self):
+        partition_id = dist.get_rank(group=self.dp_process_group)
+        for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
+            fp32_partition.data.copy_(fp16_partitions[partition_id].data)
+
+    # Refresh the fp32 master params from the fp16 copies.
+    def refresh_fp32_params(self):
+        self._restore_from_fp16_weights()
+
+    # Extract optimizer state for current partition from merged states of all partitions
+    def _partition_base_optimizer_state(self, state_key, all_partition_states):
+        partition_id = dist.get_rank(group=self.dp_process_group)
+        alignment = dist.get_world_size(group=self.dp_process_group)
+        if torch.is_tensor(all_partition_states[0]):
+            flat_merged_partitions = flatten_dense_tensors_aligned(
+                all_partition_states,
+                alignment)
+            dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions)
+            return dp_partitions[partition_id]
+        else:
+            # Assume non-tensor states are not partitioned and equal across ranks, so return first one
+            return all_partition_states[0]
+
+    # Restore base optimizer state from checkpoint by
+    # 1) Merging optimizer state from checkpoints of all partitions
+    # 2) Extracting optimizer state for current partition from the merged state
+    # 3) Using the extracted value to directly update the base optimizer.
+    def _restore_base_optimizer_state(self, all_state_dict):
+        base_optimizer_group_states = []
+        for i in range(len(self.optimizer.param_groups)):
+            partition_states = {}
+            all_partition_group_states = [
+                sd['base_optimizer_state'][i] for sd in all_state_dict
+            ]
+            for key in all_partition_group_states[0].keys():
+                all_partition_states = [
+                    all_states[key] for all_states in all_partition_group_states
+                ]
+                partition_states[key] = self._partition_base_optimizer_state(
+                    key,
+                    all_partition_states)
+            base_optimizer_group_states.append(partition_states)
+
+        for i, group in enumerate(self.optimizer.param_groups):
+            p = group['params'][0]
+            for key, saved in base_optimizer_group_states[i].items():
+                if torch.is_tensor(self.optimizer.state[p][key]):
+                    self.optimizer.state[p][key].data.copy_(saved.data)
+                else:
+                    self.optimizer.state[p][key] = saved
+
+    def load_state_dict(self,
+                        state_dict_list,
+                        load_optimizer_states=True,
+                        load_from_fp32_weights=False):
+        r"""Loading ZeRO checkpoint
+
+        Arguments:
+            state_dict_list: List of all saved ZeRO checkpoints, one for each saved partition.
+                Note that the number of saved partitions may differ from number of loading partitions to support
+                changing GPU count, specifically DP world size, between saving and loading checkpoints.
+            load_optimizer_states: Boolean indicating whether or not to load base optimizer states
+            load_from_fp32_weights: Boolean indicating whether to initialize fp32 master weights from fp32
+            copies in checkpoints (no precision loss) or from model's fp16 copies (with precision loss).
+        """
+        """
+        Loads a state_dict created by an earlier call to state_dict().
+        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
+        whose parameters in turn came from ``model``, it is expected that the user
+        will call ``model.load_state_dict()`` before
+        ``fp16_optimizer_instance.load_state_dict()`` is called.
+        Example::
+            model = torch.nn.Linear(D_in, D_out).cuda().half()
+            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+            ...
+            checkpoint = torch.load("saved.pth")
+            model.load_state_dict(checkpoint['model'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+        """
+        # I think it should actually be ok to reload the optimizer before the model.
+        self.loss_scaler = state_dict_list[0]['loss_scaler']
+        self.dynamic_loss_scale = state_dict_list[0]['dynamic_loss_scale']
+        self.overflow = state_dict_list[0]['overflow']
+
+        if load_optimizer_states:
+            self._restore_base_optimizer_state(state_dict_list)
+
+        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
+        # The optimizer's hyperparameters and internal buffers are also up to date.
+        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
+        # out of date.  There are two options.
+        # 1:  Refresh the master params from the model's fp16 params.
+        # This requires less storage but incurs precision loss.
+        # 2:  Save and restore the fp32 master copies separately.
+        # We choose option 1 if changing DP degree and option 2 otherwise.
+        #
+        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
+        # of their associated parameters, because it's possible those buffers might not exist yet in
+        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
+        # constructed in the same way as the one whose state_dict we are loading, the same master params
+        # are guaranteed to exist, so we can just copy_() from the saved master params.
+
+        if load_from_fp32_weights:
+            self._restore_from_fp32_weights(state_dict_list)
+        else:
+            self._restore_from_fp16_weights()
+
+
+def _handle_overflow(cpu_sum, x, i):
+    import math
+    rank = torch.distributed.get_rank()
+    if rank == 0:
+        t_i = -1
+        for v_i, v in enumerate(x.data.contiguous().view(-1)):
+            if not math.isfinite(float(v)):
+                t_i = v_i
+                break
+        logger.info(
+            f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}"
+        )
diff --git a/deepspeed/pt/zero_utils.py b/deepspeed/runtime/zero/utils.py
old mode 100644
new mode 100755
similarity index 57%
rename from deepspeed/pt/zero_utils.py
rename to deepspeed/runtime/zero/utils.py
index e125f82d9b39..2173670c632e
--- a/deepspeed/pt/zero_utils.py
+++ b/deepspeed/runtime/zero/utils.py
@@ -1,7 +1,8 @@
 import torch
 import torch.distributed as dist
-
-from deepspeed.pt.log_utils import logger
+from deepspeed.utils import logger
+from deepspeed.ops.adam import DeepSpeedCPUAdam
+from deepspeed.ops.adam import FusedAdam
 
 
 def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
@@ -20,3 +21,25 @@ def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
         if rank in ranks:
             my_group = group
     return my_group
+
+
+ZERO_SUPPORTED_OPTIMIZERS = [
+    torch.optim.Adam,
+    torch.optim.AdamW,
+    FusedAdam,
+    DeepSpeedCPUAdam
+]
+
+# Add apex FusedAdam to supported list if apex is installed
+try:
+    import apex
+    ZERO_SUPPORTED_OPTIMIZERS.append(apex.optimizers.FusedAdam)
+except ImportError:
+    pass
+
+
+def is_zero_supported_optimizer(optimizer):
+    print(
+        f'Checking ZeRO support for optimizer={optimizer.__class__.__name__} type={type(optimizer)}'
+    )
+    return type(optimizer) in ZERO_SUPPORTED_OPTIMIZERS
diff --git a/deepspeed/utils/__init__.py b/deepspeed/utils/__init__.py
new file mode 100644
index 000000000000..c231edca4919
--- /dev/null
+++ b/deepspeed/utils/__init__.py
@@ -0,0 +1,3 @@
+from .logging import logger, log_dist
+from .distributed import init_distributed
+from deepspeed.runtime.dataloader import RepeatingLoader
diff --git a/deepspeed/utils/distributed.py b/deepspeed/utils/distributed.py
new file mode 100644
index 000000000000..dba48e6fdac4
--- /dev/null
+++ b/deepspeed/utils/distributed.py
@@ -0,0 +1,131 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+import os
+import torch
+
+from .logging import logger
+from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
+
+
+def init_distributed(dist_backend="nccl",
+                     auto_mpi_discovery=True,
+                     distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT,
+                     verbose=True):
+    """Initialize torch.distributed backend, potentially performing MPI discovery if needed.
+
+    Arguments:
+        dist_backend: torch distributed backend, e.g., nccl, mpi, gloo
+
+        auto_mpi_discovery: if distributed environment variables are not set, attempt to discover them from MPI
+
+        distributed_port: torch distributed backend port
+
+        verbose: verbose logging
+    """
+    required_env = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
+    if auto_mpi_discovery and not all(map(lambda v: v in os.environ, required_env)):
+        if verbose:
+            logger.info(
+                "Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment..."
+            )
+        if in_aml() and not in_dlts():
+            patch_aml_env_for_torch_nccl_backend(verbose=verbose)
+        else:
+            mpi_discovery(distributed_port=distributed_port, verbose=verbose)
+
+    if not torch.distributed.is_initialized():
+        if verbose:
+            logger.info(
+                "Initializing torch distributed with backend: {}".format(dist_backend))
+        torch.distributed.init_process_group(backend=dist_backend)
+
+
+def mpi_discovery(distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbose=True):
+    """
+    Discovery MPI environment via mpi4py and map to relevant torch.distributed state
+    """
+    from mpi4py import MPI
+    import subprocess
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    world_size = comm.Get_size()
+
+    master_addr = None
+    if rank == 0:
+        hostname_cmd = ["hostname -I"]
+        result = subprocess.check_output(hostname_cmd, shell=True)
+        master_addr = result.decode('utf-8').split()[0]
+    master_addr = comm.bcast(master_addr, root=0)
+
+    # Determine local rank by assuming hostnames are unique
+    proc_name = MPI.Get_processor_name()
+    all_procs = comm.allgather(proc_name)
+    local_rank = sum([i == proc_name for i in all_procs[:rank]])
+
+    os.environ['RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    os.environ['LOCAL_RANK'] = str(local_rank)
+    os.environ['MASTER_ADDR'] = master_addr
+    os.environ['MASTER_PORT'] = str(distributed_port)
+
+    if verbose:
+        logger.info(
+            "Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
+            .format(os.environ['RANK'],
+                    os.environ['LOCAL_RANK'],
+                    os.environ['WORLD_SIZE'],
+                    os.environ['MASTER_ADDR'],
+                    os.environ['MASTER_PORT']))
+
+    if torch.distributed.is_initialized():
+        assert torch.distributed.get_rank() == rank, "MPI rank {} does not match torch rank {}".format(rank, dist.get_rank())
+        assert torch.distributed.get_world_size() == world_size, "MPI world size {} does not match torch world size {}".format(
+            world_size, torch.distributed.get_world_size())
+
+
+def in_aml():
+    # Are we running inside an Azure Machine Learning (AML) environment?
+    return 'AZUREML_EXPERIMENT_ID' in os.environ
+
+
+def in_dlts():
+    # Are we running on a DLTS cluster?
+    return 'DLTS_JOB_ID' in os.environ
+
+
+def patch_aml_env_for_torch_nccl_backend(master_port=6105, verbose=True):
+    """Helper routine to get and set environment variables.
+    This is adapted from Azure ML's documentation available from:
+    https://azure.github.io/azureml-web/docs/cheatsheet/distributed-training/#environment-variables-from-openmpi
+    """
+    os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
+    os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"]
+    single_node = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"]) == int(
+        os.environ["WORLD_SIZE"])
+
+    if not single_node:
+        master_node_params = os.environ["AZ_BATCH_MASTER_NODE"].split(":")
+        os.environ["MASTER_ADDR"] = master_node_params[0]
+        # Do not overwrite master port with that defined in AZ_BATCH_MASTER_NODE
+        if "MASTER_PORT" not in os.environ:
+            os.environ["MASTER_PORT"] = str(master_port)
+    else:
+        os.environ["MASTER_ADDR"] = os.environ["AZ_BATCHAI_MPI_MASTER_NODE"]
+        os.environ["MASTER_PORT"] = "54965"
+
+    if verbose:
+        logger.info("NCCL_SOCKET_IFNAME original value = {}".format(
+            os.environ["NCCL_SOCKET_IFNAME"]))
+
+    os.environ["NCCL_SOCKET_IFNAME"] = "^docker0,lo"
+    os.environ['LOCAL_RANK'] = os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]
+
+    if verbose:
+        logger.info(
+            "Discovered AzureML settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
+            .format(os.environ['RANK'],
+                    os.environ['LOCAL_RANK'],
+                    os.environ['WORLD_SIZE'],
+                    os.environ['MASTER_ADDR'],
+                    os.environ['MASTER_PORT']))
diff --git a/deepspeed/pt/log_utils.py b/deepspeed/utils/logging.py
similarity index 100%
rename from deepspeed/pt/log_utils.py
rename to deepspeed/utils/logging.py
diff --git a/deepspeed/pt/deepspeed_timer.py b/deepspeed/utils/timer.py
similarity index 87%
rename from deepspeed/pt/deepspeed_timer.py
rename to deepspeed/utils/timer.py
index f78330d60179..39f5c8be8fff 100755
--- a/deepspeed/pt/deepspeed_timer.py
+++ b/deepspeed/utils/timer.py
@@ -3,18 +3,24 @@
 '''
 
 import time
-import psutil
 import torch
 
-from deepspeed.pt.log_utils import logger
+from deepspeed.utils import logger
+
+try:
+    import psutil
+    PSUTILS_INSTALLED = True
+except ImportError:
+    PSUTILS_INSTALLED = False
+    pass
 
 
 def print_rank_0(message):
     if torch.distributed.is_initialized():
         if torch.distributed.get_rank() == 0:
-            logger.info(message)
+            print(message)
     else:
-        logger.info(message)
+        print(message)
 
 
 class SynchronizedWallClockTimer:
@@ -85,12 +91,15 @@ def memory_usage():
     def log(self, names, normalizer=1.0, reset=True, memory_breakdown=False):
         """Log a group of timers."""
         assert normalizer > 0.0
-        string = 'time (ms)'
+        string = f'rank={torch.distributed.get_rank()} time (ms)'
         for name in names:
-            elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer
-            string += ' | {}: {:.2f}'.format(name, elapsed_time)
-        if memory_breakdown:
-            string += self.memory_usage()
+            if name in self.timers:
+                elapsed_time = self.timers[name].elapsed(
+                    reset=reset) * 1000.0 / normalizer
+                string += ' | {}: {:.2f}'.format(name, elapsed_time)
+
+        # TODO: use our logging utilitied to selectively print. Useful for model
+        # parallelism because rank=0 is too restrictive.
         print_rank_0(string)
 
 
@@ -100,7 +109,7 @@ def __init__(self,
                  num_workers,
                  start_step=2,
                  steps_per_output=50,
-                 monitor_memory=True,
+                 monitor_memory=False,
                  logging_fn=None):
         self.start_time = 0
         self.end_time = 0
@@ -121,6 +130,9 @@ def __init__(self,
             self.logging = logger.info
         self.initialized = False
 
+        if self.monitor_memory and not PSUTILS_INSTALLED:
+            raise ImportError("Unable to import 'psutils', please install package")
+
     def update_epoch_count(self):
         self.epoch_count += 1
         self.local_step_count = 0
diff --git a/Dockerfile b/docker/Dockerfile
similarity index 53%
rename from Dockerfile
rename to docker/Dockerfile
index de029f843537..62309c03ea0d 100644
--- a/Dockerfile
+++ b/docker/Dockerfile
@@ -1,14 +1,23 @@
 FROM nvidia/cuda:10.0-devel-ubuntu18.04
 
+##############################################################################
+# Temporary Installation Directory
+##############################################################################
+ENV STAGE_DIR=/tmp
+RUN mkdir -p ${STAGE_DIR}
+
 ##############################################################################
 # Installation/Basic Utilities
 ##############################################################################
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
-    software-properties-common \
-    openssh-client openssh-server \
-    pdsh curl sudo net-tools \
-    vim iputils-ping wget
+        software-properties-common build-essential autotools-dev \
+        nfs-common pdsh \
+        cmake g++ gcc \
+        curl wget vim tmux emacs less unzip \
+        htop iftop iotop ca-certificates openssh-client openssh-server \
+        rsync iputils-ping net-tools sudo \
+        llvm-9-dev
 
 ##############################################################################
 # Installation Latest Git
@@ -18,6 +27,66 @@ RUN add-apt-repository ppa:git-core/ppa -y && \
     apt-get install -y git && \
     git --version
 
+##############################################################################
+# Client Liveness & Uncomment Port 22 for SSH Daemon
+##############################################################################
+# Keep SSH client alive from server side
+RUN echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config
+RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \
+    sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+
+##############################################################################
+# Mellanox OFED
+##############################################################################
+ENV MLNX_OFED_VERSION=4.6-1.0.1.1
+RUN apt-get install -y libnuma-dev
+RUN cd ${STAGE_DIR} && \
+    wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64.tgz | tar xzf - && \
+    cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64 && \
+    ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
+    cd ${STAGE_DIR} && \
+    rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64*
+
+##############################################################################
+# nv_peer_mem
+##############################################################################
+ENV NV_PEER_MEM_VERSION=1.1
+ENV NV_PEER_MEM_TAG=1.1-0
+RUN mkdir -p ${STAGE_DIR} && \
+    git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \
+    cd ${STAGE_DIR}/nv_peer_memory && \
+    ./build_module.sh && \
+    cd ${STAGE_DIR} && \
+    tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \
+    cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \
+    apt-get update && \
+    apt-get install -y dkms && \
+    dpkg-buildpackage -us -uc && \
+    dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
+
+##############################################################################
+# OPENMPI
+##############################################################################
+ENV OPENMPI_BASEVERSION=4.0
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.1
+RUN cd ${STAGE_DIR} && \
+    wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
+    cd openmpi-${OPENMPI_VERSION} && \
+    ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
+    make -j"$(nproc)" install && \
+    ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
+    # Sanity check:
+    test -f /usr/local/mpi/bin/mpic++ && \
+    cd ${STAGE_DIR} && \
+    rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
+ENV PATH=/usr/local/mpi/bin:${PATH} \
+    LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
+    chmod a+x /usr/local/mpi/bin/mpirun
+
 ##############################################################################
 # Python
 ##############################################################################
@@ -32,6 +101,8 @@ RUN apt-get install -y python3 python3-dev && \
     pip install --upgrade pip && \
     # Print python an pip version
     python -V && pip -V
+RUN pip install pyyaml
+RUN pip install ipython
 
 ##############################################################################
 # TensorFlow
@@ -39,6 +110,49 @@ RUN apt-get install -y python3 python3-dev && \
 ENV TENSORFLOW_VERSION=1.15.2
 RUN pip install tensorflow-gpu==${TENSORFLOW_VERSION}
 
+##############################################################################
+# Some Packages
+##############################################################################
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        libsndfile-dev \
+        libcupti-dev \
+        libjpeg-dev \
+        libpng-dev \
+        screen
+RUN pip install psutil \
+                yappi \
+                cffi \
+                ipdb \
+                pandas \
+                matplotlib \
+                py3nvml \
+                pyarrow \
+                graphviz \
+                astor \
+                boto3 \
+                tqdm \
+                sentencepiece \
+                msgpack \
+                requests \
+                pandas \
+                sphinx \
+                sphinx_rtd_theme \
+                scipy \
+                numpy \
+                sklearn \
+                scikit-learn \
+                nvidia-ml-py3 \
+                mpi4py \
+                cupy-cuda100
+
+##############################################################################
+## SSH daemon port inside container cannot conflict with host OS port
+###############################################################################
+ENV SSH_PORT=2222
+RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
+    sed "0,/^#Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+
 ##############################################################################
 # PyTorch
 ##############################################################################
@@ -50,43 +164,11 @@ RUN pip install torchvision==${TORCHVISION_VERSION}
 RUN pip install tensorboardX==${TENSORBOARDX_VERSION}
 
 ##############################################################################
-# Temporary Installation Directory
-##############################################################################
-ENV STAGE_DIR=/tmp
-RUN mkdir -p ${STAGE_DIR}
-
-##############################################################################
-# Mellanox OFED
+# PyYAML build issue
+# https://stackoverflow.com/a/53926898
 ##############################################################################
-ENV MLNX_OFED_VERSION=4.6-1.0.1.1
-RUN apt-get install -y libnuma-dev
-RUN cd ${STAGE_DIR} && \
-    wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64.tgz | tar xzf - && \
-    cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64 && \
-    ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
-    cd ${STAGE_DIR} && \
-    rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64*
-
-##############################################################################
-# nv_peer_mem
-##############################################################################
-RUN mkdir -p ${STAGE_DIR} && \
-    git clone https://github.com/Mellanox/nv_peer_memory.git ${STAGE_DIR}/nv_peer_memory && \
-    cd ${STAGE_DIR}/nv_peer_memory && \
-    ./build_module.sh && \
-    cd ${STAGE_DIR} && \
-    tar xzf ${STAGE_DIR}/nvidia-peer-memory_1.0.orig.tar.gz && \
-    cd ${STAGE_DIR}/nvidia-peer-memory-1.0 && \
-    apt-get install -y dkms && \
-    dpkg-buildpackage -us -uc && \
-    dpkg -i ${STAGE_DIR}/nvidia-peer-memory_1.0-9_all.deb
-
-##############################################################################
-## Ucomment and set SSH Daemon port
-###############################################################################
-ENV SSH_PORT=2222
-RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
-    sed "0,/^#Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+RUN rm -rf /usr/lib/python3/dist-packages/yaml && \
+    rm -rf /usr/lib/python3/dist-packages/PyYAML-*
 
 ##############################################################################
 ## Add deepspeed user
diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock
index 345bb325dbb0..94dad7c80bc0 100644
--- a/docs/Gemfile.lock
+++ b/docs/Gemfile.lock
@@ -1,7 +1,7 @@
 GEM
   remote: https://rubygems.org/
   specs:
-    activesupport (6.0.3.1)
+    activesupport (6.0.3.4)
       concurrent-ruby (~> 1.0, >= 1.0.2)
       i18n (>= 0.7, < 2)
       minitest (~> 5.1)
@@ -16,38 +16,39 @@ GEM
     colorator (1.1.0)
     commonmarker (0.17.13)
       ruby-enum (~> 0.5)
-    concurrent-ruby (1.1.6)
-    dnsruby (1.61.3)
-      addressable (~> 2.5)
-    em-websocket (0.5.1)
+    concurrent-ruby (1.1.7)
+    dnsruby (1.61.4)
+      simpleidn (~> 0.1)
+    em-websocket (0.5.2)
       eventmachine (>= 0.12.9)
       http_parser.rb (~> 0.6.0)
     ethon (0.12.0)
       ffi (>= 1.3.0)
     eventmachine (1.2.7)
     execjs (2.7.0)
-    faraday (1.0.1)
+    faraday (1.1.0)
       multipart-post (>= 1.2, < 3)
-    ffi (1.12.2)
+      ruby2_keywords
+    ffi (1.13.1)
     forwardable-extended (2.6.0)
     gemoji (3.0.1)
-    github-pages (206)
+    github-pages (209)
       github-pages-health-check (= 1.16.1)
-      jekyll (= 3.8.7)
+      jekyll (= 3.9.0)
       jekyll-avatar (= 0.7.0)
       jekyll-coffeescript (= 1.1.1)
       jekyll-commonmark-ghpages (= 0.1.6)
       jekyll-default-layout (= 0.1.4)
-      jekyll-feed (= 0.13.0)
+      jekyll-feed (= 0.15.1)
       jekyll-gist (= 1.5.0)
       jekyll-github-metadata (= 2.13.0)
-      jekyll-mentions (= 1.5.1)
+      jekyll-mentions (= 1.6.0)
       jekyll-optional-front-matter (= 0.3.2)
       jekyll-paginate (= 1.1.0)
       jekyll-readme-index (= 0.3.0)
-      jekyll-redirect-from (= 0.15.0)
+      jekyll-redirect-from (= 0.16.0)
       jekyll-relative-links (= 0.6.1)
-      jekyll-remote-theme (= 0.4.1)
+      jekyll-remote-theme (= 0.4.2)
       jekyll-sass-converter (= 1.5.2)
       jekyll-seo-tag (= 2.6.1)
       jekyll-sitemap (= 1.4.0)
@@ -55,7 +56,7 @@ GEM
       jekyll-theme-architect (= 0.1.1)
       jekyll-theme-cayman (= 0.1.1)
       jekyll-theme-dinky (= 0.1.1)
-      jekyll-theme-hacker (= 0.1.1)
+      jekyll-theme-hacker (= 0.1.2)
       jekyll-theme-leap-day (= 0.1.1)
       jekyll-theme-merlot (= 0.1.1)
       jekyll-theme-midnight (= 0.1.1)
@@ -66,13 +67,14 @@ GEM
       jekyll-theme-tactile (= 0.1.1)
       jekyll-theme-time-machine (= 0.1.1)
       jekyll-titles-from-headings (= 0.5.3)
-      jemoji (= 0.11.1)
-      kramdown (= 1.17.0)
+      jemoji (= 0.12.0)
+      kramdown (= 2.3.0)
+      kramdown-parser-gfm (= 1.1.0)
       liquid (= 4.0.3)
       mercenary (~> 0.3)
       minima (= 2.5.1)
       nokogiri (>= 1.10.4, < 2.0)
-      rouge (= 3.19.0)
+      rouge (= 3.23.0)
       terminal-table (~> 1.4)
     github-pages-health-check (1.16.1)
       addressable (~> 2.3)
@@ -80,20 +82,20 @@ GEM
       octokit (~> 4.0)
       public_suffix (~> 3.0)
       typhoeus (~> 1.3)
-    html-pipeline (2.12.3)
+    html-pipeline (2.14.0)
       activesupport (>= 2)
       nokogiri (>= 1.4)
     http_parser.rb (0.6.0)
     i18n (0.9.5)
       concurrent-ruby (~> 1.0)
-    jekyll (3.8.7)
+    jekyll (3.9.0)
       addressable (~> 2.4)
       colorator (~> 1.0)
       em-websocket (~> 0.5)
       i18n (~> 0.7)
       jekyll-sass-converter (~> 1.0)
       jekyll-watch (~> 2.0)
-      kramdown (~> 1.14)
+      kramdown (>= 1.17, < 3)
       liquid (~> 4.0)
       mercenary (~> 0.3.3)
       pathutil (~> 0.9)
@@ -113,16 +115,16 @@ GEM
       rouge (>= 2.0, < 4.0)
     jekyll-default-layout (0.1.4)
       jekyll (~> 3.0)
-    jekyll-feed (0.13.0)
+    jekyll-feed (0.15.1)
       jekyll (>= 3.7, < 5.0)
     jekyll-gist (1.5.0)
       octokit (~> 4.2)
     jekyll-github-metadata (2.13.0)
       jekyll (>= 3.4, < 5.0)
       octokit (~> 4.0, != 4.4.0)
-    jekyll-include-cache (0.2.0)
+    jekyll-include-cache (0.2.1)
       jekyll (>= 3.7, < 5.0)
-    jekyll-mentions (1.5.1)
+    jekyll-mentions (1.6.0)
       html-pipeline (~> 2.3)
       jekyll (>= 3.7, < 5.0)
     jekyll-optional-front-matter (0.3.2)
@@ -130,14 +132,15 @@ GEM
     jekyll-paginate (1.1.0)
     jekyll-readme-index (0.3.0)
       jekyll (>= 3.0, < 5.0)
-    jekyll-redirect-from (0.15.0)
+    jekyll-redirect-from (0.16.0)
       jekyll (>= 3.3, < 5.0)
     jekyll-relative-links (0.6.1)
       jekyll (>= 3.3, < 5.0)
-    jekyll-remote-theme (0.4.1)
+    jekyll-remote-theme (0.4.2)
       addressable (~> 2.0)
       jekyll (>= 3.5, < 5.0)
-      rubyzip (>= 1.3.0)
+      jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0)
+      rubyzip (>= 1.3.0, < 3.0)
     jekyll-sass-converter (1.5.2)
       sass (~> 3.4)
     jekyll-seo-tag (2.6.1)
@@ -154,8 +157,8 @@ GEM
     jekyll-theme-dinky (0.1.1)
       jekyll (~> 3.5)
       jekyll-seo-tag (~> 2.0)
-    jekyll-theme-hacker (0.1.1)
-      jekyll (~> 3.5)
+    jekyll-theme-hacker (0.1.2)
+      jekyll (> 3.5, < 5.0)
       jekyll-seo-tag (~> 2.0)
     jekyll-theme-leap-day (0.1.1)
       jekyll (~> 3.5)
@@ -189,44 +192,51 @@ GEM
       jekyll (>= 3.3, < 5.0)
     jekyll-watch (2.2.1)
       listen (~> 3.0)
-    jemoji (0.11.1)
+    jemoji (0.12.0)
       gemoji (~> 3.0)
       html-pipeline (~> 2.2)
       jekyll (>= 3.0, < 5.0)
-    kramdown (1.17.0)
+    kramdown (2.3.0)
+      rexml
+    kramdown-parser-gfm (1.1.0)
+      kramdown (~> 2.0)
     liquid (4.0.3)
     listen (3.2.1)
       rb-fsevent (~> 0.10, >= 0.10.3)
       rb-inotify (~> 0.9, >= 0.9.10)
     mercenary (0.3.6)
-    mini_portile2 (2.4.0)
+    mini_portile2 (2.5.0)
     minima (2.5.1)
       jekyll (>= 3.5, < 5.0)
       jekyll-feed (~> 0.9)
       jekyll-seo-tag (~> 2.1)
-    minimal-mistakes-jekyll (4.19.2)
+    minimal-mistakes-jekyll (4.20.2)
       jekyll (>= 3.7, < 5.0)
       jekyll-feed (~> 0.1)
       jekyll-gist (~> 1.5)
       jekyll-include-cache (~> 0.1)
       jekyll-paginate (~> 1.1)
       jekyll-sitemap (~> 1.3)
-    minitest (5.14.1)
+    minitest (5.14.2)
     multipart-post (2.1.1)
-    nokogiri (1.10.9)
-      mini_portile2 (~> 2.4.0)
+    nokogiri (1.11.0)
+      mini_portile2 (~> 2.5.0)
+      racc (~> 1.4)
     octokit (4.18.0)
       faraday (>= 0.9)
       sawyer (~> 0.8.0, >= 0.5.3)
     pathutil (0.16.2)
       forwardable-extended (~> 2.6)
     public_suffix (3.1.1)
+    racc (1.5.2)
     rb-fsevent (0.10.4)
     rb-inotify (0.10.1)
       ffi (~> 1.0)
-    rouge (3.19.0)
+    rexml (3.2.4)
+    rouge (3.23.0)
     ruby-enum (0.8.0)
       i18n
+    ruby2_keywords (0.0.2)
     rubyzip (2.3.0)
     safe_yaml (1.0.5)
     sass (3.7.4)
@@ -237,6 +247,8 @@ GEM
     sawyer (0.8.2)
       addressable (>= 2.3.5)
       faraday (> 0.8, < 2.0)
+    simpleidn (0.1.1)
+      unf (~> 0.1.4)
     terminal-table (1.8.0)
       unicode-display_width (~> 1.1, >= 1.1.1)
     thread_safe (0.3.6)
@@ -244,11 +256,14 @@ GEM
       ethon (>= 0.9.0)
     tzinfo (1.2.7)
       thread_safe (~> 0.1)
-    tzinfo-data (1.2020.1)
+    tzinfo-data (1.2020.3)
       tzinfo (>= 1.0.0)
+    unf (0.1.4)
+      unf_ext
+    unf_ext (0.0.7.7)
     unicode-display_width (1.7.0)
     wdm (0.1.1)
-    zeitwerk (2.3.0)
+    zeitwerk (2.4.0)
 
 PLATFORMS
   ruby
diff --git a/docs/README.md b/docs/README.md
index c0095845b625..0ac7783f3860 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,15 +1,49 @@
 # DeepSpeed Documentation
 
-This directory includes the documentation of DeepSpeed. There are three ways to read it:
+This directory includes the source code for the website and documentation of DeepSpeed. The `code-docs/` directory is used to build [deepspeed.readthedocs.io](https://deepspeed.readthedocs.io/en/latest/).
 
-## 1. Access [deepspeed.ai](https://www.deepspeed.ai/)
+[deepspeed.ai](https://www.deepspeed.ai/) is the recommended way to read all DeepSpeed documentation. Directly viewing the Markdown files in this directory will not include images and other features.
 
-This is the most recommended way to read the documentation.
+## Building the documentation locally
+You can serve the DeepSpeed website locally. This is especially useful for development.
 
-## 2. Directly read files in this directory
+### Prerequisites
+The DeepSpeed website relies on [Jekyll](https://jekyllrb.com/). There are several [guides for installation](https://jekyllrb.com/docs/installation/). The instructions below assume you are in an Ubuntu environment and have been tested on WSL.
 
-We do not recommend this way because this directory is organized to build the [deepspeed.ai](https://www.deepspeed.ai/) website using Jekyll. Thus some of the files actually are not DeepSpeed documentation. In addition, some of the url links in the documentation only work through the webpages generated by Jekyll.
+First ensure that you have the necessary packages (e.g., `make` and `zlib`).
+```
+sudo apt-get install build-essential zlib1g-dev ruby-full
+```
 
-## 3. Build [deepspeed.ai](https://www.deepspeed.ai/) website locally using Jekyll
+Add these lines to your `.bashrc` or equivalent to ensure you have permissions to install Ruby packages without `sudo`.
+```
+export GEM_HOME="$HOME/gems"
+export PATH="$HOME/gems/bin:$PATH"
+```
+Don't forget to `source ~/.bashrc` afterwards 😊.
 
-This is recommended for local website development or when you do not have internet access. You can follow the instruction at [here](https://help.github.com/en/github/working-with-github-pages/testing-your-github-pages-site-locally-with-jekyll) to install Ruby, Bundler, and Jekyll. Then run `bundle exec jekyll serve` at this directory so that you can view the website in your web browser at `http://localhost:4000`.
+
+Now we can install Jekyll and [Bundler](https://bundler.io/):
+```
+gem install jekyll bundler
+```
+
+### Start a local webserver
+We now need to install the required Ruby packages for the website.
+
+**NOTE**: you should change to this folder (i.e., docs) before running the installation command to avoid this [error](https://stackoverflow.com/questions/10012181/bundle-install-returns-could-not-locate-gemfile/35157872):
+
+> Could not locate Gemfile
+
+**NOTE**: this step frequently hangs when connected to a VPN (including MSVPN). Simply disconnect for the package installation.
+
+
+```
+bundle install
+```
+
+You can now start a local webserver via:
+```
+bundle exec jekyll serve
+```
+The website should now be accessible at [http://localhost:4000](http://localhost:4000)
diff --git a/docs/_config.yml b/docs/_config.yml
index 4ee2195905a8..4d64e8caf52f 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -30,6 +30,7 @@ collections:
     output: true
     permalink: /:collection/:path/
     order:
+      - advanced-install.md
       - getting-started.md
       - azure.md
       - cifar-10.md
@@ -39,6 +40,7 @@ collections:
       - megatron.md
       - 1Cycle.md
       - lrrt.md
+      - zero.md
 
 defaults:
   - scope:
@@ -73,3 +75,5 @@ analytics:
 
 timezone: America/Los_Angeles
 breadcrumbs: true
+
+press_release_v3: https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/
diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml
old mode 100644
new mode 100755
index b04769742041..5cfd3d2a5a26
--- a/docs/_data/navigation.yml
+++ b/docs/_data/navigation.yml
@@ -37,12 +37,18 @@ lnav:
         url: /docs/config-json/#communication-options
       - title: "FP16"
         url: /docs/config-json/#fp16-training-options
+      - title: "AMP"
+        url: /docs/config-json/#automatic-mixed-precision-amp-training-options
+      - title: "Gradient Clipping"
+        url: /docs/config-json/#gradient-clipping
       - title: "ZeRO optimizations"
         url: /docs/config-json/#zero-optimizations-for-fp16-training
       - title: "Logging"
         url: /docs/config-json/#logging
       - title: "Activation checkpointing"
         url: /docs/config-json/#activation-checkpointing
+      - title: "Sparse Attention"
+        url: /docs/config-json/#sparse-attention
   - title: "Tutorials"
     url: /tutorials/
     children:
@@ -52,6 +58,8 @@ lnav:
         url: /tutorials/azure/
       - title: "CIFAR-10"
         url: /tutorials/cifar-10/
+      - title: "GAN"
+        url: /tutorials/gan/
       - title: "BERT Pre-training"
         url: /tutorials/bert-pretraining/
       - title: "BingBertSQuAD Fine-tuning"
@@ -64,5 +72,17 @@ lnav:
         url: /tutorials/1Cycle/
       - title: "Learning Rate Range Test"
         url: /tutorials/lrrt/
+      - title: "DeepSpeed Sparse Attention"
+        url: /tutorials/sparse-attention/
+      - title: "ZeRO-Offload"
+        url: /tutorials/zero-offload/
+      - title: "ZeRO Redundancy Optimizer (ZeRO)"
+        url: /tutorials/zero/
+      - title: "DeepSpeed with 1-bit Adam"
+        url: /tutorials/onebit-adam/
+      - title: "Pipeline Parallelism"
+        url: /tutorials/pipeline/
+      - title: "Progressive Layer Dropping"
+        url: /tutorials/progressive_layer_dropping/
   - title: "Contributing"
     url: /contributing/
diff --git a/docs/_layouts/news-home.html b/docs/_layouts/news-home.html
index 960576b27cd7..8248eed5b551 100644
--- a/docs/_layouts/news-home.html
+++ b/docs/_layouts/news-home.html
@@ -16,6 +16,9 @@ <h2>{{ site.data.ui-text[site.locale].recent_posts | default: "Recent Posts" }}<
 {% assign news = posts | where: "sneak_preview", "false" %}
 {% for post in news %}
   {% include archive-single.html %}
+  {% if post.image %}
+    <a href="{{ post.link }}"><img src="{{ post.image }}"></a>
+  {% endif %}
 {% endfor %}
 
 {% include paginator.html %}
diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
index ea5fcb256818..f067ec941323 100755
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -34,10 +34,10 @@ title: "DeepSpeed Configuration JSON"
 
 | Fields | Value                                                        | Example                        |
 | ------ | ------------------------------------------------------------ | ------------------------------ |
-| type   | The optimizer name. DeepSpeed natively supports Adam and LAMB optimizers and will import other optimizers from [torch](https://pytorch.org/docs/stable/optim.html). | `"Adam"`                         |
+| type   | The optimizer name. DeepSpeed natively supports **Adam**, **AdamW**, **OneBitAdam**, and **Lamb** optimizers and will import other optimizers from [torch](https://pytorch.org/docs/stable/optim.html). | `"Adam"`                         |
 | params | Dictionary of parameters to instantiate optimizer. The parameter names must match the optimizer constructor signature (e.g., for [Adam](https://pytorch.org/docs/stable/optim.html#torch.optim.Adam)). | `{"lr": 0.001, "eps": 1e-8}` |
 
-  Example of ***optimizer***
+  Example of ***optimizer*** with Adam
 
 ```json
 "optimizer": {
@@ -53,6 +53,31 @@ title: "DeepSpeed Configuration JSON"
     }
   }
 ```
+The Adam optimizer also supports the following two params keys/values in addition to the standard parameters from [torch.optim.Adam](https://pytorch.org/docs/stable/_modules/torch/optim/adam.html#Adam):
+
+| "params" key  | Description                                                                 | Default |
+| ------------- | --------------------------------------------------------------------------- | --------|
+| torch\_adam   | Use torch's implementation of adam instead of our fused adam implementation | false   |
+| adam\_w\_mode | Apply L2 regularization (also known as AdamW)                               | true    |
+
+  Another example of ***optimizer*** with 1-bit Adam specific parameters is as follows.
+
+```json
+"optimizer": {
+    "type": "OneBitAdam",
+    "params": {
+      "lr": 0.001,
+      "betas": [
+        0.8,
+        0.999
+      ],
+      "eps": 1e-8,
+      "weight_decay": 3e-7,
+      "freeze_step": 400,
+      "cuda_aware": true
+    }
+  }
+```
 
 ### Scheduler Parameters
 
@@ -60,8 +85,8 @@ title: "DeepSpeed Configuration JSON"
 
 | Fields | Value                                                        | Example                        |
 | ------ | ------------------------------------------------------------ | ------------------------------ |
-| type   | The scheduler name. See [here](https://deepspeed.readthedocs.io/en/latest/deepspeed.pt.html) for list of support schedulers. | `"1Cycle"`                      |
-| params | Dictionary of parameters to instantiate scheduler. The parameter names should match scheduler constructor signature. | `{"lr": 0.001, "eps": 1e-8}` |
+| type   | The scheduler name. See [here](https://deepspeed.readthedocs.io/en/latest/deepspeed.pt.html) for list of support schedulers. | `"WarmupLR"`                      |
+| params | Dictionary of parameters to instantiate scheduler. The parameter names should match scheduler constructor signature. | `{"warmup_min_lr": 0, "warmup_max_lr": 0.001}` |
 
 Example of ***scheduler***
 
@@ -73,7 +98,7 @@ Example of ***scheduler***
           "warmup_max_lr": 0.001,
           "warmup_num_steps": 1000
       }
-  }  
+  }
 ```
 
 ### Communication options
@@ -204,16 +229,17 @@ Example of ***scheduler***
 
 ### ZeRO Optimizations for FP16 Training
 
-Enabling and configure ZeRO memory optimizations
+Enabling and configuring ZeRO memory optimizations
 ```json
   "zero_optimization": {
     "stage": [0|1|2],
     "allgather_partitions": [true|false],
-    "allgather_bucket_size": 500000000,
+    "allgather_bucket_size": 5e8,
     "overlap_comm": false,
     "reduce_scatter": [true|false],
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients" : [true|false]
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients" : [true|false],
+    "cpu_offload": [true|false]
     }
 ```
 
@@ -239,7 +265,7 @@ Enabling and configure ZeRO memory optimizations
 
 | Description                                                  | Default |
 | ------------------------------------------------------------ | ------- |
-| Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes   | `500000000`   |
+| Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes   | `5e8`   |
 
 ***overlap_comm***: [boolean]
 
@@ -257,7 +283,7 @@ Enabling and configure ZeRO memory optimizations
 
 | Description                                                  | Default |
 | ------------------------------------------------------------ | ------- |
-| Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large model sizes   | `500000000`   |
+| Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large model sizes   | `5e8`   |
 
 ***contiguous_gradients***: [boolean]
 
@@ -265,6 +291,11 @@ Enabling and configure ZeRO memory optimizations
 | ------------------------------------------------------------ | ------- |
 | Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward pass. Only useful when running very large models.   | `False`   |
 
+***cpu_offload***: [boolean]
+
+| Description                                                  | Default |
+| ------------------------------------------------------------ | ------- |
+| Enable offloading of optimizer memory and computation to CPU. This frees up GPU memory for larger models or batch sizes.  | `False`   |
 
 
 ### Logging
@@ -335,3 +366,43 @@ Enabling and configure ZeRO memory optimizations
 | Description                                                  | Default |
 | ------------------------------------------------------------ | ------- |
 | Logs the forward and backward time for each checkpoint function | `false`   |
+
+### Sparse Attention
+
+***sparse\_attention***: [dictionary]
+
+| Fields | Value                                                        | Example                        |
+| ------ | ------------------------------------------------------------ | ------------------------------ |
+| mode   | A string determining sparsity structure type. Deepspeed currently supports `"dense"`, `"fixed"`, `"bigbird"`, `"bslongformer"`, and `"variable"`. | `"fixed"` |
+| block  | An integer determining the block size. Current implementation of sparse self-attention is based on blocked sparse matrices. In which this parameter defines size of such blocks, `Block X Block`. | 16 |
+| different\_layout\_per\_head | A boolean determining if each head should be assigned a different sparsity layout; this will be satisfied based on availability. | false |
+| num\_local\_blocks | An integer determining the number of random blocks in each block row; only used in `"fixed"` mode. | 4 |
+| num\_global\_blocks | An integer determining how many consecutive blocks in a local window is used as the representative of the window for global attention; used in `"fixed"` and `"bigbird"` modes. | 1 |
+| attention | A string determining attention type. Attention can be `"unidirectional"`, such as autoregressive models, in which tokens attend only to tokens appear before them in the context. Considering that, the upper triangular of attention matrix is empty. Or it can be `"bidirectional"`, such as BERT, in which tokens can attend to any other tokens before or after them. Then, the upper triangular part of the attention matrix is mirror of the lower triangular; used in `"fixed"` and `"variable"` modes. | `"bidirectional"` |
+| horizontal\_global\_attention | A boolean determining if blocks that are global representative of a local window, also attend to all other blocks. This is valid only if attention type is `"bidirectional"`. Looking at the attention matrix, that means global attention not only includes the vertical blocks, but also horizontal blocks; used in `"fixed"` and `"variable"` modes. | false |
+| num\_different\_global\_patterns | An integer determining number of different global attentions layouts. While global attention can be fixed by which block/s are representative of any local window, since there are multi-heads, each head can use a different global representative; used only in `"fixed"` mode. | 4 |
+| num\_random\_blocks | An integer determining the number of random blocks in each block row; used in `"variable"` and `"bigbird"` modes. | 0 |
+| local\_window\_blocks | A list of integers determining the number of blocks in each local attention window. It assumes first number determines # of blocks in the first local window, second the second window, ..., and the last number determines the number of blocks in the remaining local windows; only used in `"variable"` mode. | [4] |
+| global\_block\_indices | A list of integers determining which blocks are considered as global attention. Given indices, determine the blocks that all other token blocks attend to and they attend to all other token blocks. Notice that if global\_block\_end\_indices parameter is set, this parameter is used as starting index of each global window; used in `"variable"` and `"bslongformer"` modes. | [0] |
+| global\_block\_end\_indices | A list of integers determining end indices of global window blocks. By default this is not used. But if it is set, it must have the same size of global\_block\_indices parameter, and combining this two parameters, for each index i, blocks from global\_block\_indices[i] to global\_block\_end\_indices[i], exclusive, are considered as global attention; used in `"variable"` and `"bslongformer"` modes. | None |
+| num\_sliding\_window\_blocks | An integer determining the number of blocks in sliding local attention window; used in `"bigbird"` and `"bslongformer"` modes. | 3 |
+
+  Example of ***sparse\_attention***
+
+```json
+  "sparse_attention": {
+    "mode": "fixed",
+    "block": 16,
+    "different_layout_per_head": true,
+    "num_local_blocks": 4,
+    "num_global_blocks": 1,
+    "attention": "bidirectional",
+    "horizontal_global_attention": false,
+    "num_different_global_patterns": 4,
+    "num_random_blocks": 0,
+    "local_window_blocks": [4],
+    "global_block_indices": [0],
+    "global_block_end_indices": None,
+    "num_sliding_window_blocks": 3
+  }
+```
diff --git a/docs/_pages/features.md b/docs/_pages/features.md
index 451e3b2af534..3ad1c8e91984 100755
--- a/docs/_pages/features.md
+++ b/docs/_pages/features.md
@@ -28,19 +28,25 @@ deepspeed --hostfile=<hostfile> \
 	<client_entry.py> <client args> \
 	--deepspeed --deepspeed_config ds_config.json
 ```
-The script `<client_entry.py>` will execute on the resources specified in `<hostfile>`.
+The script `<client_entry.py>` will execute on the resources specified in
+[`<hostfile>`](/getting-started/#resource-configuration-multi-node).
 
+## Pipeline Parallelism
+DeepSpeed provides [pipeline parallelism](/tutorials/pipeline/) for memory-
+and communication- efficient training. DeepSpeed supports a hybrid
+combination of data, model, and pipeline parallelism and has scaled to over
+[one trillion parameters using 3D parallelism]({{ site.press_release_v3 }}).
+Pipeline parallelism can also improve communication efficiency and has
+accelerated training by up to 7x on low-banwdith clusters.
 
-## Model Parallelism
 
+## Model Parallelism
 ### Support for Custom Model Parallelism
-DeepSpeed supports all forms of model parallelism including tensor slicing based
-approaches such as the [Megatron-LM](https://github.com/NVIDIA/Megatron-LM), or
-pipelined parallelism approaches such as
-[PipeDream](https://github.com/msr-fiddle/pipedream) and
-[GPipe](https://github.com/kakaobrain/torchgpipe). It does so by only requiring the model
-parallelism framework to provide a *model parallelism unit* (`mpu`) that implements a few
-bookkeeping functionalities:
+DeepSpeed supports all forms of model parallelism including tensor slicing
+based approaches such as the
+[Megatron-LM](https://github.com/NVIDIA/Megatron-LM). It does so by only
+requiring the model parallelism framework to provide a *model parallelism
+unit* (`mpu`) that implements a few bookkeeping functionalities:
 
 ```python
 mpu.get_model_parallel_rank()
@@ -57,6 +63,8 @@ DeepSpeed is fully compatible with [Megatron](https://github.com/NVIDIA/Megatron
 Please see the [Megatron-LM tutorial](/tutorials/megatron/) for details.
 
 
+
+
 ## The Zero Redundancy Optimizer
 The Zero Redundancy Optimizer ([ZeRO](https://arxiv.org/abs/1910.02054)) is at
 the heart of DeepSpeed and enables large model training at a scale that is
@@ -71,7 +79,7 @@ DeepSpeed.
 
 ### Optimizer State and Gradient Partitioning
 Optimizer State and Gradient Partitioning in ZeRO reduces the memory consumption of the
-model states (optimizer states, gradients and parmaeters) by 8x compared to standard
+model states (optimizer states, gradients and parameters) by 8x compared to standard
 data parallelism by partitioning these states across data parallel process instead of
 replicating them.
 
@@ -103,6 +111,12 @@ during the backward computation, the activation gradients are short lived while
 gradients are long lived. CMO transfers activation checkpoints and parameter gradients
 to contiguous buffers preventing memory fragmentation.
 
+## ZeRO-Offload
+
+ZeRO-Offload pushes the boundary of the maximum model size that can be trained efficiently using minimal GPU resources, by exploiting computational and memory resources on both GPUs and their host CPUs. It allows training up to 13-billion-parameter models on a single NVIDIA V100 GPU, 10x larger than the state-of-the-art, while retaining high training throughput of over 30 teraflops per GPU.  
+
+For more details see the [ZeRO-Offload release blog]( https://www.microsoft.com/en-us/research/?p=689370&secret=iSlooB), and [tutorial](/tutorials/zero-offload/) on integration with DeepSpeed.
+
 ## Additional Memory and Bandwidth Optimizations
 
 ### Smart Gradient Accumulation
@@ -136,8 +150,8 @@ Please see the [core API doc](https://deepspeed.readthedocs.io/) for more detail
 
 ### Activation Checkpointing API
 
-DeepSpeed's Activation Checkpoinitng API supports activation checkpoint partitioning,
-cpu checkpoiniting, and contiguous memory optimizations, while also allowing layerwise
+DeepSpeed's Activation Checkpointing API supports activation checkpoint partitioning,
+cpu checkpointing, and contiguous memory optimizations, while also allowing layerwise
 profiling. Please see the [core API doc](https://deepspeed.readthedocs.io/) for more details.
 
 
@@ -158,10 +172,28 @@ Please see the [core API doc](https://deepspeed.readthedocs.io/) for more detail
 
 ## Training Optimizers
 
+### 1-bit Adam optimizer with up to 5x less communication
+
+DeepSpeed has an efficient implementation of a novel algorithm called 1-bit Adam.
+It offers the same convergence as Adam, incurs up to 5x less communication that enables
+up to 3.5x higher throughput for BERT-Large pretraining and up to 2.7x higher throughput
+for SQuAD fine-tuning on bandwidth-limited clusters. For more details on usage and performance,
+please refer to the detailed [tutorial](https://www.deepspeed.ai/tutorials/onebit-adam) and
+[blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.md), respectively.
+<!-- **TODO: add paper link when it is ready ** -->
+
 ### Fused Adam optimizer and arbitrary torch.optim.Optimizer
 With DeepSpeed, the user can choose to use a high performance implementation of ADAM from
 NVIDIA, or any training optimizer that extends torch's `torch.optim.Optimizer` class.
 
+### CPU-Adam: High-Performance vectorized implementation of Adam
+We introduce an efficient implementation of Adam optimizer on CPU that improves the parameter-update
+performance by nearly an order of magnitude. We use the AVX SIMD instructions on Intel-x86 architecture
+for the CPU-Adam implementation. We support both AVX-512 and AVX-2 instruction sets. DeepSpeed uses
+AVX-2 by default which can be switched to AVX-512 by setting the build flag, `DS_BUILD_AVX512` to 1 when
+installing DeepSpeed. Using AVX-512, we observe 5.1x to 6.5x speedups considering the model-size between
+1 to 10 billion parameters with respect to torch-adam.
+
 ### Memory bandwidth optimized FP16 Optimizer
 Mixed precision training is handled by the DeepSpeed FP16 Optimizer. This optimizer not
 only handles FP16 training but is also highly efficient. The performance of weight update
@@ -221,3 +253,22 @@ Please see the [core API doc](https://deepspeed.readthedocs.io/) for more detail
   }
 }
 ```
+## Sparse Attention
+DeepSpeed offers sparse attention to support long sequences. Please refer to the [Sparse Attention](/tutorials/sparse-attention/) tutorial.
+
+```bash
+--deepspeed_sparse_attention
+```
+
+```json
+"sparse_attention": {
+    "mode": "fixed",
+    "block": 16,
+    "different_layout_per_head": true,
+    "num_local_blocks": 4,
+    "num_global_blocks": 1,
+    "attention": "bidirectional",
+    "horizontal_global_attention": false,
+    "num_different_global_patterns": 4
+}
+```
diff --git a/docs/_posts/2020-05-19-press-release.md b/docs/_posts/2020-05-19-press-release.md
index 3deb0686e20a..0a247e253074 100644
--- a/docs/_posts/2020-05-19-press-release.md
+++ b/docs/_posts/2020-05-19-press-release.md
@@ -4,6 +4,6 @@ title: "ZeRO-2 & DeepSpeed: Shattering Barriers of Deep Learning Speed & Scale"
 excerpt: ""
 link: https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/
 categories: news
-new_post: true
+new_post: false
 date: 2020-05-19 02:00:00
 ---
diff --git a/docs/_posts/2020-05-19-zero-stage2.md b/docs/_posts/2020-05-19-zero-stage2.md
index f52eddde012f..5ce3ad252223 100644
--- a/docs/_posts/2020-05-19-zero-stage2.md
+++ b/docs/_posts/2020-05-19-zero-stage2.md
@@ -3,7 +3,7 @@ layout: single
 title: "An Order-of-Magnitude Larger and Faster Training with ZeRO-2"
 excerpt: ""
 categories: news
-new_post: true
+new_post: false
 date: 2020-05-19 01:00:00
 ---
 
diff --git a/docs/_posts/2020-05-28-fastest-bert-training.md b/docs/_posts/2020-05-28-fastest-bert-training.md
index 9437752403a0..45ca0618e0be 100644
--- a/docs/_posts/2020-05-28-fastest-bert-training.md
+++ b/docs/_posts/2020-05-28-fastest-bert-training.md
@@ -3,7 +3,7 @@ layout: single
 title: "Microsoft DeepSpeed achieves the fastest BERT training time"
 excerpt: ""
 categories: news
-new_post: true
+new_post: false
 date: 2020-05-28 00:00:00
 ---
 
diff --git a/docs/_posts/2020-07-24-deepspeed-webinar.md b/docs/_posts/2020-07-24-deepspeed-webinar.md
new file mode 100644
index 000000000000..276b97c9522e
--- /dev/null
+++ b/docs/_posts/2020-07-24-deepspeed-webinar.md
@@ -0,0 +1,10 @@
+---
+layout: single
+title: "DeepSpeed Microsoft Research Webinar on August 6th, 2020"
+excerpt: ""
+categories: news
+link: https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html
+image: /assets/images/webinar-aug2020.png
+new_post: true
+date: 2020-07-24 00:00:00
+---
diff --git a/docs/_posts/2020-08-07-webinar-on-demand.md b/docs/_posts/2020-08-07-webinar-on-demand.md
new file mode 100644
index 000000000000..6d255520c0df
--- /dev/null
+++ b/docs/_posts/2020-08-07-webinar-on-demand.md
@@ -0,0 +1,9 @@
+---
+layout: single
+title: "DeepSpeed Microsoft Research Webinar is now on-demand"
+excerpt: ""
+categories: news
+link: https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html
+new_post: true
+date: 2020-08-07 00:00:00
+---
diff --git a/docs/_posts/2020-09-08-sparse-attention-news.md b/docs/_posts/2020-09-08-sparse-attention-news.md
new file mode 100644
index 000000000000..6f235818c33f
--- /dev/null
+++ b/docs/_posts/2020-09-08-sparse-attention-news.md
@@ -0,0 +1,15 @@
+---
+layout: single
+title: "Powering 10x longer sequences and 6x faster execution through DeepSpeed Sparse Attention"
+excerpt: ""
+categories: news
+new_post: true
+date: 2020-09-09 00:00:00
+---
+
+DeepSpeed offers sparse attention kernels, an instrumental technology to support long sequences of model inputs, whether for text, image, or sound. Compared with the classic dense Transformers, it powers an order-of-magnitude longer input sequence and obtains up to 6x faster execution with comparable accuracy. It also outperforms state-of-the-art sparse implementations with 1.5-3x faster execution. Furthermore, our sparse kernels support efficient execution of flexible sparse format and empower users to innovate on their custom sparse structures.
+
+* Brief overview, see our [press release]({{ site.press_release_v3 }}).
+* Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/news/2020/09/08/sparse-attention.html).
+* Tutorial on how to use sparse attention, see our [Sparse attention tutorial](https://www.deepspeed.ai/tutorials/sparse-attention/).
+* The source code for our sparse attention kernels can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed) and BERT pre-training code using sparse attention can be found in the [DeepSpeedExamples repo](https://github.com/microsoft/deepspeedexamples).
diff --git a/docs/_posts/2020-09-09-ZeRO-Offload.md b/docs/_posts/2020-09-09-ZeRO-Offload.md
new file mode 100755
index 000000000000..9a45ba8f244e
--- /dev/null
+++ b/docs/_posts/2020-09-09-ZeRO-Offload.md
@@ -0,0 +1,14 @@
+---
+layout: single
+title: "10x bigger model training on a single GPU with ZeRO-Offload"
+excerpt: ""
+categories: news
+new_post: true
+date: 2020-09-09 00:00:00
+---
+
+We introduce a new technology called ZeRO-Offload to enable **10X bigger model training on a single GPU**. ZeRO-Offload extends ZeRO-2 to leverage both CPU and GPU memory for training large models. Using a machine with **a single GPU**, our users now can run **models of up to 13 billion parameters** without running out of memory, 10x bigger than the existing approaches, while obtaining competitive throughput. This feature democratizes multi-billion-parameter model training and opens the window for many deep learning practitioners to explore bigger and better models.
+
+* For more information on ZeRO-Offload, see our [press release]( {{ site.press_release_v3 }} ).
+* For more information on how to use ZeRO-Offload, see our [ZeRO-Offload tutorial](https://www.deepspeed.ai/tutorials/zero-offload/).
+* The source code for ZeRO-Offload can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed).
diff --git a/docs/_posts/2020-09-09-onebit-adam-blog-post.md b/docs/_posts/2020-09-09-onebit-adam-blog-post.md
new file mode 100644
index 000000000000..b16a101578f0
--- /dev/null
+++ b/docs/_posts/2020-09-09-onebit-adam-blog-post.md
@@ -0,0 +1,143 @@
+---
+layout: single
+title: "DeepSpeed with 1-bit Adam: 5x less communication and 3.4x faster training"
+excerpt: ""
+categories: news
+new_post: false
+date: 2020-09-09 00:00:00
+---
+
+## 1. Introduction
+
+Scalable training of large models (like BERT and GPT-3) requires careful optimization rooted in model design, architecture, and system capabilities. From a system standpoint, communication has become a major bottleneck, especially on commodity systems with standard TCP interconnects that offer limited network bandwidth.
+Communication compression is an important technique to reduce training time on such systems. One of the most effective ways to compress communication is via error compensation compression, which offers robust convergence speed, even under 1-bit compression. However, state-of-the-art error compensation techniques only work with basic optimizers like Stochastic Gradient Descent (SGD) and momentum SGD, which are linearly dependent on the gradients. They do not work with non-linear gradient-based optimizers like Adam, which offers state-of-the-art convergence efficiency and accuracy for many tasks, including training of BERT-like models.
+For a powerful optimizer like ADAM, the non-linear dependency on gradient (in the variance term) makes it challenging to develop error compensation-based compression techniques, limiting the practical value of the state-of-the-art communication compression techniques.
+
+
+### 1.1 Background: Classic compression techniques
+
+One way of communication compression is 1-bit compression, which can be expressed as:
+
+<img src="https://render.githubusercontent.com/render/math?math=x%5Cto%20%5Cfrac%7B%5C%7Cx%5C%7C%7D%7B%5C%7CSign(x)%5C%7C%7DSign(x)">
+
+With this compression, we could achieve a 32x reduction of memory size by representing each number using one bit. The problem is that using this straightforward method would significantly degrade the convergence speed, which makes this method inapplicable. To solve this problem, recent studies show that by using error compensation compression, we could expect almost the same convergence rate with communication compression.
+The idea of error compensation can be summarized as: 1) doing compression, 2) memorizing the compression error, and then 3) adding the compression error back in during the next iteration. For SGD, doing error compression leads to:
+
+<img src="https://render.githubusercontent.com/render/math?math=x_t%3D%20x_%7Bt-1%7D%20-%20%5Cgamma%20C(g_t%20%2B%20e_%7Bt-1%7D)%2C%20%5Cquad%20e_t%20%3D%20g_t%2Be_%7Bt-1%7D-C(g_t%2Be_%7Bt-1%7D%20)">
+
+Where C(⋅) is the 1-bit compression operator. The good thing about doing this error compensation is that the history compression error (e_t and e_(t-1)) would be canceled by itself eventually, which can be seen by:
+
+<img src="https://render.githubusercontent.com/render/math?math=x_t%3Dx_%7Bt-1%7D-%5Cgamma(g_t%2Be_%7Bt-1%7D-e_t%20)">
+
+
+This strategy has been proven to work for optimization algorithms that are linearly dependent on the gradient, such as SGD and Momentum SGD.
+
+### 1.2 Challenges in applying error-compensation to Adam
+We provide an overview of the Adam algorithm below. The update rules are as follows.
+
+<img src="https://render.githubusercontent.com/render/math?math=m_%7Bt%2B1%7D%3D%5Cbeta_1%20m_t%2B(1-%5Cbeta_1%20)%20g_t">
+
+<img src="https://render.githubusercontent.com/render/math?math=v_%7Bt%2B1%7D%3D%5Cbeta_2%20v_t%2B(1-%5Cbeta_2%20)%20(g_t%20)%5E2">
+
+<img src="https://render.githubusercontent.com/render/math?math=x_%7Bt%2B1%7D%3Dx_t-%5Cgamma%20%5Cfrac%7Bm_%7Bt%2B1%7D%7D%7B%5Csqrt%7Bv_%7Bt%2B1%7D%7D%20%2B%5Ceta%7D">
+
+As shown in the equations above, the variance term v_t is nonlinearly dependent on the gradient g_t. If we apply basic error compensation compression to Adam, we observe that Adam will not converge as shown in Figure 1.
+
+![Inapplicability of Error-compensation Compression for Adam due to non-linear dependence on the gradient](/assets/images/adam-convergence.png){: .align-center}
+
+Figure 1: Inapplicability of Error-compensation Compression for Adam due to non-linear dependence on the gradient
+
+## 2. Compressing communication with 1-bit Adam
+To compress communication while using the Adam optimizer, we develop 1-bit Adam, which addresses the non-linearity in gradients via preconditioning. We observe that the magnitude of changes on the non-linear term, variance ( v_t), decrease significantly after a few epochs of training and setting v_t constant afterwards will not change the convergence speed. The proposed 1-bit Adam optimizer, as shown in Figure 2, consists of two parts: the warmup stage, which is essentially the vanilla Adam algorithm; and the compression stage, which keeps the variance term constant and compresses the remaining linear term, that is the momentum, into 1-bit representation.
+
+The compression stage of the algorithm is controlled by a threshold parameter (as shown in Figure 2). When we detect that the change in “variance” falls below a certain threshold, we switch to the compression stage. Our study shows that only 15-20% of the overall training steps are needed for the warmup stage.
+
+![Comparison of distributed training steps in classic Adam and the proposed 1-bit compressed Adam algorithm](/assets/images/onebit-adam-overview.png){: .align-center}
+
+Figure 2: Comparison of distributed training steps in classic Adam and the proposed 1-bit compressed Adam algorithm
+
+### 2.1 How 1-bit Adam works under the hood
+
+The weight update rule for 1-bit Adam is governed by the following equations.
+
+For the i-th worker, in the compression stage:
+
+<img src="https://render.githubusercontent.com/render/math?math=m_%7Bt%2B1%7D%5E%7B(i)%7D%3D%5Cbeta_1%20m_t%2B(1-%5Cbeta_1%20)%20g_t%5E%7B(i)%7D">
+
+<img src="https://render.githubusercontent.com/render/math?math=%5Cwidehat%7Bm%7D_%7Bt%2B1%7D%5E%7B(i)%7D%3DC(m_%7Bt%2B1%7D%5E%7B(i)%7D%2Be_t%5E%7B(i)%7D)%2C%20%5Cquad%20e_%7Bt%2B1%7D%5E%7B(i)%7D%3D(m_%7Bt%2B1%7D%5E%7B(i)%7D%2Be_t%5E%7B(i)%7D%20)-%5Chat%7Bm%7D_%7Bt%2B1%7D%5E%7B(i)%7D">
+
+<img src="https://render.githubusercontent.com/render/math?math=m_%7Bt%2B1%7D%5E%7B(ave)%7D%20%3D%20%5Cfrac%7B1%7D%7Bn%7D%5Csum_%7Bi%3D1%7D%5En%20%5Chat%7Bm%7D_%7Bt%2B1%7D%5E%7B(i)%7D">
+
+<img src="https://render.githubusercontent.com/render/math?math=%5Chat%7Bm%7D_%7Bt%2B1%7D%5E%7B(ave)%7D%3DC(m_%7Bt%2B1%7D%5E%7B(ave)%7D%2Be_t%5E%7B(ave)%7D%20)%2C%5Cquad%20%20%20e_%7Bt%2B1%7D%5E%7B(ave)%7D%3D(%5Chat%7Bm%7D_%7Bt%2B1%7D%5E%7B(ave)%7D%2Be_t%5E%7B(ave)%7D%20)-%5Chat%7Bm%7D_%7Bt%2B1%7D%5E%7B(ave)%7D">
+
+<img src="https://render.githubusercontent.com/render/math?math=m_%7Bt%2B1%7D%3D%5Chat%7Bm%7D_%7Bt%2B1%7D%5E%7B(ave)%7D">
+
+<img src="https://render.githubusercontent.com/render/math?math=x_%7Bt%2B1%7D%3Dx_t-%5Cgamma%20%5Cfrac%7Bm_%7Bt%2B1%7D%7D%7B%5Csqrt%7Bv_%7Bwarmup%7D%7D%2B%5Ceta%7D">
+
+Where x_t is the model after iteration (t-1), m_t^(i), e_t^(i) are the momentum and compression error on worker i after iteration (t-1), and v_warmup is the variance term after the warmup stage.
+
+### 2.2 Addressing system challenges for 1-bit Adam
+
+Besides the algorithmic challenge, there are two system challenges in applying 1-bit Adam in training systems. First, we need efficient kernels that convert the momentum to 1-bit representations. Second, we need efficient communication schemes to exchange this compressed momentum across different GPUs. The goal of compression is to reduce the overall training time so that commodity systems with bandwidth-limited interconnects can be used to train large models. We address these challenges in DeepSpeed and introduce a fully optimized 1-bit Adam implementation for training on communication-constrained systems.
+
+## 3. Benefits of 1-bit Adam on communication-constrained systems
+
+1-bit Adam offers the same convergence as Adam, incurs up to 5x less communication that enables up to 3.5x higher throughput for BERT-Large pretraining and up to 2.7x higher throughput for SQuAD fine-tuning. This end-to-end throughput improvement is enabled by the 6.6x (Figure 3) and 6.2x (Figure 4) speedup observed during the compression stage. It is worth mentioning that our 1-bit Adam optimizer scales so well on a 40 Gigabit Ethernet system that its performance is comparable to Adam’s scalability on a 40 Gigabit InfiniBand QDR system. We note that the effective bandwidth on 40 Gigabit Ethernet is 4.1 Gbps based on iperf benchmarks whereas InfiniBand provides near-peak bandwidth of 32Gbps based on InfiniBand perftest microbenchmarks.
+
+![BERT-Large Pretraining](/assets/images/bert-scaling.png){: .align-center}
+
+Figure 3: Scalability of 1-bit Adam for BERT-Large Pretraining on V100 GPUs with batch size of 16/GPU.
+
+![SQuAD Finetuning](/assets/images/squad-scaling.png){: .align-center}
+
+Figure 4: Scalability of 1-bit Adam for SQuAD Finetuning on V100 GPUs with batch size of 3/GPU.
+
+## 4. Dive deeper into 1-bit Adam evaluation results
+
+### Same convergence as Adam
+
+One major question for using 1-bit Adam is the convergence speed, and we find that 1-bit Adam can achieve the same convergence speed and comparable testing performance using the same number of training samples as shown in Figure 5.
+
+![1-bit Adam convergence](/assets/images/onebit-convergence.png){: .align-center}
+
+Figure 5: 1-bit Adam converges like Adam using the same number of training samples.
+
+Detailed BERT-Base and BERT-Large results are shown in Table 1. We see that the scores are on par with or better than the original model for both the uncompressed and compressed cases.
+
+![1-bit Adam convergence table](/assets/images/convergence-table.png){: .align-center}
+
+Table 1: Verifying correctness of 1-bit Adam on various testing tasks
+
+Up to 5x less communication: 1-bit Adam provides the same convergence as Adam and reduces the communication volume by 16x during the compression stage for 16-bit (FP16) training. For BERT pretraining, this leads to an overall communication reduction of 5x as we observed the warmup stage to be just 15% of the end-to-end training time.
+
+The formula to calculate the communication volume ratio of the original versus 1-bit Adam is as follows:
+
+    1 / (warmup + (1 – warmup)/16)
+
+In the case of warmup equaling 15%, original Adam incurs 5x of the communication as 1-bit Adam.
+
+### 1-bit Adam is 3.5x faster for training BERT-Large
+
+We present two main results for training BERT-Large on systems with two different bandwidth-limited interconnects: 1) 40 gigabit Ethernet (Figure 5) and 2) 40 gbps InfiniBand QDR (Figure 6). During the compression phase, we observe up to 6.6x higher throughput on the system with Ethernet and up to 2x higher throughput on the system with InfiniBand, resulting in end-to-end speed up (including both warmup and compression stages) of 3.5x and 2.7x, respectively. The major benefit of 1-bit Adam comes from the communication volume reduction—enabled by our compressed momentum exchange—and from our custom allreduce operation that implements efficient 1-bit communication using non-blocking gather operations followed by an allgather operation.
+
+It is important to note that one can also increase total batch size to reduce communication using optimizers like LAMB instead of Adam for BERT pretraining. However, 1-bit Adam avoids the need for rigorous hyperparameter tuning, which is often more difficult for large batches from our experience. Furthermore, 1-bit Adam also works very well for workloads that have small critical batch size (cannot converge well with large batch size) like many fine-tuning tasks.
+
+![Performance of 1-bit Adam for BERT-Large training on 40 gbps Ethernet](/assets/images/bert-tcp.png){: .align-center}
+
+Figure 5: Performance of 1-bit Adam for BERT-Large training on 40 Gbps Ethernet interconnect during the compression stage.
+
+![Performance of 1-bit Adam for BERT-Large training on 40 gbps InfiniBand](/assets/images/bert-ib.png){: .align-center}
+
+Figure 6: Performance of 1-bit Adam for BERT-Large training on 40 Gbps InfiniBand interconnect during the compression stage.
+
+### 1-bit Adam is 2.7x faster for SQuAD fine-tuning
+
+1-bit Adam offers scalability not only on large-scale training tasks but also on tasks like SQuAD fine-tuning. As shown in Figures 7 and 8, 1-bit Adam scales well on both Ethernet- and InfiniBand-based systems and offers up to 6.2x higher throughput (during the compression stage) on the Ethernet-based system, resulting in 2.7x end-to-end speedup (25% warmup plus 75% compression stage). For SQuAD fine-tuning, we observed that a total batch size of 96 offers the best F1 score. Batch sizes larger than this value lower the convergence rate and require additional hyperparameter tuning.  Therefore, in order to scale to 32 GPUs, we can only apply a small batch size of 3-4 per GPU. This makes fine-tuning tasks communication intensive and hard to scale. 1-bit Adam addresses the scaling challenge well, obtaining 3.4x communication reduction without enlarging batch size, and it results in a 2.7x end-to-end speedup.
+
+![1-bit Adam convergence](/assets/images/squad-tcp.png){: .align-center}
+
+Figure 7: Performance of 1-bit Adam for SQuAD fine-tuning on 40 gbps Ethernet during the compression stage.
+
+![1-bit Adam convergence](/assets/images/squad-ib.png){: .align-center}
+
+Figure 8: Performance of 1-bit Adam for SQuAD fine-tuning on 40 gbps InfiniBand interconnect during the compression stage.
diff --git a/docs/_posts/2020-09-09-onebit-adam-news.md b/docs/_posts/2020-09-09-onebit-adam-news.md
new file mode 100644
index 000000000000..5dc0f3bd2004
--- /dev/null
+++ b/docs/_posts/2020-09-09-onebit-adam-news.md
@@ -0,0 +1,21 @@
+---
+layout: single
+title: "Up to 5x less communication and 3.4x faster training through 1-bit Adam"
+excerpt: ""
+categories: news
+new_post: true
+date: 2020-09-09 00:00:00
+---
+
+
+Adam is an effective and probably the most well-utilized optimizer for
+training many large-scale deep learning models.  However, Adam is generally
+not compatible with communication-efficient optimization algorithms, and
+therefore the communication cost could become a bottleneck while scaling
+across distributed devices. We introduce a new algorithm - 1-bit Adam - and
+its efficient implementation in DeepSpeed. 1-bit Adam offers the ***same convergence*** as Adam, incurs up to ***5x less communication*** that enables up to ***3.5x higher throughput for BERT-Large pretraining*** and up to ***2.7x higher throughput for SQuAD fine-tuning*** on bandwidth-limited clusters.
+
+* Brief overview, see our [press release]({{ site.press_release_v3 }}).
+* Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html).
+* Tutorial on how to reproduce our results, see our [1-bit Adam tutorial](/tutorials/onebit-adam/).
+* The source code for 1-bit Adam can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed). The implementation of 1-bit Adam is in [onebit_adam.py](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/fp16/onebit_adam.py) and CUDA-Aware communication for 1-bit Adam is in [custom_collectives.py](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/custom_collectives.py). Example codes to try this feature can be found in the [DeepSpeedExamples repo](https://github.com/microsoft/deepspeedexamples) as shown in the [tutorial](/tutorials/onebit-adam/).
diff --git a/docs/_posts/2020-09-09-pipeline-parallelism.md b/docs/_posts/2020-09-09-pipeline-parallelism.md
new file mode 100644
index 000000000000..d8aa20c1cee5
--- /dev/null
+++ b/docs/_posts/2020-09-09-pipeline-parallelism.md
@@ -0,0 +1,20 @@
+---
+layout: single
+title: "Training a Trillion Parameters with Pipeline Parallelism"
+excerpt: ""
+categories: news
+new_post: true
+date: 2020-09-09 00:00:00
+---
+
+DeepSpeed includes new support for pipeline parallelism! DeepSpeed's training
+engine provides hybrid 3D parallelism for training models with over a
+trillion parameters. In addition to scaling to the extreme, we have
+demonstrated that hybrid parallelism accelerates training on clusters with
+low-bandwidth network by up to 7x.
+
+* For a brief overview and results including trillion-parameter capabilities,
+  see our [press release]({{ site.press_release_v3 }}).
+* To get started with pipeline parallel training in DeepSpeed, we recommend our [tutorial](/tutorials/pipeline/).
+* See our AlexNet example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples).
+* Read our API documentation on [readthedocs](https://deepspeed.readthedocs.io/en/latest/pipeline.html).
diff --git a/docs/_posts/2020-09-09-sparse-attention.md b/docs/_posts/2020-09-09-sparse-attention.md
new file mode 100644
index 000000000000..79032eeaa153
--- /dev/null
+++ b/docs/_posts/2020-09-09-sparse-attention.md
@@ -0,0 +1,84 @@
+---
+layout: single
+title: "DeepSpeed Sparse Attention"
+excerpt: ""
+categories: news
+new_post: true
+date: 2020-09-09 01:00:00
+---
+
+Attention-based deep learning models such as the transformers are highly effective in capturing relationship between tokens in an input sequence, even across long distances. As a result, they are used with text, image, and sound-based inputs, where the sequence length can be in thousands of tokens. However, despite the effectiveness of attention modules to capture long term dependencies, in practice, their application to long sequence input is limited by compute and memory requirements of the attention computation that grow quadratically, `O(n^2)`, with the sequence length `n`.
+
+To address this limitation, DeepSpeed offers a suite of sparse attention kernels --an instrumental technology that can reduce the compute and memory requirement of attention computation by orders-of-magnitude via block-sparse computation. The suite not only alleviates the memory bottleneck of attention calculation, but also performs sparse computation efficiently. Its APIs allow convenient integration with any transformer-based models. Along with providing a wide spectrum of sparsity structures, it has the flexibility of handling any user-defined block-sparse structures. More specifically, sparse attention (SA) can be designed to compute local attention between nearby tokens, or global attention via summary tokens computed with local attention. Moreover, SA can also allow random attention, or any combination of local, global, and random attention as shown in the following figure with blue, orange, and green blocks, respectively. As a result, SA decreases the memory footprint to `O(wn)`, in which `1 < w < n` is a parameter, whose value depends on the attention structure.
+
+![Variable sparsity structure](/assets/images/sa_variable_sparsity_structure.png){: .align-center}
+
+This library is PyTorch based and develops required kernels through [Triton](https://github.com/ptillet/triton) platform; kernels are not written in CUDA, which leaves the door open for CPU/OpenCL/Vulkan support in the future. The library is an extension to DeepSpeed and can be used through DeepSpeed as well as stand alone.
+Block-sparse computations handled by DeepSpeed Sparse Attention kernels are illustrated in following figures for forward and backward passes respectively. In the figures, `S` stands for a `block-sparse matrix` and `D` a `dense matrix`.
+
+![Sparse attention forward pass](/assets/images/sa_forward_pass.png){: .align-center}
+
+![Sparse attention backward pass](/assets/images/sa_backward_pass.png){: .align-center}
+
+To learn more about Sparsity Config, and also how to use this library, please check our [tutorial](/tutorials/sparse-attention/) that provides detailed information about it.
+
+## Performance Results
+
+* **Power over 10x longer sequences**
+In a pre-training experiment, we ran BERT model under three settings: dense, dense with activation checkpoint, and sparse (SA) with activation checkpoint. SA empowers 10x and 16x longer sequences comparing with dense for BERT base and large, respectively. Following figure shows the longest sequence length runnable in BERT base and large model; experiment is performed with batch size 1 on a single NVIDIA V100 GPU-32GB memory.
+
+![Maximum sequence runnable on BERT](/assets/images/sa_maximum_sequence_runnable_on_bert.png){: .align-center}
+
+* **up to 6.3x faster computation**
+We continued the pre-training experiment for different batch sizes and sequence lengths, using [BERT base/large](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert) and [Megatron GPT2](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM). In this experiment we let the training to continue for 100 iteration and recorded the average time per last 30 iterations. SA reduces total computation comparing with dense and improves training speed:  the boost is higher with increased sequence length and it is up to 6.3x faster for BERT base, 5.3x for BERT large, and 6.1x for GPT2. Following charts show these results.
+
+![Training time for BERT base with varying sequence length](/assets/images/sa_bert_base_time_result.png){: .align-center}
+
+![Training time for BERT large with varying sequence length](/assets/images/sa_bert_large_time_result.png){: .align-center}
+
+![Training time for GPT2 with varying sequence length](/assets/images/sa_gpt2_time_result.png){: .align-center}
+
+* **higher accuracy**
+Related works along the line of sparse attention ([Sparse Transformer](https://arxiv.org/pdf/1904.10509.pdf), [Longformer](https://arxiv.org/pdf/2004.05150.pdf), [BigBird](https://arxiv.org/pdf/2007.14062.pdf)) have shown comparable or higher accuracy than full attention. Our experience is well aligned. In addition to lower memory overhead and faster computation, we also observe cases in production where SA reaches higher accuracy and faster convergence. The following chart illustrates accuracy of training a production model based on BERT for long document comprehension (2,048 sequence length). The experiment is performed in three settings: dense starting from scratch, SA starting from scratch, and SA continued training from a checkpoint of using dense with sequence length of 512.  We have observed that, for pre-training from scratch, SA converges faster with higher accuracy comparing with dense. Furthermore, SA continuing training from a pre-trained checkpoint performs even better, with respect to both time and accuracy.
+
+
+![Accuracy of long document comprehension application](/assets/images/sa_long_document_comprehension_result.png){: .align-center}
+
+
+* **comparison with state of the art, Longformer**
+We compared SA with Longformer, a state-of-the-art sparse structure and implementation. In our experiment, SA uses `Fixed` sparsity, and two implementations have comparable accuracy. On system performance, SA outperforms Longformer both in training and inference:
+  * **1.47x** faster execution pre-training MLM on Wikitext103
+We ran an experiment following the [notebook](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) offered by Longformer. In this experiment, we pre-train an MLM model using RoBERTa-base checkpoint. This is done on 8 V100-SXM2 GPU. Following table shows the details of the result in which using DeepSpeed Sparse Attention shows 1.47x speed up.
+
+|Model 	            |Local Window Size |BPC     |Train Step  |Time Per Iteration  |Time Improvement  |Accuracy improvement  |
+|-------------------|------------------|--------|------------|--------------------|------------------|----------------------|
+|RoBERTa Checkpoint |                  |2.5326  |                                                                           |
+|Longformer 	    |512               |2.6535  |0           |                    |1.47              |1.01                  |
+|Sparse Attention   |	               |2.6321  |	     |	  	          |                  |                      |
+|Longformer 	    |                  |1.6708  |3k	     |1.6280		  |                  |1.01                  |
+|Sparse Attention   |	      	       |1.6613  |            |1.1059              |                  |	                    |
+|Longformer         |64                |5.7840  |0           |                    |1.31              |1.46                  |
+|Sparse Attention   |                  |3.9737  |            |                    |                  |                      |
+|Longformer         |                  |2.0466  |3k          |1.4855              |                  |1.09                  |
+|Sparse Attention   |                  |1.8693  |            |1.1372              |                  |                      |
+
+
+  * **3.13x** faster execution inference on BERT-Base
+Through our Long Document Comprehension application we described above, we also checked the inference time for different window sizes testing BERT model on a `2,048` Sequence Length and batch size `1`. In this experiment, we noticed up to `3.13X` speed up replacing Bert Attention with DeepSpeed Sparse Attention instead of Longformer Attention. Following table shows the complete result.
+
+|Local Window Size   |Time Improvement|
+|--------------------|----------------|
+|512                 |3.13            |
+|256                 |2.29            |
+|128                 |2.16            |
+|64                  |1.5             |
+|32                  |1.24            |
+|16                  |1.23            |
+
+* **flexibility to handle any block-sparse structure**
+DeepSpeed Sparse Attention suite does not target at any specific sparse structure but enables model scientists to explore any block sparse structure with efficient system support. Currently, we have added popular sparse structure like:
+  * [Fixed](https://arxiv.org/pdf/1904.10509.pdf) (from OpenAI Sparse Transformer)
+  * [BigBird](https://arxiv.org/pdf/2007.14062.pdf) (from Google)
+  * BSLongformer (Block-Sparse implementation of [Longformer](https://arxiv.org/pdf/2004.05150.pdf) from AI2)
+
+We also define a template to have `variable` structure (top figure), which can be used to simply customize any block-sparse random/local/global attention pattern. In addition to this list, user can add any other sparsity structure as described in [tutorial](https://www.deepspeed.ai/tutorials/sparse-attention/) section.
diff --git a/docs/_posts/2020-10-28-progressive-layer-dropping-news.md b/docs/_posts/2020-10-28-progressive-layer-dropping-news.md
new file mode 100755
index 000000000000..5659cf818987
--- /dev/null
+++ b/docs/_posts/2020-10-28-progressive-layer-dropping-news.md
@@ -0,0 +1,14 @@
+---
+layout: single
+title: "Progressive Layer Dropping"
+excerpt: ""
+categories: news
+new_post: true
+date: 2020-10-29 00:00:00
+---
+
+We introduce a new technology called progressive layer dropping (PLD) to speedup the pre-training of Transformer-based networks through efficient and robust compressed training. The pre-training step of Transformer networks often suffer from unbearable overall computational expenses. We analyze the training dynamics and stability of Transformer networks and propose PLD to sparsely update Transformer blocks following a progressive dropping schedule, which smoothly increases the layer dropping rate for each mini-batch as training evolves along both the temporal and the model depth dimension. PLD is able to allow the pre-training to be **2.5X faster** to get similar accuracy on downstream tasks and allows the training to be **24% faster** when training the same number of samples, not at the cost of excessive hardware resources.
+
+  * For detailed technology deep dive, see our [technical report](https://arxiv.org/pdf/2010.13369.pdf).
+  * For more information on how to use PLD, see our [Progressive layer dropping tutorial](https://www.deepspeed.ai/tutorials/progressive_layer_dropping/).
+  * The source code for PLD is now available at the [DeepSpeed repo](https://github.com/microsoft/deepspeed).
diff --git a/docs/_tutorials/advanced-install.md b/docs/_tutorials/advanced-install.md
new file mode 100644
index 000000000000..5dd95a672406
--- /dev/null
+++ b/docs/_tutorials/advanced-install.md
@@ -0,0 +1,123 @@
+---
+title: "Installation Details"
+date: 2020-10-28
+---
+
+The quickest way to get started with DeepSpeed is via pip, this will install
+the latest release of DeepSpeed which is not tied to specific PyTorch or CUDA
+versions. DeepSpeed includes several C++/CUDA extensions that we commonly refer
+to as our 'ops'.  By default, all of these extensions/ops will be built
+just-in-time (JIT) using [torch's JIT C++ extension loader that relies on
+ninja](https://pytorch.org/docs/stable/cpp_extension.html) to build and
+dynamically link them at runtime.
+
+**Note:** [PyTorch](https://pytorch.org/) must be installed _before_ installing
+DeepSpeed.
+{: .notice--info}
+
+```bash
+pip install deepspeed
+```
+
+After installation, you can validate your install and see which ops your machine
+is compatible with via the DeepSpeed environment report with `ds_report` or
+`python -m deepspeed.env_report`. We've found this report useful when debugging
+DeepSpeed install or compatibility issues.
+
+```bash
+ds_report
+```
+
+## Pre-install DeepSpeed Ops
+
+Sometimes we have found it useful to pre-install either some or all DeepSpeed
+C++/CUDA ops instead of using the JIT compiled path. In order to support
+pre-installation we introduce build environment flags to turn on/off building
+specific ops.
+
+You can indicate to our installer (either install.sh or pip install) that you
+want to attempt to install all of our ops by setting the `DS_BUILD_OPS`
+environment variable to 1, for example:
+
+```bash
+DS_BUILD_OPS=1 pip install deepspeed
+```
+
+DeepSpeed will only install any ops that are compatible with your machine.
+For more details on which ops are compatible with your system please try our
+`ds_report` tool described above.
+
+If you want to install only a specific op (e.g., FusedLamb), you can toggle
+with `DS_BUILD` environment variables at installation time. For example, to
+install DeepSpeed with only the FusedLamb op use:
+
+```bash
+DS_BUILD_FUSED_LAMB=1 pip install deepspeed
+```
+
+Available `DS_BUILD` options include:
+* `DS_BUILD_OPS` toggles all ops
+* `DS_BUILD_CPU_ADAM` builds the CPUAdam op
+* `DS_BUILD_FUSED_ADAM` builds the FusedAdam op (from [apex](https://github.com/NVIDIA/apex))
+* `DS_BUILD_FUSED_LAMB` builds the FusedLamb op
+* `DS_BUILD_SPARSE_ATTN` builds the sparse attention op
+* `DS_BUILD_TRANSFORMER` builds the transformer op
+* `DS_BUILD_STOCHASTIC_TRANSFORMER` builds the stochastic transformer op
+* `DS_BUILD_UTILS` builds various optimized utilities
+
+
+## Install DeepSpeed from source
+
+After cloning the DeepSpeed repo from GitHub, you can install DeepSpeed in
+JIT mode via pip (see below). This install should complete
+quickly since it is not compiling any C++/CUDA source files.
+
+```bash
+pip install .
+```
+
+For installs spanning multiple nodes we find it useful to install DeepSpeed
+using the
+[install.sh](https://github.com/microsoft/DeepSpeed/blob/master/install.sh)
+script in the repo. This will build a python wheel locally and copy it to all
+the nodes listed in your hostfile (either given via --hostfile, or defaults to
+/job/hostfile).
+
+
+## Building for the correct architectures
+
+If you're getting the following error:
+
+```python
+RuntimeError: CUDA error: no kernel image is available for execution on the device
+```
+when running deepspeed that means that the cuda extensions weren't built for the card you're trying to use it for.
+
+When building from source deepspeed will try to support a wide range of architectures, but under jit-mode it'll only support the archs visible at the time of building.
+
+You can build specifically for a desired range of architectures by setting a `TORCH_CUDA_ARCH_LIST` env variable, like so:
+
+```bash
+TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6" pip install ...
+```
+
+It will also make the build faster when you only build for a few architectures.
+
+This is also recommended to do to ensure your exact architecture is used. Due to a variety of technical reasons a distributed pytorch binary isn't built to fully support all architectures, skipping binary compatible ones, at a potential cost of underutilizing your full card's compute capabilities. To see which archs get included during the deepspeed build from source - save the log and grep for `-gencode` arguments.
+
+The full list of nvidia gpus and their compute capabilities can be found [here](https://developer.nvidia.com/cuda-gpus).
+
+## Feature specific dependencies
+
+Some DeepSpeed features require specific dependencies outside of the general
+dependencies of DeepSpeed.
+
+* Python package dependencies per feature/op please
+see our [requirements
+directory](https://github.com/microsoft/DeepSpeed/tree/master/requirements).
+
+* We attempt to keep the system level dependencies to a minimum, however some features do require special system-level packages. Please see our `ds_report` tool output to see if you are missing any system-level packages for a given feature.
+
+## Pre-compiled DeepSpeed builds from PyPI
+
+Coming soon
diff --git a/docs/_tutorials/azure.md b/docs/_tutorials/azure.md
index e345862d0e16..3644b4621f8f 100644
--- a/docs/_tutorials/azure.md
+++ b/docs/_tutorials/azure.md
@@ -14,7 +14,7 @@ To help with launching Azure instances we suggest using the [Azure
 CLI](https://docs.microsoft.com/en-us/cli/azure/?view=azure-cli-latest). We have created
 several helper scripts to get you quickly started using DeepSpeed with Azure.
  * Install Azure CLI on your local box: [https://docs.microsoft.com/en-us/cli/azure/install-azure-cli](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli).
- * Alternatively you can use the Azure in-browser shell: [https://shell.azure.com/](https://shell.azure.com/).
+ * Alternatively, you can use the Azure in-browser shell: [https://shell.azure.com/](https://shell.azure.com/).
 
 ## Create an SSH key
 Generate an SSH key that will be used across this tutorial to SSH into your VMs and
diff --git a/docs/_tutorials/bert-pretraining.md b/docs/_tutorials/bert-pretraining.md
index 03462e893b07..0791fb3308fe 100755
--- a/docs/_tutorials/bert-pretraining.md
+++ b/docs/_tutorials/bert-pretraining.md
@@ -284,10 +284,10 @@ transformer layers using DeepSpeed transformer kernel as below.
              gelu_checkpoint=args.gelu_checkpoint,
              stochastic_mode=True)
 
-         self.layer = nn.ModuleList([copy.deepcopy(DeepSpeedTransformerLayer(i, cuda_config)) for i in range(config.num_hidden_layers)])
+         layer = DeepSpeedTransformerLayer(cuda_config)
      else:
          layer = BertLayer(config)
-         self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+     self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
 ```
 All configuration settings come from the DeepSpeed configuration file and
 command arguments and thus we must pass the `args` variable to here in this model.
diff --git a/docs/_tutorials/cifar-10.md b/docs/_tutorials/cifar-10.md
index 91f1b57034db..c7b53e58357a 100644
--- a/docs/_tutorials/cifar-10.md
+++ b/docs/_tutorials/cifar-10.md
@@ -108,7 +108,7 @@ The first step to apply DeepSpeed is adding DeepSpeed arguments to CIFAR-10 mode
 
 ### Initialization
 
-We use `deepspeed.initialize` to create `model_engine`, `optimizer` and `trainloader`. Below is its definition.
+We create `model_engine`, `optimizer` and `trainloader` with the help of `deepspeed.initialize`, which is defined as following:
 
 ```python
 def initialize(args,
@@ -122,7 +122,7 @@ def initialize(args,
                collate_fn=None):
 ```
 
-For CIFAR-10 model, we initialize DeepSpeed its model (net) is created as below, to pass the raw `model`, `optimizer`, `args`, `parametersnd` and `trainset`.
+Here we initialize DeepSpeed with CIFAR-10 model (`net`), `args`, `parameters` and `trainset`:
 
 ```python
  parameters = filter(lambda p: p.requires_grad, net.parameters())
@@ -132,11 +132,11 @@ For CIFAR-10 model, we initialize DeepSpeed its model (net) is created as below,
  # 1) Distributed model
  # 2) Distributed data loader
  # 3) DeepSpeed optimizer
- model_engine, optimizer, trainloader, __ = deepspeed.initialize(args=args, model=net, model_parameters=parameters, training_data=trainset)
+ model_engine, optimizer, trainloader, _ = deepspeed.initialize(args=args, model=net, model_parameters=parameters, training_data=trainset)
 
 ```
 
-The original device and optimizer can be removed after initializing DeepSpeed.
+After initializing DeepSpeed, the original `device` and `optimizer` are removed:
 
 ```python
  #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
diff --git a/docs/_tutorials/gan.md b/docs/_tutorials/gan.md
new file mode 100755
index 000000000000..d880f48db28e
--- /dev/null
+++ b/docs/_tutorials/gan.md
@@ -0,0 +1,110 @@
+---
+title: "DCGAN Tutorial"
+excerpt: "Train your first GAN model with DeepSpeed!"
+---
+
+If you haven't already, we advise you to first read through the [Getting Started](/getting-started/) guide before stepping through this
+tutorial.
+
+In this tutorial, we will port the DCGAN model to DeepSpeed using custom (user-defined) optimizers and a multi-engine setup!
+
+## Running Original DCGAN
+
+Please go through the [original tutorial](https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html) for the Celebrities dataset first using the [original code](https://github.com/pytorch/examples/blob/master/dcgan/main.py). Then run `bash gan_baseline_run.sh`.
+
+
+## Enabling DeepSpeed
+
+The codes may be obtained [here](https://github.com/microsoft/DeepSpeedExamples/tree/master/gan).
+
+### Argument Parsing
+
+The first step to apply DeepSpeed is adding configuration arguments to DCGAN model, using the `deepspeed.add_config_arguments()` function as below.
+
+```python
+import deepspeed
+
+def main():
+    parser = get_argument_parser()
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args()
+    train(args)
+```
+
+
+
+### Initialization
+
+We use `deepspeed.initialize` to create two model engines (one for the discriminator network and one for the generator network along with their respective optimizers) as follows:
+
+```python
+    model_engineD, optimizerD, _, _ = deepspeed.initialize(args=args, model=netD, model_parameters=netD.parameters(), optimizer=optimizerD)
+    model_engineG, optimizerG, _, _ = deepspeed.initialize(args=args, model=netG, model_parameters=netG.parameters(), optimizer=optimizerG)
+
+```
+
+Note that DeepSpeed automatically takes care of the distributed training aspect, so we set ngpu=0 to disable the default data parallel mode of pytorch.
+
+### Discriminator Training
+
+We modify the backward for discriminator as follows:
+
+```python
+model_engineD.backward(errD_real)
+model_engineD.backward(errD_fake)
+```
+
+which leads to the inclusion of the gradients due to both real and fake mini-batches in the optimizer update.
+
+### Generator Training
+
+We modify the backward for generator as follows:
+
+```python
+model_engineG.backward(errG)
+```
+
+**Note:** In the case where we use gradient accumulation, backward on the generator would result in accumulation of gradients on the discriminator, due to the tensor dependencies as a result of `errG` being computed from a forward pass through the discriminator; so please set `requires_grad=False` for the `netD` parameters before doing the generator backward.
+
+### Configuration
+
+The next step to use DeepSpeed is to create a configuration JSON file (gan_deepspeed_config.json). This file provides DeepSpeed specific parameters defined by the user, e.g., batch size, optimizer, scheduler and other parameters.
+
+```json
+{
+  "train_batch_size" : 64,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.0002,
+      "betas": [
+        0.5,
+        0.999
+      ],
+      "eps": 1e-8
+    }
+  },
+  "steps_per_print" : 10
+}
+```
+
+
+
+### Run DCGAN Model with DeepSpeed Enabled
+
+To start training the DCGAN model with DeepSpeed, we execute the following command which will use all detected GPUs by default.
+
+```bash
+deepspeed gan_deepspeed_train.py --dataset celeba --cuda --deepspeed_config gan_deepspeed_config.json --tensorboard_path './runs/deepspeed'
+```
+
+## Performance Comparison
+
+We use a total batch size of 64 and perform the training on 16 GPUs for 1 epoch on a DGX-2 node which leads to 3x speed-up. The summary of the the results is given below:
+
+- Baseline total wall clock time for 1 epochs is 393 secs
+
+- Deepspeed total wall clock time for 1 epochs is 128 secs
+
+
+###
diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md
index 9c845bc41b39..5c53f2084f3c 100644
--- a/docs/_tutorials/getting-started.md
+++ b/docs/_tutorials/getting-started.md
@@ -7,10 +7,9 @@ date: 2020-05-15
 
 ## Installation
 
+* Installing is as simple as `pip install deepspeed`, [see more details](/tutorials/advanced-install/).
 * Please see our [Azure tutorial](/tutorials/azure/) to get started with DeepSpeed on Azure!
 * If you're not on Azure, we recommend using our docker image via `docker pull deepspeed/deepspeed:latest` which contains a pre-installed version of DeepSpeed and all the necessary dependencies.
-* If you want to install DeepSpeed manually, we provide an install script
-* `install.sh` to help install on a local machine or across an entire cluster.
 
 ## Writing DeepSpeed Models
 DeepSpeed model training is accomplished using the DeepSpeed engine. The engine
@@ -25,18 +24,34 @@ model_engine, optimizer, _, _ = deepspeed.initialize(args=cmd_args,
                                                      model_parameters=params)
 ```
 
-`deepspeed.inialize` ensures that all of the necessary setup required for
+`deepspeed.initialize` ensures that all of the necessary setup required for
 distributed data parallel or mixed precision training are done
 appropriately under the hood.  In addition to wrapping the model, DeepSpeed can
 construct and manage the training optimizer, data loader, and the learning rate
-scheduler based on the parameters passed to `deepspeed.initialze` and the
+scheduler based on the parameters passed to `deepspeed.initialize` and the
 DeepSpeed [configuration file](#deepspeed-configuration).
 
+If you already have a distributed environment setup, you'd need to replace:
+
+```python
+torch.distributed.init_process_group(...)
+```
+
+with:
+
+```python
+deepspeed.init_distributed()
+```
+
+The default is to use the NCCL backend, which DeepSpeed has been thoroughly tested with, but you can also [override the default](https://deepspeed.readthedocs.io/en/latest/initialize.html#distributed-initialization).
+
+But if you don't need the distributed environment setup until after `deepspeed.initialize()` you don't have to use this function, as DeepSpeed will automatically initialize the distributed environment during its `initialize`. Regardless, you will need to remove `torch.distributed.init_process_group` if you already had it in place.
+
 
 ### Training
 
 Once the DeepSpeed engine has been initialized, it can be used to train the
-model using three simple APIs for forward propagation (`()`), backward
+model using three simple APIs for forward propagation (callable object), backward
 propagation (`backward`), and weight updates (`step`).
 
 ```python
@@ -217,25 +232,27 @@ DeepSpeed will then make sure that these environment variables are set when
 launching each process on every node across their training job.
 
 
-### MPI Compatibility
+### MPI and AzureML Compatibility
 As described above, DeepSpeed provides its own parallel launcher to help launch
 multi-node/multi-gpu training jobs. If you prefer to launch your training job
 using MPI (e.g., mpirun), we provide support for this. It should be noted that
 DeepSpeed will still use the torch distributed NCCL backend and *not* the MPI
-backend. To launch your training job with mpirun + DeepSpeed you simply pass us
-an additional flag `--deepspeed_mpi`. DeepSpeed will then use
-[mpi4py](https://pypi.org/project/mpi4py/) to discover the MPI environment (e.g.,
-rank, world size) and properly initialize torch distributed for training. In this
-case you will explicitly invoke `python` to launch your model script instead of using
-the `deepspeed` launcher, here is an example:
-```bash
-mpirun <mpi-args> python \
-	<client_entry.py> <client args> \
-	--deepspeed_mpi --deepspeed --deepspeed_config ds_config.json
-```
+backend.
 
-If you want to use this feature of DeepSpeed, please ensure that mpi4py is
-installed via `pip install mpi4py`.
+To launch your training job with mpirun + DeepSpeed or with AzureML (which uses
+mpirun as a launcher backend) you simply need to install the
+[mpi4py](https://pypi.org/project/mpi4py/) python package.  DeepSpeed will use
+this to discover the MPI environment and pass the necessary state (e.g., world
+size, rank) to the torch distributed backend.
+
+If you are using model parallelism, pipeline parallelism, or otherwise require
+torch.distributed calls before calling `deepspeed.initialize(..)` we provide
+the same MPI support with an additional DeepSpeed API call. Replace your initial
+`torch.distributed.init_process_group(..)` call with:
+
+```python
+deepspeed.init_distributed()
+```
 
 ## Resource Configuration (single-node)
 In the case that we are only running on a single node (with one or more GPUs)
diff --git a/docs/_tutorials/megatron.md b/docs/_tutorials/megatron.md
index 2dac5686dc0f..31a71d57ee92 100644
--- a/docs/_tutorials/megatron.md
+++ b/docs/_tutorials/megatron.md
@@ -322,7 +322,7 @@ and return the states for the client model.
 
 ### DeepSpeed Activation Checkpoints (Optional)
 
-DeepSpeed can reduce the activation memory during model parallel training by partitioning activation checkpoints across model parallel GPUs, or offloading them to CPU. These optimization are optional, and can be skipped unless activation memory becomes a memory bottlenck. To enable partition activation, we use the `deepspeed.checkpointing` API to replace Megatron's activation checkpointing and random state tracker APIs. The replacement should happen before the first invocation of these APIs.
+DeepSpeed can reduce the activation memory during model parallel training by partitioning activation checkpoints across model parallel GPUs, or offloading them to CPU. These optimizations are optional, and can be skipped unless activation memory becomes a memory bottleneck. To enable partition activation, we use the `deepspeed.checkpointing` API to replace Megatron's activation checkpointing and random state tracker APIs. The replacement should happen before the first invocation of these APIs.
 
 a) Replace in `pretrain_gpt.py` :
 
@@ -405,4 +405,4 @@ Scalability: We observe superlinear speedup (Figure 2, top right), where the per
 
 Democratizing large model training: ZeRO-2 empowers model scientists to train models up to 13 billion parameters efficiently without any model parallelism that typically requires model refactoring (Figure 2, bottom right). 13 billion parameters is larger than most of the largest state-of-the-art models (such as Google T5, with 11 billion parameters). Model scientists can therefore experiment freely with large models without worrying about model parallelism. In comparison, the implementations of classic data-parallelism approaches (such as PyTorch Distributed Data Parallel) run out of memory with 1.4-billion-parameter models, while ZeRO-1 supports up to 6 billion parameters for comparison.
 
-Furthermore, in the absence of model parallelism, these models can be trained on low bandwidth clusters while still achieving significantly better throughput compared to using model parallelism. For example, the GPT-2 model can be trained nearly 4x faster with ZeRO powered data parallelism compared to using model parallelism on a four node cluster connected with 40 Gbps Infiniband interconnect, where each node have four NVIDIA 16GB V100 GPUs connected with PCI-E. Therefore, with this performance improvement, large model training is no longer limited to GPU clusters with ultra fast interconnect but also accesible on modest clusters with limited bandwidth.
+Furthermore, in the absence of model parallelism, these models can be trained on low bandwidth clusters while still achieving significantly better throughput compared to using model parallelism. For example, the GPT-2 model can be trained nearly 4x faster with ZeRO powered data parallelism compared to using model parallelism on a four node cluster connected with 40 Gbps Infiniband interconnect, where each node have four NVIDIA 16GB V100 GPUs connected with PCI-E. Therefore, with this performance improvement, large model training is no longer limited to GPU clusters with ultra fast interconnect but also accessible on modest clusters with limited bandwidth.
diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md
new file mode 100644
index 000000000000..8871a5dd0e28
--- /dev/null
+++ b/docs/_tutorials/onebit-adam.md
@@ -0,0 +1,249 @@
+---
+title: "1-bit Adam: Up to 5x less communication volume and up to 2x faster training"
+---
+
+In this tutorial, we are going to introduce the 1-bit Adam optimizer in DeepSpeed. 1-bit Adam can improve model training speed on communication-constrained clusters, especially for communication-intensive large models by reducing the overall communication volume by up to 5x. Detailed description of the 1-bit Adam algorithm, its implementation in DeepSpeed, and performance evaluation is available from our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html).
+
+To illustrate the benefits and usage of 1-bit Adam optimizer in DeepSpeed, we use the following two training tasks as examples:
+
+1. BingBertSQuAD Fine-tuning
+2. BERT Pre-training
+
+For more details on these tasks, please refer to the tutorial posts on [BingBertSQuAD Fine-tuning](/tutorials/bert-finetuning/) and [BERT Pre-training](/tutorials/bert-pretraining/).
+
+## 1. Overview
+
+### Pre-requisites for installing DeepSpeed
+
+If you don't already have a copy of the DeepSpeed repository, please clone in
+now and checkout the DeepSpeedExamples submodule that contains the BingBertSQuAD and BERT Pre-training examples.
+
+```shell
+git clone https://github.com/microsoft/DeepSpeed
+cd DeepSpeed
+git submodule update --init --recursive
+cd DeepSpeedExamples/
+```
+
+### Pre-requisites for 1-bit Adam
+
+1-bit Adam uses advanced communication schemes that are not yet supported by PyTorch distributed and NCCL. We rely on Message Passing Interface (MPI) for these advanced communication primitives.
+
+We package the necessary dependencies in the DeepSpeed docker images. However, if you are using a different build system, please install MPI and mpi4py on your system. We have tested CUDA-Aware MPI communication using the [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) library. However, any CUDA-Aware communication library including [OpenMPI](https://www.open-mpi.org/) should work fine with these examples.
+
+An example launch command for 1-bit Adam using the `deepspeed` launcher is as follows:
+
+```shell
+deepspeed --launcher=[mvapich|openmpi] script.py
+```
+
+Alternatively, the standard mpirun launcher can also be used as follows:
+
+```shell
+mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flags] bash [training_script.sh]
+```
+
+### 1-bit Algorithm
+
+The detailed description of the 1-bit Algorithm can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html).
+
+### Configuration of 1-bit Adam
+The 1-bit Adam feature can be used by setting the optimizer configuration options as follows. An example json config file is shown below.
+
+```json
+{
+  "train_batch_size": 4096,
+  "train_micro_batch_size_per_gpu": 64,
+  "optimizer": {
+    "type": "OneBitAdam",
+    "params": {
+      "lr": 2e-4,
+      "freeze_step": 400,
+      "cuda_aware": true
+    }
+  },
+  "fp16": {
+    "enabled": true,
+  }
+}
+```
+Please note two new parameters `freeze_step` and `cuda_aware` that have been added to support the 1-bit Adam feature.
+
+`cuda_aware` is used to indicate that the underlying MPI library support CUDA-Aware communication.
+This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
+
+`freeze_step` is the number of warm up steps before 1-bit compression gets applied to the communication. In order to determine the number of warm up steps, one strategy is to set 15-25% of the total training steps for a given model. If it provides the desired outcome, one can try to extract more performance by reducing the steps systematically. In future, we plan to introduce a threshold that can automatically search and decide for the number of warm up steps for different models. The examples below have been tuned for the number of warm up steps. The `freeze_step` parameter has already been set to the best number we found in the corresponding run scripts.
+
+## 2. BingBertSQuAD Fine-tuning with 1-bit Adam
+
+* Download the SQuAD dataset:
+  * Training set: [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+  * Validation set: [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+* Download the HuggingFace checkpoint and config files:
+  * [bert-large-uncased-whole-word-masking](https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin)
+  * [bert json config](https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json)
+
+You can also use a pre-trained BERT model checkpoint from either DeepSpeed, [HuggingFace](https://github.com/huggingface/transformers), or [TensorFlow](https://github.com/google-research/bert#pre-trained-models) to run the fine-tuning.
+
+### 2.1 Running BingBertSQuAD with DeepSpeed and 1-bit Adam
+
+The main part of training is done in `nvidia_run_squad_deepspeed.py`, which has
+already been modified to use DeepSpeed. The `run_squad_deepspeed.sh` script
+helps to invoke training and setup several different hyperparameters relevant
+to the training process.
+
+- **DeepSpeed-enabled:** Start training with DeepSpeed by providing the following 4 arguments to this script:
+
+```shell
+bash run_squad_deepspeed.sh <NUM_GPUS> <PATH_TO_CHECKPOINT> <PATH_TO_DATA_DIR> <PATH_TO_OUTPUT_DIR>`
+```
+
+The first argument is the number of GPUs to train with, second argument is the path to the pre-training checkpoint, third is the path to training and validation sets (e.g., train-v1.1.json), and fourth is path to an output folder where the results will be saved. This script will invoke `nvidia_run_squad_deepspeed.py`.
+
+- **DeepSpeed with 1-bit Adam enabled:** In order to run with 1-bit Adam feature enabled, the same script (`nvidia_run_squad_deepspeed.py`) can be used but there are two options for launching this properly: 1) Launch using deepspeed launcher and 2) Launch with mpirun.
+
+To enable the 1-bit compressed training, 1-bit Adam uses an MPI library (E.g. MVAPICH2-GDR, OpenMPI, etc.) as the communication backend, which means that we can use `mpirun` to launchg the training job. However, our user-friendly launcher called `deepspeed` has been enhanced to launch MPI jobs as well.
+
+### Launch with deepspeed
+
+The following helper script in the DeepSpeedExamples/BingBertSQuAD will launch the training without the need for setting any `mpirun` parameters. The number of nodes and GPUs will be automatically detected and the job will be launched on all the available resources.
+
+```shell
+bash run_squad_deepspeed_onebitadam.sh <PATH_TO_OUTPUT_DIR>
+```
+
+### Launch with mpirun
+
+Alternatively, we show how the standard `mpirun` launcher can be used for launching the fine-tuning job.
+
+```shell
+mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flags] bash run_squad_mpi_onebitadam.sh
+```
+
+For example, in order to use 32 GPUs (4GPUs/node, 8 nodes in total), with the support of InfiniBand, you can use the `mpirun` launcher packaged with the MVAPICH2 library. Please run the following command:
+
+```shell
+mpirun -np 32 -ppn 4 -hostfile hosts -env MV2_USE_CUDA=1 -env MV2_SUPPORT_DL=1 -env MV2_ENABLE_AFFINITY=0 -env MV2_SMP_USE_CMA=0 bash run_squad_mpi_onebitadam.sh
+```
+
+### 2.2 Configuration for BingBertSQuAD with DeepSpeed and 1-bit Adam enabled
+
+The `deepspeed_onebitadam_bsz96_config.json` file gives the user the ability to specify DeepSpeed
+options in terms of batch size, micro batch size, optimizer, learning rate, and other parameters.
+When running the `nvidia_run_squad_deepspeed.py`, in addition to the
+`--deepspeed` flag to enable DeepSpeed, the appropriate DeepSpeed configuration
+file must be specified using `--deepspeed_config deepspeed_onebitadam_bsz96_config.json`.
+
+Table 1 shows the fine-tuning configuration we used in our experiments.
+
+| Parameters                     | Value 		|
+| ------------------------------ | ---------------------|
+| Total batch size               | 96    		|
+| Train micro batch size per GPU | 3     		|
+| Optimizer                      | **OnebitAdam**  	|
+| Learning rate                  | 3e-5  		|
+| Sequence-length                | 384   		|
+| Weight-decay                   | 0.0   		|
+| Epoch count                    | 2     		|
+| **freeze_step**                | 400     	   	|
+| **cuda_aware**                 | True     		|
+
+Table 1. Fine-tuning configuration
+
+**Note:** For more details about loading checkpoint, argument parsing, initialization, forward pass, backward pass, weight update and evaluation, please refer to the [BingBertSQuAD Fine-tuning](/tutorials/bert-finetuning/) tutorial.
+
+### 2.3 Performance Results for BingBertSQuAD Fine-tuning
+
+***Accuracy:***
+The results are summarized in the table below. The total batch size is set to 96 and training is conducted
+on 32 GPUs for 2 epochs. A set of parameters (seeds and learning rates) were tried and the best ones were selected.
+We fixed the learning rate to 3e-5. The table below shows the F1 and the EM scores we achieved that are on-par or better than the [HuggingFace results](https://github.com/huggingface/transformers/tree/master/examples/question-answering).
+
+| Case        | Model                                 | Precision | EM    | F1    |
+| ----------- | ------------------------------------- | --------- | ----- | ----- |
+| HuggingFace | [Bert-large-uncased-whole-word-masking](https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin) | FP16      | 87.26 | 93.32 |
+
+
+***Training Speed and Scalability:***
+
+1-bit Adam enables up to 2.7x overall speedup in training speed for SQuAD fine-tuning. This is made possible by up to 6.2x faster throughput during the compressed stage of the algorithm as shown in Figure 1.
+
+![SQuAD Finetuning](/assets/images/squad-scaling.png){: .align-center}
+
+Figure 1: Scalability of 1-bit Adam for SQuAD Finetuning on V100 GPUs with batch size of 3/GPU.
+
+
+## 3. BERT Pre-training with 1-bit Adam
+For data downloading and pre-processing, please refer to the [BERT Pre-training](/tutorials/bert-pretraining/) post.
+
+### 3.1 Running Pre-training with DeepSpeed and 1-bit Adam
+
+The main part of training is done in `deepspeed_train.py`, which has
+already been modified to use DeepSpeed. The `ds_train_bert_onebit_bsz4k_seq128.sh` and `ds_train_bert_bsz64k_seq128.sh`
+are the shell scripts that help to invoke training and setup several different hyperparameters relevant
+to the training process.
+
+- **DeepSpeed-enabled:** Start training with DeepSpeed by running the command below:
+
+```shell
+bash ds_train_bert_bsz64k_seq128.sh
+```
+
+- **DeepSpeed with 1-bit Adam enabled:** In order to run with 1-bit Adam feature enabled, the same script (`deepspeed_train.py`) can be used but there are two options for launching this properly:
+
+### Launch with deepspeed
+
+As discussed for BingBertSQuAD fine-tuning, we can simply use the `deepspeed` launcher to launch our BERT pre-training jobs as follows.
+
+```shell
+bash ds_train_bert_onebit_bsz4k_seq128.sh
+```
+
+### Launch with mpirun
+
+Alternatively, use the following command to launch using `mpirun`.
+
+```shell
+mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flags] bash mpi_train_bert_onebit_bsz4k_seq128.sh
+```
+
+For example, in order to use 32 GPUs (4GPUs/node, 8 nodes in total), with the support of InfiniBand, you can use MVAPICH2 as the launcher and run the following command:
+```shell
+mpirun -np 32 -ppn 4 -hostfile hosts -env MV2_USE_CUDA=1 -env MV2_SUPPORT_DL=1 -env MV2_ENABLE_AFFINITY=0 -env MV2_SMP_USE_CMA=0 bash ds_train_bert_onebit_bsz4k_seq128.sh
+```
+
+### 3.2 Configuration for BingBertSQuAD with DeepSpeed and 1-bit Adam enabled
+
+The `deepspeed_bsz4k_onebit_config_seq128.json` file gives the user the ability to specify DeepSpeed
+options in terms of batch size, micro batch size, optimizer, learning rate, and other parameters.
+
+Below is the DeepSpeed configuration file for running BERT-large pre-training with sequence length of 128 using the 1-bit Adam optimizer.
+
+```json
+{
+  "train_batch_size": 4096,
+  "train_micro_batch_size_per_gpu": 16,
+  "steps_per_print": 100,
+  "optimizer": {
+    "type": "OneBitAdam",
+    "params": {
+      "lr": 2e-4,
+      "max_grad_norm": 1.0,
+      "weight_decay": 0.01,
+      "bias_correction": false,
+      "freeze_step": 23000,
+      "cuda_aware": true
+    }
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "initial_scale_power": 16
+  }
+}
+```
+The above file is for BERT-large but for BERT-base training (sequence length 128), the suggested freeze_step will need to be changed to 16000. For the rest of the pre-training using sequence 512, we suggest to use a freeze_step of 1500.
+
+### 3.3 Performance Results for BERT Pre-training
+
+Performance results of BERT Pre-training can be seen from our detailed [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html).
diff --git a/docs/_tutorials/pipeline.md b/docs/_tutorials/pipeline.md
new file mode 100644
index 000000000000..e7730ebe2661
--- /dev/null
+++ b/docs/_tutorials/pipeline.md
@@ -0,0 +1,317 @@
+---
+title: "Pipeline Parallelism"
+---
+
+DeepSpeed v0.3 includes new support for pipeline parallelism! Pipeline
+parallelism improves both the memory and compute efficiency of deep learning
+training by partitioning the layers of a model into stages that can be
+processed in parallel.
+DeepSpeed's training engine provides hybrid data and pipeline parallelism and
+can be further combined with model parallelism such as
+[Megatron-LM](https://github.com/NVIDIA/Megatron-LM).
+An illustration of
+3D parallelism is shown below. Our latest [results]({{ site.press_release_v3 }})
+demonstrate that this 3D parallelism enables training models with over a
+**trillion** parameters.
+
+![3D parallelism in DeepSpeed](/assets/images/3d-parallelism.png)
+
+DeepSpeed uses *gradient accumulation* to extract pipeline parallelism (shown
+below). Each batch of training data is divided into micro-batches that can be
+processed in parallel by the pipeline stages. Once a stage completes the
+forward pass for a micro-batch, the activation memory is communicated to the
+next stage in the pipeline. Similarly, as the next stage completes its
+backward pass on a micro-batch, the gradient with respect to the activation
+is communicated backwards through the pipeline. Each backward pass
+accumulates gradients locally. Next, all data parallel groups perform
+reductions of the gradients in parallel. Lastly, the optimizer updates the
+model weights.
+
+
+Below is an illustration of how DeepSpeed will train a batch with eight
+micro-batches using hybrid two-way data parallelism and two-stage pipeline
+parallelism. GPUs 0 and 2 are arranged in a pipeline and will alternate
+forward (F) and backward (B) passes. They will then all-reduce (AR) gradients
+with their data parallel counterparts, GPUs 1 and 3, respectively. Finally,
+the two pipeline stages update their model weights.
+
+![Pipeline Schedule](/assets/images/pipe-schedule.png)
+
+
+## Getting Starting with Pipeline Parallelism
+
+DeepSpeed strives to accelerate *and* simplify the process of pipeline
+parallel training. This section provides first steps with hybrid data and
+pipeline parallel training by preparing `torchvision`'s
+[AlexNet](https://pytorch.org/docs/1.2.0/_modules/torchvision/models/alexnet.html)
+model.
+
+### Expressing Pipeline Models
+Pipeline parallelism requires models to be expressed as a sequence of layers.
+In the forward pass, each layer consumes the output of the previous
+layer. In fact, there is no need to specify a `forward()` for a pipeline
+parallel model! The forward pass of a pipeline parallel model implicitly
+takes the form:
+```python
+def forward(self, inputs):
+    x = inputs
+    for layer in self.layers:
+        x = layer(x)
+    return x
+```
+
+PyTorch's
+[`torch.nn.Sequential`](https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html)
+is a convenient container for expressing pipeline parallel models and can be
+parallelized by DeepSpeed with no modification:
+```python
+net = nn.Sequential(
+    nn.Linear(in_features, hidden_dim),
+    nn.ReLU(inplace=True),
+    nn.Linear(hidden_dim, out_features)
+)
+from deepspeed.pipe import PipelineModule
+net = PipelineModule(layers=net, num_stages=2)
+```
+`PipelineModule` uses its `layers` argument as the sequence of layers that
+comprise the model. After initialization, `net` is divided into two pipeline
+stages and its layers moved to the corresponding GPUs. If more than two GPUs
+are present, DeepSpeed will also use hybrid data parallelism.
+
+**Note:** The total number of GPUs must be divisible by the number of pipeline
+stages.
+{: .notice--info}
+
+**Note:** For large model training, see [memory-efficient model construction](#memory-efficient-model-construction).
+{: .notice--info}
+
+### AlexNet
+Let's look at an abbreviated implementation of `torchvision`'s
+[AlexNet](https://pytorch.org/docs/1.2.0/_modules/torchvision/models/alexnet.html):
+
+```python
+class AlexNet(nn.Module):
+    def __init__(self, num_classes=1000):
+        super(AlexNet, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
+            ...
+            nn.MaxPool2d(kernel_size=3, stride=2),
+        )
+        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
+        self.classifier = nn.Sequential(
+            nn.Dropout(),
+            ...
+            nn.Linear(4096, num_classes),
+        )
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.classifier(x)
+        return x
+```
+`AlexNet` is mostly a composition of several `Sequential` submodules. We can
+turn this into a `PipelineModule` by flattening its submodules into a single
+sequence of layers:
+```python
+class AlexNetPipe(AlexNet):
+    def to_layers(self):
+        layers = [
+            *self.features,
+            self.avgpool,
+            lambda x: torch.flatten(x, 1),
+            *self.classifier
+        ]
+        return layers
+
+from deepspeed.pipe import PipelineModule
+net = AlexNetPipe()
+net = PipelineModule(layers=net.to_layers(), num_stages=2)
+```
+
+**Note:**
+the `lamda` in the middle of `layers` above is not a `torch.nn.Module`
+type. Any object that implements `__call__()` can be a layer in a
+`PipelineModule`: this allows for convenient data transformations in the
+pipeline.
+{: .notice--info}
+
+
+### Inputs and Outputs
+Following `torch.nn.Sequential`, the inputs and outputs of each layer must be
+either a single `torch.Tensor` or a `tuple` of tensors. In practice, some
+models may need to modify their forward pass to pack and unpack arguments to
+`forward()`. Consider an abbreviated implementation of a stack of Transformer
+blocks:
+```python
+class TransformerBlock(nn.Module)
+    ...
+    def forward(self, hidden, mask):
+        output = self.compute(hidden, mask)
+        return output
+    ...
+
+stack = [ TransformerBlock() for _ in range(num_layers) ]
+```
+Two modifications to `TransformerBlock` are required:
+
+1. The arguments must be collected into a `tuple`.
+2. `mask` must also be returned from `forward()` to pass to the next layer.
+
+These modifications can be accomplished with a short subclass:
+```python
+class TransformerBlockPipe(TransformerBlock)
+    def forward(self, inputs):
+        hidden, mask = inputs
+        outputs = super().forward(hidden, mask)
+        return (output, mask)
+stack = [ TransformerBlockPipe() for _ in range(num_layers) ]
+```
+
+### Training Loops
+
+Pipeline parallelism interleaves forward and backward passes, and thus the
+training loop cannot be divided into separate stages of `forward()`,
+`backward()` and `step()`.
+Instead, DeepSpeed's pipeline engine provides a `train_batch()` method that
+advances the pipeline engine until the next batch of training data is
+consumed and the model weights updated.
+```python
+train_iter = iter(train_loader)
+loss = engine.train_batch(data_iter=train_iter)
+```
+
+The above `train_batch()` example is equivalent to the following with
+traditional data parallel DeepSpeed:
+```python
+train_iter = iter(train_loader)
+for micro_batch in engine.gradient_accumulation_steps():
+    batch = next(data_iter)
+    loss = engine(batch)
+    engine.backward(loss)
+    engine.step()
+```
+
+### Dealing with Data
+
+Data parallel training typically has each worker perform IO independently at
+the start of each batch. However, in a pipeline parallel environment, only the
+first stage uses the input data, and only the last stage uses labels for loss
+calculation.
+
+**Note:**
+The pipeline engine expects data loaders to return a `tuple` of two items. The
+first returned item is the input batch data, and the second item is the data
+to be used in the loss calculation. As before, inputs and labels should be
+either `torch.Tensor` type or a `tuple` of tensors.
+{: .notice--info}
+
+For convenience, the DeepSpeed pipeline engine can construct a distributed
+data loader when a dataset is provided to `deepspeed.initialize()`. DeepSpeed
+handles the rest of the complexity of data loading, and so the pipeline
+training loop becomes:
+```python
+engine, _, _, _ = deepspeed.initialize(
+    args=args,
+    model=net,
+    model_parameters=[p for p in net.parameters() if p.requires_grad],
+    training_data=cifar_trainset())
+
+for step in range(args.steps):
+    loss = engine.train_batch()
+```
+
+Of course, DeepSpeed will work with any data loader that you wish to use.
+Data loaders should be constructed by the first and last stages in the
+pipeline. Each worker should load micro-batches of size
+`engine.train_micro_batch_size_per_gpu()` and will be queried
+a total of `engine.gradient_accumulation_steps()` times per `train_batch()`.
+
+**Watch out!**
+The pipeline engine *pulls* data from an iteratior instead of iterating over
+it. It's critical that the data stream does not empty in the middle of a
+training batch. Each invocation of `train_batch()` will pull
+a total of `engine.gradient_accumulation_steps()` micro-batches of data from
+the data iterator.
+{: .notice--warning}
+
+DeepSpeed provides a convenience class `deepspeed.utils.RepeatingLoader` that
+simply wraps an iterable such as a data loader and restarts it whenever the
+end is reached:
+```python
+train_loader = deepspeed.utils.RepeatingLoader(train_loader)
+train_iter = iter(train_loader)
+for step in range(args.steps):
+    loss = engine.train_batch(data_iter=trainiter)
+```
+
+
+## Advanced Topics
+
+
+### Load Balancing Pipeline Modules
+The performance of pipeline parallel training strongly relies on load
+balance. DeepSpeed provides several mechanisms for partitioning the model
+across GPUs. These strategies can be set with the `partition_method` keyword
+argument to `PipelineModule`. Here are partitioning methods currently provided
+by DeepSpeed:
+
+* `partition_method="parameters"` (**default**)
+   balances the number of trainable parameters on each pipeline stage . This is
+   especially useful in memory-constrained environments and when the size of a
+   layer is proportional to the computation time.
+* `partition_method="type:[regex]"`
+  balances layers whose class names match `[regex]`. The regular expression
+  is not case sensitive. For example, `partition_method="type:transformer"`
+  would balance the number of transformer layers per stage.
+* `partition_method="uniform"` balances the number of layers per stage.
+
+### Memory-Efficient Model Construction
+Building a `Sequential` and providing it `PipelineModule` is a convenient way
+of specifying a pipeline parallel model. However, this approach encounters
+scalability issues for massive models. Starting from a `Sequential` allocates
+the model in CPU memory redundantly by every worker. A machine with 16 GPUs
+must have as much local CPU memory as 16 times the model size.
+
+DeepSpeed provides a `LayerSpec` class that delays the construction of
+modules until the model layers have been partitioned across workers. Then,
+the modules are built on the GPU that owns the layer.
+
+Here's an example of the abbreviated AlexNet model, but expressed only
+with `LayerSpec`s. Note that the syntax is almost unchanged: `nn.ReLU(inplace=True)`
+simply becomes `LayerSpec(nn.ReLU, inplace=True)`.
+```python
+from deepspeed.pipe import PipelineModule, LayerSpec
+class AlexNetPipe(PipelineModule):
+    def __init__(self, num_classes=10, **kwargs):
+        self.num_classes = num_classes
+        specs = [
+            LayerSpec(nn.Conv2d, 3, 64, kernel_size=11, stride=4, padding=2),
+            LayerSpec(nn.ReLU, inplace=True),
+            ...
+            LayerSpec(nn.ReLU, inplace=True),
+            LayerSpec(nn.Linear, 4096, self.num_classes),
+        ]
+        super().__init__(layers=specs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
+```
+
+### Tied Layers
+Some models cannot be entirely expressed as pipeline parallel models because
+some layers are reused in the pipeline. For example, Transformer based
+language models commonly use an embedding layer early in the pipeline to map
+vocabulary to hidden states, and then use the embedding to map hidden states
+back to vocabulary at the end of the pipeline. If the model was restricted to
+pure pipeline parallelism, this embedding reuse would prohibit pipeline
+parallelism.
+
+DeepSpeed provides a `TiedLayerSpec` that is an extension of
+`LayerSpec`. `TiedLayerSpec` requires an additional argument: `key`.
+Each reuse of a layer is specified with a `TiedLayerSpec`, and the `key` field
+is used to identify where a layer is reused.
+
+Tied layers are replicated on every pipeline stage that owns an instance of
+reuse. Training then proceeds as normal, but an additional all-reduce of the
+tied gradients is added after all backward passes complete. The all-reduce
+ensures that the weights of the tied layer remain in sync across pipeline stages.
diff --git a/docs/_tutorials/progressive_layer_dropping.md b/docs/_tutorials/progressive_layer_dropping.md
new file mode 100755
index 000000000000..8a447e97c945
--- /dev/null
+++ b/docs/_tutorials/progressive_layer_dropping.md
@@ -0,0 +1,155 @@
+---
+title: "Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping"
+
+---
+
+In this tutorial, we are going to introduce the progressive layer dropping (PLD) in DeepSpeed and provide examples on how to use PLD. PLD allows to train Transformer networks such as BERT 24% faster under the same number of samples and 2.5 times faster to get similar accuracy on downstream tasks. Detailed description of PLD and the experimental results are available in our [technical report](https://arxiv.org/pdf/2010.13369.pdf).
+
+To illustrate how to use PLD in DeepSpeed, we show how to enable PLD to pre-train a BERT model and fine-tune the pre-trained model on the GLUE datasets.
+
+## Running Pre-training with DeepSpeed and PLD
+
+To perform pre-training, one needs to first prepare the datasets. For this part, please refer our [BERT Pre-training](/tutorials/bert-pretraining/) post, which contains detailed information on how to do data downloading and pre-processing. For the below experiment, we use Wikipedia text and Bookcorpus, similar as [Devlin et. al.](https://arxiv.org/abs/1810.04805).
+
+The main part of pre-training is done in `deepspeed_train.py`, which has
+already been modified to use DeepSpeed. The  `ds_train_bert_progressive_layer_drop_bsz4k_seq128.sh` is the shell script that launches the pre-training with DeepSpeed and PLD.
+
+```shell
+bash ds_train_bert_progressive_layer_drop_bsz4k_seq128.sh
+```
+
+Most of the flags in the above script should be familiar if you have stepped through the BERT pre-training [tutorial](/tutorials/bert-pretraining/). To enable training with PLD, one needs to enable PLD in both the client script and in the DeepSpeed engine. To enable PLD in the client script, one needs to add the following command line flag to enable progressive layer dropping on Transformer blocks.
+
+```bash
+--progressive_layer_drop
+```
+
+To enable PLD in DeepSpeed, one needs to update the json configuration file with an appropriate PLD configuration dictionary like below:
+
+```json
+{
+  ...
+  "progressive_layer_drop": {
+    "enabled": true,
+    "theta": 0.5,
+    "gamma": 0.001
+  }
+}
+```
+
+we recommend a PLD theta value of 0.5 and gamma of 0.001 because these have worked well in our experiments.
+
+With these configuration changes, the DeepSpeed engine should print a runtime message as below:
+
+    [INFO] [logging.py:60:log_dist] [Rank 0] Enabled progressive layer dropping (theta = 0.5)
+
+The `deepspeed_bsz4k_progressive_layer_drop_config_seq128.json` file allows users to specify DeepSpeed options in terms of batch size, micro batch size, optimizer, learning rate, sequence length, and other parameters. Below is the DeepSpeed configuration file we use for running BERT and PLD.
+
+```json
+{
+  "train_batch_size": 4096,
+  "train_micro_batch_size_per_gpu": 16,
+  "steps_per_print": 1000,
+  "prescale_gradients": true,
+  "gradient_predivide_factor": 8,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 1e-3,
+      "weight_decay": 0.01,
+      "bias_correction": false
+    }
+  },
+  "gradient_clipping": 1.0,
+
+  "wall_clock_breakdown": false,
+
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0
+  },
+
+  "progressive_layer_drop": {
+    "enabled": true,
+    "theta": 0.5,
+    "gamma": 0.001
+  }
+}
+```
+
+Note that the above configuration assumes training on 64 X 32GB V100 GPUs. Each GPU uses a micro batch size of 16 and accumulates gradients until the effective batch size reaches 4096. If you have GPUs with less memory, you may need to reduce "train_micro_batch_size_per_gpu". Alternatively, if you have more GPUs, you can increase the "train_batch_size" to increase training speed. We use the following hyperparameters for pre-training BERT with PLD enabled.
+
+| Parameters                     | Value                   |
+| ------------------------------ | ----------------------- |
+| Effective batch size           | 4K                      |
+| Train micro batch size per GPU | 16                      |
+| Optimizer                      | Adam                    |
+| Peak learning rate             | 1e-3                    |
+| Sequence-length                | 128                     |
+| Learning rate scheduler        | Warmup linear decay exp |
+| Warmup ratio                   | 0.02                    |
+| Decay rate                     | 0.99                    |
+| Decay step                     | 1000                    |
+| Weight decay                   | 0.01                    |
+| Gradient clipping              | 1.0                     |
+
+Table 1. Pre-training hyperparameters
+
+**Note:** DeepSpeed now supports PreLayerNorm as the default way for training BERT, because of its ability to avoid vanishing gradient, stabilize optimization, and performance gains, as described in our fastest BERT training [blog post](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html). We therefore support the switchable Transformer block directly on the the BERT with PreLayerNorm. The implementation can be found at "example\bing_bert\nvidia\modelingpreln_layerdrop.py".
+
+## Fine-tuning with DeepSpeed on GLUE Tasks
+
+We use GLUE for fine-tuning tasks. GLUE (General Language Understanding Evaluation benchmark) (https://gluebenchmark.com/) is a  collection of sentence or sentence-pair natural language understanding tasks including question answering, sentiment analysis, and textual entailment.  It is designed to favor sample-efficient learning and knowledge-transfer across a range of different linguistic tasks in different domains.
+
+One can download all GLUE data using the provided helper [script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e). Once the data has been downloaded, one can set up the data and move the data to "/data/GlueData", which is the default location for hosting GLUE data. We then can use the PLD pre-trained BERT model checkpoint to run the fine-tuning.
+
+The main part of fine-tuning is done in `run_glue_classifier_bert_base.py`, which has
+already been modified to use DeepSpeed. Before the fine-tuning, one needs to specify the BERT model configuration through the following config in `run_glue_classifier_bert_base.py`. In this case, it has already been modified to be the same as the configuration of the pre-trained model.
+
+```json
+    bert_model_config = {
+        "vocab_size_or_config_json_file": 119547,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02
+    }
+```
+
+Next, one can load a DeepSpeed style checkpoint with the following command, which has also already been added in the script.
+
+```shell
+model.load_state_dict(checkpoint_state_dict['module'], strict=False)
+```
+
+Finally, the `run_glue_classifier_bert_base.sh` script invokes pre-training and setups several hyperparameters relevant to fine-tuning.
+
+```shell
+bash run_glue_bert_base_finetune.sh [task] [batch size] [learning rate] [number of epochs] [job name] [checkpoint path]
+```
+
+An example would be:
+
+```shell
+bash run_glue_bert_base_finetune.sh MNLI 32 3e-5 5 "fine_tune_MNLI" deepspeed_checkpoint.pt
+```
+
+
+
+### Expected Results
+
+The fine-tuning results can be found under the "logs" directory, and below are expected results for PLD on GLUE tasks. The "Lr" row indicates the learning rate we use for getting the corresponding accuracy result for each task.
+
+|                        | RTE  | MRPC      | STS-B     | CoLA | SST-2 | QNLI | QQP       | MNLI-m/mm | GLUE |
+| ---------------------- | :--: | --------- | --------- | ---- | ----- | ---- | --------- | --------- | ---- |
+| Metrics                | Acc. | F1/Acc.   | PCC/SCC   | Acc. | Acc.  | Acc. | F1/Acc.   | Acc.      |      |
+| Bert_{base} (original) | 66.4 | 88.9/84.8 | 87.1/89.2 | 52.1 | 93.5  | 90.5 | 71.2/89.2 | 84.6/83.4 | 80.7 |
+| Bert_{base} (Our impl) | 67.8 | 88.0/86.0 | 89.5/89.2 | 52.5 | 91.2  | 87.1 | 89.0/90.6 | 82.5/83.4 | 82.1 |
+| PLD                    | 69.3 | 86.6/84.3 | 90.0/89.6 | 55.8 | 91.6  | 90.7 | 89.6/91.2 | 84.1/83.8 | 82.9 |
+| Lr                     | 7e-5 | 9e-5      | 7e-5      | 5e-5 | 7e-5  | 9e-5 | 2e-4      | 3e-5      |      |
diff --git a/docs/_tutorials/sparse-attention.md b/docs/_tutorials/sparse-attention.md
new file mode 100644
index 000000000000..915fd524e1fd
--- /dev/null
+++ b/docs/_tutorials/sparse-attention.md
@@ -0,0 +1,195 @@
+---
+title: "DeepSpeed  Sparse Attention"
+---
+
+In this tutorial we describe how to use DeepSpeed Sparse Attention (SA) and its building-block kernels. The easiest way to use SA is through DeepSpeed launcher. We will describe this through an example in [How to use sparse attention with DeepSpeed launcher](#how-to-use-sparse-attention-with-deepspeed-launcher) section. But before that, we introduce modules provided by DeepSpeed SA in the [next](#sparse-attention-modules) section.
+
+**Note:** Currently DeepSpeed Sparse Attention can be used only on NVIDIA V100 GPU using Torch >= 1.5 and Cuda 10.1 or 10.2.
+{: .notice--warning}
+
+## Sparse attention modules
+* **MatMul**: This module handles block-sparse matrix-matrix multiplication. Currently it supports SDD, DSD, and DDS as described in [DeepSpeed Sparse Attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention.html) section.
+* **Softmax**: This module applies block sparse softmax. It handles both forward and backward pass.
+* **SparseSelfAttention**: This module uses MatMul and Softmax kernels and generates Context Layer output given Query, Keys and Values. It is a simplified version of common operations in any self-attention layer. It can also apply:
+  * `Relative position embedding`
+  * `Attention mask`
+  * `Key padding mask`
+on the intermediate attention scores. For more details about self attention, please check [MultiHeadAttention](https://pytorch.org/docs/master/generated/torch.nn.MultiheadAttention.html#multiheadattention).
+* **BertSparseSelfAttention**: This module contains a simplified BertSelfAttention layer that can be used instead of original dense Bert Self-Attention layer. Our implementation is based on [DeepSpeedExample](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py#L373-#L434).
+* **SparseAttentionUtils**: This module provides few utility functions to handle adapting pre-trained model with sparse attention:
+  * `replace_model_self_attention_with_sparse_self_attention`: If you have currently loaded a model and want to replace self-attention module with sparse self-attention, you can simply use this function to handle it for you. It currently handles BERT and RoBERTa based pre-trained models, but you can extend it base on your model type if it is different from these two. You also need to extend the position embedding to handle new sequence length; this can be done using `extend_position_embedding` function.
+  * `update_tokenizer_model_max_length`: This function simply updates maximum position embedding in your tokenizer with the new value.
+  * `extend_position_embedding`: This function extends the position embedding based on the current values. For example, if you have a 128 max sequence length model and extending it to a 1k sequence length, it replicates current embeddings 8 times to initialize new embedding. Experimentally we have seen such initialization works much better than initializing from scratch; leads to faster convergence.
+  * `pad_to_block_size`: This function pads input tokens and attention mask on sequence length dimension to be multiple of block size; this is a requirement for SA.
+  * `unpad_sequence_output`: This function unpads sequence output if inputs of the model were padded.
+* **SparsityConfig**: this is an abstract class for sparsity structure. Any sparsity structure needs to extend this class and writes its own sparsity pattern construction; `make_layout` function. DeepSpeed currently provides the following structures that will be described in [How to config sparsity structures](#how-to-config-sparsity-structures) section:
+  * `FixedSparsityConfig`
+  * `BSLongformerSparsityConfig`
+  * `BigBirdSparsityConfig`
+  * `VariableSparsityConfig`
+  * `DenseSparsityConfig`
+
+**Note:** Currently DeepSpeed Transformer Kernels do not support Sparse Attention. To use Sparse Attention, you need to disable Transformer Kernels!
+{: .notice--warning}
+
+## How to use sparse attention with DeepSpeed launcher
+In this section we describe how to use DeepSpeed Sparse Attention through our [bing_bert](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py) code.
+
+* **Update attention module**: First, you need to update your attention module based on sparse computation. Here, we use [BertSparseSelfAttention](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py) which is the sparse version of `BertSelfAttention` from our [bing_bert](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py) code. It rewrites `BertSelfAttention` where it replaces:
+
+```python
+attention_scores = torch.matmul(query_layer, key_layer)
+attention_scores = attention_scores / math.sqrt(
+    self.attention_head_size)
+
+# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+attention_scores = attention_scores + attention_mask
+
+pdtype = attention_scores.dtype
+# Normalize the attention scores to probabilities.
+attention_probs = self.softmax(attention_scores)
+
+# This is actually dropping out entire tokens to attend to, which might
+# seem a bit unusual, but is taken from the original Transformer paper.
+attention_probs = self.dropout(attention_probs)
+
+context_layer = torch.matmul(attention_probs, value_layer)
+```
+
+with:
+
+```python
+context_layer =
+  self.sparse_self_attention(
+	query_layer,
+	key_layer,
+	value_layer,
+	key_padding_mask=attention_mask)
+```
+
+in which `sparse_self_attention` is an instance of [SparseSelfAttention](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/ops/sparse_attention/sparse_self_attention.py). This module computes attention context through sparse attention replacing underlying matrix multiplications and softmax with their equivalent sparse version. You can update any other attention module similarly.
+
+* **Setup sparse attention config in the model**: You need to setup the sparse attention config. In our example, this is done in the `BertModel`.
+
+```python
+self.pad_token_id = config.pad_token_id if hasattr(
+   config, 'pad_token_id') and config.pad_token_id is not None else 0
+# set sparse_attention_config if it has been selected
+self.sparse_attention_config = get_sparse_attention_config(
+   args, config.num_attention_heads)
+self.encoder = BertEncoder(
+   config, args, sparse_attention_config=self.sparse_attention_config)
+```
+
+* **Update encoder model**: Further, you need to update your encoder model to use SA for the attention layer when SA is enabled. Please check our [bing_bert example](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py) in which we use `BertSparseSelfAttention` instead of `BertSelfAttention` when SA is enabled.
+
+```python
+if sparse_attention_config is not None:
+    from deepspeed.ops.sparse_attention import BertSparseSelfAttention
+
+    layer.attention.self = BertSparseSelfAttention(
+         config, sparsity_config=sparse_attention_config)
+```
+
+* **Pad and unpad input data**: Also you may need to pad sequence dimension of `input_ids` and `attention_mask` to be multiple of sparse block size. As mentioned in [module](#sparse-attention-modules) section above, DeepSpeed provides utility functions for padding and unpadding. Please check our [bing_bert example](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py) to see where and how pad and unpad the inputs or outputs of the model.
+
+```python
+if self.sparse_attention_config is not None:
+   pad_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds = SparseAttentionUtils.pad_to_block_size(
+      block_size=self.sparse_attention_config.block,
+      input_ids=input_ids,
+      attention_mask=extended_attention_mask,
+      token_type_ids=token_type_ids,
+      position_ids=None,
+      inputs_embeds=None,
+      pad_token_id=self.pad_token_id,
+      model_mbeddings=self.embeddings)
+.
+.
+.
+# If BertEncoder uses sparse attention, and input_ids were padded, sequence output needs to be unpadded to original length
+if self.sparse_attention_config is not None and pad_len > 0:
+   encoded_layers[-1] = SparseAttentionUtils.unpad_sequence_output(
+      pad_len, encoded_layers[-1])
+```
+
+* **Enable sparse attention*: To use DeepSpeed Sparse Attention, you need to enable it in the launcher script through `deepspeed_sparse_attention` argument:
+
+```bash
+--deepspeed_sparse_attention
+```
+
+Please check [our bing_bert runner script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/ds_sa_train_bert_bsz64k_seq128.sh) as an example of how to enable SA with DeepSpeed launcher.
+
+* **Add sparsity config**: The sparsity config can be set through the [DeepSpeed JSON config file](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_bsz64k_lamb_config_seq128.json). In this example, we have used `fixed` sparsity mode that will be described in [How to config sparsity structures](#how-to-config-sparsity-structures) section.
+
+```json
+"sparse_attention": {
+    "mode": "fixed",
+    "block": 16,
+    "different_layout_per_head": true,
+    "num_local_blocks": 4,
+    "num_global_blocks": 1,
+    "attention": "bidirectional",
+    "horizontal_global_attention": false,
+    "num_different_global_patterns": 4
+}
+```
+
+## How to use individual kernels
+DeepSpeed Sparse Attention can be used as a feature through DeepSpeed, as described above, or simply integrated with any Transformer model as a self-attention module alone. Further, the building block kernels, matrix multiplication and softmax can be used separately. To use sparse attention alone, you can simply install DeepSpeed and import any of the modules described in [modules](#sparse-attention-modules) section; example:
+
+```python
+from deepspeed.ops.sparse_attention import SparseSelfAttention
+```
+
+Please refer to the Docstrings for details of how to use each module separately.
+
+## How to config sparsity structures
+Following we describe supported sparsity structures, their parameter set and the flexibility of adding arbitrary sparsity pattern on the self-attention layer. You can update DeepSpeed config file using any of the supported sparsity structures and set the parameters accordingly.
+
+* **SpasityConfig**:
+This module, is the parent class for all sparsity structures and contains the shared features of all sparsity structures. It takes the following parameters:
+  * `num_heads`: an integer determining number of attention heads of the layer.
+  * `block`: an integer determining the block size. Current implementation of sparse self-attention is based on blocked sparse matrices. In which this parameter defines size of such square blocks; `Block X Block`.
+  * `different_layout_per_head`: a boolean determining if each head should be assigned a different sparsity layout; default is false and this will be satisfied based on availability.
+
+* **Fixed** (FixedSparistyConfig):
+This structure is based on [Generative Modeling with Sparse Transformers](https://arxiv.org/abs/1904.10509) from OpenAI, in which local and global attention is fixed by the given parameters:
+  * `num_local_blocks`: an integer determining the number of blocks in local attention window. As it is illustrated in the below figure (adapted from original paper), tokens in a local window, attend to all tokens local to them. In the case of autoregressive model, as in the figure, tokens attend to tokens appearing before them in the local window. And in the case of Masked model such as BERT, attention is bidirectional.
+  * `num_global_blocks`: an integer determining how many consecutive blocks in a local window is used as the representative of the window for global attention; illustrated in the figure below as well.
+  * `attention`: a string determining attention type. Attention can be `unidirectional`, such as autoregressive models, in which tokens attend only to tokens appear before them in the context. Considering that, the upper triangular of attention matrix is empty as above figure. Or it can be `bidirectional`, such as BERT, in which tokens can attend to any other tokens before or after them. Then, the upper triangular part of the attention matrix is mirror of the lower triangular in the above figure.
+  * `horizontal_global_attention`: a boolean determining if blocks that are global representative of a local window, also attend to all other blocks. This is valid only if attention type is `bidirectional`. Looking at the attention matrix, that means global attention not only includes the vertical blocks, but also horizontal blocks.
+  * `num_different_global_patterns`: an integer determining number of different global attentions layouts. While global attention can be fixed by which block/s are representative of any local window, since there are multi-heads, each head can use a different global representative. For example, with 4 blocks constructing local window and global attention size of a single block, we can have 4 different versions in which the first, second, third, or forth block of each local window can be global representative of that window. This parameter determines how many of such patterns we want. Of course, there is a limitation based on `num_local_blocks` and `num_global_blocks`. Further, if you set this to more than one, you need to set `different_layout_per_head` to `True`.
+
+![Fixed sparsity structure](/assets/images/sa_fixed_sparsity_structure.png)
+
+* **BSLongformer** (BSLongformerSparistyConfig):
+This structure is an edited version of [Longformer: The Long-Document Transformer](https://arxiv.org/pdf/2004.05150.pdf), in which instead of single token-wise sparsity, we offer block of tokens sparsity. Parameters that define this patters are:
+	* `num_sliding_window_blocks`: an integer determining the number of blocks in sliding local attention window.
+	* `global_block_indices`: a list of integers determining which blocks are considered as global attention. Given indices, determine the blocks that all other token blocks attend to and they attend to all other token blocks. Notice that if `global_block_end_indices` parameter is set, this parameter is used as starting index of each global window.
+	* `global_block_end_indices`: a list of integers determining end indices of global window blocks. By default this is not used. But if it is set, it must have the same size as `global_block_indices` parameter, and combining this two parameters, for each index `i`, blocks from `global_block_indices[i]` to `global_block_end_indices[i]` (exclusive) are considered as global attention block.
+
+* **BigBird** (BigBirdSparsityConfig):
+This structure is based on [Big Bird: Transformers for Longer Sequences](https://arxiv.org/pdf/2007.14062.pdf). It somehow combines the idea of `fixed` and `longformer` patterns along with random attention. Following parameters define this structure:
+	* `num_random_blocks`: an integer determining how many blocks in each row block are attended randomly.
+	* `num_sliding_window_blocks`: an integer determining the number of blocks in sliding local attention window.
+	* `num_global_blocks`: an integer determining how many consecutive blocks, starting from index 0, are considered as global attention. Global block tokens will be attended by all other block tokens and will attend to all other block tokens as well.
+
+* **Variable** (VariableSparsityConfig):
+This structure also combines the idea of local, global and random attention. Further, it has the flexibility of defining variable size local windows. Following is the list of parameters that define this structure:
+	* `num_random_blocks`: an integer determining how many blocks in each row block are attended randomly.
+	* `local_window_blocks`: a list of integers determining the number of blocks in each local attention window. It assumes first number determines # of blocks in the first local window, second number the second window, ..., and the last number determines the number of blocks in the remaining local windows.
+	* `global_block_indices`: a list of integers determining which blocks are considered as global attention. Given indices, determine the blocks that all other token blocks attend to and they attend to all other token blocks. Notice that if `global_block_end_indices` parameter is set, this parameter is used as starting index of each global window.
+	* `global_block_end_indices`: a list of integers determining end indices of global window blocks. By default this is not used. But if it is set, it must have the same size as `global_block_indices` parameter, and combining this two parameters, for each index `i`, blocks from `global_block_indices[i]` to `global_block_end_indices[i]` (exclusive) are considered as global attention block.
+	* `attention`: a string determining attention type. Attention can be `unidirectional`, such as autoregressive models, in which tokens attend only to tokens appear before them in the context. Considering that, the upper triangular of attention matrix is empty as above figure. Or it can be `bidirectional`, such as BERT, in which tokens can attend to any other tokens before or after them. Then, the upper triangular part of the attention matrix is mirror of the lower triangular in the above figure.
+	* `horizontal_global_attention`: a boolean determining if blocks that are global representative of a local window, also attend to all other blocks. This is valid only if attention type is `bidirectional`. Looking at the attention matrix, that means global attention not only includes the vertical blocks, but also horizontal blocks
+Figure bellow illustrates an example of `variable` sparsity, in which blue, orange and green blocks illustrate local, global, and random attention blocks respectively.
+
+![Variable sparsity structure](/assets/images/sa_variable_sparsity_structure.png)
+
+Further, we provide a `dense` pattern (`DenseSparsityConfig`), that can be used for the sake of testing while it represents the full attention.
+
+
+## How to support new user defined sparsity structures
+Our building block kernels, block-based `MatMul` and `Softmax`, can accept any block-based sparsity. This provides the flexibility to apply any block-based sparsity pattern to attention score. To define and apply a new sparsity pattern, you can simply follow any of the above sparsity structures. You need to add a new class that expands `SparsityConfig` and define `make_layout` function based on how your sparsity is structured. You can add any extra parameters you may need or just use default parameters of the parent class.
diff --git a/docs/_tutorials/transformer_kernel.md b/docs/_tutorials/transformer_kernel.md
index ce5955e0fe6f..9dbcf26e2a12 100755
--- a/docs/_tutorials/transformer_kernel.md
+++ b/docs/_tutorials/transformer_kernel.md
@@ -19,6 +19,9 @@ training](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html).
 
 To use transformer kernel for training a model, you should Integrate DeepSpeed into your training script using the [Getting Started](/getting-started/) guide.
 
+**Note:** Currently DeepSpeed Transformer Kernels do not support Sparse Attention. To use Sparse Attention, you need to disable Transformer Kernels!
+{: .notice--warning}
+
 ### **Integrate Transformer Kernel**
 
 First of all, you need to integrate transformer kernel into the top-level model. Here, we show an example of instantiating the transformer kernel using the Pre-LN BERT-Large configuration settings. This configuration has 24 layers with 1024 hidden-dimension and uses the sequence length of 128 and batch size of 64. To add all these layers, we copy the same layer specification `num_hidden_layer`  times with different IDs inside a ModuleList.
@@ -40,14 +43,14 @@ config = DeepSpeedTransformerConfig(batch_size = 64,
                                     normalize_invertible=False,
                                     gelu_checkpoint=False)
 self.layer = nn.ModuleList([
-    copy.deepcopy(DeepSpeedTransformerLayer(i, cuda_config))
-    for i in range(config.num_hidden_layers)
+    copy.deepcopy(DeepSpeedTransformerLayer(cuda_config))
+    for _ in range(config.num_hidden_layers)
 ])
 ```
 ### Transformer kernel Parameters
 
 The transformer kernel is configured by a number of parameters which allow users to
-explore different settings. We partition these parameters into three categories:
+explore different settings. We partition these parameters into four categories:
 
 1. General configuration, used by different types of transformer layers
 2. Environment parameters, specifying the system's setting
diff --git a/docs/_tutorials/zero-offload.md b/docs/_tutorials/zero-offload.md
new file mode 100644
index 000000000000..97f83112c7f7
--- /dev/null
+++ b/docs/_tutorials/zero-offload.md
@@ -0,0 +1,67 @@
+---
+title: "ZeRO-Offload"
+---
+We recommend that you read the tutorials on [Getting Started](/getting-started/)  and [ZeRO](/tutorials/zero/) before stepping through this tutorial.
+
+ZeRO-Offload is a ZeRO optimization that offloads the optimizer memory and computation from the GPU to the host CPU. ZeRO-Offload enables large models with up to 13 billion parameters to be efficiently trained on a single GPU. In this tutorial we will use ZeRO-Offload to train a 10-billion parameter GPT-2 model in DeepSpeed. Furthermore, *using ZeRO-Offload in a DeepSpeed model is quick and easy because all you need is to change a few configurations in the DeepSpeed configuration json*. No code changes are needed.
+
+## ZeRO-Offload Overview
+For large model training, optimizers such as [Adam](https://arxiv.org/abs/1412.6980), can consume a significant amount of GPU compute and memory. ZeRO-Offload reduces the GPU compute and memory requirements of such models by leveraging compute and memory resources on the host CPU  to execute the optimizer. Furthermore, to prevent the optimizer from becoming a bottleneck, ZeRO-Offload uses DeepSpeed's highly optimized CPU implementation of Adam called [DeeSpeedCPUAdam](https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/ops/adam). DeepSpeedCPUAdam is 5X--7X faster than the standard PyTorch implementation. To deep dive into the design and performance of ZeRO-Offload, please see our [blog post](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/#toc-heading-3).
+
+## Training Environment
+For this tutorial, we will configure a 10 billion parameter GPT-2 model using the DeepSpeed [Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM) GPT-2 code. We advise stepping through the Megatron-LM [tutorial](/tutorials/megatron/) if you have not previously done so. We will use a single [NVIDIA Tesla V100-SXM3 Tensor Core GPU](https://www.nvidia.com/en-us/data-center/v100/) with 32GB RAM for this exercise.
+
+## Training a 10B parameter GPT-2 on 1 V100 GPU
+We need to make changes to the Megatron-LM launch script and to the DeepSpeed configuration json.
+
+### Megatron-LM GPT-2 launch script changes
+We need to apply two changes to the launch script for the DeepSpeed Megatron-LM GPT-2 model. The first change is to configure a 10B parameter GPT-2 model with activation checkpointing enabled, which can be achieved by the following set of changes:
+
+```bash
+       --model-parallel-size 1 \
+       --num-layers 50 \
+       --hidden-size 4096 \
+       --num-attention-heads 32 \
+       --batch-size 10 \
+       --deepspeed_config ds_zero_offload.config \
+       --cpu_optimizer \
+       --checkpoint-activations
+```
+
+Most of the flags in the changes above should be familiar if you have stepped through the Megatron-LM [tutorial](/tutorials/megatron/), except for the **_--cpu_optimizer_**. This flag informs the model script to pass a CPU-based Adam optimizer, rather than a GPU-based one, to DeepSpeed as the client optimizer. It is very important that this flag be used when training with ZeRO-Offload to ensure correct operation of the DeepSpeed engine.  
+
+Second, we need to apply the following changes to ensure that only one GPU is used for training.
+```bash
+   deepspeed --num_nodes 1 --num_gpus 1 ...
+```
+
+### DeepSpeed Configuration Changes
+ZeRO-Offload leverages much for ZeRO stage 2 mechanisms, and so the configuration changes to enable ZeRO-Offload is an extension of those required to enable ZeRO stage 2. The **zero_optimization** key to enable ZeRO-Offload is shown below:
+
+```json
+{
+    "zero_optimization": {
+        "stage": 2,
+        "cpu_offload": true,
+        "contiguous_gradients": true,
+        "overlap_comm": true
+    }
+}
+```
+
+As seen above, in addition to setting the _stage_ field to **2** (to enable ZeRO stage 2), we also need to set _cpu_offload_ flag to **true** enable ZeRO-Offload optimizations. In addition, we can  set other ZeRO stage 2 optimization flags, such as _overlap_comm_ to tune ZeRO-Offload performance.  With these changes we can now run the model. We share some screenshots of the training below.
+
+Here is a screenshot of the training log:
+
+![ZERO_OFFLOAD_DP1_10B_LOG](/assets/images/zero_offload_dp1_10B_log.png)
+
+Here is a screenshot of nvidia-smi showing that only GPU 0 is active during training:
+
+![ZERO_OFFLOAD_DP1_10B_SMI](/assets/images/zero_offload_dp1_10B_smi.png)
+
+Finally, here is a screenshot of htop showing host CPU and memory activity during optimizer computation:
+
+![ZERO_OFFLOAD_DP1_10B_SMI](/assets/images/zero_offload_dp1_10B_cpu.png)
+
+Congratulations! You have completed the ZeRO-Offload tutorial.
+
diff --git a/docs/_tutorials/zero.md b/docs/_tutorials/zero.md
new file mode 100644
index 000000000000..7e5aa0d3672a
--- /dev/null
+++ b/docs/_tutorials/zero.md
@@ -0,0 +1,94 @@
+---
+title: "Zero Redundancy Optimizer (ZeRO)"
+---
+If you have not done so already, we advise that you read the DeepSpeed tutorials on [Getting Started](/getting-started/) and [Megatron-LM GPT-2](/tutorials/megatron/) before stepping through this tutorial.
+
+In this tutorial, we will apply the ZeRO optimizer to the [Megatron-LM GPT-2](https://github.com/NVIDIA/Megatron-LM) model. ZeRO is a powerful set of memory optimization techniques that enable effective FP16 training of large models with billions of parameters, such as [GPT-2](https://openai.com/blog/better-language-models/) and [Turing-NLG 17B](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/). Compared to the alternative model parallelism approaches for training large models, a key appeal of ZeRO is that no model code modifications are required. As this tutorial will demonstrate, *using ZeRO in a DeepSpeed model is quick and easy because all you need is to change a few configurations in the DeepSpeed configuration json*. No code changes are needed.
+
+## ZeRO Overview
+ZeRO leverages the aggregate computation and memory resources of data parallelism to reduce the memory and compute requirements of each device (GPU) used for model training. ZeRO reduces the memory consumption of each GPU by partitioning the various model training states (weights, gradients, and optimizer states) across the available devices (GPUs and CPUs) in the distributed training hardware. Concretely, ZeRO is being implemented as incremental stages of optimizations, where optimizations in earlier stages are available in the later stages. To deep dive into ZeRO, please see our [paper](https://arxiv.org/abs/1910.02054v3).
+
+* **Stage 1**: The optimizer states (e.g., for [Adam optimizer](https://arxiv.org/abs/1412.6980), 32-bit weights, and the first, and second moment estimates) are partitioned across the processes, so that each process updates only its partition.
+
+* **Stage 2**: The reduced 32-bit gradients for updating the model weights are also partitioned such that each process retains only the gradients corresponding to its portion of the optimizer states.
+
+## Training environment
+We use the DeepSpeed [Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM) GPT-2 code for this exercise. You can step through the Megatron-LM [tutorial](/tutorials/megatron/) to familiarize yourself with the code. We will train the models in this tutorial on [NVIDIA Tesla V100-SXM3 Tensor Core GPUs](https://www.nvidia.com/en-us/data-center/v100/) with 32GB RAM.
+
+## Enabling ZeRO Optimization
+To enable ZeRO optimizations for a DeepSpeed model, we simply add the **_zero_optimization_** key to the DeepSpeed json configuration. A full description of configuration knobs of the **zero_optimization** key is available [here](/docs/config-json/#zero-optimizations-for-fp16-training).
+
+### Training a 1.5B Parameter GPT-2 model
+We demonstrate the benefits of ZeRO stage 1 by showing that it enables data parallel training of a 1.5 billion parameter GPT-2 model on eight V100 GPUs. We configure training to use a batch size of 1 per device to ensure that the memory consumption is primarily due to model parameters and optimizer states. We create this training scenario by applying the following modifications to the deepspeed launch script:
+
+```bash
+       --model-parallel-size 1 \
+       --num-layers 48 \
+       --hidden-size 1600 \
+       --num-attention-heads 16 \
+       --batch-size 1 \
+       --deepspeed_config ds_zero_stage_1.config \
+```
+
+Training this model without ZeRO fails with an out-of-memory (OOM) error as shown below:
+![OOM_DP8_1.5B_model](/assets/images/oom_dp8_1.5B_log.png)
+
+A key reason why this model does not fit in GPU memory is that the Adam optimizer states for the model consume 18GB; a significant portion of the 32GB RAM. By using ZeRO stage 1 to partition the optimizer state among eight data parallel ranks, the per-device memory consumption can be reduced to 2.25GB, thus making the model trainable. To enable ZeRO stage 1, we simply update the DeepSpeed json config file as below:
+
+```json
+{
+    "zero_optimization": {
+        "stage":1,
+        "reduce_bucket_size": 5e8
+    }
+}
+```
+As seen above, we set two fields in the **zero_optimization** key. Specifically we set the _stage_ field to 1, and the optional _reduce_bucket_size_ for gradient reduction to 500M. With ZeRO stage 1 enabled, the model can now train smoothly on 8 GPUs without running out of memory.   Below we provide some screenshots of the model training:
+
+![ZERO1_DP8_1.5B_LOG](/assets/images/zero1_dp8_1.5B_log.png)
+
+![ZERO1_DP8_1.5B_SMI](/assets/images/zero1_dp8_1.5B_smi.png)
+
+From the nvidia-smi screenshot above we can see that only GPUs 6-7 are being used for training the model. With ZeRO stage 1 we can further reduce the per-device memory consumption by increasing the data parallelism degree. These memory savings can be leveraged to either increase model size and/or batch size. In contrast, such benefits are not possible with data parallelism alone.
+
+### Training a 10B Parameter GPT-2 model
+ZeRO stage 2 optimizations further increases the size of models that can be trained using data parallelism. We show this by training a model with 10B parameters using 32 V100 GPUs.
+
+First, we need to configure a 10B parameter model with activation checkpointing enabled. This can be done by applying the following GPT-2 model configuration changes to the DeepSpeed launch script.
+
+```bash
+       --model-parallel-size 1 \
+       --num-layers 50 \
+       --hidden-size 4096 \
+       --num-attention-heads 32 \
+       --batch-size 1 \
+       --deepspeed_config ds_zero_stage_2.config \
+       --checkpoint-activations
+```
+
+Next, we need to update the DeepSpeed json configuration, as shown below, to enable ZeRO stage 2 optimizations:
+
+```json
+{
+    "zero_optimization": {
+        "stage":2,
+        "contiguous_gradients": true,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "allgather_bucket_size": 5e8
+    }
+}
+```
+
+In the above changes, we have set the _stage_ field to 2, and configured other optimization knobs that are available in ZeRO stage 2. For example, we have enabled _contiguous_gradients_ to reduce memory fragmentation during backward pass. A full description of these optimization knobs is available [here](/docs/config-json/#zero-optimizations-for-fp16-training). With these changes, we can now launch the training run.
+
+Here is a screenshot of the training log:
+
+![ZERO2_DP32_10B_LOG](/assets/images/zero2_dp32_10B_log.png)
+
+Here is a screenshot of nvidia-smi showing GPU activity during training:
+
+![ZERO2_DP32_10B_SMI](/assets/images/zero2_dp32_10B_smi.png)
+
+Congratulations! You have completed the ZeRO tutorial.
diff --git a/docs/assets/images/3d-parallelism.png b/docs/assets/images/3d-parallelism.png
new file mode 100644
index 000000000000..e40238dd32f2
Binary files /dev/null and b/docs/assets/images/3d-parallelism.png differ
diff --git a/docs/assets/images/adam-convergence.png b/docs/assets/images/adam-convergence.png
new file mode 100755
index 000000000000..a21f4c569678
Binary files /dev/null and b/docs/assets/images/adam-convergence.png differ
diff --git a/docs/assets/images/bert-ib.png b/docs/assets/images/bert-ib.png
new file mode 100755
index 000000000000..50235ecc7b4f
Binary files /dev/null and b/docs/assets/images/bert-ib.png differ
diff --git a/docs/assets/images/bert-scaling.png b/docs/assets/images/bert-scaling.png
new file mode 100755
index 000000000000..8b6900d516ab
Binary files /dev/null and b/docs/assets/images/bert-scaling.png differ
diff --git a/docs/assets/images/bert-tcp.png b/docs/assets/images/bert-tcp.png
new file mode 100755
index 000000000000..38572622d17d
Binary files /dev/null and b/docs/assets/images/bert-tcp.png differ
diff --git a/docs/assets/images/convergence-table.png b/docs/assets/images/convergence-table.png
new file mode 100755
index 000000000000..8a930f66a598
Binary files /dev/null and b/docs/assets/images/convergence-table.png differ
diff --git a/docs/assets/images/onebit-adam-overview.png b/docs/assets/images/onebit-adam-overview.png
new file mode 100755
index 000000000000..d656446e751a
Binary files /dev/null and b/docs/assets/images/onebit-adam-overview.png differ
diff --git a/docs/assets/images/onebit-convergence.png b/docs/assets/images/onebit-convergence.png
new file mode 100755
index 000000000000..e8a2e2b6b17e
Binary files /dev/null and b/docs/assets/images/onebit-convergence.png differ
diff --git a/docs/assets/images/oom_dp8_1.5B_log.png b/docs/assets/images/oom_dp8_1.5B_log.png
new file mode 100644
index 000000000000..3dd8c5ae1440
Binary files /dev/null and b/docs/assets/images/oom_dp8_1.5B_log.png differ
diff --git a/docs/assets/images/pipe-schedule.png b/docs/assets/images/pipe-schedule.png
new file mode 100755
index 000000000000..fa098542487f
Binary files /dev/null and b/docs/assets/images/pipe-schedule.png differ
diff --git a/docs/assets/images/pp-lowbw-gpt2.png b/docs/assets/images/pp-lowbw-gpt2.png
new file mode 100644
index 000000000000..d1742b41dcc8
Binary files /dev/null and b/docs/assets/images/pp-lowbw-gpt2.png differ
diff --git a/docs/assets/images/sa_backward_pass.png b/docs/assets/images/sa_backward_pass.png
new file mode 100644
index 000000000000..acdf9a26e0a5
Binary files /dev/null and b/docs/assets/images/sa_backward_pass.png differ
diff --git a/docs/assets/images/sa_bert_base_time_result.png b/docs/assets/images/sa_bert_base_time_result.png
new file mode 100644
index 000000000000..958ed8d28b76
Binary files /dev/null and b/docs/assets/images/sa_bert_base_time_result.png differ
diff --git a/docs/assets/images/sa_bert_large_time_result.png b/docs/assets/images/sa_bert_large_time_result.png
new file mode 100644
index 000000000000..400a591be1e2
Binary files /dev/null and b/docs/assets/images/sa_bert_large_time_result.png differ
diff --git a/docs/assets/images/sa_fixed_sparsity_structure.png b/docs/assets/images/sa_fixed_sparsity_structure.png
new file mode 100644
index 000000000000..5352f0d6067b
Binary files /dev/null and b/docs/assets/images/sa_fixed_sparsity_structure.png differ
diff --git a/docs/assets/images/sa_forward_pass.png b/docs/assets/images/sa_forward_pass.png
new file mode 100644
index 000000000000..ca4ae297c13f
Binary files /dev/null and b/docs/assets/images/sa_forward_pass.png differ
diff --git a/docs/assets/images/sa_gpt2_time_result.png b/docs/assets/images/sa_gpt2_time_result.png
new file mode 100644
index 000000000000..e51647fa9df6
Binary files /dev/null and b/docs/assets/images/sa_gpt2_time_result.png differ
diff --git a/docs/assets/images/sa_long_document_comprehension_result.png b/docs/assets/images/sa_long_document_comprehension_result.png
new file mode 100644
index 000000000000..c6a23bdccd1c
Binary files /dev/null and b/docs/assets/images/sa_long_document_comprehension_result.png differ
diff --git a/docs/assets/images/sa_maximum_sequence_runnable_on_bert.png b/docs/assets/images/sa_maximum_sequence_runnable_on_bert.png
new file mode 100644
index 000000000000..3355d2b4b5ad
Binary files /dev/null and b/docs/assets/images/sa_maximum_sequence_runnable_on_bert.png differ
diff --git a/docs/assets/images/sa_variable_sparsity_structure.png b/docs/assets/images/sa_variable_sparsity_structure.png
new file mode 100644
index 000000000000..53ada47b3f09
Binary files /dev/null and b/docs/assets/images/sa_variable_sparsity_structure.png differ
diff --git a/docs/assets/images/squad-ib.png b/docs/assets/images/squad-ib.png
new file mode 100755
index 000000000000..08ba41fc2990
Binary files /dev/null and b/docs/assets/images/squad-ib.png differ
diff --git a/docs/assets/images/squad-scaling.png b/docs/assets/images/squad-scaling.png
new file mode 100755
index 000000000000..b14a92895e30
Binary files /dev/null and b/docs/assets/images/squad-scaling.png differ
diff --git a/docs/assets/images/squad-tcp.png b/docs/assets/images/squad-tcp.png
new file mode 100755
index 000000000000..ba283e51bf62
Binary files /dev/null and b/docs/assets/images/squad-tcp.png differ
diff --git a/docs/assets/images/variable_sparsity_pattern.png b/docs/assets/images/variable_sparsity_pattern.png
new file mode 100644
index 000000000000..09fb18be7ee0
Binary files /dev/null and b/docs/assets/images/variable_sparsity_pattern.png differ
diff --git a/docs/assets/images/webinar-aug2020.png b/docs/assets/images/webinar-aug2020.png
new file mode 100644
index 000000000000..cb8577a4aea4
Binary files /dev/null and b/docs/assets/images/webinar-aug2020.png differ
diff --git a/docs/assets/images/zero1_dp8_1.5B_log.png b/docs/assets/images/zero1_dp8_1.5B_log.png
new file mode 100644
index 000000000000..4aa7df779042
Binary files /dev/null and b/docs/assets/images/zero1_dp8_1.5B_log.png differ
diff --git a/docs/assets/images/zero1_dp8_1.5B_smi.png b/docs/assets/images/zero1_dp8_1.5B_smi.png
new file mode 100644
index 000000000000..69febe38a3ff
Binary files /dev/null and b/docs/assets/images/zero1_dp8_1.5B_smi.png differ
diff --git a/docs/assets/images/zero2_dp32_10B_log.png b/docs/assets/images/zero2_dp32_10B_log.png
new file mode 100644
index 000000000000..a67ed891caa2
Binary files /dev/null and b/docs/assets/images/zero2_dp32_10B_log.png differ
diff --git a/docs/assets/images/zero2_dp32_10B_smi.png b/docs/assets/images/zero2_dp32_10B_smi.png
new file mode 100644
index 000000000000..55d773a0d69c
Binary files /dev/null and b/docs/assets/images/zero2_dp32_10B_smi.png differ
diff --git a/docs/assets/images/zero_offload_dp1_10B_cpu.png b/docs/assets/images/zero_offload_dp1_10B_cpu.png
new file mode 100644
index 000000000000..db9a1dc64396
Binary files /dev/null and b/docs/assets/images/zero_offload_dp1_10B_cpu.png differ
diff --git a/docs/assets/images/zero_offload_dp1_10B_log.png b/docs/assets/images/zero_offload_dp1_10B_log.png
new file mode 100644
index 000000000000..6fe458dacf60
Binary files /dev/null and b/docs/assets/images/zero_offload_dp1_10B_log.png differ
diff --git a/docs/assets/images/zero_offload_dp1_10B_smi.png b/docs/assets/images/zero_offload_dp1_10B_smi.png
new file mode 100644
index 000000000000..c94f120f1af0
Binary files /dev/null and b/docs/assets/images/zero_offload_dp1_10B_smi.png differ
diff --git a/docs/code-docs/requirements.local.txt b/docs/code-docs/requirements.local.txt
deleted file mode 100755
index fdce5922d1b3..000000000000
--- a/docs/code-docs/requirements.local.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-sphinx
-recommonmark
-sphinx-rtd-theme
diff --git a/docs/code-docs/requirements.readthedocs.txt b/docs/code-docs/requirements.readthedocs.txt
deleted file mode 100644
index c032a8c9fdad..000000000000
--- a/docs/code-docs/requirements.readthedocs.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-tqdm
-psutil
diff --git a/docs/code-docs/source/conf.py b/docs/code-docs/source/conf.py
index 17f66d2a3438..eb9a412d8a4a 100644
--- a/docs/code-docs/source/conf.py
+++ b/docs/code-docs/source/conf.py
@@ -20,10 +20,12 @@
 author = 'Microsoft'
 
 # The full version, including alpha/beta/rc tags
-release = '0.1.0'
+release = '0.3.0'
 
 master_doc = 'index'
 
+autodoc_member_order = 'bysource'
+
 # -- General configuration ---------------------------------------------------
 
 # Add any Sphinx extension module names here, as strings. They can be
@@ -32,6 +34,7 @@
 extensions = [
     'sphinx.ext.autodoc',
     'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
     'recommonmark',
     'sphinx_rtd_theme',
 ]
@@ -76,11 +79,4 @@
 
 autoclass_content = 'both'
 
-autodoc_mock_imports = [
-    "torch",
-    "apex",
-    "mpi4py",
-    "tensorboardX",
-    "deepspeed_transformer_cuda",
-    "deepspeed_stochastic_transformer_cuda",
-]
+autodoc_mock_imports = ["torch", "apex", "mpi4py", "tensorboardX", "numpy", "cupy"]
diff --git a/docs/code-docs/source/deepspeed.pt.rst b/docs/code-docs/source/deepspeed.pt.rst
deleted file mode 100644
index 991963e32a43..000000000000
--- a/docs/code-docs/source/deepspeed.pt.rst
+++ /dev/null
@@ -1,134 +0,0 @@
-deepspeed.pt package
-====================
-
-Submodules
-----------
-
-deepspeed.pt.deepspeed\_config module
--------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_config
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_constants module
-----------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_constants
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_csr\_tensor module
-------------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_csr_tensor
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_dataloader module
------------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_dataloader
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_fused\_lamb module
-------------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_fused_lamb
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_launch module
--------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_launch
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_light module
-------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_light
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_lr\_schedules module
---------------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_lr_schedules
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_run module
-----------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_run
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_timer module
-------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_timer
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_utils module
-------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_zero\_optimizer module
-----------------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_zero_optimizer
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.fp16\_optimizer module
------------------------------------
-
-.. automodule:: deepspeed.pt.fp16_optimizer
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.fp16\_unfused\_optimizer module
---------------------------------------------
-
-.. automodule:: deepspeed.pt.fp16_unfused_optimizer
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.loss\_scaler module
---------------------------------
-
-.. automodule:: deepspeed.pt.loss_scaler
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-
-Module contents
----------------
-
-.. automodule:: deepspeed.pt
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/code-docs/source/deepspeed.rst b/docs/code-docs/source/deepspeed.rst
deleted file mode 100644
index 480793bbcaba..000000000000
--- a/docs/code-docs/source/deepspeed.rst
+++ /dev/null
@@ -1,38 +0,0 @@
-deepspeed package
-=================
-
-Subpackages
------------
-
-.. toctree::
-   :maxdepth: 4
-
-   deepspeed.pt
-
-Submodules
-----------
-
-deepspeed.git\_version\_info module
------------------------------------
-
-.. automodule:: deepspeed.git_version_info
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.install\_config module
---------------------------------
-
-.. automodule:: deepspeed.install_config
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-
-Module contents
----------------
-
-.. automodule:: deepspeed
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/code-docs/source/index.rst b/docs/code-docs/source/index.rst
index 59d53b09912f..faf818c696b3 100644
--- a/docs/code-docs/source/index.rst
+++ b/docs/code-docs/source/index.rst
@@ -34,6 +34,13 @@ Transformer Kernel API
 
    kernel
 
+Pipeline Parallelism
+--------------------
+.. toctree::
+   :maxdepth: 2
+
+   pipeline
+
 
 Indices and tables
 ------------------
diff --git a/docs/code-docs/source/initialize.rst b/docs/code-docs/source/initialize.rst
index ee10154515ea..938045de8fc8 100644
--- a/docs/code-docs/source/initialize.rst
+++ b/docs/code-docs/source/initialize.rst
@@ -25,7 +25,7 @@ to add DeepSpeed's builtin arguments to your application's parser.
 
 Training Initialization
 -----------------------
-The entrypoint for all training with DeepSpeed is ``deepspeed.initialize()``.
+The entrypoint for all training with DeepSpeed is ``deepspeed.initialize()``. Will initialize distributed backend if it is not intialized already.
 
 Example usage:
 
@@ -36,3 +36,9 @@ Example usage:
                                                          model_parameters=net.parameters())
 
 .. autofunction:: deepspeed.initialize
+
+Distributed Initialization
+-----------------------
+Optional distributed backend initializating separate from ``deepspeed.initialize()``. Useful in scenarios where the user wants to use torch distributed calls before calling ``deepspeed.initialize()``, such as when using model parallelism, pipeline parallelism, or certain data loader scenarios.
+
+.. autofunction:: deepspeed.init_distributed
diff --git a/docs/code-docs/source/model-checkpointing.rst b/docs/code-docs/source/model-checkpointing.rst
index eaf349b27ccb..064f228f1e2c 100644
--- a/docs/code-docs/source/model-checkpointing.rst
+++ b/docs/code-docs/source/model-checkpointing.rst
@@ -5,8 +5,8 @@ DeepSpeed provides routines for checkpointing model state during training.
 
 Loading Training Checkpoints
 ----------------------------
-.. autofunction:: deepspeed.DeepSpeedLight.load_checkpoint
+.. autofunction:: deepspeed.DeepSpeedEngine.load_checkpoint
 
 Saving Training Checkpoints
 ---------------------------
-.. autofunction:: deepspeed.DeepSpeedLight.save_checkpoint
+.. autofunction:: deepspeed.DeepSpeedEngine.save_checkpoint
diff --git a/docs/code-docs/source/modules.rst b/docs/code-docs/source/modules.rst
deleted file mode 100644
index ffb76bdd7102..000000000000
--- a/docs/code-docs/source/modules.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-deepspeed
-=========
-
-.. toctree::
-   :maxdepth: 4
-
-   deepspeed
diff --git a/docs/code-docs/source/optimizers.rst b/docs/code-docs/source/optimizers.rst
new file mode 100755
index 000000000000..65f1ca2bf33f
--- /dev/null
+++ b/docs/code-docs/source/optimizers.rst
@@ -0,0 +1,12 @@
+Optimizers
+===================
+
+DeepSpeed offers high-performance implementations of Adam and Lamb optimizers on CPU and GPU, respectively.
+
+DeepSpeed CPU Adam
+----------------------------
+.. autoclass:: deepspeed.ops.adam.DeepSpeedCPUAdam
+
+DeepSpeed Fused Lamb
+----------------------------
+.. autoclass:: deepspeed.ops.adam.DeepSpeedCPUAdam
diff --git a/docs/code-docs/source/pipeline.rst b/docs/code-docs/source/pipeline.rst
new file mode 100644
index 000000000000..b82ea05fee16
--- /dev/null
+++ b/docs/code-docs/source/pipeline.rst
@@ -0,0 +1,24 @@
+Pipeline Parallelism
+====================
+
+Model Specification
+--------------------
+.. autoclass:: deepspeed.pipe.PipelineModule
+    :members:
+
+.. autoclass:: deepspeed.pipe.LayerSpec
+    :members:
+
+.. autoclass:: deepspeed.pipe.TiedLayerSpec
+    :members:
+
+
+Training
+--------
+.. automodule:: deepspeed.runtime.pipe.engine
+    :members:
+
+Extending Pipeline Parallelism
+------------------------------
+.. automodule:: deepspeed.runtime.pipe.schedule
+    :members:
diff --git a/docs/code-docs/source/training.rst b/docs/code-docs/source/training.rst
index 55ce4642ff70..d88d755f39cb 100644
--- a/docs/code-docs/source/training.rst
+++ b/docs/code-docs/source/training.rst
@@ -1,8 +1,8 @@
 Training API
 ============
 
-:func:`deepspeed.initialize` returns a *model engine* in its first argument
-of type ``DeepSpeedLight``. This engine is used to progress training:
+:func:`deepspeed.initialize` returns a *training engine* in its first argument
+of type :class:`DeepSpeedEngine`. This engine is used to progress training:
 
 .. code-block:: python
 
@@ -18,12 +18,16 @@ of type ``DeepSpeedLight``. This engine is used to progress training:
 
 Forward Propagation
 -------------------
-.. autofunction:: deepspeed.DeepSpeedLight.forward
+.. autofunction:: deepspeed.DeepSpeedEngine.forward
 
 Backward Propagation
 --------------------
-.. autofunction:: deepspeed.DeepSpeedLight.backward
+.. autofunction:: deepspeed.DeepSpeedEngine.backward
 
 Optimizer Step
 --------------
-.. autofunction:: deepspeed.DeepSpeedLight.step
+.. autofunction:: deepspeed.DeepSpeedEngine.step
+
+Gradient Accumulation
+---------------------
+.. autofunction:: deepspeed.DeepSpeedEngine.is_gradient_accumulation_boundary
diff --git a/docs/index.md b/docs/index.md
index 7b99cb550db0..61df8442344b 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -11,9 +11,13 @@ efficient, and effective.
 <p align="center"><i><b>10x Faster Training</b></i></p>
 <p align="center"><i><b>Minimal Code Change</b></i></p>
 
-DeepSpeed can train DL models with over a hundred billion parameters on current
-generation of GPU clusters, while achieving over 10x in system performance
-compared to the state-of-art. Early adopters of DeepSpeed have already produced
+DeepSpeed delivers extreme-scale model training for everyone, from data scientists training on massive supercomputers to those training on low-end clusters or even on a single GPU:
+* Extreme scale: Using current generation of GPU clusters with hundreds of devices,  3D parallelism of DeepSpeed can efficiently train deep learning models with trillions of parameters.
+* Extremely memory efficient: With just a single GPU, ZeRO-Offload of DeepSpeed can train models with over 10B parameters, 10x bigger than the state of arts, democratizing multi-billion-parameter model training such that many deep learning scientists can explore bigger and better models.
+* Extremely long sequence length: Sparse attention of DeepSpeed powers an order-of-magnitude longer input sequence and obtains up to 6x faster execution comparing with dense transformers.
+* Extremely communication efficient: 3D parallelism improves communication efficiency allows users to train multi-billion-parameter models 2–7x faster on clusters with limited network bandwidth.  1-bit Adam reduces communication volume by up to 5x while achieving similar convergence efficiency to Adam, allowing for scaling to different types of GPU clusters and networks.
+
+Early adopters of DeepSpeed have already produced
 a language model (LM) with over 17B parameters called
 [Turing-NLG](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft),
 establishing a new SOTA in the LM category.
@@ -24,15 +28,13 @@ initiative to enable next-generation AI capabilities at scale, where you can fin
 information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).
 
 # What's New?
-{% assign news = site.posts | where: "sneak_preview", "false" %}
-{% for post in news limit:5 %}
-  {% if post.link %}
-  * [{{ post.date | date: "%Y/%m/%d"  }}] [{{ post.title }}]({{ post.link }}) {% if post.new_post %} <span style="color:dodgerblue">**NEW!**</span> {% endif %}
-  {% else %}
-  * [{{ post.date | date: "%Y/%m/%d"}}] [{{ post.title }}]({{ post.url }}) {% if post.new_post %} <span style="color:dodgerblue">**NEW!**</span> {% endif %}
-  {% endif %}
-{% endfor %}
-
+* [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
+* [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
+* [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone]({{ site.press_release_v3 }})
+  * [Powering 10x longer sequences and 6x faster execution through DeepSpeed Sparse Attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention-news.html)
+  * [Training a trillion parameters with pipeline parallelism](https://www.deepspeed.ai/news/2020/09/08/pipeline-parallelism.html)
+  * [Up to 5x less communication and 3.4x faster training through 1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-news.html)
+  * [10x bigger model training on a single GPU with ZeRO-Offload](https://www.deepspeed.ai/news/2020/09/08/ZeRO-Offload.html)
 
 # Why DeepSpeed?
 Training advanced deep learning models is challenging. Beyond model design,
@@ -84,7 +86,7 @@ optimizations on advanced hyperparameter tuning and optimizers. For example:
 ## Memory efficiency
 DeepSpeed provides memory-efficient data parallelism and enables training models without
 model parallelism. For example, DeepSpeed can train models with up to 13 billion parameters on
-NVIDIA V100 GPUs with 32GB of device memory. In comparison, existing frameworks (e.g.,
+a single GPU. In comparison, existing frameworks (e.g.,
 PyTorch's Distributed Data Parallel) run out of memory with 1.4 billion parameter models.
 
 DeepSpeed reduces the training memory footprint through a novel solution called Zero
@@ -94,7 +96,7 @@ significant memory. Furthermore, it also reduces activation memory and fragmente
 The current implementation (ZeRO-2) reduces memory by up to
 8x relative to the state-of-art. You can read more about ZeRO in our [paper](https://arxiv.org/abs/1910.02054), and
 in our blog posts related to
-[ZeRO-1](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/). <!-- and [ZeRO-2](linklink). -->
+[ZeRO-1](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/) and [ZeRO-2](https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/).
 
 With this impressive memory reduction, early adopters of DeepSpeed have already
 produced  a language model (LM) with over 17B parameters called
@@ -102,15 +104,15 @@ produced  a language model (LM) with over 17B parameters called
 <span style="color:dodgerblue">Turing-NLG</span></a>,
 establishing a new SOTA in the LM category.
 
+For model scientists with limited GPU resources, ZeRO-Offload leverages both CPU and GPU memory for training large models. Using a machine with **a single GPU**, our users can run **models of up to 13 billion parameters** without running out of memory, 10x bigger than the existing approaches, while obtaining competitive throughput. This feature democratizes multi-billion-parameter model training and opens the window for many deep learning practitioners to explore bigger and better models.
 
 ## Scalability
-DeepSpeed supports efficient data parallelism, model parallelism, and their
-combination. ZeRO boosts the scaling capability and efficiency further.
-* <span style="color:dodgerblue">DeepSpeed provides system support to run models up to 170 billion parameters,
-  10x larger than the state-of-art (8 billion NVIDIA GPT, 11 billion Google T5).</span>
+DeepSpeed supports efficient data parallelism, model parallelism, pipeline parallelism and their
+combinations, which we call 3D parallelism.
+* <span style="color:dodgerblue">3D parallelism of DeepSpeed provides system support to run models with trillions of parameters, read more in our [press-release]({{ site.press_release_v3 }}) and [tutorial](/tutorials/pipeline).</span>
 * <span style="color:dodgerblue">DeepSpeed can run large models more efficiently, up to 10x
   faster for models with
-  various sizes spanning 1.5B to 170B.</span> More specifically, the data parallelism powered by ZeRO
+  various sizes spanning 1.5B to hundred billion.</span> More specifically, the data parallelism powered by ZeRO
   is complementary and can be combined with different types of model parallelism.  It allows
   DeepSpeed to fit models using lower degree of model parallelism and higher batch size, offering
   significant performance gains compared to using model parallelism alone.
@@ -123,6 +125,15 @@ combination. ZeRO boosts the scaling capability and efficiency further.
 <em>The figure depicts system throughput improvements of DeepSpeed (combining ZeRO-powered data parallelism with model parallelism of NVIDIA Megatron-LM) over using Megatron-LM alone.</em>
 </p>
 
+## Communication efficiency
+Pipeline parallelism of DeepSpeed reduce communication volume during distributed training, which allows users to train multi-billion-parameter models 2–7x faster on clusters with limited network bandwidth.
+![Low-bandwidth GPT-2 Performance](/assets/images/pp-lowbw-gpt2.png)
+
+1-bit Adam reduces communication volume by up to 5x while achieving similar convergence efficiency to Adam, allowing for scaling to different types of GPU clusters and networks.  [Read more here](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html).
+
+## Supporting long sequence length
+DeepSpeed offers sparse attention kernels—an instrumental technology to support long sequences of model inputs, whether for text, image, or sound. Compared with the classic dense Transformers, it powers **an order-of-magnitude longer input sequence** and obtains up to 6x faster execution with comparable accuracy. It also outperforms state-of-the-art sparse implementations with 1.5–3x faster execution. Furthermore, our sparse kernels support efficient execution of flexible sparse format and empower users to innovate on their custom sparse structures.  [Read more here](https://www.deepspeed.ai/news/2020/09/08/sparse-attention.html).
+
 
 ## Fast convergence for effectiveness
 DeepSpeed supports advanced hyperparameter tuning and large batch size
@@ -139,39 +150,54 @@ Only a few lines of code changes are needed to enable a PyTorch model to use Dee
 
 ## Features
 
-Below we provide a brief feature list, see our detailed [feature
-overview](/features/) for descriptions and usage.
-
-* [Distributed Training with Mixed Precision](/features/#distributed-training-with-mixed-precision)
-    * 16-bit mixed precision
-    * Single-GPU/Multi-GPU/Multi-Node
-* [Model Parallelism](/features/#model-parallelism)
-    * Support for Custom Model Parallelism
-    * Integration with Megatron-LM
-* [The Zero Redundancy Optimizer (ZeRO)](/features/#the-zero-redundancy-optimizer)
-    * Optimizer State and Gradient Partitioning
-    * Activation Partitioning
-    * Constant Buffer Optimization
-    * Contiguous Memory Optimization
-* [Additional Memory and Bandwidth Optimizations](/features/#additional-memory-and-bandwidth-optimizations)
-    * Smart Gradient Accumulation
-    * Communication/Computation Overlap
-* [Training Features](/features/#training-features)
-    * Simplified training API
-    * Activation Checkpointing API
-    * Gradient Clipping
-    * Automatic loss scaling with mixed precision
-* [Training Optimizers](/features/#training-optimizers)
-    * Fused Adam optimizer and arbitrary `torch.optim.Optimizer`
-    * Memory bandwidth optimized FP16 Optimizer
-    * Large Batch Training with LAMB Optimizer
-    * Memory efficient Training with ZeRO Optimizer
-* [Training Agnostic Checkpointing](/features/#training-agnostic-checkpointing)
-* [Advanced Parameter Search](/features/#advanced-parameter-search)
-    * Learning Rate Range Test
-    * 1Cycle Learning Rate Schedule
-* [Simplified Data Loader](/features/#simplified-data-loader)
-* [Performance Analysis and Debugging](/features/#performance-analysis-and-debugging)
+Below we provide a brief feature list, see our detailed [feature overview](https://www.deepspeed.ai/features/) for descriptions and usage.
+
+* [Distributed Training with Mixed Precision](https://www.deepspeed.ai/features/#distributed-training-with-mixed-precision)
+  * 16-bit mixed precision
+  * Single-GPU/Multi-GPU/Multi-Node
+* [Model Parallelism](https://www.deepspeed.ai/features/#model-parallelism)
+  * Support for Custom Model Parallelism
+  * Integration with Megatron-LM
+* [Pipeline Parallelism](https://www.deepspeed.ai/tutorials/pipeline/)
+  * 3D Parallelism
+* [The Zero Redundancy Optimizer (ZeRO)](https://www.deepspeed.ai/tutorials/zero/)
+  * Optimizer State and Gradient Partitioning
+  * Activation Partitioning
+  * Constant Buffer Optimization
+  * Contiguous Memory Optimization
+* [ZeRO-Offload](https://www.deepspeed.ai/tutorials/zero-offload/)
+  * Leverage both CPU/GPU memory for model training
+  * Support 10B model training on a single GPU
+* [Ultra-fast dense transformer kernels](https://www.deepspeed.ai/news/2020/05/18/bert-record.html)
+* [Sparse attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention.html)
+  * Memory- and compute-efficient sparse kernels
+  * Support 10x long sequences than dense
+  * Flexible support to different sparse structures
+* [1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html)
+  * Custom communication collective
+  * Up to 5x communication volume saving
+* [Additional Memory and Bandwidth Optimizations](https://www.deepspeed.ai/features/#additional-memory-and-bandwidth-optimizations)
+  * Smart Gradient Accumulation
+  * Communication/Computation Overlap
+* [Training Features](https://www.deepspeed.ai/features/#training-features)
+  * Simplified training API
+  * Gradient Clipping
+  * Automatic loss scaling with mixed precision
+* [Training Optimizers](https://www.deepspeed.ai/features/#training-optimizers)
+  * Fused Adam optimizer and arbitrary `torch.optim.Optimizer`
+  * Memory bandwidth optimized FP16 Optimizer
+  * Large Batch Training with LAMB Optimizer
+  * Memory efficient Training with ZeRO Optimizer
+  * CPU-Adam
+* [Training Agnostic Checkpointing](https://www.deepspeed.ai/features/#training-agnostic-checkpointing)
+* [Advanced Parameter Search](https://www.deepspeed.ai/features/#advanced-parameter-search)
+  * Learning Rate Range Test
+  * 1Cycle Learning Rate Schedule
+* [Simplified Data Loader](https://www.deepspeed.ai/features/#simplified-data-loader)
+* [Progressive Layer Dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
+  * Efficient and robust compressed training
+  * Up to 2.5x convergence speedup for pre-training
+* [Performance Analysis and Debugging](https://www.deepspeed.ai/features/#performance-analysis-and-debugging)
 
 
 # Contributing
@@ -198,4 +224,20 @@ Conduct](https://opensource.microsoft.com/codeofconduct/). For more information
 comments.
 
 # Publications
-1. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He. (2019) ZeRO: Memory Optimization Towards Training A Trillion Parameter Models. [ArXiv:1910.02054](https://arxiv.org/abs/1910.02054)
+1. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He. (2019) ZeRO: memory optimizations toward training trillion parameter models. [arXiv:1910.02054](https://arxiv.org/abs/1910.02054) and [In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC '20)](https://dl.acm.org/doi/10.5555/3433701.3433727).
+2. Jeff Rasley, Samyam Rajbhandari, Olatunji Ruwase, and Yuxiong He. (2020) DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters. [In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD '20, Tutorial)](https://dl.acm.org/doi/10.1145/3394486.3406703).
+3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
+4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840).
+
+# Videos
+1. DeepSpeed KDD 2020 Tutorial
+    1. [Overview](https://www.youtube.com/watch?v=CaseqC45DNc&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=29)
+    2. [ZeRO + large model training](https://www.youtube.com/watch?v=y4_bCiAsIAk&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=28)
+    3. [17B T-NLG demo](https://www.youtube.com/watch?v=9V-ZbP92drg&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=27)
+    4. [Fastest BERT training + RScan tuning](https://www.youtube.com/watch?v=o1K-ZG9F6u0&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=26)
+    5. DeepSpeed hands on deep dive: [part 1](https://www.youtube.com/watch?v=_NOk-mBwDYg&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=92), [part 2](https://www.youtube.com/watch?v=sG6_c4VXLww&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=94), [part 3](https://www.youtube.com/watch?v=k9yPkBTayos&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=93)
+    6. [FAQ](https://www.youtube.com/watch?v=nsHu6vEgPew&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=24)
+2. Microsoft Research Webinar
+    * Registration is free and all videos are available on-demand.
+    * [ZeRO & Fastest BERT: Increasing the scale and speed of deep learning training in DeepSpeed](https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html).
+3. [DeepSpeed on AzureML](https://youtu.be/yBVXR8G8Bg8)
diff --git a/install.sh b/install.sh
index 433bcd8b0b07..b9f1501d9cad 100755
--- a/install.sh
+++ b/install.sh
@@ -15,16 +15,13 @@ By default will install deepspeed and all third party dependecies accross all ma
 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
 
 [optional]
-    -d, --deepspeed_only    Install only deepspeed and no third party dependencies
-    -t, --third_party_only  Install only third party dependencies and not deepspeed
     -l, --local_only        Install only on local machine
     -s, --pip_sudo          Run pip install with sudo (default: no sudo)
     -r, --allow_sudo        Allow script to be run by root (probably don't want this, instead use --pip_sudo)
     -n, --no_clean          Do not clean prior build state, by default prior build files are removed before building wheels
     -m, --pip_mirror        Use the specified pip mirror (default: the default pip mirror)
     -H, --hostfile          Path to MPI-style hostfile (default: /job/hostfile)
-    -a, --apex_commit       Install a specific commit hash of apex, instead of the one deepspeed points to
-    -k, --skip_requirements Skip installing DeepSpeed requirements
+    -v, --verbose           Verbose logging
     -h, --help              This help text
   """
 }
@@ -42,27 +39,12 @@ apex_commit=""
 skip_requirements=0
 allow_sudo=0
 no_clean=0
+verbose=0
 
 while [[ $# -gt 0 ]]
 do
 key="$1"
 case $key in
-    -d|--deepspeed_only)
-    deepspeed_install=1;
-    third_party_install=0;
-    ds_only=1;
-    shift
-    ;;
-    -t|--third_party_only)
-    deepspeed_install=0;
-    third_party_install=1;
-    tp_only=1;
-    shift
-    ;;
-    -l|--local_only)
-    local_only=1;
-    shift
-    ;;
     -s|--pip_sudo)
     pip_sudo=1;
     shift
@@ -72,13 +54,8 @@ case $key in
     shift
     shift
     ;;
-    -a|--apex_commit)
-    apex_commit=$2;
-    shift
-    shift
-    ;;
-    -k|--skip_requirements)
-    skip_requirements=1;
+    -v|--verbose)
+    verbose=1;
     shift
     ;;
     -r|--allow_sudo)
@@ -126,32 +103,29 @@ if [ "$ds_only" == "1" ] && [ "$tp_only" == "1" ]; then
     exit 1
 fi
 
+if [ "$verbose" == "1" ]; then
+    VERBOSE="-v"
+else
+    VERBOSE=""
+fi
+
 rm_if_exist() {
     echo "Attempting to remove $1"
     if [ -f $1 ]; then
-        rm -v $1
+        rm $VERBOSE $1
     elif [ -d $1 ]; then
-        rm -vr $1
+        rm -r $VERBOSE $1
     fi
 }
 
 if [ "$no_clean" == "0" ]; then
     # remove deepspeed build files
-    rm_if_exist deepspeed/git_version_info.py
+    rm_if_exist deepspeed/git_version_info_installed.py
     rm_if_exist dist
     rm_if_exist build
     rm_if_exist deepspeed.egg-info
-    # remove apex build files
-    rm_if_exist third_party/apex/dist
-    rm_if_exist third_party/apex/build
-    rm_if_exist third_party/apex/apex.egg-info
 fi
 
-echo "Updating git hash/branch info"
-echo "git_hash = '$(git rev-parse --short HEAD)'" > deepspeed/git_version_info.py
-echo "git_branch = '$(git rev-parse --abbrev-ref HEAD)'" >> deepspeed/git_version_info.py
-cat deepspeed/git_version_info.py
-
 if [ "$pip_sudo" == "1" ]; then
     PIP_SUDO="sudo -H"
 else
@@ -159,59 +133,25 @@ else
 fi
 
 if [ "$pip_mirror" != "" ]; then
-    PIP_INSTALL="pip install -v -i $pip_mirror"
+    PIP_INSTALL="pip install $VERBOSE -i $pip_mirror"
 else
-    PIP_INSTALL="pip install -v"
+    PIP_INSTALL="pip install $VERBOSE"
 fi
 
+
 if [ ! -f $hostfile ]; then
     echo "No hostfile exists at $hostfile, installing locally"
     local_only=1
 fi
 
-if [ "$skip_requirements" == "0" ]; then
-    # Ensure dependencies are installed locally
-    $PIP_SUDO $PIP_INSTALL -r requirements.txt
-fi
-
-# Build wheels
-if [ "$third_party_install" == "1" ]; then
-    echo "Checking out sub-module(s)"
-    git submodule update --init --recursive
-
-    echo "Building apex wheel"
-    cd third_party/apex
-
-    if [ "$apex_commit" != "" ]; then
-        echo "Installing a non-standard version of apex at commit: $apex_commit"
-        git fetch
-        git checkout $apex_commit
-    fi
-
-    python setup.py -v --cpp_ext --cuda_ext bdist_wheel
-    cd -
-
-    echo "Installing apex locally so that deepspeed will build"
-    $PIP_SUDO pip uninstall -y apex
-    $PIP_SUDO $PIP_INSTALL third_party/apex/dist/apex*.whl
-fi
-if [ "$deepspeed_install" == "1" ]; then
-    echo "Building deepspeed wheel"
-    python setup.py -v bdist_wheel
-fi
+echo "Building deepspeed wheel"
+python setup.py $VERBOSE bdist_wheel
 
 if [ "$local_only" == "1" ]; then
-    if [ "$deepspeed_install" == "1" ]; then
-        echo "Installing deepspeed"
-        $PIP_SUDO pip uninstall -y deepspeed
-        $PIP_SUDO $PIP_INSTALL dist/deepspeed*.whl
-        python basic_install_test.py
-        if [ $? == 0 ]; then
-            echo "Installation is successful"
-        else
-            echo "Installation failed"
-        fi
-    fi
+    echo "Installing deepspeed"
+    $PIP_SUDO pip uninstall -y deepspeed
+    $PIP_SUDO $PIP_INSTALL dist/deepspeed*.whl
+    ds_report
 else
     local_path=`pwd`
     if [ -f $hostfile ]; then
@@ -220,28 +160,16 @@ else
         echo "hostfile not found, cannot proceed"
         exit 1
     fi
-    export PDSH_RCMD_TYPE=ssh;
+    export PDSH_RCMD_TYPE=ssh
     tmp_wheel_path="/tmp/deepspeed_wheels"
 
     pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; else mkdir -pv $tmp_wheel_path; fi"
-    pdcp -w $hosts requirements.txt ${tmp_wheel_path}/
-    if [ "$skip_requirements" == "0" ]; then
-        pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL -r ${tmp_wheel_path}/requirements.txt"
-    fi
-    if [ "$third_party_install" == "1" ]; then
-        pdsh -w $hosts "$PIP_SUDO pip uninstall -y apex"
-        pdcp -w $hosts third_party/apex/dist/apex*.whl $tmp_wheel_path/
-        pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/apex*.whl"
-        pdsh -w $hosts 'python -c "import apex"'
-    fi
-    if [ "$deepspeed_install" == "1" ]; then
-        echo "Installing deepspeed"
-        pdsh -w $hosts "$PIP_SUDO pip uninstall -y deepspeed"
-        pdcp -w $hosts dist/deepspeed*.whl $tmp_wheel_path/
-        pdcp -w $hosts basic_install_test.py $tmp_wheel_path/
-        pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/deepspeed*.whl"
-        pdsh -w $hosts "python $tmp_wheel_path/basic_install_test.py"
-        echo "Installation is successful"
-    fi
-    pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl $tmp_wheel_path/basic_install_test.py $tmp_wheel_path/requirements.txt; rmdir $tmp_wheel_path; fi"
+    pdcp -w $hosts requirements/requirements.txt ${tmp_wheel_path}/
+
+    echo "Installing deepspeed"
+    pdsh -w $hosts "$PIP_SUDO pip uninstall -y deepspeed"
+    pdcp -w $hosts dist/deepspeed*.whl $tmp_wheel_path/
+    pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/deepspeed*.whl"
+    pdsh -w $hosts "ds_report"
+    pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; rm $tmp_wheel_path/*.txt; rmdir $tmp_wheel_path; fi"
 fi
diff --git a/op_builder/__init__.py b/op_builder/__init__.py
new file mode 100644
index 000000000000..aceced8cedef
--- /dev/null
+++ b/op_builder/__init__.py
@@ -0,0 +1,21 @@
+from .cpu_adam import CPUAdamBuilder
+from .fused_adam import FusedAdamBuilder
+from .fused_lamb import FusedLambBuilder
+from .sparse_attn import SparseAttnBuilder
+from .transformer import TransformerBuilder
+from .stochastic_transformer import StochasticTransformerBuilder
+from .utils import UtilsBuilder
+from .builder import get_default_compute_capatabilities
+
+# TODO: infer this list instead of hard coded
+# List of all available ops
+__op_builders__ = [
+    CPUAdamBuilder(),
+    FusedAdamBuilder(),
+    FusedLambBuilder(),
+    SparseAttnBuilder(),
+    TransformerBuilder(),
+    StochasticTransformerBuilder(),
+    UtilsBuilder()
+]
+ALL_OPS = {op.name: op for op in __op_builders__}
diff --git a/op_builder/builder.py b/op_builder/builder.py
new file mode 100644
index 000000000000..3959bba5ceff
--- /dev/null
+++ b/op_builder/builder.py
@@ -0,0 +1,304 @@
+import os
+import time
+import torch
+import importlib
+from pathlib import Path
+import subprocess
+from abc import ABC, abstractmethod
+
+YELLOW = '\033[93m'
+END = '\033[0m'
+WARNING = f"{YELLOW} [WARNING] {END}"
+
+DEFAULT_TORCH_EXTENSION_PATH = "/tmp/torch_extensions"
+DEFAULT_COMPUTE_CAPABILITIES = "6.0;6.1;7.0"
+
+
+def installed_cuda_version():
+    import torch.utils.cpp_extension
+    cuda_home = torch.utils.cpp_extension.CUDA_HOME
+    assert cuda_home is not None, "CUDA_HOME does not exist, unable to compile CUDA op(s)"
+    # Ensure there is not a cuda version mismatch between torch and nvcc compiler
+    output = subprocess.check_output([cuda_home + "/bin/nvcc",
+                                      "-V"],
+                                     universal_newlines=True)
+    output_split = output.split()
+    release_idx = output_split.index("release")
+    release = output_split[release_idx + 1].replace(',', '').split(".")
+    # Ignore patch versions, only look at major + minor
+    cuda_major, cuda_minor = release[:2]
+    installed_cuda_version = ".".join(release[:2])
+    return int(cuda_major), int(cuda_minor)
+
+
+def get_default_compute_capatabilities():
+    compute_caps = DEFAULT_COMPUTE_CAPABILITIES
+    import torch.utils.cpp_extension
+    if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version(
+    )[0] >= 11:
+        if installed_cuda_version()[0] == 11 and installed_cuda_version()[1] == 0:
+            # Special treatment of CUDA 11.0 because compute_86 is not supported.
+            compute_caps += ";8.0"
+        else:
+            compute_caps += ";8.0;8.6"
+    return compute_caps
+
+
+def assert_no_cuda_mismatch():
+    cuda_major, cuda_minor = installed_cuda_version()
+    sys_cuda_version = f'{cuda_major}.{cuda_minor}'
+    torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+    # This is a show-stopping error, should probably not proceed past this
+    if sys_cuda_version != torch_cuda_version:
+        if sys_cuda_version == "11.1" and torch_cuda_version == "11.0":
+            # it works to build against installed cuda-11.1 while torch was built with cuda-11.0
+            return
+        raise Exception(
+            f"Installed CUDA version {sys_cuda_version} does not match the "
+            f"version torch was compiled with {torch.version.cuda}, unable to compile "
+            "cuda/cpp extensions without a matching cuda version.")
+
+
+def assert_torch_info(torch_info):
+    install_torch_version = torch_info['version']
+    install_cuda_version = torch_info['cuda_version']
+
+    current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+    current_torch_version = ".".join(torch.__version__.split('.')[:2])
+
+    if install_cuda_version != current_cuda_version or install_torch_version != current_torch_version:
+        raise RuntimeError(
+            "PyTorch and CUDA version mismatch! DeepSpeed ops were compiled and installed "
+            "with a different version than what is being used at runtime. Please re-install "
+            f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
+            f"torch={install_torch_version}, cuda={install_cuda_version}, runtime versions:"
+            f"torch={current_torch_version}, cuda={current_cuda_version}")
+
+
+class OpBuilder(ABC):
+    def __init__(self, name):
+        self.name = name
+        self.jit_mode = False
+
+    @abstractmethod
+    def absolute_name(self):
+        '''
+        Returns absolute build path for cases where the op is pre-installed, e.g., deepspeed.ops.adam.cpu_adam
+        will be installed as something like: deepspeed/ops/adam/cpu_adam.so
+        '''
+        pass
+
+    @abstractmethod
+    def sources(self):
+        '''
+        Returns list of source files for your op, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
+        '''
+        pass
+
+    def include_paths(self):
+        '''
+        Returns list of include paths, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
+        '''
+        return []
+
+    def nvcc_args(self):
+        '''
+        Returns optional list of compiler flags to forward to nvcc when building CUDA sources
+        '''
+        return []
+
+    def cxx_args(self):
+        '''
+        Returns optional list of compiler flags to forward to the build
+        '''
+        return []
+
+    def is_compatible(self):
+        '''
+        Check if all non-python dependencies are satisfied to build this op
+        '''
+        return True
+
+    def python_requirements(self):
+        '''
+        Override if op wants to define special dependencies, otherwise will
+        take self.name and load requirements-<op-name>.txt if it exists.
+        '''
+        path = f'requirements/requirements-{self.name}.txt'
+        requirements = []
+        if os.path.isfile(path):
+            with open(path, 'r') as fd:
+                requirements = [r.strip() for r in fd.readlines()]
+        return requirements
+
+    def command_exists(self, cmd):
+        if '|' in cmd:
+            cmds = cmd.split("|")
+        else:
+            cmds = [cmd]
+        valid = False
+        for cmd in cmds:
+            result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
+            valid = valid or result.wait() == 0
+
+        if not valid and len(cmds) > 1:
+            print(
+                f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!"
+            )
+        elif not valid and len(cmds) == 1:
+            print(
+                f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!"
+            )
+        return valid
+
+    def warning(self, msg):
+        print(f"{WARNING} {msg}")
+
+    def deepspeed_src_path(self, code_path):
+        if os.path.isabs(code_path):
+            return code_path
+        else:
+            return os.path.join(Path(__file__).parent.parent.absolute(), code_path)
+
+    def builder(self):
+        from torch.utils.cpp_extension import CppExtension
+        return CppExtension(name=self.absolute_name(),
+                            sources=self.sources(),
+                            include_dirs=self.include_paths(),
+                            extra_compile_args={'cxx': self.cxx_args()})
+
+    def load(self, verbose=True):
+        from ...git_version_info import installed_ops, torch_info
+        if installed_ops[self.name]:
+            # Ensure the op we're about to load was compiled with the same
+            # torch/cuda versions we are currently using at runtime.
+            if isinstance(self, CUDAOpBuilder):
+                assert_torch_info(torch_info)
+
+            return importlib.import_module(self.absolute_name())
+        else:
+            return self.jit_load(verbose)
+
+    def jit_load(self, verbose=True):
+        if not self.is_compatible():
+            raise RuntimeError(
+                f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue."
+            )
+        try:
+            import ninja
+        except ImportError:
+            raise RuntimeError(
+                f"Unable to JIT load the {self.name} op due to ninja not being installed."
+            )
+
+        if isinstance(self, CUDAOpBuilder):
+            assert_no_cuda_mismatch()
+
+        self.jit_mode = True
+        from torch.utils.cpp_extension import load
+
+        # Ensure directory exists to prevent race condition in some cases
+        ext_path = os.path.join(
+            os.environ.get('TORCH_EXTENSIONS_DIR',
+                           DEFAULT_TORCH_EXTENSION_PATH),
+            self.name)
+        os.makedirs(ext_path, exist_ok=True)
+
+        start_build = time.time()
+        op_module = load(
+            name=self.name,
+            sources=[self.deepspeed_src_path(path) for path in self.sources()],
+            extra_include_paths=[
+                self.deepspeed_src_path(path) for path in self.include_paths()
+            ],
+            extra_cflags=self.cxx_args(),
+            extra_cuda_cflags=self.nvcc_args(),
+            verbose=verbose)
+        build_duration = time.time() - start_build
+        if verbose:
+            print(f"Time to load {self.name} op: {build_duration} seconds")
+        return op_module
+
+
+class CUDAOpBuilder(OpBuilder):
+    def compute_capability_args(self, cross_compile_archs=None):
+        """
+        Returns nvcc compute capability compile flags.
+
+        1. `TORCH_CUDA_ARCH_LIST` takes priority over `cross_compile_archs`.
+        2. If neither is set default compute capabilities will be used
+        3. Under `jit_mode` compute capabilities of all visible cards will be used plus PTX
+
+        Format:
+
+        - `TORCH_CUDA_ARCH_LIST` may use ; or whitespace separators. Examples:
+
+        TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6" pip install ...
+        TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX" pip install ...
+
+        - `cross_compile_archs` uses ; separator.
+
+        """
+
+        ccs = []
+        if self.jit_mode:
+            # Compile for underlying architectures since we know those at runtime
+            for i in range(torch.cuda.device_count()):
+                CC_MAJOR, CC_MINOR = torch.cuda.get_device_capability(i)
+                cc = f"{CC_MAJOR}.{CC_MINOR}"
+                if cc not in ccs:
+                    ccs.append(cc)
+            ccs = sorted(ccs)
+            ccs[-1] += '+PTX'
+        else:
+            # Cross-compile mode, compile for various architectures
+            # env override takes priority
+            cross_compile_archs_env = os.environ.get('TORCH_CUDA_ARCH_LIST', None)
+            if cross_compile_archs_env is not None:
+                if cross_compile_archs is not None:
+                    print(
+                        f"{WARNING} env var `TORCH_CUDA_ARCH_LIST={cross_compile_archs_env}` overrides `cross_compile_archs={cross_compile_archs}`"
+                    )
+                cross_compile_archs = cross_compile_archs_env.replace(' ', ';')
+            else:
+                if cross_compile_archs is None:
+                    cross_compile_archs = get_default_compute_capatabilities()
+            ccs = cross_compile_archs.split(';')
+
+        args = []
+        for cc in ccs:
+            num = cc[0] + cc[2]
+            args.append(f'-gencode=arch=compute_{num},code=sm_{num}')
+            if cc.endswith('+PTX'):
+                args.append(f'-gencode=arch=compute_{num},code=compute_{num}')
+
+        return args
+
+    def version_dependent_macros(self):
+        # Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
+        TORCH_MAJOR = int(torch.__version__.split('.')[0])
+        TORCH_MINOR = int(torch.__version__.split('.')[1])
+        version_ge_1_1 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
+            version_ge_1_1 = ['-DVERSION_GE_1_1']
+        version_ge_1_3 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
+            version_ge_1_3 = ['-DVERSION_GE_1_3']
+        version_ge_1_5 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
+            version_ge_1_5 = ['-DVERSION_GE_1_5']
+        return version_ge_1_1 + version_ge_1_3 + version_ge_1_5
+
+    def is_compatible(self):
+        return super().is_compatible()
+
+    def builder(self):
+        from torch.utils.cpp_extension import CUDAExtension
+        assert_no_cuda_mismatch()
+        return CUDAExtension(name=self.absolute_name(),
+                             sources=self.sources(),
+                             include_dirs=self.include_paths(),
+                             extra_compile_args={
+                                 'cxx': self.cxx_args(),
+                                 'nvcc': self.nvcc_args()
+                             })
diff --git a/op_builder/cpu_adam.py b/op_builder/cpu_adam.py
new file mode 100644
index 000000000000..20f8fe2d8b6f
--- /dev/null
+++ b/op_builder/cpu_adam.py
@@ -0,0 +1,68 @@
+import os
+import torch
+import subprocess
+from .builder import CUDAOpBuilder
+
+
+class CPUAdamBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_ADAM"
+    NAME = "cpu_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/custom_cuda_kernel.cu']
+
+    def include_paths(self):
+        CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
+        return ['csrc/includes', CUDA_INCLUDE]
+
+    def simd_width(self):
+        if not self.command_exists('lscpu'):
+            self.warning(
+                "CPUAdam attempted to query 'lscpu' to detect the existence "
+                "of AVX instructions. However, 'lscpu' does not appear to exist on "
+                "your system, will fall back to non-vectorized execution.")
+            return ''
+
+        result = subprocess.check_output('lscpu', shell=True)
+        result = result.decode('utf-8').strip().lower()
+        if 'genuineintel' in result:
+            if 'avx512' in result:
+                return '-D__AVX512__'
+            elif 'avx2' in result:
+                return '-D__AVX256__'
+        return ''
+
+    def cxx_args(self):
+        CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
+        SIMD_WIDTH = self.simd_width()
+
+        return [
+            '-O3',
+            '-std=c++14',
+            f'-L{CUDA_LIB64}',
+            '-lcudart',
+            '-lcublas',
+            '-g',
+            '-Wno-reorder',
+            '-march=native',
+            '-fopenmp',
+            SIMD_WIDTH
+        ]
+
+    def nvcc_args(self):
+        args = [
+            '-O3',
+            '--use_fast_math',
+            '-std=c++14',
+            '-U__CUDA_NO_HALF_OPERATORS__',
+            '-U__CUDA_NO_HALF_CONVERSIONS__',
+            '-U__CUDA_NO_HALF2_OPERATORS__'
+        ]
+        args += self.compute_capability_args()
+        return args
diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py
new file mode 100644
index 000000000000..e9dd71a5530e
--- /dev/null
+++ b/op_builder/fused_adam.py
@@ -0,0 +1,28 @@
+import torch
+from .builder import CUDAOpBuilder
+
+
+class FusedAdamBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_FUSED_ADAM"
+    NAME = "fused_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adam/fused_adam_frontend.cpp', 'csrc/adam/multi_tensor_adam.cu']
+
+    def include_paths(self):
+        return ['csrc/includes']
+
+    def cxx_args(self):
+        return ['-O3'] + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        return ['-lineinfo',
+                '-O3',
+                '--use_fast_math'
+                ] + self.version_dependent_macros() + self.compute_capability_args()
diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py
new file mode 100644
index 000000000000..33a98387b96c
--- /dev/null
+++ b/op_builder/fused_lamb.py
@@ -0,0 +1,28 @@
+import torch
+from .builder import CUDAOpBuilder
+
+
+class FusedLambBuilder(CUDAOpBuilder):
+    BUILD_VAR = 'DS_BUILD_FUSED_LAMB'
+    NAME = "fused_lamb"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.lamb.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/lamb/fused_lamb_cuda.cpp', 'csrc/lamb/fused_lamb_cuda_kernel.cu']
+
+    def include_paths(self):
+        return ['csrc/includes']
+
+    def cxx_args(self):
+        return ['-O3'] + self.version_dependent_macros()
+
+    def nvcc_args(self):
+        return ['-lineinfo',
+                '-O3',
+                '--use_fast_math'
+                ] + self.version_dependent_macros() + self.compute_capability_args()
diff --git a/op_builder/sparse_attn.py b/op_builder/sparse_attn.py
new file mode 100644
index 000000000000..c3fa5624b25e
--- /dev/null
+++ b/op_builder/sparse_attn.py
@@ -0,0 +1,49 @@
+import torch
+import warnings
+from .builder import OpBuilder
+
+
+class SparseAttnBuilder(OpBuilder):
+    BUILD_VAR = "DS_BUILD_SPARSE_ATTN"
+    NAME = "sparse_attn"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.sparse_attention.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/sparse_attention/utils.cpp']
+
+    def cxx_args(self):
+        return ['-O2', '-fopenmp']
+
+    def is_compatible(self):
+        # Check to see if llvm and cmake are installed since they are dependencies
+        required_commands = ['llvm-config|llvm-config-9', 'cmake']
+        command_status = list(map(self.command_exists, required_commands))
+        deps_compatible = all(command_status)
+
+        # torch-cpu will not have a cuda version
+        if torch.version.cuda is None:
+            cuda_compatible = False
+            self.warning(f"{self.NAME} cuda is not available from torch")
+        else:
+            major, minor = torch.version.cuda.split('.')[:2]
+            cuda_compatible = int(major) == 10 and int(minor) >= 1
+            if not cuda_compatible:
+                self.warning(
+                    f"{self.NAME} requires CUDA version 10.1+, does not currently support >=11 or <10.1"
+                )
+
+        TORCH_MAJOR = int(torch.__version__.split('.')[0])
+        TORCH_MINOR = int(torch.__version__.split('.')[1])
+        torch_compatible = TORCH_MAJOR == 1 and TORCH_MINOR >= 5
+        if not torch_compatible:
+            self.warning(
+                f'{self.NAME} requires a torch version >= 1.5 but detected {TORCH_MAJOR}.{TORCH_MINOR}'
+            )
+
+        return super().is_compatible(
+        ) and deps_compatible and torch_compatible and cuda_compatible
diff --git a/op_builder/stochastic_transformer.py b/op_builder/stochastic_transformer.py
new file mode 100644
index 000000000000..50dfea7c6698
--- /dev/null
+++ b/op_builder/stochastic_transformer.py
@@ -0,0 +1,18 @@
+import torch
+from .transformer import TransformerBuilder
+
+
+class StochasticTransformerBuilder(TransformerBuilder):
+    BUILD_VAR = "DS_BUILD_STOCHASTIC_TRANSFORMER"
+    NAME = "stochastic_transformer"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.transformer.{self.NAME}_op'
+
+    def nvcc_args(self):
+        args = super().nvcc_args()
+        args.append('-D__STOCHASTIC_MODE__')
+        return args
diff --git a/op_builder/transformer.py b/op_builder/transformer.py
new file mode 100644
index 000000000000..2735b078fb98
--- /dev/null
+++ b/op_builder/transformer.py
@@ -0,0 +1,44 @@
+import torch
+from .builder import CUDAOpBuilder
+
+
+class TransformerBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_TRANSFORMER"
+    NAME = "transformer"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.transformer.{self.NAME}_op'
+
+    def sources(self):
+        return [
+            'csrc/transformer/ds_transformer_cuda.cpp',
+            'csrc/transformer/cublas_wrappers.cu',
+            'csrc/transformer/transform_kernels.cu',
+            'csrc/transformer/gelu_kernels.cu',
+            'csrc/transformer/dropout_kernels.cu',
+            'csrc/transformer/normalize_kernels.cu',
+            'csrc/transformer/softmax_kernels.cu',
+            'csrc/transformer/general_kernels.cu'
+        ]
+
+    def include_paths(self):
+        return ['csrc/includes']
+
+    def nvcc_args(self):
+        args = [
+            '-O3',
+            '--use_fast_math',
+            '-std=c++14',
+            '-U__CUDA_NO_HALF_OPERATORS__',
+            '-U__CUDA_NO_HALF_CONVERSIONS__',
+            '-U__CUDA_NO_HALF2_OPERATORS__'
+        ]
+
+        return args + self.compute_capability_args()
+
+    def cxx_args(self):
+        return ['-O3', '-std=c++14', '-g', '-Wno-reorder']
diff --git a/op_builder/utils.py b/op_builder/utils.py
new file mode 100644
index 000000000000..1631a2cf18b2
--- /dev/null
+++ b/op_builder/utils.py
@@ -0,0 +1,15 @@
+from .builder import OpBuilder
+
+
+class UtilsBuilder(OpBuilder):
+    BUILD_VAR = "DS_BUILD_UTILS"
+    NAME = "utils"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/utils/flatten_unflatten.cpp']
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 066129813851..000000000000
--- a/requirements.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-torch>=1.2
-torchvision>=0.4.0
-pillow==6.2.2
-tqdm
-psutil
-tensorboardX==1.8
-pytest
-pytest-forked
-pre-commit
-clang-format
diff --git a/requirements/requirements-1bit-adam.txt b/requirements/requirements-1bit-adam.txt
new file mode 100644
index 000000000000..66c5ba0468f8
--- /dev/null
+++ b/requirements/requirements-1bit-adam.txt
@@ -0,0 +1 @@
+mpi4py
diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
new file mode 100644
index 000000000000..0118f6ee760c
--- /dev/null
+++ b/requirements/requirements-dev.txt
@@ -0,0 +1,7 @@
+pytest
+pytest-forked
+pre-commit
+clang-format
+sphinx
+recommonmark
+sphinx-rtd-theme
diff --git a/requirements/requirements-readthedocs.txt b/requirements/requirements-readthedocs.txt
new file mode 100644
index 000000000000..78620c472c9d
--- /dev/null
+++ b/requirements/requirements-readthedocs.txt
@@ -0,0 +1 @@
+tqdm
diff --git a/requirements/requirements-sparse_attn.txt b/requirements/requirements-sparse_attn.txt
new file mode 100644
index 000000000000..5c3a59af407c
--- /dev/null
+++ b/requirements/requirements-sparse_attn.txt
@@ -0,0 +1 @@
+triton==0.2.3
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
new file mode 100644
index 000000000000..9192befdd35c
--- /dev/null
+++ b/requirements/requirements.txt
@@ -0,0 +1,6 @@
+torch>=1.2
+torchvision>=0.4.0
+tqdm
+tensorboardX==1.8
+ninja
+numpy
diff --git a/setup.py b/setup.py
index cc1484177b24..19df040dcc88 100755
--- a/setup.py
+++ b/setup.py
@@ -9,126 +9,189 @@
 """
 
 import os
-import torch
+import shutil
+import subprocess
+import warnings
 from setuptools import setup, find_packages
-from torch.utils.cpp_extension import CUDAExtension, BuildExtension
+
+try:
+    import torch
+    from torch.utils.cpp_extension import BuildExtension
+except ImportError:
+    raise ImportError('Unable to import torch, please visit https://pytorch.org/ '
+                      'to see how to properly install torch on your system.')
+
+from op_builder import ALL_OPS, get_default_compute_capatabilities
+
+
+def fetch_requirements(path):
+    with open(path, 'r') as fd:
+        return [r.strip() for r in fd.readlines()]
+
+
+install_requires = fetch_requirements('requirements/requirements.txt')
+extras_require = {
+    '1bit_adam': fetch_requirements('requirements/requirements-1bit-adam.txt'),
+    'readthedocs': fetch_requirements('requirements/requirements-readthedocs.txt'),
+    'dev': fetch_requirements('requirements/requirements-dev.txt'),
+}
+
+# If MPI is available add 1bit-adam requirements
+if torch.cuda.is_available():
+    if shutil.which('ompi_info') or shutil.which('mpiname'):
+        cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
+        extras_require['1bit_adam'].append(cupy)
+
+# Make an [all] extra that installs all needed dependencies
+all_extras = set()
+for extra in extras_require.items():
+    for req in extra[1]:
+        all_extras.add(req)
+extras_require['all'] = list(all_extras)
 
 cmdclass = {}
-cmdclass['build_ext'] = BuildExtension
 
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
+# For any pre-installed ops force disable ninja
+cmdclass['build_ext'] = BuildExtension.with_options(use_ninja=False)
+
+TORCH_MAJOR = torch.__version__.split('.')[0]
+TORCH_MINOR = torch.__version__.split('.')[1]
 
 if not torch.cuda.is_available():
-    # Fix to allow docker buils, similar to https://github.com/NVIDIA/apex/issues/486
+    # Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486
     print(
-        "[WARNING] Torch did not find cuda available, if cross-compling or running with cpu only "
+        "[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only "
         "you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "
         "(compute capabilities 6.0, 6.1, 6.2)")
     if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
-        os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"
-
-# Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
-version_ge_1_1 = []
-if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
-    version_ge_1_1 = ['-DVERSION_GE_1_1']
-version_ge_1_3 = []
-if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
-    version_ge_1_3 = ['-DVERSION_GE_1_3']
-version_ge_1_5 = []
-if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
-    version_ge_1_5 = ['-DVERSION_GE_1_5']
-version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
-
-ext_modules = [
-    CUDAExtension(
-        name='deepspeed_lamb_cuda',
-        sources=['csrc/lamb/fused_lamb_cuda.cpp',
-                 'csrc/lamb/fused_lamb_cuda_kernel.cu'],
-        include_dirs=['csrc/includes'],
-        extra_compile_args={
-            'cxx': [
-                '-O3',
-            ] + version_dependent_macros,
-            'nvcc': ['-O3',
-                     '--use_fast_math'] + version_dependent_macros
-        }),
-    CUDAExtension(name='deepspeed_transformer_cuda',
-                  sources=[
-                      'csrc/transformer/ds_transformer_cuda.cpp',
-                      'csrc/transformer/cublas_wrappers.cu',
-                      'csrc/transformer/transform_kernels.cu',
-                      'csrc/transformer/gelu_kernels.cu',
-                      'csrc/transformer/dropout_kernels.cu',
-                      'csrc/transformer/normalize_kernels.cu',
-                      'csrc/transformer/softmax_kernels.cu',
-                      'csrc/transformer/general_kernels.cu'
-                  ],
-                  include_dirs=['csrc/includes'],
-                  extra_compile_args={
-                      'cxx': ['-O3',
-                              '-std=c++14',
-                              '-g',
-                              '-Wno-reorder'],
-                      'nvcc': [
-                          '-O3',
-                          '--use_fast_math',
-                          '-gencode',
-                          'arch=compute_61,code=compute_61',
-                          '-gencode',
-                          'arch=compute_70,code=compute_70',
-                          '-std=c++14',
-                          '-U__CUDA_NO_HALF_OPERATORS__',
-                          '-U__CUDA_NO_HALF_CONVERSIONS__',
-                          '-U__CUDA_NO_HALF2_OPERATORS__'
-                      ]
-                  }),
-    CUDAExtension(name='deepspeed_stochastic_transformer_cuda',
-                  sources=[
-                      'csrc/transformer/ds_transformer_cuda.cpp',
-                      'csrc/transformer/cublas_wrappers.cu',
-                      'csrc/transformer/transform_kernels.cu',
-                      'csrc/transformer/gelu_kernels.cu',
-                      'csrc/transformer/dropout_kernels.cu',
-                      'csrc/transformer/normalize_kernels.cu',
-                      'csrc/transformer/softmax_kernels.cu',
-                      'csrc/transformer/general_kernels.cu'
-                  ],
-                  include_dirs=['csrc/includes'],
-                  extra_compile_args={
-                      'cxx': ['-O3',
-                              '-std=c++14',
-                              '-g',
-                              '-Wno-reorder'],
-                      'nvcc': [
-                          '-O3',
-                          '--use_fast_math',
-                          '-gencode',
-                          'arch=compute_61,code=compute_61',
-                          '-gencode',
-                          'arch=compute_70,code=compute_70',
-                          '-std=c++14',
-                          '-U__CUDA_NO_HALF_OPERATORS__',
-                          '-U__CUDA_NO_HALF_CONVERSIONS__',
-                          '-U__CUDA_NO_HALF2_OPERATORS__',
-                          '-D__STOCHASTIC_MODE__'
-                      ]
-                  }),
-]
+        os.environ["TORCH_CUDA_ARCH_LIST"] = get_default_compute_capatabilities()
+
+ext_modules = []
+
+# Default to pre-install kernels to false so we rely on JIT
+BUILD_OP_DEFAULT = int(os.environ.get('DS_BUILD_OPS', 0))
+print(f"DS_BUILD_OPS={BUILD_OP_DEFAULT}")
+
+
+def command_exists(cmd):
+    result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
+    return result.wait() == 0
+
+
+def op_enabled(op_name):
+    assert hasattr(ALL_OPS[op_name], 'BUILD_VAR'), \
+        f"{op_name} is missing BUILD_VAR field"
+    env_var = ALL_OPS[op_name].BUILD_VAR
+    return int(os.environ.get(env_var, BUILD_OP_DEFAULT))
+
+
+install_ops = dict.fromkeys(ALL_OPS.keys(), False)
+for op_name, builder in ALL_OPS.items():
+    op_compatible = builder.is_compatible()
+
+    # If op is compatible update install reqs so it can potentially build/run later
+    if op_compatible:
+        reqs = builder.python_requirements()
+        install_requires += builder.python_requirements()
+
+    # If op install enabled, add builder to extensions
+    if op_enabled(op_name) and op_compatible:
+        install_ops[op_name] = op_enabled(op_name)
+        ext_modules.append(builder.builder())
+
+compatible_ops = {op_name: op.is_compatible() for (op_name, op) in ALL_OPS.items()}
+
+print(f'Install Ops={install_ops}')
+
+# Write out version/git info
+git_hash_cmd = "git rev-parse --short HEAD"
+git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
+if command_exists('git') and 'DS_BUILD_STRING' not in os.environ:
+    try:
+        result = subprocess.check_output(git_hash_cmd, shell=True)
+        git_hash = result.decode('utf-8').strip()
+        result = subprocess.check_output(git_branch_cmd, shell=True)
+        git_branch = result.decode('utf-8').strip()
+    except subprocess.CalledProcessError:
+        git_hash = "unknown"
+        git_branch = "unknown"
+else:
+    git_hash = "unknown"
+    git_branch = "unknown"
+
+# Parse the DeepSpeed version string from version.txt
+version_str = open('version.txt', 'r').read().strip()
+
+# Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
+# example: DS_BUILD_STR=".dev20201022" python setup.py sdist bdist_wheel
+#version_str += os.environ.get('DS_BUILD_STRING', f'+{git_hash}')
+
+# Building wheel for distribution, update version file
+
+if 'DS_BUILD_STRING' in os.environ:
+    # Build string env specified, probably building for distribution
+    with open('build.txt', 'w') as fd:
+        fd.write(os.environ.get('DS_BUILD_STRING'))
+    version_str += os.environ.get('DS_BUILD_STRING')
+elif os.path.isfile('build.txt'):
+    # build.txt exists, probably installing from distribution
+    with open('build.txt', 'r') as fd:
+        version_str += fd.read().strip()
+else:
+    # None of the above, probably installing from source
+    version_str += f'+{git_hash}'
+
+torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
+# Set cuda_version to 0.0 if cpu-only
+cuda_version = "0.0"
+if torch.version.cuda is not None:
+    cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+torch_info = {"version": torch_version, "cuda_version": cuda_version}
+
+print(f"version={version_str}, git_hash={git_hash}, git_branch={git_branch}")
+with open('deepspeed/git_version_info_installed.py', 'w') as fd:
+    fd.write(f"version='{version_str}'\n")
+    fd.write(f"git_hash='{git_hash}'\n")
+    fd.write(f"git_branch='{git_branch}'\n")
+    fd.write(f"installed_ops={install_ops}\n")
+    fd.write(f"compatible_ops={compatible_ops}\n")
+    fd.write(f"torch_info={torch_info}\n")
+
+print(f'install_requires={install_requires}')
+print(f'compatible_ops={compatible_ops}')
+print(f'ext_modules={ext_modules}')
+
+# Parse README.md to make long_description for PyPI page.
+thisdir = os.path.abspath(os.path.dirname(__file__))
+with open(os.path.join(thisdir, 'README.md'), encoding='utf-8') as fin:
+    readme_text = fin.read()
 
 setup(name='deepspeed',
-      version='0.2.0',
+      version=version_str,
       description='DeepSpeed library',
+      long_description=readme_text,
+      long_description_content_type='text/markdown',
       author='DeepSpeed Team',
       author_email='deepspeed@microsoft.com',
-      url='http://aka.ms/deepspeed',
+      url='http://deepspeed.ai',
+      install_requires=install_requires,
+      extras_require=extras_require,
       packages=find_packages(exclude=["docker",
-                                      "third_party",
-                                      "csrc"]),
-      scripts=['bin/deepspeed',
-               'bin/deepspeed.pt',
-               'bin/ds',
-               'bin/ds_ssh'],
-      classifiers=['Programming Language :: Python :: 3.6'],
+                                      "third_party"]),
+      include_package_data=True,
+      scripts=[
+          'bin/deepspeed',
+          'bin/deepspeed.pt',
+          'bin/ds',
+          'bin/ds_ssh',
+          'bin/ds_report',
+          'bin/ds_elastic'
+      ],
+      classifiers=[
+          'Programming Language :: Python :: 3.6',
+          'Programming Language :: Python :: 3.7',
+          'Programming Language :: Python :: 3.8'
+      ],
+      license='MIT',
       ext_modules=ext_modules,
       cmdclass=cmdclass)
diff --git a/tests/model/BingBertSquad/deepspeed_bsz24_fp16_config.json b/tests/model/BingBertSquad/deepspeed_bsz24_fp16_config.json
index 4322433ed279..f0e7ca55ac65 100755
--- a/tests/model/BingBertSquad/deepspeed_bsz24_fp16_config.json
+++ b/tests/model/BingBertSquad/deepspeed_bsz24_fp16_config.json
@@ -1,6 +1,6 @@
 {
   "train_batch_size": 24,
-  "train_micro_batch_size_per_gpu": 6,
+  "train_micro_batch_size_per_gpu": 3,
   "steps_per_print": 1,
   "optimizer": {
     "type": "Adam",
diff --git a/tests/model/BingBertSquad/deepspeed_bsz24_fp16_zero2_config.json b/tests/model/BingBertSquad/deepspeed_bsz24_fp16_zero2_config.json
index 4fb783082801..4a3d8c8f8dbf 100755
--- a/tests/model/BingBertSquad/deepspeed_bsz24_fp16_zero2_config.json
+++ b/tests/model/BingBertSquad/deepspeed_bsz24_fp16_zero2_config.json
@@ -1,6 +1,6 @@
 {
   "train_batch_size": 24,
-  "train_micro_batch_size_per_gpu": 6,
+  "train_micro_batch_size_per_gpu": 3,
   "steps_per_print": 1,
   "optimizer": {
     "type": "Adam",
diff --git a/tests/model/BingBertSquad/deepspeed_bsz24_fp32_config.json b/tests/model/BingBertSquad/deepspeed_bsz24_fp32_config.json
index 1b2a07f1c3dc..328be06298db 100755
--- a/tests/model/BingBertSquad/deepspeed_bsz24_fp32_config.json
+++ b/tests/model/BingBertSquad/deepspeed_bsz24_fp32_config.json
@@ -1,6 +1,6 @@
 {
   "train_batch_size": 24,
-  "train_micro_batch_size_per_gpu": 6,
+  "train_micro_batch_size_per_gpu": 3,
   "steps_per_print": 1,
   "optimizer": {
     "type": "Adam",
diff --git a/tests/model/BingBertSquad/run_BingBertSquad.sh b/tests/model/BingBertSquad/run_BingBertSquad.sh
index 181009ff2620..7631217619ae 100755
--- a/tests/model/BingBertSquad/run_BingBertSquad.sh
+++ b/tests/model/BingBertSquad/run_BingBertSquad.sh
@@ -121,7 +121,7 @@ echo "deepspeed: ${enable_deepspeed}"
 echo "other_args: ${other_args}"
 
 EFFECTIVE_BATCH_SIZE=${batch_size}
-MAX_GPU_BATCH_SIZE=6
+MAX_GPU_BATCH_SIZE=3
 PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/num_gpus))
 if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
        GRAD_ACCUM_STEPS=1
diff --git a/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh b/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh
index 91dcbdb723bb..1b49a37b783f 100755
--- a/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh
+++ b/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh
@@ -122,7 +122,7 @@ echo "deepspeed: ${enable_deepspeed}"
 echo "other_args: ${other_args}"
 
 EFFECTIVE_BATCH_SIZE=${batch_size}
-MAX_GPU_BATCH_SIZE=6
+MAX_GPU_BATCH_SIZE=3
 PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/num_gpus))
 if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
        GRAD_ACCUM_STEPS=1
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero1.json b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero1.json
index 2a3b9ca5a0be..33562614aa9a 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero1.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero1.json
@@ -3,13 +3,7 @@
   "gradient_accumulation_steps": 1,
   "steps_per_print": 1,
   "zero_optimization": {
-    "stage":1
-  },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
+    "stage": 1
   },
   "gradient_clipping": 1.0,
   "fp16": {
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json
index fde222a3cca2..afc33ee7ea0d 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json
@@ -3,17 +3,11 @@
   "gradient_accumulation_steps": 1,
   "steps_per_print": 1,
   "zero_optimization": {
-    "stage":2,
+    "stage": 2,
     "reduce_bucket_size": 7000000,
     "allgather_bucket_size": 7000000,
     "reduce_scatter": true
   },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
-  },
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2_offload.json b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2_offload.json
new file mode 100755
index 000000000000..a436fe9f04c6
--- /dev/null
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2_offload.json
@@ -0,0 +1,20 @@
+{
+  "train_batch_size": 4,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "zero_optimization": {
+    "stage": 2,
+    "reduce_bucket_size": 7000000,
+    "allgather_bucket_size": 7000000,
+    "reduce_scatter": true,
+    "cpu_offload": true
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  }
+}
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json
index 99637973cd60..63b30c225753 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json
@@ -3,13 +3,7 @@
   "gradient_accumulation_steps": 1,
   "steps_per_print": 1,
   "zero_optimization": {
-    "stage":0
-  },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
+    "stage": 0
   },
   "gradient_clipping": 1.0,
   "fp16": {
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero0_gas3.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero0_gas3.json
new file mode 100755
index 000000000000..166db751361a
--- /dev/null
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero0_gas3.json
@@ -0,0 +1,23 @@
+{
+  "train_micro_batch_size_per_gpu": 8,
+  "gradient_accumulation_steps": 3,
+  "steps_per_print": 1,
+  "zero_optimization": {
+    "stage": 0,
+    "reduce_bucket_size": 7000000,
+    "allgather_bucket_size": 7000000,
+    "reduce_scatter": true
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": true,
+    "contiguous_memory_optimization": true
+  }
+}
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json
index 8d44659a9ee3..ca4d5ed9dfef 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json
@@ -2,14 +2,8 @@
   "train_batch_size": 8,
   "gradient_accumulation_steps": 1,
   "steps_per_print": 1,
-  "zero_optimization":{
-    "stage":1
-  },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
+  "zero_optimization": {
+    "stage": 1
   },
   "gradient_clipping": 1.0,
   "fp16": {
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json
index fde90e8274b8..9dd4a0807ba4 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json
@@ -3,17 +3,11 @@
   "gradient_accumulation_steps": 1,
   "steps_per_print": 1,
   "zero_optimization": {
-    "stage":2,
+    "stage": 2,
     "reduce_bucket_size": 7000000,
     "allgather_bucket_size": 7000000,
     "reduce_scatter": true
   },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
-  },
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
@@ -26,5 +20,4 @@
     "partition_activations": true,
     "contiguous_memory_optimization": true
   }
-
 }
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_gas3.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_gas3.json
new file mode 100755
index 000000000000..531975c499f5
--- /dev/null
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_gas3.json
@@ -0,0 +1,23 @@
+{
+  "train_micro_batch_size_per_gpu": 8,
+  "gradient_accumulation_steps": 3,
+  "steps_per_print": 1,
+  "zero_optimization": {
+    "stage": 2,
+    "reduce_bucket_size": 7000000,
+    "allgather_bucket_size": 7000000,
+    "reduce_scatter": true
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": true,
+    "contiguous_memory_optimization": true
+  }
+}
diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_offload.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_offload.json
new file mode 100755
index 000000000000..48d620c79e33
--- /dev/null
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_offload.json
@@ -0,0 +1,24 @@
+{
+  "train_batch_size": 8,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "zero_optimization": {
+    "stage": 2,
+    "reduce_bucket_size": 7000000,
+    "allgather_bucket_size": 7000000,
+    "reduce_scatter": true,
+    "cpu_offload": true
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "activation_checkpointing": {
+    "partition_activations": true,
+    "contiguous_memory_optimization": true
+  }
+}
diff --git a/tests/model/Megatron_GPT2/ds_config_func_scheduler.json b/tests/model/Megatron_GPT2/ds_config_func_scheduler.json
index 60c810786bf0..6f8f320b08cb 100755
--- a/tests/model/Megatron_GPT2/ds_config_func_scheduler.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_scheduler.json
@@ -3,13 +3,7 @@
   "gradient_accumulation_steps": 1,
   "steps_per_print": 1,
   "zero_optimization": {
-    "stage":2
-  },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
+    "stage": 2
   },
   "gradient_clipping": 1.0,
   "scheduler": {
@@ -20,7 +14,6 @@
       "warmup_num_steps": 10
     }
   },
-
   "fp16": {
     "enabled": true,
     "loss_scale": 0,
diff --git a/tests/model/Megatron_GPT2/ds_config_perf_bs16.json b/tests/model/Megatron_GPT2/ds_config_perf_bs16.json
old mode 100644
new mode 100755
index f160ccd8e610..ae22cf625c69
--- a/tests/model/Megatron_GPT2/ds_config_perf_bs16.json
+++ b/tests/model/Megatron_GPT2/ds_config_perf_bs16.json
@@ -2,14 +2,10 @@
   "train_batch_size": 16,
   "gradient_accumulation_steps": 1,
   "steps_per_print": 1,
-  "zero_optimization": 1,
-  "disable_allgather": true,
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
+  "zero_optimization": {
+    "stage": 1
   },
+  "disable_allgather": true,
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
diff --git a/tests/model/Megatron_GPT2/ds_config_perf_bs32.json b/tests/model/Megatron_GPT2/ds_config_perf_bs32.json
index 6e23fe687bc8..64fb73e9bec8 100755
--- a/tests/model/Megatron_GPT2/ds_config_perf_bs32.json
+++ b/tests/model/Megatron_GPT2/ds_config_perf_bs32.json
@@ -3,15 +3,9 @@
   "gradient_accumulation_steps": 1,
   "steps_per_print": 1,
   "zero_optimization": {
-    "stage":1
+    "stage": 1
   },
   "disable_allgather": true,
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
-  },
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
diff --git a/tests/model/Megatron_GPT2/ds_config_perf_bs8.json b/tests/model/Megatron_GPT2/ds_config_perf_bs8.json
old mode 100644
new mode 100755
index 514496958e14..c274b15becf2
--- a/tests/model/Megatron_GPT2/ds_config_perf_bs8.json
+++ b/tests/model/Megatron_GPT2/ds_config_perf_bs8.json
@@ -2,14 +2,10 @@
   "train_batch_size": 8,
   "gradient_accumulation_steps": 1,
   "steps_per_print": 1,
-  "zero_optimization": 1,
-  "disable_allgather": true,
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
+  "zero_optimization": {
+    "stage": 1
   },
+  "disable_allgather": true,
   "gradient_clipping": 1.0,
   "fp16": {
     "enabled": true,
diff --git a/tests/model/Megatron_GPT2/ds_gpt2_test.sh b/tests/model/Megatron_GPT2/ds_gpt2_test.sh
index 5c901f855a33..ac5d7e379023 100755
--- a/tests/model/Megatron_GPT2/ds_gpt2_test.sh
+++ b/tests/model/Megatron_GPT2/ds_gpt2_test.sh
@@ -3,7 +3,7 @@
 helpFunction()
 {
     echo ""
-    echo "Usage: $0 -m model-parallelism -g gpu-per-node -n node# -b batch-size -s stpes -l layers -h hidden_size -q seq_length -e heads -c ckpt_num_layers [-d]"
+    echo "Usage: $0 -m model-parallelism -g gpu-per-node -n node# -b batch-size -s stpes -l layers -h hidden_size -q seq_length -e heads -c ckpt_num_layers -p [-d]"
     echo -e "\t-m model parallelism"
     echo -e "\t-g gpus per node"
     echo -e "\t-n node count"
@@ -17,6 +17,7 @@ helpFunction()
     echo -e "\t-o other args"
     echo -e "\t-d DeepSpeed config json file"
     echo -e "\t-z Enable Zero optimization"
+    echo -e "\t-p DeepSpeed master port"
     exit 1
 }
 
@@ -27,6 +28,7 @@ ckpt_num_layers=1
 other_args=""
 ds_opt=""
 zero_opt=""
+master_port=29600
 
 script_path=$(realpath $0)
 script_dir=$(dirname $script_path)
@@ -44,6 +46,7 @@ do
         q ) seq_length="$OPTARG" ;;
         e ) heads="$OPTARG" ;;
         c ) ckpt_num_layers="$OPTARG" ;;
+        p ) master_port="$OPTARG" ;;
         o ) other_args="$OPTARG" ;;
         d ) ds_opt="--deepspeed --deepspeed_config $script_dir/$OPTARG" ;;
         z ) zero_opt="--zero_optimization" ;;
@@ -93,7 +96,7 @@ gpt_options=" \
 "
 
 work_dir="../../../DeepSpeedExamples/Megatron-LM/"
-run_cmd="(cd ${work_dir} && deepspeed --num_nodes $nodes --num_gpus $gpus pretrain_gpt2.py ${gpt_options})"
+run_cmd="(cd ${work_dir} && deepspeed --master_port ${master_port} --num_nodes $nodes --num_gpus $gpus pretrain_gpt2.py ${gpt_options})"
 echo ${run_cmd}
 eval ${run_cmd}
 
diff --git a/tests/model/Megatron_GPT2/run_checkpoint_test.py b/tests/model/Megatron_GPT2/run_checkpoint_test.py
index 116e58b98fa2..cf11af6c2ae4 100755
--- a/tests/model/Megatron_GPT2/run_checkpoint_test.py
+++ b/tests/model/Megatron_GPT2/run_checkpoint_test.py
@@ -97,6 +97,29 @@ def test_mp2_gpu4_node1_with_zero2(self):
         succ = self.run_test(test_config, 0.01)
         self.assertTrue(succ)
 
+    def test_mp2_gpu4_node1_with_zero2_offload(self):
+        test_config = {
+            "mp": 2,
+            "gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1100,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": 256,
+            "heads": ATTN_HEADS,
+            "deepspeed": True,
+            "tag": "ds_zero2_offload",
+            "zero": True,
+            "other_args": "",
+            "checkpoint_name": "ckpt_mp2_gpu8_w_zero2_offload",
+            "checkpoint_interval": 1000,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+        }
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
     def test_mp1_gpu2_load_gpu1_node1_with_zero1(self):
         test_config = {
             "mp": 1,
@@ -110,7 +133,7 @@ def test_mp1_gpu2_load_gpu1_node1_with_zero1(self):
             "seq_length": 256,
             "heads": ATTN_HEADS,
             "deepspeed": True,
-            "tag": "ds_zero2",
+            "tag": "ds_zero1",
             "zero": True,
             "other_args": "",
             "checkpoint_name": "ckpt_mp1_gpu2_gpu1_w_zero1",
@@ -133,7 +156,7 @@ def test_mp1_gpu2_load_gpu4_node1_with_zero1(self):
             "seq_length": 256,
             "heads": ATTN_HEADS,
             "deepspeed": True,
-            "tag": "ds_zero2",
+            "tag": "ds_zero1",
             "zero": True,
             "other_args": "",
             "checkpoint_name": "ckpt_mp1_gpu2_gpu4_w_zero1",
@@ -166,6 +189,30 @@ def test_mp1_gpu2_load_gpu1_node1_with_zero2(self):
         succ = self.run_test(test_config, 0.01)
         self.assertTrue(succ)
 
+    def test_mp1_gpu2_load_gpu1_node1_with_zero2_offload(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 2,
+            "load_gpus": 1,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1100,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": 256,
+            "heads": ATTN_HEADS,
+            "deepspeed": True,
+            "tag": "ds_zero2_offload",
+            "zero": True,
+            "other_args": "",
+            "checkpoint_name": "ckpt_mp1_gpu2_gpu1_w_zero2_offload",
+            "checkpoint_interval": 1000,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+        }
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
     def test_mp1_gpu2_load_gpu4_node1_with_zero2(self):
         test_config = {
             "mp": 1,
@@ -189,6 +236,30 @@ def test_mp1_gpu2_load_gpu4_node1_with_zero2(self):
         succ = self.run_test(test_config, 0.01)
         self.assertTrue(succ)
 
+    def test_mp1_gpu2_load_gpu4_node1_with_zero2_offload(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 2,
+            "load_gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1100,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": 256,
+            "heads": ATTN_HEADS,
+            "deepspeed": True,
+            "tag": "ds_zero2_offload",
+            "zero": True,
+            "other_args": "",
+            "checkpoint_name": "ckpt_mp1_gpu2_gpu4_w_zero2_offload",
+            "checkpoint_interval": 1000,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+        }
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
     def test_mp2_gpu4_load_gpu2_node1_with_zero1(self):
         test_config = {
             "mp": 2,
@@ -258,6 +329,30 @@ def test_mp2_gpu4_load_gpu2_node1_with_zero2(self):
         succ = self.run_test(test_config, 0.01)
         self.assertTrue(succ)
 
+    def test_mp2_gpu4_load_gpu2_node1_with_zero2_offload(self):
+        test_config = {
+            "mp": 2,
+            "gpus": 4,
+            "load_gpus": 2,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1100,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": 256,
+            "heads": ATTN_HEADS,
+            "deepspeed": True,
+            "tag": "ds_zero2_offload",
+            "zero": True,
+            "other_args": "",
+            "checkpoint_name": "ckpt_mp2_gpu4_gpu2_w_zero2_offload",
+            "checkpoint_interval": 1000,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+        }
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
     def test_mp2_gpu2_load_gpu4_node1_with_zero2(self):
         test_config = {
             "mp": 2,
@@ -281,6 +376,30 @@ def test_mp2_gpu2_load_gpu4_node1_with_zero2(self):
         succ = self.run_test(test_config, 0.01)
         self.assertTrue(succ)
 
+    def test_mp2_gpu2_load_gpu4_node1_with_zero2_offload(self):
+        test_config = {
+            "mp": 2,
+            "gpus": 2,
+            "load_gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1100,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": 256,
+            "heads": ATTN_HEADS,
+            "deepspeed": True,
+            "tag": "ds_zero2_offload",
+            "zero": True,
+            "other_args": "",
+            "checkpoint_name": "ckpt_mp2_gpu2_gpu4_w_zero2_offload",
+            "checkpoint_interval": 1000,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+        }
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
     def test_mp2_gpu4_node1_without_zero(self):
         test_config = {
             "mp": 2,
@@ -306,7 +425,8 @@ def test_mp2_gpu4_node1_without_zero(self):
     def gen_name(self, test_config, prefix):
         save_dir = "checkpoint_test_logs"
         tag = test_config["tag"]
-        file_name = f"_{tag}.log"
+        checkpoint_name = test_config["checkpoint_name"]
+        file_name = f"_{tag}_{checkpoint_name}.log"
         return os.path.join(save_dir, prefix + file_name)
 
     def run_test(self, test_config, r_tol):
@@ -334,10 +454,15 @@ def run_test(self, test_config, r_tol):
         except:
             print("No old checkpoint")
 
+        if "cpu_optimizer" in test_config and test_config["cpu_optimizer"]:
+            cpu_optimizer_flag = " --cpu-optimizer"
+        else:
+            cpu_optimizer_flag = ""
+
         #-----------------Saving Checkpoint-----------------#
-        #building checkpoint arguments
+        # building checkpoint arguments
         test_config[
-            "other_args"] = f"\"--save {checkpoint_folder} --save-interval {checkpoint_interval}\""
+            "other_args"] = f"\"--save {checkpoint_folder} --save-interval {checkpoint_interval} {cpu_optimizer_flag}\""
 
         prefix = "gpt2_saving_checkpoint"
 
@@ -356,10 +481,11 @@ def run_test(self, test_config, r_tol):
 
         #-----------------Loading Checkpoint-----------------#
 
-        #building checkpoint arguments
-        test_config["other_args"] = f"\"--load {checkpoint_folder}\""
+        # building checkpoint arguments
+        test_config[
+            "other_args"] = f"\"--load {checkpoint_folder} {cpu_optimizer_flag} \""
 
-        #set checkpoint load iteration
+        # set checkpoint load iteration
         try:
             cmd = f"echo {checkpoint_interval} > {checkpoint_name}/latest_checkpointed_iteration.txt"
             print(f"{self.id()} running cmd: {cmd}")
@@ -411,20 +537,32 @@ def check_parity(self, base_file, test_file, r_tol):
 
 def checkpoint_suite():
     suite = unittest.TestSuite()
+
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_with_zero1'))
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_with_zero2'))
+    suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_with_zero2_offload'))
 
     # Shrink DP
     suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero1'))
     suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero2'))
+    suite.addTest(
+        GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero2_offload'))
+
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero1'))
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero2'))
+    suite.addTest(
+        GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero2_offload'))
 
     # Expand DP
     suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero1'))
     suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero2'))
+    suite.addTest(
+        GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero2_offload'))
+
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero1'))
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero2'))
+    suite.addTest(
+        GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero2_offload'))
 
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_without_zero'))
 
diff --git a/tests/model/Megatron_GPT2/run_func_test.py b/tests/model/Megatron_GPT2/run_func_test.py
index cf4034e585f0..f8ab5bcb3333 100755
--- a/tests/model/Megatron_GPT2/run_func_test.py
+++ b/tests/model/Megatron_GPT2/run_func_test.py
@@ -14,11 +14,12 @@
 HIDDEN_SIZE = 128
 ATTN_HEADS = 8
 SEQ_LEN = 64
+MASTER_PORT = 29700
 
 
 def grep_loss_from_file(file_name):
     loss = 0.0
-
+    print(f'grepping {file_name}')
     with open(file_name, 'r') as f:
         lines = f.readlines()
         line_filter = "validation loss at the end of training for test data | LM loss:"
@@ -48,6 +49,24 @@ def setUp(self):
     def tearDown(self):
         os.chdir(self.save_dir)
 
+    def test_mp1_gpu2_node1_fp16(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 2,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": False,
+            "json": "ds_config_func_bs8_no_zero.json",
+        }
+
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
     def test_mp1_gpu1_node1_zero1(self):
         test_config = {
             "mp": 1,
@@ -171,10 +190,12 @@ def test_mp2_gpu4_node1_zero2(self):
             "json": "ds_config_func_bs8_zero2.json",
         }
 
-        succ = self.run_test(test_config, 0.01)
+        basic_run_config = test_config
+        succ = self.run_test(basic_run_config, 0.01)
         self.assertTrue(succ)
 
-        succ = self.run_partition_activations_test(test_config, 0.01)
+        partition_activation_config = test_config
+        succ = self.run_partition_activations_test(partition_activation_config, 0.01)
         self.assertTrue(succ)
 
     def test_mp4_gpu4_node1_zero2(self):
@@ -192,12 +213,209 @@ def test_mp4_gpu4_node1_zero2(self):
             "json": "ds_config_func_bs8_zero2.json",
         }
 
+        basic_run_config = test_config
+        succ = self.run_test(basic_run_config, 0.01)
+        self.assertTrue(succ)
+
+        partition_activation_config = test_config
+        succ = self.run_partition_activations_test(partition_activation_config, 0.01)
+        self.assertTrue(succ)
+
+    def test_mp1_gpu1_node1_zero2_ds_offload(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 1,
+            "nodes": 1,
+            "bs": 4,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": False,
+            "json": "ds_config_func_bs4_zero2_offload.json",
+            "cpu_optimizer": True,
+        }
+        succ = self.run_test(test_config, 0.02)
+        self.assertTrue(succ)
+
+    def test_mp1_gpu2_node1_zero2_ds_offload(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 2,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": False,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+        }
+        succ = self.run_test(test_config, 0.02)
+        self.assertTrue(succ)
+
+    def test_mp2_gpu4_node1_zero2_gas(self):
+        test_config = {
+            "mp": 2,
+            "gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": True,
+            "json": "ds_config_func_bs8_zero2_gas3.json",
+            "baseline": "ds_config_func_bs8_zero0_gas3.json",
+        }
+
         succ = self.run_test(test_config, 0.01)
         self.assertTrue(succ)
 
         succ = self.run_partition_activations_test(test_config, 0.01)
         self.assertTrue(succ)
 
+    def test_mp2_gpu4_node1_zero2_ds_offload(self):
+        test_config = {
+            "mp": 2,
+            "gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": False,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+        }
+
+        basic_run_config = test_config
+        succ = self.run_test(basic_run_config, 0.02)
+        self.assertTrue(succ)
+
+        partition_activation_config = test_config
+        succ = self.run_partition_activations_test(partition_activation_config, 0.02)
+        self.assertTrue(succ)
+
+    def test_mp4_gpu4_node1_zero2_ds_offload(self):
+        test_config = {
+            "mp": 4,
+            "gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": False,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+        }
+
+        basic_run_config = test_config
+        succ = self.run_test(basic_run_config, 0.02)
+        self.assertTrue(succ)
+
+        partition_activation_config = test_config
+        succ = self.run_partition_activations_test(partition_activation_config, 0.02)
+        self.assertTrue(succ)
+
+    def test_mp1_gpu1_node1_zero2_torch_offload(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 1,
+            "nodes": 1,
+            "bs": 4,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": False,
+            "json": "ds_config_func_bs4_zero2_offload.json",
+            "cpu_optimizer": True,
+            "test_torch_offload": True,
+        }
+
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
+    def test_mp1_gpu2_node1_zero2_torch_offload(self):
+        test_config = {
+            "mp": 1,
+            "gpus": 2,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": False,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+            "test_torch_offload": True,
+        }
+
+        succ = self.run_test(test_config, 0.01)
+        self.assertTrue(succ)
+
+    def test_mp2_gpu4_node1_zero2_torch_offload(self):
+        test_config = {
+            "mp": 2,
+            "gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": False,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+            "test_torch_offload": True,
+        }
+
+        basic_run_config = test_config
+        succ = self.run_test(basic_run_config, 0.01)
+        self.assertTrue(succ)
+
+        partition_activation_config = test_config
+        succ = self.run_partition_activations_test(partition_activation_config, 0.01)
+        self.assertTrue(succ)
+
+    def test_mp4_gpu4_node1_zero2_torch_offload(self):
+        test_config = {
+            "mp": 4,
+            "gpus": 4,
+            "nodes": 1,
+            "bs": 8,
+            "steps": 1000,
+            "layers": LAYERS,
+            "hidden_size": HIDDEN_SIZE,
+            "seq_length": SEQ_LEN,
+            "heads": ATTN_HEADS,
+            "deepspeed": False,
+            "json": "ds_config_func_bs8_zero2_offload.json",
+            "cpu_optimizer": True,
+            "test_torch_offload": True,
+        }
+
+        basic_run_config = test_config
+        succ = self.run_test(basic_run_config, 0.01)
+        self.assertTrue(succ)
+
+        partition_activation_config = test_config
+        succ = self.run_partition_activations_test(partition_activation_config, 0.01)
+
     def test_optimizer_scheduler(self):
         test_config = {
             "mp": 1,
@@ -224,9 +442,24 @@ def run_partition_activations_test(self, test_config, r_tol):
         baseline_prefix = "gpt2_func_"
         prefix = "gpt2_partition_activation_"
 
+        deepspeed_config = test_config["json"]
+        baseline_deepspeed_config = False
+        cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, True)
+
         # baseline run...
-        test_config["deepspeed"] = False
-        base_file = self.gen_output_name(test_config, baseline_prefix)
+        # turnoff deepspeed if baseline deepspeed config
+        # is not provided
+        if not "baseline" in test_config:
+            test_config["deepspeed"] = False
+        else:
+            test_config["json"] = test_config["baseline"]
+            baseline_prefix += test_config["json"][0:-5]
+            baseline_deepspeed_config = True
+
+        test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
+        base_file = self.gen_output_name(test_config,
+                                         baseline_prefix,
+                                         baseline_config=baseline_deepspeed_config)
 
         # skip baseline run if it exists.
         if not self.has_loss_data(base_file):
@@ -237,7 +470,11 @@ def run_partition_activations_test(self, test_config, r_tol):
 
         # DeepSpeed run...
         test_config["deepspeed"] = True
-        test_config["other_args"] = "--deepspeed-activation-checkpointing"
+        cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, False)
+        test_config[
+            "other_args"] = f"\"--deepspeed-activation-checkpointing {cpu_optimizer_flag}\""
+        test_config["json"] = deepspeed_config
+
         print("{0}: DeepSpeed run.".format(self.id()))
         test_file = self.gen_output_name(test_config, prefix)
         self.run_gpt2_test(test_config, test_file)
@@ -249,10 +486,28 @@ def run_test(self, test_config, r_tol):
         print("{0}: starting......".format(self.id()))
 
         prefix = "gpt2_func"
+        baseline_prefix = prefix
+
+        deepspeed_config = test_config["json"]
+        baseline_deepspeed_config = False
+        cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, True)
+
+        # baseline run...
+        # turn off deepspeed if a baseline deepspeed config
+        # is not provided
+        if not "baseline" in test_config:
+            test_config["deepspeed"] = False
+        else:
+            test_config["json"] = test_config["baseline"]
+            baseline_prefix = prefix + test_config["json"][0:-5]
+            baseline_deepspeed_config = True
+
+        test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
 
         # baseline run...
-        test_config["deepspeed"] = False
-        base_file = self.gen_output_name(test_config, prefix)
+        base_file = self.gen_output_name(test_config,
+                                         baseline_prefix,
+                                         baseline_config=baseline_deepspeed_config)
 
         # skip baseline run if it exists.
         if not self.has_loss_data(base_file):
@@ -263,6 +518,9 @@ def run_test(self, test_config, r_tol):
 
         # DeepSpeed run...
         test_config["deepspeed"] = True
+        cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, False)
+        test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
+
         print("{0}: DeepSpeed run.".format(self.id()))
         test_file = self.gen_output_name(test_config, prefix)
         self.run_gpt2_test(test_config, test_file)
@@ -292,9 +550,40 @@ def check_parity(self, base_file, test_file, r_tol):
 
         return True
 
+    def gen_cpu_optimizer_flag(self, test_config, is_baseline):
+        if 'cpu_optimizer' in test_config and test_config['cpu_optimizer']:
+            cpu_optimizer_flag = "--cpu-optimizer"
+            if is_baseline:
+                cpu_optimizer_flag += " --cpu_torch_adam"
+                return cpu_optimizer_flag
+            if 'test_torch_offload' in test_config and test_config['test_torch_offload']:
+                cpu_optimizer_flag += " --cpu_torch_adam"
+                return cpu_optimizer_flag
+        else:
+            cpu_optimizer_flag = ""
+
+        return cpu_optimizer_flag
+
 
 def suite():
     suite = unittest.TestSuite()
+
+    suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_fp16'))
+
+    # Baseline = Megatron + Torch.Optim.Adam
+    # Test = Megatron + Torch.Optim.Adam + ZeRO-Offload
+    suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2_torch_offload'))
+    suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2_torch_offload'))
+    suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_torch_offload'))
+    suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2_torch_offload'))
+
+    # Baseline = Megatron + Torch.Optim.Adam
+    # Test = Megatron + DeepSpeedAdam + ZeRO-Offload
+    suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2_ds_offload'))
+    suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2_ds_offload'))
+    suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_ds_offload'))
+    suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2_ds_offload'))
+
     suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero1'))
     suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero1'))
     suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero1'))
@@ -305,7 +594,10 @@ def suite():
     suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2'))
     suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2'))
 
+    suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_gas'))
+
     suite.addTest(GPT2FuncTestCase('test_optimizer_scheduler'))
+
     return suite
 
 
diff --git a/tests/model/Megatron_GPT2/test_common.py b/tests/model/Megatron_GPT2/test_common.py
index 7567b5a14f46..ae1dd328de2e 100755
--- a/tests/model/Megatron_GPT2/test_common.py
+++ b/tests/model/Megatron_GPT2/test_common.py
@@ -16,7 +16,7 @@ def __init__(self, methodName="DeepSpeed performance test"):
         self.baseline_dir = "./baseline"
         self.timestr = time.strftime("%Y%m%d-%H%M%S")
 
-    def gen_output_name(self, test_config, prefix):
+    def gen_output_name(self, test_config, prefix, baseline_config=False):
         other_args = test_config["other_args"] if "other_args" in test_config else ""
         zero_args = "_zero" if "zero" in test_config and test_config["zero"] else ""
         other_args = other_args.strip(' -\\').replace(" ", "").replace("\"", "")
@@ -24,7 +24,7 @@ def gen_output_name(self, test_config, prefix):
         if other_args:
             other_args = "_" + other_args
 
-        if test_config["deepspeed"]:
+        if test_config["deepspeed"] and not baseline_config:
             file_name = "_mp{0}_gpu{1}_node{2}_bs{3}_step{4}_layer{5}_hidden{6}_seq{7}_head{8}{9}_ds{10}-{11}.log".format(
                 test_config["mp"],
                 test_config["gpus"],
diff --git a/tests/onebitadam/test_com_reduce_cuda.py b/tests/onebitadam/test_com_reduce_cuda.py
new file mode 100644
index 000000000000..a5a87ce67232
--- /dev/null
+++ b/tests/onebitadam/test_com_reduce_cuda.py
@@ -0,0 +1,86 @@
+from mpi4py import MPI
+import time
+import torch
+import torch.distributed as dist
+import numpy as np
+import deepspeed
+from deepspeed.runtime.fp16.onebit_adam import OnebitAdam
+
+comm = MPI.COMM_WORLD
+size = comm.Get_size()
+rank = comm.Get_rank()
+
+#TODO: Detect the hostname we are running on automatically
+torch.distributed.init_process_group(backend='nccl',
+                                     init_method='tcp://worker-1:2245',
+                                     world_size=size,
+                                     rank=rank)
+
+dummy_model = [torch.nn.Parameter(torch.ones(10))]
+
+# Set cuda_aware to True to use CUDA buffers for communication
+dummy_optim = OnebitAdam(dummy_model, cuda_aware=True)
+
+device = torch.device('cuda', rank % torch.cuda.device_count())
+
+
+def torch_sim(a):
+    a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
+    scale = a.norm() / np.sqrt(a.numel())
+    a_compressed = scale * a_sign
+    a_sign = None
+    worker_error = a - a_compressed
+    dist.all_reduce(a_compressed)
+    a_compressed.mul_(1 / dist.get_world_size())
+    a_server_sign = a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
+    a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
+    server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list]
+    a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
+    a_server_compressed = torch.cat(
+        [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
+    rank = dist.get_rank()
+    server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
+    torch.cuda.synchronize()
+    torch.distributed.barrier()
+    return a_server_compressed, worker_error, server_error
+
+
+tensor_size = 100 * 2**20
+server_size = int(tensor_size / size)
+if tensor_size % (8 * size) != 0:
+    right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
+else:
+    right_tensor_size = tensor_size
+right_server_size = right_tensor_size // size
+# Adding bias to the initialization of the gradient we are communicating
+# In order to get rid of the case where some elements in the gradient are too small
+a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
+worker_error = torch.zeros(right_tensor_size, device=device)
+server_error = torch.zeros(right_server_size, device=device)
+a_torch, worker_error_torch, server_error_torch = torch_sim(a)
+torch.cuda.empty_cache()
+local_rank = rank % torch.cuda.device_count()
+a_after = dummy_optim.Compressed_Allreduce(a,
+                                           worker_error,
+                                           server_error,
+                                           rank,
+                                           size,
+                                           comm,
+                                           local_rank)
+threshold = 1e-6
+magnitude_threshold = 1e-6
+diff_mask = (a_after - a_torch) > threshold
+diff_server_mask = torch.chunk(diff_mask, size)[rank]
+mpi_server = torch.chunk(a_after, size)[rank] + server_error
+torch_server = torch.chunk(a_torch, size)[rank] + server_error_torch
+
+# If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic
+# The test would skip those numbers that are too small in compensated_server_m
+if torch.sum(diff_server_mask) == 0:
+    print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank))
+else:
+    check_mag_mask = mpi_server[diff_mask] > magnitude_threshold
+    if torch.sum(check_mag_mask) == 0:
+        print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank))
+    else:
+        print('Fails at {} of positions'.format(torch.sum(check_mag_mask)))
diff --git a/tests/onebitadam/test_com_reduce_host.py b/tests/onebitadam/test_com_reduce_host.py
new file mode 100644
index 000000000000..1507abc44f24
--- /dev/null
+++ b/tests/onebitadam/test_com_reduce_host.py
@@ -0,0 +1,86 @@
+from mpi4py import MPI
+import time
+import torch
+import torch.distributed as dist
+import numpy as np
+import deepspeed
+from deepspeed.runtime.fp16.onebit_adam import OnebitAdam
+
+comm = MPI.COMM_WORLD
+size = comm.Get_size()
+rank = comm.Get_rank()
+
+#TODO: Detect the hostname we are running on automatically
+torch.distributed.init_process_group(backend='nccl',
+                                     init_method='tcp://worker-1:2245',
+                                     world_size=size,
+                                     rank=rank)
+
+dummy_model = [torch.nn.Parameter(torch.ones(10))]
+
+# Set cuda_aware to False to use host buffers for communication
+dummy_optim = OnebitAdam(dummy_model, cuda_aware=False)
+
+device = torch.device('cuda', rank % torch.cuda.device_count())
+
+
+def torch_sim(a):
+    a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
+    scale = a.norm() / np.sqrt(a.numel())
+    a_compressed = scale * a_sign
+    a_sign = None
+    worker_error = a - a_compressed
+    dist.all_reduce(a_compressed)
+    a_compressed.mul_(1 / dist.get_world_size())
+    a_server_sign = a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
+    a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
+    server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list]
+    a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
+    a_server_compressed = torch.cat(
+        [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
+    rank = dist.get_rank()
+    server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
+    torch.cuda.synchronize()
+    torch.distributed.barrier()
+    return a_server_compressed, worker_error, server_error
+
+
+tensor_size = 100 * 2**20
+server_size = int(tensor_size / size)
+if tensor_size % (8 * size) != 0:
+    right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
+else:
+    right_tensor_size = tensor_size
+right_server_size = right_tensor_size // size
+# Adding bias to the initialization of the gradient we are communicating
+# In order to get rid of the case where some elements in the gradient are too small
+a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
+worker_error = torch.zeros(right_tensor_size, device=device)
+server_error = torch.zeros(right_server_size, device=device)
+a_torch, worker_error_torch, server_error_torch = torch_sim(a)
+torch.cuda.empty_cache()
+local_rank = rank % torch.cuda.device_count()
+a_after = dummy_optim.Compressed_Allreduce(a,
+                                           worker_error,
+                                           server_error,
+                                           rank,
+                                           size,
+                                           comm,
+                                           local_rank)
+threshold = 1e-6
+magnitude_threshold = 1e-6
+diff_mask = (a_after - a_torch) > threshold
+diff_server_mask = torch.chunk(diff_mask, size)[rank]
+mpi_server = torch.chunk(a_after, size)[rank] + server_error
+torch_server = torch.chunk(a_torch, size)[rank] + server_error_torch
+
+# If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic
+# The test would skip those numbers that are too small in compensated_server_m
+if torch.sum(diff_server_mask) == 0:
+    print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank))
+else:
+    check_mag_mask = mpi_server[diff_mask] > magnitude_threshold
+    if torch.sum(check_mag_mask) == 0:
+        print('Successfully passed the test for 1bit Adam at Rank {}'.format(rank))
+    else:
+        print('Fails at {} of positions'.format(torch.sum(check_mag_mask)))
diff --git a/tests/onebitadam/test_server_error.py b/tests/onebitadam/test_server_error.py
new file mode 100644
index 000000000000..075145f84915
--- /dev/null
+++ b/tests/onebitadam/test_server_error.py
@@ -0,0 +1,87 @@
+from mpi4py import MPI
+import time
+import torch
+import torch.distributed as dist
+import numpy as np
+import deepspeed
+from deepspeed.runtime.fp16.onebit_adam import OnebitAdam
+
+comm = MPI.COMM_WORLD
+size = comm.Get_size()
+rank = comm.Get_rank()
+
+torch.distributed.init_process_group(backend='nccl',
+                                     init_method='tcp://worker-0:2245',
+                                     world_size=size,
+                                     rank=rank)
+
+dummy_model = [torch.nn.Parameter(torch.ones(10))]
+dummy_optim = OnebitAdam(dummy_model, cuda_aware=False)
+
+device = torch.device('cuda', rank % torch.cuda.device_count())
+
+
+def torch_sim(a):
+    a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
+    scale = a.norm() / np.sqrt(a.numel())
+    a_compressed = scale * a_sign
+    a_sign = None
+    worker_error = a - a_compressed
+    dist.all_reduce(a_compressed)
+    a_compressed.mul_(1 / dist.get_world_size())
+    a_server_sign = a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
+    a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
+    server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list]
+    a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
+    a_server_compressed = torch.cat(
+        [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
+    rank = dist.get_rank()
+    server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
+    torch.cuda.synchronize()
+    torch.distributed.barrier()
+    return a_server_compressed, worker_error, server_error
+
+
+# Input Tensor size
+tensor_size = 100 * 2**20
+
+server_size = int(tensor_size / size)
+if tensor_size % (8 * size) != 0:
+    right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
+else:
+    right_tensor_size = tensor_size
+
+right_server_size = right_tensor_size // size
+
+# The -0.5 is required for avoiding sign flips/errors
+a = torch.rand(tensor_size, device=device) - 0.5
+
+worker_error = torch.zeros(right_tensor_size, device=device)
+server_error = torch.zeros(right_server_size, device=device)
+a_torch, worker_error_torch, server_error_torch = torch_sim(a)
+torch.cuda.empty_cache()
+local_rank = rank % torch.cuda.device_count()
+
+# Test the 1-bit Adam optimizer
+a_after = dummy_optim.Compressed_Allreduce(a,
+                                           worker_error,
+                                           server_error,
+                                           rank,
+                                           size,
+                                           comm,
+                                           local_rank)
+
+# If the error is below the threshold, it is acceptable for training
+threshold = 1e-6
+
+diff_pos = ((a_after - a_torch) > threshold)
+
+if rank == 0:
+    before_diff = torch.chunk(a_after - a_torch,
+                              size)[rank] + server_error - server_error_torch
+    if torch.norm(before_diff) / torch.norm(torch.chunk(a_after,
+                                                        size)[rank]) < threshold:
+        print('Successfully passed the test')
+    else:
+        print('The difference for the tensor before allgather is {}'.format(
+            torch.norm(before_diff)))
diff --git a/tests/perf/adam_test.py b/tests/perf/adam_test.py
new file mode 100755
index 000000000000..0f29cab4662e
--- /dev/null
+++ b/tests/perf/adam_test.py
@@ -0,0 +1,24 @@
+import torch
+from deepspeed.ops.adam import DeepSpeedCPUAdam
+import time
+
+device = 'cpu'
+model_size = 1 * 1024**3
+group_size = [model_size, 274432]
+
+param = [torch.nn.Parameter(torch.ones(size, device=device)) for size in group_size]
+optimizer = DeepSpeedCPUAdam(param)
+#torch.set_num_threads(128)
+for i, p in enumerate(param):
+    p.grad = torch.ones(group_size[i], device=device)
+#param.grad = torch.ones(model_size, device=device)
+avg = 0
+for i in range(100):
+    start = time.time()
+    optimizer.step()
+    stop = time.time()
+    avg += (stop - start)
+    for i, p in enumerate(param):
+        p.grad = torch.ones(group_size[i], device=device) * 2
+    #param.grad = torch.ones(model_size, device=device) * 2
+print("Elapsed Time is ", avg / 100)
diff --git a/tests/perf/adam_test1.py b/tests/perf/adam_test1.py
new file mode 100755
index 000000000000..b0aba0fcd6b9
--- /dev/null
+++ b/tests/perf/adam_test1.py
@@ -0,0 +1,22 @@
+import torch
+from deepspeed.ops.adam import DeepSpeedCPUAdam
+import time
+
+device = 'cpu'
+model_size = 1 * 1024**3
+param = torch.nn.Parameter(torch.ones(model_size, device=device))
+param_fp16 = torch.nn.Parameter(torch.ones(model_size,
+                                           dtype=torch.half,
+                                           device='cuda:0'))
+
+optimizer = DeepSpeedCPUAdam([param])
+#torch.set_num_threads(128)
+param.grad = torch.ones(model_size, device=device)
+avg = 0
+for i in range(100):
+    start = time.time()
+    optimizer.step(fp16_param_groups=[param_fp16])
+    stop = time.time()
+    avg += (stop - start)
+    param.grad = torch.ones(model_size, device=device) * 2
+print("Elapsed Time is ", avg / 100)
diff --git a/tests/unit/common.py b/tests/unit/common.py
index 5cea6d2f0f76..62b7495a025c 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -5,6 +5,8 @@
 import torch.distributed as dist
 from torch.multiprocessing import Process
 
+import deepspeed
+
 import pytest
 
 # Worker timeout *after* the first worker has completed.
@@ -32,15 +34,19 @@ def dist_wrap(run_func):
         def dist_init(local_rank, num_procs, *func_args, **func_kwargs):
             """Initialize torch.distributed and execute the user function. """
             os.environ['MASTER_ADDR'] = '127.0.0.1'
-            os.environ['MASTER_PORT'] = '29500'
-            dist.init_process_group(backend=backend,
-                                    init_method='env://',
-                                    rank=local_rank,
-                                    world_size=num_procs)
+            os.environ['MASTER_PORT'] = '29503'
+            os.environ['LOCAL_RANK'] = str(local_rank)
+            # NOTE: unit tests don't support multi-node so local_rank == global rank
+            os.environ['RANK'] = str(local_rank)
+            os.environ['WORLD_SIZE'] = str(num_procs)
+
+            deepspeed.init_distributed(dist_backend=backend)
 
             if torch.cuda.is_available():
                 torch.cuda.set_device(local_rank)
 
+            if 'args' in func_kwargs:
+                func_kwargs['args'].local_rank = local_rank
             run_func(*func_args, **func_kwargs)
 
         def dist_launcher(num_procs, *func_args, **func_kwargs):
diff --git a/tests/unit/modelingpreln.py b/tests/unit/modelingpreln.py
index e9e00727353f..8fcae8bcca18 100755
--- a/tests/unit/modelingpreln.py
+++ b/tests/unit/modelingpreln.py
@@ -363,10 +363,18 @@ def __init__(self, hidden_size, eps=1e-12):
             self.variance_epsilon = eps
 
         def forward(self, x):
+            pdtype = x.dtype
+            x = x.float()
             u = x.mean(-1, keepdim=True)
             s = (x - u).pow(2).mean(-1, keepdim=True)
             x = (x - u) / torch.sqrt(s + self.variance_epsilon)
-            return self.weight * x + self.bias
+            return self.weight * x.to(pdtype) + self.bias
+
+        #def forward(self, x):
+        #    u = x.mean(-1, keepdim=True)
+        #    s = (x - u).pow(2).mean(-1, keepdim=True)
+        #    x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        #    return self.weight * x + self.bias
 
 
 class BertEmbeddings(nn.Module):
diff --git a/tests/unit/simple_model.py b/tests/unit/simple_model.py
old mode 100644
new mode 100755
index 7a2e3357af60..17215cd323bb
--- a/tests/unit/simple_model.py
+++ b/tests/unit/simple_model.py
@@ -3,6 +3,8 @@
 import argparse
 import torch
 
+from deepspeed.pipe import PipelineModule, LayerSpec
+
 
 class SimpleModel(torch.nn.Module):
     def __init__(self, hidden_dim, empty_grad=False, rank=0):
@@ -23,6 +25,59 @@ def forward(self, x, y):
         return self.cross_entropy_loss(hidden_dim, y)
 
 
+class LinearStack(torch.nn.Module):
+    def __init__(self, input_dim=128, hidden_dim=128, output_dim=128, num_layers=4):
+        super().__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.hidden_dim = hidden_dim
+
+        self.input_layer = torch.nn.Linear(in_features=self.input_dim,
+                                           out_features=self.hidden_dim)
+        self.layers = torch.nn.ModuleList([
+            torch.nn.Linear(in_features=self.hidden_dim,
+                            out_features=self.hidden_dim,
+                            bias=False) for x in range(num_layers)
+        ])
+        self.output_layer = torch.nn.Linear(in_features=self.hidden_dim,
+                                            out_features=self.output_dim)
+
+        self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        x = self.input_layer(x)
+        for layer in self.layers:
+            x = layer(x)
+        x = self.output_layer(x)
+        return x
+
+
+class LinearStackPipe(PipelineModule):
+    def __init__(self,
+                 input_dim=128,
+                 hidden_dim=128,
+                 output_dim=128,
+                 num_layers=4,
+                 **kwargs):
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+
+        layers = []
+        layers.append(LayerSpec(torch.nn.Linear, self.input_dim, self.hidden_dim))
+        for x in range(self.num_layers):
+            layers.append(
+                LayerSpec(torch.nn.Linear,
+                          self.hidden_dim,
+                          self.hidden_dim,
+                          bias=False))
+            layers.append(lambda x: x)
+        layers.append(LayerSpec(torch.nn.Linear, self.hidden_dim, self.output_dim))
+
+        super().__init__(layers=layers, loss_fn=torch.nn.CrossEntropyLoss(), **kwargs)
+
+
 class SimpleOptimizer(torch.optim.Optimizer):
     def __init__(self, params, lr=0.11072018):
         defaults = dict(lr=lr)
@@ -46,6 +101,48 @@ def step(self, closure=None):
         return loss
 
 
+class HybridStateOptimizer(torch.optim.Optimizer):
+    def __init__(self, params, lr=0.11072018):
+        defaults = dict(lr=lr)
+        super(HybridStateOptimizer, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(HybridStateOptimizer, self).__setstate__(state)
+
+    def step(self, closure=None):
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                state = self.state[p]
+                if len(state) == 0:
+                    state['integer_step'] = 0
+                    state['tensor_step'] = torch.zeros(1)
+
+                d_p = p.grad.data
+                p.data.add_(-group['lr'], d_p)
+                state['integer_step'] += 1
+                state['tensor_step'] += 1
+
+        return loss
+
+
+class PLD_SimpleModel(SimpleModel):
+    def __init__(self, hidden_dim, empty_grad=False, rank=0):
+        super(PLD_SimpleModel, self).__init__(hidden_dim, empty_grad, rank)
+
+    def forward(self, x, y, **kwargs):
+        pld = kwargs.get('progressive_layer_drop', False)
+        theta = kwargs.get('pld_theta', 1.0)
+        hidden_dim = super(PLD_SimpleModel, self).forward(x, y)
+        return hidden_dim
+
+
 def random_dataloader(model, total_samples, hidden_dim, device, dtype=torch.half):
     batch_size = model.train_micro_batch_size_per_gpu()
     train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=dtype)
@@ -64,11 +161,21 @@ def create_config_from_dict(tmpdir, config_dict):
     return config_path
 
 
-def args_from_dict(tmpdir, config_dict):
-    config_path = create_config_from_dict(tmpdir, config_dict)
+def create_deepspeed_args():
     parser = argparse.ArgumentParser()
     args = parser.parse_args(args='')
     args.deepspeed = True
+    if torch.distributed.is_initialized():
+        # We assume up to one full node executing unit tests
+        assert torch.distributed.get_world_size() <= torch.cuda.device_count()
+        args.local_rank = torch.distributed.get_rank()
+    else:
+        args.local_rank = 0
+    return args
+
+
+def args_from_dict(tmpdir, config_dict):
+    args = create_deepspeed_args()
+    config_path = create_config_from_dict(tmpdir, config_dict)
     args.deepspeed_config = config_path
-    args.local_rank = 0
     return args
diff --git a/tests/unit/test_activation_checkpointing.py b/tests/unit/test_activation_checkpointing.py
new file mode 100644
index 000000000000..35fb665c6a0e
--- /dev/null
+++ b/tests/unit/test_activation_checkpointing.py
@@ -0,0 +1,181 @@
+# TODO: add tests with model parallelism for activation partitioning and other features.
+
+from copy import deepcopy
+
+import pytest
+
+import torch
+
+import deepspeed
+ckpt = deepspeed.checkpointing.checkpoint
+
+from common import distributed_test
+
+
+def _compute(module, *inputs, do_checkpoint=False):
+    if do_checkpoint:
+        outputs = ckpt(module, *inputs)
+    else:
+        outputs = module(*inputs)
+
+    if torch.is_tensor(outputs):
+        outputs = (outputs, )
+
+    sum(o.sum() for o in outputs if o.requires_grad).backward()
+    grads = [p.grad for p in module.parameters()]
+    input_grads = [inp.grad for inp in inputs if torch.is_tensor(inp)]
+
+    return {
+        'outputs': outputs,
+        'module_grads': grads,
+        'input_grads': input_grads,
+    }
+
+
+def _prep_inputs(*inputs):
+    _inputs = []
+
+    for inp in inputs:
+        inp = deepcopy(inp)
+        if torch.is_tensor(inp):
+            inp = inp.cuda()
+        _inputs.append(inp)
+
+    return tuple(_inputs)
+
+
+# This is distributed because checkpoint() assumes that torch.distributed is initialized.
+# torch.distributed is used with activation partitioning, but not for these simple cases.
+@distributed_test(world_size=1)
+def _test_activation_checkpoint(module, *inputs):
+    # Move to device
+    module.cuda()
+
+    # Get rid of dropouts until we fork the RNG between tests.
+    module.eval()
+
+    module_ = deepcopy(module)
+    inputs_ = _prep_inputs(*inputs)
+    base = _compute(module_, *inputs_, do_checkpoint=False)
+
+    module_ = deepcopy(module)
+    inputs_ = _prep_inputs(*inputs)
+    test = _compute(module_, *inputs_, do_checkpoint=True)
+
+    for group in base.keys():
+        for b, t in zip(base[group], test[group]):
+            # Catch grad `None`s, etc.
+            if not torch.is_tensor(b):
+                assert b == t
+            elif b.is_floating_point():
+                assert torch.allclose(b, t)
+            else:
+                assert torch.equal(b, t)
+
+
+#
+# Helpers
+#
+
+
+class MaskedLinear(torch.nn.Linear):
+    def forward(self, x, mask):
+        out = super().forward(x)
+        if mask.is_floating_point():
+            out = out * mask
+        else:
+            # must cast BoolTensor in older torch versions
+            out = out * mask.type_as(out)
+        return out
+
+
+class MaskedLinearSeq(MaskedLinear):
+    """Tests pipeline modules by also returning the mask."""
+    def forward(self, x, mask):
+        return super().forward(x, mask), mask
+
+
+class MaskedLinearSeqDup(MaskedLinearSeq):
+    """MaskedLinearSeq, but with more outputs than inputs and in a different order."""
+    def forward(self, x, mask):
+        dup = x.clone().detach() * 1.38  # just an arbitrary scaling
+        x, mask = super().forward(x, mask)
+        return dup, x, mask
+
+
+HIDDEN_DIM = 20
+
+
+def _mixed_mask(size=HIDDEN_DIM):
+    entries = torch.randn(size)
+    mask = torch.where(entries > 0, torch.ones(size), torch.zeros(size))
+    mask = mask.bool()
+    return mask
+
+
+def _bool_to_float(btensor, dtype=torch.float32):
+    """Converts a torch.BoolTensor to an equivalent dtype. """
+    ones = torch.ones(size=btensor.size(), dtype=dtype)
+    zeros = torch.zeros(size=btensor.size(), dtype=dtype)
+    return torch.where(btensor, ones, zeros)
+
+
+#
+# Tests
+#
+
+
+def test_ckpt_inputs1_outputs1():
+    module = torch.nn.Linear(HIDDEN_DIM, HIDDEN_DIM)
+    inputs = torch.rand(HIDDEN_DIM)
+    inputs.requires_grad = True
+    _test_activation_checkpoint(module, inputs)
+
+
+# both bool and float are important, as bool is not diffentiable
+@pytest.mark.parametrize('mask',
+                         [
+                             _mixed_mask(),
+                             _bool_to_float(_mixed_mask()),
+                         ])
+def test_ckpt_inputs2_outputs1(mask):
+    module = MaskedLinear(HIDDEN_DIM, HIDDEN_DIM)
+    inputs = torch.rand(HIDDEN_DIM)
+    inputs.requires_grad = True
+    _test_activation_checkpoint(module, inputs, mask)
+
+
+@pytest.mark.parametrize('mask',
+                         [
+                             _mixed_mask(),
+                             _bool_to_float(_mixed_mask()),
+                         ])
+def test_ckpt_inputs2_outputs2(mask):
+    module = MaskedLinearSeq(HIDDEN_DIM, HIDDEN_DIM)
+    inputs = torch.rand(HIDDEN_DIM)
+    inputs.requires_grad = True
+    _test_activation_checkpoint(module, inputs, mask)
+
+
+@pytest.mark.parametrize('mask',
+                         [
+                             _mixed_mask(),
+                             _bool_to_float(_mixed_mask()),
+                         ])
+def test_ckpt_inputs2_outputs3(mask):
+    module = MaskedLinearSeqDup(HIDDEN_DIM, HIDDEN_DIM)
+    inputs = torch.rand(HIDDEN_DIM)
+    inputs.requires_grad = True
+    _test_activation_checkpoint(module, inputs, mask)
+
+
+class DropMaskLinear(torch.nn.Linear):
+    def forward(self, x, mask):
+        return super().forward(x)
+
+
+def test_ckpt_arg_none():
+    module = DropMaskLinear(HIDDEN_DIM, HIDDEN_DIM)
+    inputs = (torch.rand(HIDDEN_DIM), None)
+    inputs[0].requires_grad = True
+    _test_activation_checkpoint(module, *inputs)
diff --git a/tests/unit/test_checkpointing.py b/tests/unit/test_checkpointing.py
index d80bd897ee1a..f5ea219fb398 100755
--- a/tests/unit/test_checkpointing.py
+++ b/tests/unit/test_checkpointing.py
@@ -1,10 +1,18 @@
 import torch
+
+import torch.distributed as dist
+
 import deepspeed
-from deepspeed.pt.deepspeed_zero_optimizer import FP16_DeepSpeedZeroOptimizer
-from deepspeed.pt.zero_optimizer_stage1 import FP16_DeepSpeedZeroOptimizer_Stage1
+from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer
+from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1
+
+from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
+from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
 
-from deepspeed.pt.fp16_optimizer import FP16_Optimizer
-from deepspeed.pt.fp16_unfused_optimizer import FP16_UnfusedOptimizer
+from deepspeed.runtime.pipe.topology import *
+PipeTopo = PipeDataParallelTopology
+
+from deepspeed.ops.op_builder import FusedLambBuilder, CPUAdamBuilder
 
 import argparse
 import pytest
@@ -12,7 +20,7 @@
 import os
 import numbers
 from common import distributed_test
-from simple_model import SimpleModel, random_dataloader, args_from_dict
+from simple_model import *
 
 
 def compare_deepspeed_states(saved_model, loaded_model):
@@ -24,29 +32,37 @@ def compare_deepspeed_states(saved_model, loaded_model):
     assert saved_model.global_steps == loaded_model.global_steps
 
 
-def compare_model_states(saved_model, loaded_model):
+def compare_model_states(saved_model, loaded_model, compare_optimizer=True):
     compare_deepspeed_states(saved_model, loaded_model)
 
     for p0, p1 in zip(saved_model.module.parameters(), loaded_model.module.parameters()):
-        assert torch.allclose(p0,p1,atol=1e-07), f"FP16 model state {p0} is not equal to {p1}"
+        assert id(p0) != id(p1), f'Comparing fp16 model state tensor against itself : {id(p0)} <====> {id(p1)}'
+        assert torch.allclose(p0, p1, atol=1e-07), f"FP16 model state {p0} is not equal to {p1}"
+
+    if not compare_optimizer:
+        return
 
     if isinstance(saved_model.optimizer, FP16_DeepSpeedZeroOptimizer):
         for p0, p1 in zip(saved_model.optimizer.single_partition_of_fp32_groups, loaded_model.optimizer.single_partition_of_fp32_groups):
-            assert torch.allclose(p0,p1,atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
+            assert id(p0) != id(p1), f'Comparing fp32 model state tensor against itself: {id(p0)} <====> {id(p1)}'
+            assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
 
     elif isinstance(saved_model.optimizer, FP16_DeepSpeedZeroOptimizer_Stage1):
         for partition0, partition1 in zip(saved_model.optimizer.local_sub_partitions_of_fp32_groups, loaded_model.optimizer.local_sub_partitions_of_fp32_groups):
             for p0, p1 in zip(partition0, partition1):
-                assert torch.allclose(p0,p1,atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
+                assert id(p0) != id(p1), f'Comparing fp32 model state tensor against itself: {id(p0)} <====> {id(p1)}'
+                assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
 
     elif isinstance(saved_model.optimizer, FP16_Optimizer):
         for p0, p1 in zip(saved_model.optimizer.fp32_groups_flat, loaded_model.optimizer.fp32_groups_flat):
-            assert torch.allclose(p0,p1,atol=1e-07), f"FP32 model states {p0} is not equal to {p1}"
+            assert id(p0) != id(p1), f'Comparing fp32 model state tensor against itself: {id(p0)} <====> {id(p1)}'
+            assert torch.allclose(p0, p1, atol=1e-07), f"FP32 model states {p0} is not equal to {p1}"
 
     elif isinstance(saved_model.optimizer, FP16_UnfusedOptimizer):
         for params0, params1 in zip(saved_model.optimizer.fp32_groups, loaded_model.optimizer.fp32_groups):
             for p0, p1 in zip(params0, params1):
-                assert torch.allclose(p0,p1,atol=1e-07), f"FP32 model states {p0} is not equal to {p1}"
+                assert id(p0) != id(p1), f'Comparing fp32 model state tensor against itself: {id(p0)} <====> {id(p1)}'
+                assert torch.allclose(p0, p1, atol=1e-07), f"FP32 model states {p0} is not equal to {p1}"
     elif isinstance(saved_model.optimizer, torch.optim.Optimizer):
         pass
     else:
@@ -61,6 +77,7 @@ def compare_optimizer_states(saved_model, loaded_model, hidden_dim, fp16=True):
                               loaded_optimizer.state.values()):
         for s0, s1 in zip(state0.values(), state1.values()):
             if isinstance(s0, torch.Tensor) and isinstance(s1, torch.Tensor):
+                assert id(s0) != id(s1), f'Comparing optimizer state tensor against itself: {id(s0)} <====> {id(s1)}'
                 assert torch.equal(s0, s1)
             else:
                 assert s0 == s1
@@ -89,40 +106,64 @@ def compare_lr_scheduler_states(saved_model, loaded_model):
             assert state0 == state1
 
 
+def create_deepspeed_model(args, model, base_optimizer):
+    if base_optimizer is None:
+        ds_model, _, _, _ = deepspeed.initialize(args=args,
+                                                 model=model,
+                                                 model_parameters=model.parameters())
+    else:
+        ds_model, _, _, _ = deepspeed.initialize(args=args,
+                                                model=model,
+                                                optimizer=base_optimizer)
+
+    return ds_model
+
+
 def checkpoint_correctness_verification(args,
-                                        model,
+                                        models,
                                         hidden_dim,
                                         tmpdir,
                                         load_optimizer_states=False,
                                         load_lr_scheduler_states=False,
-                                        fp16=True):
+                                        fp16=True,
+                                        train_batch=False,
+                                        base_optimizers=[None,
+                                                         None],
+                                        empty_tag=False):
     dtype = torch.half if fp16 else torch.float32
-    ds_model, _, _,_ = deepspeed.initialize(args=args,
-                                            model=model,
-                                            model_parameters=model.parameters())
+    ds_model = create_deepspeed_model(args=args,
+                                      model=models[0],
+                                      base_optimizer=base_optimizers[0])
+
     data_loader = random_dataloader(model=ds_model,
                                     total_samples=50,
                                     hidden_dim=hidden_dim,
                                     device=ds_model.device,
                                     dtype=dtype)
-    for n, batch in enumerate(data_loader):
-        loss = ds_model(batch[0], batch[1])
-        ds_model.backward(loss)
-        ds_model.step()
+
+    if train_batch:
+        ds_model.set_dataloader(data_loader)
+        for n, batch in enumerate(data_loader):
+            loss = ds_model.train_batch()
+    else:
+        for n, batch in enumerate(data_loader):
+            loss = ds_model(batch[0], batch[1])
+            ds_model.backward(loss)
+            ds_model.step()
 
     trained_model = ds_model
 
     save_folder = os.path.join(tmpdir, 'saved_checkpoint')
-    save_tag = '1'
+    save_tag = None if empty_tag else '1'
 
-    trained_model.save_checkpoint(save_folder, save_tag)
+    trained_model.save_checkpoint(save_folder, tag=save_tag)
 
-    loaded_model, _, _,_ = deepspeed.initialize(args=args,
-                                            model=model,
-                                            model_parameters=model.parameters())
+    loaded_model = create_deepspeed_model(args=args,
+                                          model=models[1],
+                                          base_optimizer=base_optimizers[1])
 
     loaded_model.load_checkpoint(save_folder,
-                                 save_tag,
+                                 tag=save_tag,
                                  load_optimizer_states=load_optimizer_states,
                                  load_lr_scheduler_states=load_lr_scheduler_states)
 
@@ -135,6 +176,8 @@ def checkpoint_correctness_verification(args,
         compare_lr_scheduler_states(trained_model, loaded_model)
 
 
+@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME],
+                    reason="lamb is not compatible")
 def test_checkpoint_unfused_optimizer(tmpdir):
     config_dict = {
         "train_batch_size": 2,
@@ -170,25 +213,26 @@ def test_checkpoint_unfused_optimizer(tmpdir):
     args = args_from_dict(tmpdir, config_dict)
     hidden_dim = 10
 
-    model = SimpleModel(hidden_dim, empty_grad=False)
+    models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
 
     @distributed_test(world_size=[2])
     def _test_checkpoint_unfused_optimizer(args,
-                                           model,
+                                           models,
                                            hidden_dim,
                                            load_optimizer_states):
         checkpoint_correctness_verification(args,
-                                            model,
-                                            hidden_dim,
-                                            tmpdir,
+                                            models=models,
+                                            hidden_dim=hidden_dim,
+                                            tmpdir=tmpdir,
                                             load_optimizer_states=load_optimizer_states)
 
     _test_checkpoint_unfused_optimizer(args=args,
-                                       model=model,
+                                       models=models,
                                        hidden_dim=hidden_dim,
                                        load_optimizer_states=True)
+
     _test_checkpoint_unfused_optimizer(args=args,
-                                       model=model,
+                                       models=models,
                                        hidden_dim=hidden_dim,
                                        load_optimizer_states=False)
 
@@ -215,33 +259,48 @@ def test_checkpoint_fused_optimizer(tmpdir):
     args = args_from_dict(tmpdir, config_dict)
     hidden_dim = 10
 
-    model = SimpleModel(hidden_dim, empty_grad=False)
+    models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
 
     @distributed_test(world_size=[2])
-    def _test_checkpoint_fused_optimizer(args, model, hidden_dim, load_optimizer_states):
+    def _test_checkpoint_fused_optimizer(args,
+                                         models,
+                                         hidden_dim,
+                                         load_optimizer_states):
         checkpoint_correctness_verification(args,
-                                            model,
-                                            hidden_dim,
-                                            tmpdir,
+                                            models=models,
+                                            hidden_dim=hidden_dim,
+                                            tmpdir=tmpdir,
                                             load_optimizer_states=load_optimizer_states)
 
     _test_checkpoint_fused_optimizer(args=args,
-                                     model=model,
+                                     models=models,
                                      hidden_dim=hidden_dim,
                                      load_optimizer_states=True)
+
     _test_checkpoint_fused_optimizer(args=args,
-                                     model=model,
+                                     models=models,
                                      hidden_dim=hidden_dim,
                                      load_optimizer_states=False)
 
 
-@pytest.mark.parametrize("zero_stage", [1, 2])
-def test_checkpoint_zero_optimizer(tmpdir, zero_stage):
+@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+                         [
+                             (1,
+                              False),
+                             (2,
+                              False),
+                             (2,
+                              True),
+                         ])
+def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload):
+    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+        pytest.skip("cpu-adam is not compatible")
+
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
         "optimizer": {
-            "type": "Adam",
+            "type": 'Adam',
             "params": {
                 "lr": 0.00015,
                 "betas": [0.8,
@@ -254,35 +313,47 @@ def test_checkpoint_zero_optimizer(tmpdir, zero_stage):
             "enabled": True
         },
         "zero_optimization": {
-            "stage": zero_stage
-        },
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload
+        }
     }
     args = args_from_dict(tmpdir, config_dict)
     hidden_dim = 10
 
-    model = SimpleModel(hidden_dim, empty_grad=False)
+    models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
 
     @distributed_test(world_size=[2])
-    def _test_checkpoint_zero_optimizer(args, model, hidden_dim, load_optimizer_states):
+    def _test_checkpoint_zero_optimizer(args, models, hidden_dim, load_optimizer_states):
         checkpoint_correctness_verification(args,
-                                            model,
-                                            hidden_dim,
-                                            tmpdir,
+                                            models=models,
+                                            hidden_dim=hidden_dim,
+                                            tmpdir=tmpdir,
                                             load_optimizer_states=load_optimizer_states)
 
     _test_checkpoint_zero_optimizer(args=args,
-                                    model=model,
+                                    models=models,
                                     hidden_dim=hidden_dim,
                                     load_optimizer_states=True)
 
 
-@pytest.mark.parametrize("zero_stage", [1, 2])
-def test_checkpoint_zero_no_optimizer(tmpdir, zero_stage):
+@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+                         [
+                             (1,
+                              False),
+                             (2,
+                              False),
+                             (2,
+                              True),
+                         ])
+def test_checkpoint_zero_no_optimizer(tmpdir, zero_stage, use_cpu_offload):
+    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+        pytest.skip("cpu-adam is not compatible")
+
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
         "optimizer": {
-            "type": "Adam",
+            "type": 'Adam',
             "params": {
                 "lr": 0.00015,
                 "betas": [0.8,
@@ -295,38 +366,52 @@ def test_checkpoint_zero_no_optimizer(tmpdir, zero_stage):
             "enabled": True
         },
         "zero_optimization": {
-            "stage": zero_stage
-        },
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload
+        }
     }
     args = args_from_dict(tmpdir, config_dict)
     hidden_dim = 10
 
-    model = SimpleModel(hidden_dim, empty_grad=False)
+    models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
 
     @distributed_test(world_size=[2])
     def _test_checkpoint_zero_no_optimizer(args,
-                                           model,
+                                           models,
                                            hidden_dim,
                                            load_optimizer_states):
         checkpoint_correctness_verification(args,
-                                            model,
-                                            hidden_dim,
-                                            tmpdir,
+                                            models=models,
+                                            hidden_dim=hidden_dim,
+                                            tmpdir=tmpdir,
                                             load_optimizer_states=load_optimizer_states)
 
     _test_checkpoint_zero_no_optimizer(args=args,
-                                       model=model,
+                                       models=models,
                                        hidden_dim=hidden_dim,
                                        load_optimizer_states=False)
 
 
-@pytest.mark.parametrize("zero_stage", [0, 1, 2])
-def test_checkpoint_lr_scheduler(tmpdir, zero_stage):
+@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+                         [
+                             (0,
+                              False),
+                             (1,
+                              False),
+                             (2,
+                              False),
+                             (2,
+                              True),
+                         ])
+def test_checkpoint_lr_scheduler(tmpdir, zero_stage, use_cpu_offload):
+    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+        pytest.skip("cpu-adam is not compatible")
+
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
         "optimizer": {
-            "type": "Adam",
+            "type": 'Adam',
             "params": {
                 "lr": 0.00015,
                 "betas": [0.8,
@@ -339,7 +424,8 @@ def test_checkpoint_lr_scheduler(tmpdir, zero_stage):
             "enabled": True
         },
         "zero_optimization": {
-            "stage": zero_stage
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload
         },
         "scheduler": {
             "type": "WarmupLR",
@@ -353,36 +439,49 @@ def test_checkpoint_lr_scheduler(tmpdir, zero_stage):
     args = args_from_dict(tmpdir, config_dict)
     hidden_dim = 10
 
-    model = SimpleModel(hidden_dim, empty_grad=False)
+    models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
 
     @distributed_test(world_size=[2])
     def _test_checkpoint_lr_scheduler(args,
-                                      model,
+                                      models,
                                       hidden_dim,
                                       load_optimizer_states,
                                       load_lr_scheduler_states):
         checkpoint_correctness_verification(
             args,
-            model,
-            hidden_dim,
-            tmpdir,
+            models=models,
+            hidden_dim=hidden_dim,
+            tmpdir=tmpdir,
             load_optimizer_states=load_optimizer_states,
             load_lr_scheduler_states=load_lr_scheduler_states)
 
     _test_checkpoint_lr_scheduler(args=args,
-                                  model=model,
+                                  models=models,
                                   hidden_dim=hidden_dim,
                                   load_optimizer_states=False,
                                   load_lr_scheduler_states=True)
 
 
-@pytest.mark.parametrize("zero_stage", [0, 1, 2])
-def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage):
+@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+                         [
+                             (0,
+                              False),
+                             (1,
+                              False),
+                             (2,
+                              False),
+                             (2,
+                              True),
+                         ])
+def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage, use_cpu_offload):
+    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+        pytest.skip("cpu-adam is not compatible")
+
     config_dict = {
         "train_batch_size": 2,
         "steps_per_print": 1,
         "optimizer": {
-            "type": "Adam",
+            "type": 'Adam',
             "params": {
                 "lr": 1e-5
             }
@@ -391,7 +490,8 @@ def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage):
             "enabled": True
         },
         "zero_optimization": {
-            "stage": zero_stage
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload
         },
         "scheduler": {
             "type": "WarmupLR",
@@ -400,29 +500,29 @@ def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage):
                 "warmup_max_lr": 0.001,
                 "warmup_num_steps": 1000
             }
-        }
+        },
     }
     args = args_from_dict(tmpdir, config_dict)
     hidden_dim = 10
 
-    model = SimpleModel(hidden_dim, empty_grad=False)
+    models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
 
     @distributed_test(world_size=[2])
     def _test_checkpoint_no_lr_scheduler(args,
-                                         model,
+                                         models,
                                          hidden_dim,
                                          load_optimizer_states,
                                          load_lr_scheduler_states):
         checkpoint_correctness_verification(
             args,
-            model,
-            hidden_dim,
-            tmpdir,
+            models=models,
+            hidden_dim=hidden_dim,
+            tmpdir=tmpdir,
             load_optimizer_states=load_optimizer_states,
             load_lr_scheduler_states=load_lr_scheduler_states)
 
     _test_checkpoint_no_lr_scheduler(args=args,
-                                     model=model,
+                                     models=models,
                                      hidden_dim=hidden_dim,
                                      load_optimizer_states=False,
                                      load_lr_scheduler_states=False)
@@ -450,10 +550,279 @@ def test_checkpoint_fp32_optimizer(tmpdir):
     args = args_from_dict(tmpdir, config_dict)
     hidden_dim = 10
 
-    model = SimpleModel(hidden_dim, empty_grad=False)
+    models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
 
     @distributed_test(world_size=[2])
-    def _test_checkpoint_fp32_optimizer(args, model, hidden_dim):
-        checkpoint_correctness_verification(args, model, hidden_dim, tmpdir, fp16=False)
+    def _test_checkpoint_fp32_optimizer(args, models, hidden_dim):
+        checkpoint_correctness_verification(args,
+                                            models=models,
+                                            hidden_dim=hidden_dim,
+                                            tmpdir=tmpdir,
+                                            fp16=False)
+
+    _test_checkpoint_fp32_optimizer(args=args, models=models, hidden_dim=hidden_dim)
+
+
+@pytest.mark.parametrize("zero_stage", [0, 1])
+def test_checkpoint_pipe_engine(zero_stage, tmpdir, stages=2):
+    config_dict = {
+        "train_batch_size": 2,
+        "train_micro_batch_size_per_gpu": 1,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 1e-5
+            }
+        },
+        "zero_optimization": {
+            "stage": zero_stage
+        },
+        "fp16": {
+            "enabled": zero_stage > 0
+        },
+        "scheduler": {
+            "type": "OneCycle",
+            "params": {
+                "cycle_first_step_size": 1000,
+                "cycle_first_stair_count": 500,
+                "cycle_second_step_size": 1000,
+                "cycle_second_stair_count": 500,
+                "decay_step_size": 1000,
+                "cycle_min_lr": 0.0001,
+                "cycle_max_lr": 0.0010,
+                "decay_lr_rate": 0.001,
+                "cycle_min_mom": 0.85,
+                "cycle_max_mom": 0.99,
+                "decay_mom_rate": 0.0
+            }
+        }
+    }
+
+    @distributed_test(world_size=4)
+    def _test(save_folder, num_stages):
+        args = args_from_dict(tmpdir, config_dict)
+        models = [LinearStackPipe(num_stages=num_stages) for _ in range(2)]
+        checkpoint_correctness_verification(args=args,
+                                            models=models,
+                                            hidden_dim=models[0].hidden_dim,
+                                            tmpdir=save_folder,
+                                            fp16=config_dict['fp16']['enabled'],
+                                            load_optimizer_states=True,
+                                            load_lr_scheduler_states=True,
+                                            train_batch=True)
+
+    _test(tmpdir, num_stages=stages)
+
+
+@pytest.mark.parametrize("base_topo,test_topo",
+                         [
+                             (PipeTopo(num_pp=1,
+                                       num_dp=4),
+                              PipeTopo(num_pp=4,
+                                       num_dp=1)),
+                             (PipeTopo(num_pp=2,
+                                       num_dp=2),
+                              PipeTopo(num_pp=2,
+                                       num_dp=2)),
+                             (PipeTopo(num_pp=4,
+                                       num_dp=1),
+                              PipeTopo(num_pp=2,
+                                       num_dp=2)),
+                         ])
+def test_checkpoint_pipe_module(base_topo, test_topo, tmpdir):
+    @distributed_test(world_size=4)
+    def _test(base_topo, test_topo, save_folder):
+        base_model = LinearStackPipe(topology=base_topo)
+        base_model.save_state_dict(save_folder)
+
+        dist.barrier()
+
+        test_model = LinearStackPipe(topology=test_topo)
+        test_model.load_state_dir(save_folder)
+
+        # Base and test can have different lengths, so make sure we map from the
+        # smaller to larger model
+        if len(base_model.forward_funcs) < len(test_model.forward_funcs):
+            A = base_model
+            B = test_model
+        else:
+            A = test_model
+            B = base_model
+
+        # Compare layers individually since partitions are different
+        for idx, A_layer in enumerate(A.forward_funcs):
+            if not hasattr(A_layer, 'parameters'):
+                # Skip functionals, etc.
+                continue
+
+            # Find the corresponding layer in B
+            global_idx = idx + A._local_start
+            B_local_idx = global_idx - B._local_start
+            B_layer = B.forward_funcs[B_local_idx]
+
+            # Compare layer parameters
+            for p0, p1 in zip(A_layer.parameters(), B_layer.parameters()):
+                assert torch.allclose(p0, p1, atol=1e-07), f"Model state {p0} is not equal to {p1}"
+
+    _test(base_topo, test_topo, save_folder=tmpdir)
+
+
+@pytest.mark.parametrize('zero_stage', [1, 2])
+def test_checkpoint_zero_hybrid_optimizer_state(tmpdir, zero_stage):
+    config_dict = {
+        "train_micro_batch_size_per_gpu": 2,
+        "gradient_accumulation_steps": 2,
+        "steps_per_print": 1,
+        "zero_optimization": {
+            "stage": zero_stage
+        },
+        "zero_allow_untested_optimizer": True,
+        "fp16": {
+            "enabled": True,
+            "initial_scale_power": 8
+        }
+    }
+
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+    models = [SimpleModel(hidden_dim=hidden_dim) for _ in range(2)]
+    optimizers = [HybridStateOptimizer(model.parameters()) for model in models]
+
+    @distributed_test(world_size=[2])
+    def _test_checkpoint_zero_hybrid_optimizer_state(args,
+                                                     models,
+                                                     optimizers,
+                                                     hidden_dim):
+        checkpoint_correctness_verification(args,
+                                            models=models,
+                                            base_optimizers=optimizers,
+                                            hidden_dim=hidden_dim,
+                                            tmpdir=tmpdir,
+                                            load_optimizer_states=True)
+
+    _test_checkpoint_zero_hybrid_optimizer_state(args=args,
+                                                 models=models,
+                                                 optimizers=optimizers,
+                                                 hidden_dim=hidden_dim)
+
+
+def test_checkpoint_latest(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            }
+        }
+    }
+    hidden_dim = 10
+    args = args_from_dict(tmpdir, config_dict)
+    models = [SimpleModel(hidden_dim=hidden_dim) for _ in range(2)]
+
+    @distributed_test(world_size=[1])
+    def _helper(args, models):
+        checkpoint_correctness_verification(args,
+                                            models=models,
+                                            hidden_dim=hidden_dim,
+                                            tmpdir=tmpdir,
+                                            load_optimizer_states=True,
+                                            load_lr_scheduler_states=False,
+                                            fp16=False,
+                                            empty_tag=True)
+
+    _helper(args, models)
+
+
+def test_checkpoint_missing_latest(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            }
+        }
+    }
+    hidden_dim = 10
+    args = args_from_dict(tmpdir, config_dict)
+
+    model = SimpleModel(hidden_dim, rank=args.local_rank)
+
+    @distributed_test(world_size=[1])
+    def _helper(args, model, hidden_dim):
+        model, _, _,_ = deepspeed.initialize(args=args,
+                                             model=model,
+                                             model_parameters=model.parameters())
+        # should be no-op, since latest doesn't exist
+        model.load_checkpoint(tmpdir)
+
+    _helper(args=args, model=model, hidden_dim=hidden_dim)
+
+
+@pytest.mark.parametrize('valid_mode', ["FAIL", "WARN", "IGNORE"])
+def test_checkpoint_unique_tag(tmpdir, valid_mode):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            }
+        },
+        "checkpoint": {
+            "tag_validation": valid_mode
+        }
+    }
+    hidden_dim = 10
+    args = args_from_dict(tmpdir, config_dict)
+
+    model = SimpleModel(hidden_dim, rank=args.local_rank)
+
+    @distributed_test(world_size=[2])
+    def _helper(args, model, hidden_dim):
+        model, _, _,_ = deepspeed.initialize(args=args,
+                                             model=model,
+                                             model_parameters=model.parameters())
+        if valid_mode == "FAIL":
+            with pytest.raises(AssertionError):
+                model.save_checkpoint(save_dir=tmpdir,
+                                      tag=f"tag-{torch.distributed.get_rank()}")
+        else:
+            model.save_checkpoint(save_dir=tmpdir,
+                                  tag=f"tag-{torch.distributed.get_rank()}")
+
+    _helper(args=args, model=model, hidden_dim=hidden_dim)
+
+
+def test_checkpoint_unknown_tag_validation(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            }
+        },
+        "checkpoint": {
+            "tag_validation": "foo"
+        }
+    }
+    hidden_dim = 10
+    args = args_from_dict(tmpdir, config_dict)
+
+    model = SimpleModel(hidden_dim, rank=args.local_rank)
+
+    @distributed_test(world_size=[1])
+    def _helper(args, model, hidden_dim):
+        with pytest.raises(deepspeed.DeepSpeedConfigError):
+            model, _, _,_ = deepspeed.initialize(args=args,
+                                                 model=model,
+                                                 model_parameters=model.parameters())
 
-    _test_checkpoint_fp32_optimizer(args=args, model=model, hidden_dim=hidden_dim)
+    _helper(args=args, model=model, hidden_dim=hidden_dim)
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
index 297658f71e6e..4cabefe71a33 100755
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -9,7 +9,7 @@
 
 # A test on its own
 import deepspeed
-from deepspeed.pt.deepspeed_config import DeepSpeedConfig
+from deepspeed.runtime.config import DeepSpeedConfig
 
 
 def test_cuda():
@@ -195,3 +195,34 @@ def _test_dist_init_true(args, model, hidden_dim):
             model.step()
 
     _test_dist_init_true(args=args, model=model, hidden_dim=hidden_dim)
+
+
+def test_init_no_optimizer(tmpdir):
+
+    config_dict = {"train_batch_size": 1, "fp16": {"enabled": True}}
+    config_path = create_config_from_dict(tmpdir, config_dict)
+
+    @distributed_test(world_size=1)
+    def _helper():
+        parser = argparse.ArgumentParser()
+        args = parser.parse_args(args='')
+        args.deepscale_config = config_path
+        args.local_rank = 0
+
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim=hidden_dim)
+
+        model, _, _, _ = deepspeed.initialize(args=args, model=model)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=5,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            with pytest.raises(AssertionError):
+                model.backward(loss)
+            with pytest.raises(AssertionError):
+                model.step()
+
+    _helper()
diff --git a/tests/unit/test_cpu_adam.py b/tests/unit/test_cpu_adam.py
new file mode 100755
index 000000000000..2ae1f8201328
--- /dev/null
+++ b/tests/unit/test_cpu_adam.py
@@ -0,0 +1,62 @@
+import argparse
+import torch
+import time
+import numpy as np
+import pytest
+import copy
+
+import deepspeed
+from deepspeed.ops.adam import FusedAdam
+from deepspeed.ops.op_builder import CPUAdamBuilder
+
+if not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+    pytest.skip("cpu-adam is not compatible")
+
+
+def check_equal(first, second, atol=1e-2, verbose=False):
+    x = first.detach().numpy()
+    y = second.detach().numpy()
+    if verbose:
+        print("x = {}".format(x.flatten()))
+        print("y = {}".format(y.flatten()))
+        print('-' * 80)
+    np.testing.assert_allclose(x, y, err_msg="param-update dismatch!", atol=atol)
+
+@pytest.mark.parametrize('model_size',
+                         [
+                             (64),
+                             (22),
+                             (55),
+                             (127),
+                             (1024),
+                             (1048576),
+                         ]) # yapf: disable
+def test_cpu_adam_opt(model_size):
+    from deepspeed.ops.adam import DeepSpeedCPUAdam
+    device = 'cpu'
+    rng_state = torch.get_rng_state()
+    param = torch.nn.Parameter(torch.randn(model_size, device=device))
+    torch.set_rng_state(rng_state)
+    param1 = torch.nn.Parameter(torch.randn(model_size, device=device))
+    torch.set_rng_state(rng_state)
+    param2_data = torch.randn(model_size, device=device).cuda()
+    param2 = torch.nn.Parameter(param2_data)
+
+    optimizer1 = torch.optim.AdamW([param1])
+    optimizer2 = FusedAdam([param2])
+    optimizer = DeepSpeedCPUAdam([param])
+
+    for i in range(10):
+        rng_state = torch.get_rng_state()
+        param.grad = torch.randn(model_size, device=device)
+        torch.set_rng_state(rng_state)
+        param1.grad = torch.randn(model_size, device=device)
+        torch.set_rng_state(rng_state)
+        param2.grad = torch.randn(model_size, device=device).cuda()
+
+        optimizer.step()
+        optimizer2.step()
+        optimizer1.step()
+
+    check_equal(param, param1, atol=1e-2, verbose=True)
+    check_equal(param, param2.cpu(), atol=1e-2, verbose=True)
diff --git a/tests/unit/test_csr.py b/tests/unit/test_csr.py
index 06b43cdeacb9..766be7fb7470 100644
--- a/tests/unit/test_csr.py
+++ b/tests/unit/test_csr.py
@@ -1,6 +1,6 @@
 import torch
 import random
-from deepspeed.pt.deepspeed_csr_tensor import CSRTensor
+from deepspeed.runtime.csr_tensor import CSRTensor
 
 
 def test_csr_addition_self():
diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/test_cuda_backward.py
index bf0e5955d62c..eca853abf569 100755
--- a/tests/unit/test_cuda_backward.py
+++ b/tests/unit/test_cuda_backward.py
@@ -12,9 +12,13 @@
 from modeling import BertEncoder as BertEncoderPostln
 from modeling import BertConfig, BertLayerNorm
 from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
+import deepspeed
 
 import sys
 
+#if not deepspeed.ops.__installed_ops__['transformer']:
+#    pytest.skip("transformer kernels are not installed", allow_module_level=True)
+
 
 def check_equal(first, second, atol=1e-2, verbose=False):
     diction_x = {}
@@ -79,11 +83,10 @@ def __init__(self, config, weights, biases):
         super(DSEncoder, self).__init__()
         self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
         self.layer = nn.ModuleList([
-            copy.deepcopy(DeepSpeedTransformerLayer(i,
-                                                    config,
+            copy.deepcopy(DeepSpeedTransformerLayer(config,
                                                     weights,
                                                     biases))
-            for i in range(config.num_hidden_layers)
+            for _ in range(config.num_hidden_layers)
         ])
         self.grads = []
         self.pre_or_post = config.pre_layer_norm
@@ -118,7 +121,9 @@ def custom_forward(*inputs):
             # decoder layers
         else:
             for i, layer_module in enumerate(self.layer):
-                hidden_states = layer_module(hidden_states, attention_mask, self.grads)
+                hidden_states = layer_module(hidden_states,
+                                             attention_mask,
+                                             grads=self.grads)
                 hidden_states.register_hook(
                     lambda x,
                     self=self: self.grads.append([x,
@@ -142,11 +147,11 @@ def create_models(ds_config):
                              hidden_size=ds_config.hidden_size,
                              num_hidden_layers=ds_config.num_hidden_layers,
                              num_attention_heads=ds_config.heads,
-                             intermediate_size=4 * ds_config.hidden_size,
+                             intermediate_size=ds_config.intermediate_size,
                              hidden_act="gelu",
                              hidden_dropout_prob=ds_config.hidden_dropout_ratio,
                              attention_probs_dropout_prob=ds_config.attn_dropout_ratio,
-                             max_position_embeddings=ds_config.max_seq_length,
+                             max_position_embeddings=512,
                              type_vocab_size=2,
                              initializer_range=ds_config.initializer_range)
 
@@ -162,12 +167,12 @@ def create_models(ds_config):
     weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
     weights[4].data.fill_(1.0)
     weights.append(
-        nn.Parameter(torch.Tensor(4 * ds_config.hidden_size,
+        nn.Parameter(torch.Tensor(ds_config.intermediate_size,
                                   ds_config.hidden_size)))
     weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range)
     weights.append(
         nn.Parameter(torch.Tensor(ds_config.hidden_size,
-                                  4 * ds_config.hidden_size)))
+                                  ds_config.intermediate_size)))
     weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range)
     weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
     weights[7].data.fill_(1.0)
@@ -177,7 +182,7 @@ def create_models(ds_config):
     for i in range(4):
         biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
         biases[i + 1].data.zero_()
-    biases.append(nn.Parameter(torch.Tensor(4 * ds_config.hidden_size)))
+    biases.append(nn.Parameter(torch.Tensor(ds_config.intermediate_size)))
     biases[5].data.zero_()
     biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
     biases[6].data.zero_()
@@ -206,25 +211,18 @@ def set_seed(seed):
     torch.manual_seed(seed)
 
 
-def run_backward(ds_config, atol=1e-2, verbose=False):
+def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
     set_seed(123)
     bert_encoder, ds_encoder = create_models(ds_config)
 
     # prepare test data
     kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32
     hidden_states = torch.randn(ds_config.batch_size,
-                                ds_config.max_seq_length,
+                                seq_len,
                                 ds_config.hidden_size,
                                 **kwargs)
-    input_mask = torch.randn(ds_config.batch_size,
-                             1,
-                             1,
-                             ds_config.max_seq_length,
-                             **kwargs)
-    Y = torch.randn(ds_config.batch_size,
-                    ds_config.max_seq_length,
-                    ds_config.hidden_size,
-                    **kwargs)
+    input_mask = torch.randn(ds_config.batch_size, 1, 1, seq_len, **kwargs)
+    Y = torch.randn(ds_config.batch_size, seq_len, ds_config.hidden_size, **kwargs)
 
     # run baseline
     base_results = bert_encoder(hidden_states,
@@ -250,12 +248,15 @@ def run_backward(ds_config, atol=1e-2, verbose=False):
     check_equal(base_grads, ds_grads, atol=atol, verbose=verbose)
 
 
+#test_backward[3-1024-120-16-24-True-True-0.05]
 @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
                          [
-                             (3,1024,128,16,24,True,False, 0.05),
-                             (3,1024,128,16,24,True,True, 0.05),
-                             (3,1024,128,16,24,False,False, 0.1),
-                             (3,1024,128,16,24,False,True, 0.2),
+                             (3,1024,119,16,24,True,False, 0.05),
+                             (3,1024,115,16,24,True,True, 0.05),
+                             (1024,128,10,2,2,False,False, 0.1),
+                             (3,1024,52,16,24,False,True, 0.2),
+                             (3,128,51,2,24,False,False, 0.1),
+                             (3,128,54,2,24,False,True, 0.2),
                          ]) # yapf: disable
 def test_backward(batch_size,
                   hidden_size,
@@ -274,7 +275,7 @@ def test_backward(batch_size,
     ds_config.layer_id = None
     ds_config.batch_size = batch_size
     ds_config.hidden_size = hidden_size
-    ds_config.max_seq_length = seq_len
+    ds_config.intermediate_size = hidden_size
     ds_config.heads = heads
     ds_config.attn_dropout_ratio = 0.0
     ds_config.hidden_dropout_ratio = 0.0
@@ -283,7 +284,7 @@ def test_backward(batch_size,
     ds_config.initializer_range = 0.02
     ds_config.fp16 = use_fp16
 
-    run_backward(ds_config, atol=atol)
+    run_backward(ds_config, seq_len, atol=atol)
 
 
 #@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
@@ -310,6 +311,7 @@ def test_backward(batch_size,
 #    ds_config.layer_id = None
 #    ds_config.batch_size = batch_size
 #    ds_config.hidden_size = hidden_size
+#    ds_config.intermediate_size = 4 * hidden_size
 #    ds_config.max_seq_length = seq_len
 #    ds_config.heads = heads
 #    ds_config.attn_dropout_ratio = 0.0
diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py
index 4e995a34448f..5add5e152a91 100755
--- a/tests/unit/test_cuda_forward.py
+++ b/tests/unit/test_cuda_forward.py
@@ -1,325 +1,328 @@
-import argparse
-import numpy as np
-import torch
-import torch.nn.functional as F
-import pytest
-import json
-import random
-import time
-import copy
-from torch import nn
-from modelingpreln import BertEncoder as BertEncoderPreln
-from modeling import BertEncoder as BertEncoderPostln
-from modeling import BertLayerNorm, BertConfig
-from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
-
-import sys
-
-
-def check_equal(first, second, atol=1e-2, verbose=False):
-    if verbose:
-        print()
-    for i, (x, y) in enumerate(zip(first, second)):
-        x = x[0].cpu().detach().numpy()
-        y = y[0].cpu().detach().numpy()
-        if verbose:
-            print("x = {}".format(x.flatten()))
-            print("y = {}".format(y.flatten()))
-            print('-' * 80)
-        np.testing.assert_allclose(x, y, err_msg="Index: {}".format(i), atol=atol)
-
-
-def zero_grad(variables):
-    for variable in variables:
-        variable.grad.zero_()
-
-
-device = torch.device("cuda")
-kwargs_fp32 = {'dtype': torch.float, 'device': device, 'requires_grad': True}
-kwargs_fp16 = {'dtype': torch.half, 'device': device, 'requires_grad': True}
-
-
-class DSEncoder(nn.Module):
-    def __init__(self, config, weights, biases):
-        super(DSEncoder, self).__init__()
-        self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
-        self.layer = nn.ModuleList([
-            copy.deepcopy(DeepSpeedTransformerLayer(i,
-                                                    config,
-                                                    weights,
-                                                    biases))
-            for i in range(config.num_hidden_layers)
-        ])
-        self.grads = []
-        self.pre_or_post = config.pre_layer_norm
-
-    def forward(self,
-                hidden_states,
-                attention_mask,
-                output_all_encoded_layers=True,
-                checkpoint_activations=False):
-        all_encoder_layers = []
-
-        def custom(start, end):
-            def custom_forward(*inputs):
-                layers = self.layer[start:end]
-                x_ = inputs[0]
-                for layer in layers:
-                    x_ = layer(x_, inputs[1])
-                return x_
-
-            return custom_forward
-
-        if checkpoint_activations:
-            l = 0
-            num_layers = len(self.layer)
-            chunk_length = math.ceil(math.sqrt(num_layers))
-            while l < num_layers:
-                hidden_states = checkpoint.checkpoint(custom(l,
-                                                             l + chunk_length),
-                                                      hidden_states,
-                                                      attention_mask * 1)
-                l += chunk_length
-            # decoder layers
-        else:
-            for i, layer_module in enumerate(self.layer):
-                hidden_states = layer_module(hidden_states, attention_mask)
-                hidden_states.register_hook(
-                    lambda x,
-                    i=i,
-                    self=self: self.grads.append([x,
-                                                  "hidden_state"]))
-
-                if output_all_encoded_layers:
-                    all_encoder_layers.append(hidden_states)
-
-        if not output_all_encoded_layers or checkpoint_activations:
-            if (self.pre_or_post):
-                hidden_states = self.FinalLayerNorm(hidden_states)
-            all_encoder_layers.append(hidden_states)
-        return all_encoder_layers
-
-    def get_grads(self):
-        return self.grads
-
-
-def create_models(ds_config):
-    bert_config = BertConfig(vocab_size_or_config_json_file=119547,
-                             hidden_size=ds_config.hidden_size,
-                             num_hidden_layers=ds_config.num_hidden_layers,
-                             num_attention_heads=ds_config.heads,
-                             batch_size=ds_config.batch_size,
-                             intermediate_size=4 * ds_config.hidden_size,
-                             hidden_act="gelu",
-                             hidden_dropout_prob=ds_config.hidden_dropout_ratio,
-                             attention_probs_dropout_prob=ds_config.attn_dropout_ratio,
-                             max_position_embeddings=ds_config.max_seq_length,
-                             type_vocab_size=2,
-                             initializer_range=ds_config.initializer_range,
-                             fp16=ds_config.fp16)
-
-    weights = []
-    biases = []
-
-    for i in range(4):
-        weights.append(
-            nn.Parameter(torch.Tensor(ds_config.hidden_size,
-                                      ds_config.hidden_size)))
-        weights[i].data.normal_(mean=0.0, std=ds_config.initializer_range)
-
-    weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
-    weights[4].data.fill_(1.0)
-    weights.append(
-        nn.Parameter(torch.Tensor(4 * ds_config.hidden_size,
-                                  ds_config.hidden_size)))
-    weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range)
-    weights.append(
-        nn.Parameter(torch.Tensor(ds_config.hidden_size,
-                                  4 * ds_config.hidden_size)))
-    weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range)
-    weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
-    weights[7].data.fill_(1.0)
-
-    biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
-    biases[0].data.zero_()
-    for i in range(4):
-        biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
-        biases[i + 1].data.zero_()
-    biases.append(nn.Parameter(torch.Tensor(4 * ds_config.hidden_size)))
-    biases[5].data.zero_()
-    biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
-    biases[6].data.zero_()
-    biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
-    biases[7].data.zero_()
-
-    if (ds_config.pre_layer_norm):
-        bert_encoder = BertEncoderPreln(bert_config, weights, biases)
-    else:
-        bert_encoder = BertEncoderPostln(bert_config, weights, biases)
-    ds_encoder = DSEncoder(ds_config, weights, biases)
-
-    if ds_config.fp16:
-        bert_encoder.half()
-        ds_encoder.half()
-
-    bert_encoder.cuda()
-    ds_encoder.cuda()
-
-    return bert_encoder, ds_encoder
-
-
-def set_seed(seed):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-
-
-def run_forward(ds_config, atol=1e-2, verbose=False, test_bsz=None):
-    set_seed(123)
-    bert_encoder, ds_encoder = create_models(ds_config)
-
-    bsz = ds_config.batch_size if test_bsz is None else test_bsz
-
-    # prepare test data
-    kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32
-    hidden_states = torch.randn(bsz,
-                                ds_config.max_seq_length,
-                                ds_config.hidden_size,
-                                **kwargs)
-    input_mask = torch.randn(bsz, 1, 1, ds_config.max_seq_length, **kwargs)
-
-    # run baseline
-    base_results = bert_encoder(hidden_states,
-                                input_mask,
-                                output_all_encoded_layers=False,
-                                checkpoint_activations=False)
-
-    # run ds
-    ds_results = ds_encoder(hidden_states,
-                            input_mask,
-                            output_all_encoded_layers=False,
-                            checkpoint_activations=False)
-
-    # check grads
-    check_equal(base_results, ds_results, atol=atol, verbose=verbose)
-
-
-# FP16 test cases can only run on the devices support FP16.
-@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
-                         [
-                             (64,1024,128,16,3,True,False),
-                             (64,1024,128,16,3,True,True),
-                             (8,1024,384,16,3,True,False),
-                             (8,1024,384,16,3,True,True),
-                             (8,1024,512,16,3,True,False),
-                             (8,1024,512,16,3,True,True),
-                             (64,1024,128,16,3,False,False),
-                             (64,1024,128,16,3,False,True),
-                             (8,1024,384,16,3,False,False),
-                             (8,1024,384,16,3,False,True),
-                             (8,1024,512,16,3,False,False),
-                             (8,1024,512,16,3,False,True),
-                             (8,1536,128,24,3,False,False),
-                             (8,1536,128,24,3,False,True),
-                             (8,2048,128,32,3,False,False),
-                             (8,2048,128,32,3,False,True),
-                             (8,2560,128,40,3,False,False),
-                             (8,2560,128,40,3,False,True),
-                         ]) # yapf: disable
-def test_forward(batch_size,
-                 hidden_size,
-                 seq_len,
-                 heads,
-                 num_layers,
-                 is_preln,
-                 use_fp16):
-    # Only run fp16 test cases on devices with 7+ capability.
-    major, _ = torch.cuda.get_device_capability()
-    if major < 7 and use_fp16 is True:
-        return
-
-    ds_config = DeepSpeedTransformerConfig()
-    ds_config.layer_id = None
-    ds_config.batch_size = batch_size
-    ds_config.hidden_size = hidden_size
-    ds_config.max_seq_length = seq_len
-    ds_config.heads = heads
-    ds_config.attn_dropout_ratio = 0.0
-    ds_config.hidden_dropout_ratio = 0.0
-    ds_config.num_hidden_layers = num_layers
-    ds_config.pre_layer_norm = is_preln
-    ds_config.initializer_range = 0.02
-    ds_config.fp16 = use_fp16
-
-    run_forward(ds_config, atol=2e-2)
-
-
-@pytest.mark.parametrize('batch_size, small_bsz, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
-                         [
-                             (8,3,1024,512,16,3,True,False),
-                             (8,7,1024,512,16,3,True,True),
-                             (8,3,1024,512,16,3,False,False),
-                             (8,7,1024,512,16,3,False,True),
-                         ]) # yapf: disable
-def test_forward_with_small_bsz(batch_size,
-                                small_bsz,
-                                hidden_size,
-                                seq_len,
-                                heads,
-                                num_layers,
-                                is_preln,
-                                use_fp16):
-    # Only run fp16 test cases on devices with 7+ capability.
-    major, _ = torch.cuda.get_device_capability()
-    if major < 7 and use_fp16 is True:
-        return
-
-    ds_config = DeepSpeedTransformerConfig()
-    ds_config.layer_id = None
-    ds_config.batch_size = batch_size
-    ds_config.hidden_size = hidden_size
-    ds_config.max_seq_length = seq_len
-    ds_config.heads = heads
-    ds_config.attn_dropout_ratio = 0.0
-    ds_config.hidden_dropout_ratio = 0.0
-    ds_config.num_hidden_layers = num_layers
-    ds_config.pre_layer_norm = is_preln
-    ds_config.initializer_range = 0.02
-    ds_config.fp16 = use_fp16
-
-    run_forward(ds_config, atol=2e-2, test_bsz=small_bsz)
-
-@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
-                         [
-                             (64,1024,128,16,3,True,False),
-                             (64,1024,128,16,3,True,True),
-                             (64,1024,128,16,3,False,False),
-                             (64,1024,128,16,3,False,True),
-                         ]) # yapf: disable
-def test_forward_stochastic(batch_size,
-                            hidden_size,
-                            seq_len,
-                            heads,
-                            num_layers,
-                            is_preln,
-                            use_fp16):
-    # Only run fp16 test cases on devices with 7+ capability.
-    major, _ = torch.cuda.get_device_capability()
-    if major < 7 and use_fp16 is True:
-        return
-
-    ds_config = DeepSpeedTransformerConfig()
-    ds_config.layer_id = None
-    ds_config.batch_size = batch_size
-    ds_config.hidden_size = hidden_size
-    ds_config.max_seq_length = seq_len
-    ds_config.heads = heads
-    ds_config.attn_dropout_ratio = 0.0
-    ds_config.hidden_dropout_ratio = 0.0
-    ds_config.num_hidden_layers = num_layers
-    ds_config.pre_layer_norm = is_preln
-    ds_config.initializer_range = 0.02
-    ds_config.fp16 = use_fp16
-    ds_config.stochastic_mode = True
-
-    run_forward(ds_config, atol=7e-2)
+import argparse
+import numpy as np
+import torch
+import torch.nn.functional as F
+import pytest
+import json
+import random
+import time
+import copy
+from torch import nn
+from modelingpreln import BertEncoder as BertEncoderPreln
+from modeling import BertEncoder as BertEncoderPostln
+from modeling import BertLayerNorm, BertConfig
+from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
+import deepspeed
+
+import sys
+
+#if not deepspeed.ops.__installed_ops__['transformer']:
+#    pytest.skip("transformer kernels are not installed", allow_module_level=True)
+
+
+def check_equal(first, second, atol=1e-2, verbose=False):
+    if verbose:
+        print()
+    for i, (x, y) in enumerate(zip(first, second)):
+        x = x[0].cpu().detach().numpy()
+        y = y[0].cpu().detach().numpy()
+        if verbose:
+            print("x = {}".format(x.flatten()))
+            print("y = {}".format(y.flatten()))
+            print('-' * 80)
+        np.testing.assert_allclose(x, y, err_msg="Index: {}".format(i), atol=atol)
+
+
+def zero_grad(variables):
+    for variable in variables:
+        variable.grad.zero_()
+
+
+device = torch.device("cuda")
+kwargs_fp32 = {'dtype': torch.float, 'device': device, 'requires_grad': True}
+kwargs_fp16 = {'dtype': torch.half, 'device': device, 'requires_grad': True}
+
+
+class DSEncoder(nn.Module):
+    def __init__(self, config, weights, biases):
+        super(DSEncoder, self).__init__()
+        self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.layer = nn.ModuleList([
+            copy.deepcopy(DeepSpeedTransformerLayer(config,
+                                                    weights,
+                                                    biases))
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.grads = []
+        self.pre_or_post = config.pre_layer_norm
+
+    def forward(self,
+                hidden_states,
+                attention_mask,
+                output_all_encoded_layers=True,
+                checkpoint_activations=False):
+        all_encoder_layers = []
+
+        def custom(start, end):
+            def custom_forward(*inputs):
+                layers = self.layer[start:end]
+                x_ = inputs[0]
+                for layer in layers:
+                    x_ = layer(x_, inputs[1])
+                return x_
+
+            return custom_forward
+
+        if checkpoint_activations:
+            l = 0
+            num_layers = len(self.layer)
+            chunk_length = math.ceil(math.sqrt(num_layers))
+            while l < num_layers:
+                hidden_states = checkpoint.checkpoint(custom(l,
+                                                             l + chunk_length),
+                                                      hidden_states,
+                                                      attention_mask * 1)
+                l += chunk_length
+            # decoder layers
+        else:
+            for i, layer_module in enumerate(self.layer):
+                hidden_states = layer_module(hidden_states, attention_mask)
+
+                if output_all_encoded_layers:
+                    all_encoder_layers.append(hidden_states)
+
+        if not output_all_encoded_layers or checkpoint_activations:
+            if (self.pre_or_post):
+                hidden_states = self.FinalLayerNorm(hidden_states)
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+
+def create_models(ds_config):
+    bert_config = BertConfig(vocab_size_or_config_json_file=119547,
+                             hidden_size=ds_config.hidden_size,
+                             num_hidden_layers=ds_config.num_hidden_layers,
+                             num_attention_heads=ds_config.heads,
+                             batch_size=ds_config.batch_size,
+                             intermediate_size=ds_config.intermediate_size,
+                             hidden_act="gelu",
+                             hidden_dropout_prob=ds_config.hidden_dropout_ratio,
+                             attention_probs_dropout_prob=ds_config.attn_dropout_ratio,
+                             max_position_embeddings=512,
+                             type_vocab_size=2,
+                             initializer_range=ds_config.initializer_range,
+                             fp16=ds_config.fp16)
+
+    weights = []
+    biases = []
+
+    for i in range(4):
+        weights.append(
+            nn.Parameter(torch.Tensor(ds_config.hidden_size,
+                                      ds_config.hidden_size)))
+        weights[i].data.normal_(mean=0.0, std=ds_config.initializer_range)
+
+    weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
+    weights[4].data.fill_(1.0)
+    weights.append(
+        nn.Parameter(torch.Tensor(ds_config.intermediate_size,
+                                  ds_config.hidden_size)))
+    weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range)
+    weights.append(
+        nn.Parameter(torch.Tensor(ds_config.hidden_size,
+                                  ds_config.intermediate_size)))
+    weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range)
+    weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
+    weights[7].data.fill_(1.0)
+
+    biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
+    biases[0].data.zero_()
+    for i in range(4):
+        biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
+        biases[i + 1].data.zero_()
+    biases.append(nn.Parameter(torch.Tensor(ds_config.intermediate_size)))
+    biases[5].data.zero_()
+    biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
+    biases[6].data.zero_()
+    biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
+    biases[7].data.zero_()
+
+    if (ds_config.pre_layer_norm):
+        bert_encoder = BertEncoderPreln(bert_config, weights, biases)
+    else:
+        bert_encoder = BertEncoderPostln(bert_config, weights, biases)
+    ds_encoder = DSEncoder(ds_config, weights, biases)
+
+    if ds_config.fp16:
+        bert_encoder.half()
+        ds_encoder.half()
+
+    bert_encoder.cuda()
+    ds_encoder.cuda()
+
+    return bert_encoder, ds_encoder
+
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+
+def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None):
+    set_seed(123)
+    bert_encoder, ds_encoder = create_models(ds_config)
+
+    bsz = ds_config.batch_size if test_bsz is None else test_bsz
+
+    # prepare test data
+    kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32
+    hidden_states = torch.randn(bsz, seq_len, ds_config.hidden_size, **kwargs)
+    input_mask = torch.randn(bsz, 1, 1, seq_len, **kwargs)
+
+    # run baseline
+    base_results = bert_encoder(hidden_states,
+                                input_mask,
+                                output_all_encoded_layers=False,
+                                checkpoint_activations=False)
+
+    # run ds
+    ds_results = ds_encoder(hidden_states,
+                            input_mask,
+                            output_all_encoded_layers=False,
+                            checkpoint_activations=False)
+
+    # check forward evaluation
+    check_equal(base_results, ds_results, atol=atol, verbose=verbose)
+
+
+# FP16 test cases can only run on the devices support FP16.
+@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
+                         [
+                             (8,256,53,4,3,True,False),
+                             (8,256,52,4,3,True,True),
+                             (3,1024,51,16,3,True,False),
+                             (3,1024,54,16,3,True,True),
+                             (8,1024,381,16,3,True,False),
+                             (8,1024,384,16,3,True,True),
+                             (8,1024,384,16,3,True,True),
+                             (8,1024,119,16,3,True,False),
+                             (8,1024,120,16,3,True,True),
+                             (8,1024,509,16,3,True,False),
+                             (8,1024,512,16,3,True,True),
+                             (64,1024,56,16,3,False,False),
+                             (64,1024,53,16,3,False,True),
+                             (64,1024,24,16,3,False,False),
+                             (64,1024,21,16,3,False,True),
+                             (8,1024,384,16,3,False,False),
+                             (8,1024,384,16,3,False,True),
+                             (8,1024,512,16,3,False,False),
+                             (8,1024,511,16,3,False,True),
+                             (8,1536,128,24,3,False,False),
+                             (8,1536,128,24,3,False,True),
+                             (8,2048,128,32,3,False,False),
+                             (8,2048,128,32,3,False,True),
+                             (8,2560,128,40,3,False,False),
+                             (8,2560,128,40,3,False,True),
+                             (8,128,128,2,3,True,False),
+                             (8,128,128,2,3,True,True),
+                             (8,4096,128,64,3,True,True),
+                             (8,8192,128,64,3,False,True),
+                         ]) # yapf: disable
+def test_forward(batch_size,
+                 hidden_size,
+                 seq_len,
+                 heads,
+                 num_layers,
+                 is_preln,
+                 use_fp16):
+    # Only run fp16 test cases on devices with 7+ capability.
+    major, _ = torch.cuda.get_device_capability()
+    if major < 7 and use_fp16 is True:
+        return
+
+    ds_config = DeepSpeedTransformerConfig()
+    ds_config.layer_id = None
+    ds_config.batch_size = batch_size
+    ds_config.hidden_size = hidden_size
+    ds_config.intermediate_size = 4 * hidden_size
+    ds_config.heads = heads
+    ds_config.attn_dropout_ratio = 0.0
+    ds_config.hidden_dropout_ratio = 0.0
+    ds_config.num_hidden_layers = num_layers
+    ds_config.pre_layer_norm = is_preln
+    ds_config.initializer_range = 0.02
+    ds_config.fp16 = use_fp16
+
+    run_forward(ds_config, seq_len, atol=2e-2)
+
+
+@pytest.mark.parametrize('batch_size, small_bsz, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
+                         [
+                             (8,3,1024,512,16,3,True,False),
+                             (8,7,1024,512,16,3,True,True),
+                             (8,3,1024,512,16,3,False,False),
+                             (8,7,1024,512,16,3,False,True),
+                         ]) # yapf: disable
+def test_forward_with_small_bsz(batch_size,
+                                small_bsz,
+                                hidden_size,
+                                seq_len,
+                                heads,
+                                num_layers,
+                                is_preln,
+                                use_fp16):
+    # Only run fp16 test cases on devices with 7+ capability.
+    major, _ = torch.cuda.get_device_capability()
+    if major < 7 and use_fp16 is True:
+        return
+
+    ds_config = DeepSpeedTransformerConfig()
+    ds_config.layer_id = None
+    ds_config.batch_size = batch_size
+    ds_config.hidden_size = hidden_size
+    ds_config.intermediate_size = 4 * hidden_size
+    ds_config.heads = heads
+    ds_config.attn_dropout_ratio = 0.0
+    ds_config.hidden_dropout_ratio = 0.0
+    ds_config.num_hidden_layers = num_layers
+    ds_config.pre_layer_norm = is_preln
+    ds_config.initializer_range = 0.02
+    ds_config.fp16 = use_fp16
+
+    run_forward(ds_config, seq_len, atol=2e-2, test_bsz=small_bsz)
+
+@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
+                         [
+                             (64,1024,128,16,3,True,False),
+                             (64,1024,128,16,3,True,True),
+                             (64,1024,128,16,3,False,False),
+                             (64,1024,128,16,3,False,True),
+                         ]) # yapf: disable
+def test_forward_stochastic(batch_size,
+                            hidden_size,
+                            seq_len,
+                            heads,
+                            num_layers,
+                            is_preln,
+                            use_fp16):
+    # Only run fp16 test cases on devices with 7+ capability.
+    major, _ = torch.cuda.get_device_capability()
+    if major < 7 and use_fp16 is True:
+        return
+
+    ds_config = DeepSpeedTransformerConfig()
+    ds_config.layer_id = None
+    ds_config.batch_size = batch_size
+    ds_config.hidden_size = hidden_size
+    ds_config.intermediate_size = 4 * hidden_size
+    ds_config.heads = heads
+    ds_config.attn_dropout_ratio = 0.0
+    ds_config.hidden_dropout_ratio = 0.0
+    ds_config.num_hidden_layers = num_layers
+    ds_config.pre_layer_norm = is_preln
+    ds_config.initializer_range = 0.02
+    ds_config.fp16 = use_fp16
+    ds_config.stochastic_mode = True
+
+    run_forward(ds_config, seq_len, atol=7e-2)
diff --git a/tests/unit/test_data.py b/tests/unit/test_data.py
new file mode 100644
index 000000000000..d05b9d232e45
--- /dev/null
+++ b/tests/unit/test_data.py
@@ -0,0 +1,11 @@
+from deepspeed.utils import RepeatingLoader
+
+
+def test_repeating_loader():
+    loader = [1, 2, 3]
+    loader = RepeatingLoader(loader)
+
+    for idx in range(50):
+        assert next(loader) == 1
+        assert next(loader) == 2
+        assert next(loader) == 3
diff --git a/tests/unit/test_ds_config.py b/tests/unit/test_ds_config.py
index 9f19fd226326..728a46bbbb1b 100755
--- a/tests/unit/test_ds_config.py
+++ b/tests/unit/test_ds_config.py
@@ -1,7 +1,7 @@
 import pytest
 import os
 import json
-from deepspeed.pt import deepspeed_config as ds_config
+from deepspeed.runtime import config as ds_config
 
 
 def test_only_required_fields(tmpdir):
diff --git a/tests/unit/test_dynamic_loss_scale.py b/tests/unit/test_dynamic_loss_scale.py
index e12386271450..7575d6b49454 100755
--- a/tests/unit/test_dynamic_loss_scale.py
+++ b/tests/unit/test_dynamic_loss_scale.py
@@ -191,7 +191,6 @@ def _test_unfused_no_overflow(args):
         model, optim, _, _ = deepspeed.initialize(args=args,
                                                   model=model,
                                                   model_parameters=model.parameters())
-
         expected_loss_scale = 2**8
         expected_scale_window = 2
         # Ensure the dynamic loss scaler is correctly configured.
diff --git a/tests/unit/test_elastic.py b/tests/unit/test_elastic.py
new file mode 100644
index 000000000000..62d948d599b0
--- /dev/null
+++ b/tests/unit/test_elastic.py
@@ -0,0 +1,270 @@
+import pytest
+import deepspeed
+from common import distributed_test
+from deepspeed.git_version_info import version as ds_version
+from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
+
+base_ds_config = {
+    "elasticity": {
+        "enabled": True,
+        "max_train_batch_size": 10000,
+        "micro_batch_sizes": [8,
+                              12,
+                              16,
+                              17],
+        "min_gpus": 32,
+        "max_gpus": 1500,
+        "min_time": 20,
+        "version": 0.1
+    }
+}
+
+
+def test_basic_10k():
+    ds_config = base_ds_config.copy()
+    final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(
+        ds_config=ds_config,
+        target_deepspeed_version=ds_version)
+
+    for gpu_num in valid_gpus:
+        assert final_batch_size % gpu_num == 0, f"Batch {final_batch_size} is not divisible by GPU count {gpu_num}"
+        batch_per_gpu = final_batch_size // gpu_num
+        found_valid_mbsize = False
+
+        for mb in ds_config['elasticity']['micro_batch_sizes']:
+            if batch_per_gpu % mb == 0:
+                found_valid_mb = True
+                break
+        assert found_valid_mb, "No valid mb found"
+
+    assert len(valid_gpus) == 23
+    assert final_batch_size == 9792
+
+
+def test_old_version():
+    ds_config = base_ds_config.copy()
+    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+        final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(
+            ds_config=ds_config,
+            target_deepspeed_version="0.2")
+
+
+def test_disabled():
+    ds_config = base_ds_config.copy()
+    ds_config['elasticity']['enabled'] = False
+    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+        final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(
+            ds_config=ds_config,
+            target_deepspeed_version=ds_version)
+
+
+def test_valid_world_size():
+    ds_config = base_ds_config.copy()
+    final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
+            ds_config=ds_config,
+            target_deepspeed_version=ds_version,
+            world_size=64)
+    assert mbsize == 17
+
+
+def test_invalid_world_size():
+    ds_config = base_ds_config.copy()
+    with pytest.raises(deepspeed.elasticity.config.ElasticityIncompatibleWorldSize):
+        final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
+            ds_config=ds_config,
+            target_deepspeed_version=ds_version,
+            world_size=128)
+
+
+def test_future_elastic_version():
+    ds_config = base_ds_config.copy()
+    ds_config['elasticity']['version'] = '0.2'
+    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                    target_deepspeed_version=ds_version)
+
+
+def test_missing_max_batch():
+    ds_config = base_ds_config.copy()
+    del ds_config['elasticity']['max_train_batch_size']
+    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                    target_deepspeed_version=ds_version)
+
+
+def test_missing_micro_batch():
+    ds_config = base_ds_config.copy()
+    del ds_config['elasticity']['micro_batch_sizes']
+    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                    target_deepspeed_version=ds_version)
+
+
+def test_empty_config():
+    ds_config = {"elasticity": {"enabled": True}}
+    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                    target_deepspeed_version=ds_version)
+
+
+@pytest.mark.parametrize('key, value',
+                         [('micro_batch_sizes',
+                           [1,
+                            4,
+                            -1,
+                            2,
+                            -10]),
+                          ('min_gpus',
+                           -1),
+                          ('max_gpus',
+                           -1),
+                          ('micro_batch_sizes',
+                           5),
+                          ('micro_batch_sizes',
+                           ['a',
+                            None,
+                            0.5]),
+                          ('micro_batch_sizes',
+                           [2,
+                            0.5,
+                            4])])
+def test_invalid_config_values(key, value):
+    ds_config = base_ds_config.copy()
+    ds_config['elasticity'][key] = value
+    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                    target_deepspeed_version=ds_version)
+
+
+def test_proper_mbsz():
+    ds_config = base_ds_config.copy()
+    ds_config["elasticity"]["max_train_batch_size"] = 32
+    ds_config["elasticity"]["micro_batch_sizes"] = [1, 2, 3, 7]
+    ds_config["elasticity"]["min_gpus"] = 1
+    final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
+        ds_config=ds_config,
+        target_deepspeed_version=ds_version,
+        world_size=7)
+    assert mbsize == 3
+
+
+def test_non_elastic_batch_params(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Lamb",
+            "params": {
+                "lr": 0.00015
+            }
+        },
+        "gradient_clipping": 1.0,
+        "elasticity": {
+            "enabled": True,
+            "max_train_batch_size": 4,
+            "micro_batch_sizes": [1,
+                                  2,
+                                  3,
+                                  4],
+            "min_gpus": 1,
+            "max_gpus": 4,
+            "min_time": 20,
+            "version": 0.1
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1, 2])
+    def _test_elastic(args, model, hidden_dim):
+        with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+            model, _, _,_ = deepspeed.initialize(args=args,
+                                                 model=model,
+                                                 model_parameters=model.parameters())
+
+    _test_elastic(args=args, model=model, hidden_dim=hidden_dim)
+
+
+def test_non_elastic_batch_params_w_override(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Lamb",
+            "params": {
+                "lr": 0.00015
+            }
+        },
+        "gradient_clipping": 1.0,
+        "elasticity": {
+            "enabled": True,
+            "max_train_batch_size": 4,
+            "micro_batch_sizes": [1,
+                                  2,
+                                  3,
+                                  4],
+            "min_gpus": 1,
+            "max_gpus": 4,
+            "min_time": 20,
+            "version": 0.1,
+            "ignore_non_elastic_batch_info": True
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1, 2])
+    def _test_elastic(args, model, hidden_dim):
+        model, _, _,_ = deepspeed.initialize(args=args,
+                                             model=model,
+                                             model_parameters=model.parameters())
+
+    _test_elastic(args=args, model=model, hidden_dim=hidden_dim)
+
+
+def test_elastic_config_changed(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Lamb",
+            "params": {
+                "lr": 0.00015
+            }
+        },
+        "gradient_clipping": 1.0,
+        "elasticity": {
+            "enabled": True,
+            "max_train_batch_size": 4,
+            "micro_batch_sizes": [1,
+                                  2,
+                                  3,
+                                  4],
+            "min_gpus": 1,
+            "max_gpus": 4,
+            "min_time": 20,
+            "version": 0.1,
+            "ignore_non_elastic_batch_info": True
+        }
+    }
+    import json, os
+    scheduler_elastic_config = config_dict.copy()
+    scheduler_elastic_config["elasticity"]["max_train_batch_size"] = 27
+    os.environ['DEEPSPEED_ELASTICITY_CONFIG'] = json.dumps(scheduler_elastic_config)
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1, 2])
+    def _test_elastic(args, model, hidden_dim):
+        with pytest.raises(deepspeed.elasticity.config.ElasticityError):
+            model, _, _,_ = deepspeed.initialize(args=args,
+                                                 model=model,
+                                                 model_parameters=model.parameters())
+
+    _test_elastic(args=args, model=model, hidden_dim=hidden_dim)
diff --git a/tests/unit/test_flops_profiler.py b/tests/unit/test_flops_profiler.py
new file mode 100644
index 000000000000..fc741707646f
--- /dev/null
+++ b/tests/unit/test_flops_profiler.py
@@ -0,0 +1,117 @@
+import torch
+import deepspeed
+import deepspeed.runtime.utils as ds_utils
+from deepspeed.profiling.flops_profiler import FlopsProfiler, get_model_profile
+from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
+from common import distributed_test
+
+
+def test_flops_profiler_in_ds_trainning(tmpdir):
+    config_dict = {
+        "train_batch_size": 1,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.001,
+            }
+        },
+        "zero_optimization": {
+            "stage": 0
+        },
+        "fp16": {
+            "enabled": True,
+        },
+        "flops_profiler": {
+            "enabled": True,
+            "start_step": 2,
+            "end_step": 3,
+            "module_depth": -1,
+            "top_modules": 3,
+        },
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_flops_profiler_in_ds_trainning(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                            model=model,
+                                            model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.half)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+            if n == 3: break
+        assert model.flops_profiler.flops == 100
+        assert model.flops_profiler.params == 110
+
+    _test_flops_profiler_in_ds_trainning(args, model, hidden_dim)
+
+
+class LeNet5(torch.nn.Module):
+    def __init__(self, n_classes):
+        super(LeNet5, self).__init__()
+
+        self.feature_extractor = torch.nn.Sequential(
+            torch.nn.Conv2d(in_channels=1,
+                            out_channels=6,
+                            kernel_size=5,
+                            stride=1),
+            torch.nn.Tanh(),
+            torch.nn.AvgPool2d(kernel_size=2),
+            torch.nn.Conv2d(in_channels=6,
+                            out_channels=16,
+                            kernel_size=5,
+                            stride=1),
+            torch.nn.Tanh(),
+            torch.nn.AvgPool2d(kernel_size=2),
+            torch.nn.Conv2d(in_channels=16,
+                            out_channels=120,
+                            kernel_size=5,
+                            stride=1),
+            torch.nn.Tanh(),
+        )
+
+        self.classifier = torch.nn.Sequential(
+            torch.nn.Linear(in_features=120,
+                            out_features=84),
+            torch.nn.Tanh(),
+            torch.nn.Linear(in_features=84,
+                            out_features=n_classes),
+        )
+
+    def forward(self, x):
+        x = self.feature_extractor(x)
+        x = torch.flatten(x, 1)
+        logits = self.classifier(x)
+        probs = torch.nn.functional.softmax(logits, dim=1)
+        return logits, probs
+
+
+def test_flops_profiler_in_inference():
+    mod = LeNet5(10)
+    batch_size = 1024
+    input = torch.randn(batch_size, 1, 32, 32)
+    macs, params, steps = get_model_profile(
+        mod,
+        tuple(input.shape),
+        print_profile=True,
+        print_aggregated_profile=True,
+        module_depth=-1,
+        top_modules=3,
+        warm_up=5,
+        num_steps=10,
+        as_strings=True,
+        ignore_modules=None,
+    )
+    print(macs, params, steps)
+    assert macs == "439.55 MMACs"
+    assert params == "61.71 k"
diff --git a/tests/unit/test_fp16.py b/tests/unit/test_fp16.py
index 320d026bdd83..ae6041f3ec44 100755
--- a/tests/unit/test_fp16.py
+++ b/tests/unit/test_fp16.py
@@ -4,8 +4,16 @@
 import pytest
 import json
 import os
+from deepspeed.ops.adam import FusedAdam
 from common import distributed_test
-from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
+from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict, create_deepspeed_args
+
+try:
+    from apex import amp
+    _amp_available = True
+except ImportError:
+    _amp_available = False
+amp_available = pytest.mark.skip(_amp_available, reason="apex/amp is not installed")
 
 
 def test_lamb_fp32_grad_clip(tmpdir):
@@ -27,9 +35,9 @@ def test_lamb_fp32_grad_clip(tmpdir):
 
     @distributed_test(world_size=[1, 2])
     def _test_lamb_fp32_grad_clip(args, model, hidden_dim):
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -65,9 +73,9 @@ def test_lamb_fp16_basic(tmpdir):
 
     @distributed_test(world_size=[1, 2])
     def _test_lamb_fp16_basic(args, model, hidden_dim):
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -102,9 +110,9 @@ def test_lamb_fp16_empty_grad(tmpdir):
 
     @distributed_test(world_size=[2])
     def _test_lamb_fp16_empty_grad(args, model, hidden_dim):
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -139,9 +147,9 @@ def test_adam_fp32_empty_grad(tmpdir):
 
     @distributed_test(world_size=[2])
     def _test_adam_fp32_empty_grad(args, model, hidden_dim):
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -171,9 +179,9 @@ def test_adamw_fp16_basic(tmpdir):
     @distributed_test(world_size=[1])
     def _test_adamw_fp16_basic(args, model, hidden_dim):
         optimizer = torch.optim.AdamW(params=model.parameters())
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             optimizer=optimizer)
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              optimizer=optimizer)
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -186,6 +194,41 @@ def _test_adamw_fp16_basic(args, model, hidden_dim):
     _test_adamw_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
 
 
+def test_dict_config_adamw_fp16_basic():
+    config_dict = {
+        "train_batch_size": 1,
+        "steps_per_print": 1,
+        "fp16": {
+            "enabled": True
+        }
+    }
+    args = create_deepspeed_args()
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_adamw_fp16_basic(args, model, hidden_dim, config_dict):
+        optimizer = torch.optim.AdamW(params=model.parameters())
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              optimizer=optimizer,
+                                              config_params=config_dict)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _test_adamw_fp16_basic(args=args,
+                           model=model,
+                           hidden_dim=hidden_dim,
+                           config_dict=config_dict)
+
+
 def test_adamw_fp16_empty_grad(tmpdir):
     config_dict = {
         "train_batch_size": 1,
@@ -202,9 +245,9 @@ def test_adamw_fp16_empty_grad(tmpdir):
     @distributed_test(world_size=[1])
     def _test_adamw_fp16_empty_grad(args, model, hidden_dim):
         optimizer = torch.optim.AdamW(params=model.parameters())
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             optimizer=optimizer)
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              optimizer=optimizer)
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -217,8 +260,18 @@ def _test_adamw_fp16_empty_grad(args, model, hidden_dim):
     _test_adamw_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
 
 
-@pytest.mark.parametrize("zero_stage", [0, 1, 2])
-def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage):
+@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+                         [
+                             (1,
+                              False),
+                             (2,
+                              False),
+                             (2,
+                              True),
+                         ])
+def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload):
+    # if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
+    #    pytest.skip("cpu-adam is not installed")
     config_dict = {
         "train_batch_size": 1,
         "steps_per_print": 1,
@@ -246,7 +299,8 @@ def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage):
             "enabled": True
         },
         "zero_optimization": {
-            "stage": zero_stage
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload
         }
     }
 
@@ -257,9 +311,9 @@ def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage):
 
     @distributed_test(world_size=[1])
     def _test_adam_fp16_zero_onecycle_compatibility(args, model, hidden_dim):
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -274,8 +328,18 @@ def _test_adam_fp16_zero_onecycle_compatibility(args, model, hidden_dim):
                                                 hidden_dim=hidden_dim)
 
 
-@pytest.mark.parametrize("zero_stage", [1, 2])
-def test_zero_static_scale(tmpdir, zero_stage):
+@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+                         [
+                             (1,
+                              False),
+                             (2,
+                              False),
+                             (2,
+                              True),
+                         ])
+def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
+    # if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
+    #    pytest.skip("cpu-adam is not installed")
     config_dict = {
         "train_batch_size": 4,
         "steps_per_print": 1,
@@ -290,7 +354,8 @@ def test_zero_static_scale(tmpdir, zero_stage):
             "loss_scale": 138.
         },
         "zero_optimization": {
-            "stage": zero_stage
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload
         }
     }
     args = args_from_dict(tmpdir, config_dict)
@@ -299,9 +364,9 @@ def test_zero_static_scale(tmpdir, zero_stage):
     def _test_zero_static_scale(args):
         hidden_dim = 10
         model = SimpleModel(hidden_dim, empty_grad=True)
-        model, optim, _,_ = deepspeed.initialize(args=args,
-                                            model=model,
-                                            model_parameters=model.parameters())
+        model, optim, _, _ = deepspeed.initialize(args=args,
+                                                  model=model,
+                                                  model_parameters=model.parameters())
 
         # Ensure the static scaler is configured.
         assert optim.dynamic_loss_scale == False
@@ -342,9 +407,9 @@ def test_zero_static_scale_deprecated_format(tmpdir):
     def _test_zero_static_scale(args):
         hidden_dim = 10
         model = SimpleModel(hidden_dim, empty_grad=True)
-        model, optim, _,_ = deepspeed.initialize(args=args,
-                                            model=model,
-                                            model_parameters=model.parameters())
+        model, optim, _, _ = deepspeed.initialize(args=args,
+                                                  model=model,
+                                                  model_parameters=model.parameters())
 
         # Ensure the static scaler is configured.
         assert optim.dynamic_loss_scale == False
@@ -363,8 +428,18 @@ def _test_zero_static_scale(args):
     _test_zero_static_scale(args)
 
 
-@pytest.mark.parametrize("zero_stage", [1, 2])
-def test_zero_allow_untested_optimizer(tmpdir, zero_stage):
+@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+                         [
+                             (1,
+                              False),
+                             (2,
+                              False),
+                             (2,
+                              True),
+                         ])
+def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
+    # if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
+    #    pytest.skip("cpu-adam is not installed")
     config_dict = {
         "train_batch_size": 4,
         "steps_per_print": 1,
@@ -372,7 +447,8 @@ def test_zero_allow_untested_optimizer(tmpdir, zero_stage):
             "enabled": True,
         },
         "zero_optimization": {
-            "stage": zero_stage
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload
         },
         "zero_allow_untested_optimizer": False
     }
@@ -384,16 +460,26 @@ def _test_zero_allow_untested_optimizer(args):
         model = SimpleModel(hidden_dim, empty_grad=True)
         optimizer = SimpleOptimizer(model.parameters())
         with pytest.raises(AssertionError):
-            model, optim, _,_ = deepspeed.initialize(args=args,
-                                                    model=model,
-                                                    optimizer=optimizer,
-                                                    model_parameters=model.parameters())
+            model, optim, _, _ = deepspeed.initialize(args=args,
+                                                      model=model,
+                                                      optimizer=optimizer,
+                                                      model_parameters=model.parameters())
 
     _test_zero_allow_untested_optimizer(args)
 
 
-@pytest.mark.parametrize("zero_stage", [1, 2])
-def test_zero_empty_partition(tmpdir, zero_stage):
+@pytest.mark.parametrize('zero_stage, use_cpu_offload',
+                         [
+                             (1,
+                              False),
+                             (2,
+                              False),
+                             (2,
+                              True),
+                         ])
+def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
+    # if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
+    #    pytest.skip("cpu-adam is not installed")
     config_dict = {
         "train_micro_batch_size_per_gpu": 1,
         "gradient_accumulation_steps": 1,
@@ -408,7 +494,10 @@ def test_zero_empty_partition(tmpdir, zero_stage):
             }
         },
         "zero_optimization": {
-            "stage": zero_stage
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload,
+            "reduce_bucket_size": 100,
+            "allgather_bucket_size": 100
         }
     }
     args = args_from_dict(tmpdir, config_dict)
@@ -436,6 +525,7 @@ def _test_zero_empty_partition(args):
     _test_zero_empty_partition(args)
 
 
+@amp_available
 def test_adam_amp_basic(tmpdir):
     config_dict = {"train_batch_size": 1, "steps_per_print": 1, "amp": {"enabled": True}}
     args = args_from_dict(tmpdir, config_dict)
@@ -446,9 +536,9 @@ def test_adam_amp_basic(tmpdir):
     @distributed_test(world_size=[1])
     def _test_adam_amp_basic(args, model, hidden_dim):
         optimizer = torch.optim.Adam(params=model.parameters())
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             optimizer=optimizer)
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              optimizer=optimizer)
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -461,6 +551,7 @@ def _test_adam_amp_basic(args, model, hidden_dim):
     _test_adam_amp_basic(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@amp_available
 def test_lamb_amp_basic(tmpdir):
     config_dict = {
         "train_batch_size": 2,
@@ -483,9 +574,9 @@ def test_lamb_amp_basic(tmpdir):
 
     @distributed_test(world_size=[1, 2])
     def _test_lamb_amp_basic(args, model, hidden_dim):
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -498,6 +589,7 @@ def _test_lamb_amp_basic(args, model, hidden_dim):
     _test_lamb_amp_basic(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@amp_available
 def test_adam_amp_o2(tmpdir):
     config_dict = {
         "train_batch_size": 2,
@@ -521,9 +613,9 @@ def test_adam_amp_o2(tmpdir):
 
     @distributed_test(world_size=[1, 2])
     def _test_adam_amp_o2(args, model, hidden_dim):
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -536,6 +628,7 @@ def _test_adam_amp_o2(args, model, hidden_dim):
     _test_adam_amp_o2(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@amp_available
 def test_adam_amp_o2_empty_grad(tmpdir):
     config_dict = {
         "train_batch_size": 2,
@@ -559,9 +652,9 @@ def test_adam_amp_o2_empty_grad(tmpdir):
 
     @distributed_test(world_size=[2])
     def _test_adam_amp_o2_empty_grad(args, model, hidden_dim):
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -572,3 +665,133 @@ def _test_adam_amp_o2_empty_grad(args, model, hidden_dim):
             model.step()
 
     _test_adam_amp_o2_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
+
+
+@pytest.mark.parametrize('zero_stage, optimizer_constructor',
+                         [(1,
+                           FusedAdam),
+                          (2,
+                           torch.optim.Adam),
+                          (2,
+                           FusedAdam)])
+def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_constructor):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "fp16": {
+            "enabled": True
+        },
+        "zero_optimization": {
+            "stage": zero_stage
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_zero_supported_client_optimizer(args, model, optimizer_constructor):
+        client_optimizer = optimizer_constructor(params=model.parameters())
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              optimizer=client_optimizer)
+
+    _test_zero_supported_client_optimizer(args=args,
+                                          model=model,
+                                          optimizer_constructor=optimizer_constructor)
+
+
+def test_zero2_reduce_scatter_off(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            }
+        },
+        "gradient_clipping": 1.0,
+        "zero_optimization": {
+            "stage": 2,
+            "contiguous_gradients": True,
+            "allgather_bucket_size": 2000000000,
+            "reduce_bucket_size": 200000000,
+            "overlap_comm": False,
+            "reduce_scatter": False
+        },
+        "fp16": {
+            "enabled": True
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, rank=args.local_rank)
+
+    @distributed_test(world_size=[2])
+    def _helper(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _helper(args=args, model=model, hidden_dim=hidden_dim)
+
+
+@pytest.mark.parametrize('adam_type, torch_impl',
+                         [('Adam',
+                           True),
+                          ('Adam',
+                           False),
+                          ('AdamW',
+                           True),
+                          ('AdamW',
+                           False)])
+def test_fp16_adam_types(tmpdir, adam_type, torch_impl):
+    config_dict = {
+        "train_batch_size": 1,
+        "steps_per_print": 1,
+        "fp16": {
+            "enabled": True,
+            "initial_scale_power": 10
+        },
+        "optimizer": {
+            "type": adam_type,
+            "torch_adam": torch_impl,
+            "params": {
+                "lr": 0.00015
+            }
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_fp16_adam_types(args, model, hidden_dim):
+
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=10,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+
+        for _, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _test_fp16_adam_types(args=args, model=model, hidden_dim=hidden_dim)
diff --git a/tests/unit/test_lr_schedulers.py b/tests/unit/test_lr_schedulers.py
new file mode 100755
index 000000000000..d93ac6f171bb
--- /dev/null
+++ b/tests/unit/test_lr_schedulers.py
@@ -0,0 +1,527 @@
+import torch
+import deepspeed
+import argparse
+import pytest
+import json
+import os
+from common import distributed_test
+from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
+from deepspeed.runtime.lr_schedules import LR_RANGE_TEST, LR_RANGE_TEST_MIN_LR, LR_RANGE_TEST_STEP_RATE, LR_RANGE_TEST_STEP_SIZE, LR_RANGE_TEST_STAIRCASE
+from deepspeed.runtime.lr_schedules import WARMUP_LR, WARMUP_MIN_LR, WARMUP_MAX_LR, WARMUP_NUM_STEPS
+from deepspeed.runtime.lr_schedules import ONE_CYCLE, CYCLE_MIN_LR, CYCLE_MAX_LR, CYCLE_FIRST_STEP_SIZE, DECAY_LR_RATE, DECAY_STEP_SIZE
+from deepspeed.runtime.lr_schedules import CYCLE_MIN_MOM, CYCLE_MAX_MOM, DECAY_MOM_RATE
+from deepspeed.runtime.lr_schedules import WARMUP_DECAY_LR, TOTAL_NUM_STEPS
+
+
+def _verify_continuous_decrease(values):
+    for i in range(len(values) - 1):
+        assert values[i] > values[i + 1]
+
+
+def _verify_continuous_increase(values):
+    for i in range(len(values) - 1):
+        assert values[i] < values[i + 1]
+
+
+def _verify_staircase_increase(values, step_size):
+    num_values = len(values)
+    for i in range(0, num_values, step_size):
+        j = min(i + step_size, num_values)
+        assert all([values[i] == v for v in values[i:j]])
+
+
+@pytest.mark.parametrize("scheduler_type,params",
+                         [(WARMUP_LR,
+                           {}),
+                          (WARMUP_DECAY_LR,
+                           {
+                               WARMUP_NUM_STEPS: 10,
+                               TOTAL_NUM_STEPS: 20
+                           }),
+                          (ONE_CYCLE,
+                           {
+                               CYCLE_MIN_LR: 0,
+                               CYCLE_MAX_LR: 0.1
+                           }),
+                          (LR_RANGE_TEST,
+                           {})])
+def test_get_lr_before_train(tmpdir, scheduler_type, params):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            },
+        },
+        "scheduler": {
+            "type": scheduler_type,
+            "params": params
+        },
+        "gradient_clipping": 1.0
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_get_lr_before_train(args, model, hidden_dim):
+        model, _, _, lr_scheduler = deepspeed.initialize(args=args,
+                                                         model=model,
+                                                         model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+        for n, batch in enumerate(data_loader):
+            # get lr before training starts
+            lr_scheduler.get_lr()
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _test_get_lr_before_train(args=args, model=model, hidden_dim=hidden_dim)
+
+
+@pytest.mark.parametrize("warmup_num_steps", [10, 15, 19, 33])
+def test_lr_warmup_schedule(tmpdir, warmup_num_steps):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            },
+        },
+        "scheduler": {
+            "type": WARMUP_LR,
+            "params": {
+                WARMUP_MIN_LR: 0.1,
+                WARMUP_MAX_LR: 0.2,
+                WARMUP_NUM_STEPS: warmup_num_steps
+            }
+        },
+        "gradient_clipping": 1.0
+    }
+
+    total_num_steps = 2 * warmup_num_steps
+
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_lr_warmup_schedule(args, model, hidden_dim, schedule_params, num_steps):
+        model, _, _, lr_scheduler = deepspeed.initialize(args=args,
+                                                         model=model,
+                                                         model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=num_steps * 2,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+        step_lrs = []
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+            step_lrs.append(lr_scheduler.get_lr())
+
+        # Verify initial lr
+        assert step_lrs[0] == [schedule_params[WARMUP_MIN_LR]]
+
+        # Verify warmup completion
+        warmup_num_steps = schedule_params[WARMUP_NUM_STEPS]
+        warmup_max_lr = [schedule_params[WARMUP_MAX_LR]]
+        assert step_lrs[warmup_num_steps] == warmup_max_lr
+
+        # Verify post-warmup completion
+        assert all([warmup_max_lr == lr for lr in step_lrs[warmup_num_steps:]])
+
+    _test_lr_warmup_schedule(args=args,
+                             model=model,
+                             hidden_dim=hidden_dim,
+                             schedule_params=config_dict["scheduler"]["params"],
+                             num_steps=total_num_steps)
+
+
+@pytest.mark.parametrize("warmup_num_steps", [10, 15, 19, 33])
+def test_lr_warmup_decay_schedule(tmpdir, warmup_num_steps):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            },
+        },
+        "scheduler": {
+            "type": WARMUP_DECAY_LR,
+            "params": {
+                WARMUP_MIN_LR: 0.1,
+                WARMUP_MAX_LR: 0.2,
+                WARMUP_NUM_STEPS: warmup_num_steps,
+                TOTAL_NUM_STEPS: warmup_num_steps * 2
+            }
+        },
+        "gradient_clipping": 1.0
+    }
+
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_lr_warmup_decay_schedule(args,
+                                       model,
+                                       hidden_dim,
+                                       schedule_params,
+                                       num_steps):
+        model, _, _, lr_scheduler = deepspeed.initialize(args=args,
+                                                         model=model,
+                                                         model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=num_steps * 2,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+        step_lrs = []
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+            step_lrs.append(lr_scheduler.get_lr())
+
+        # Verify initial lr
+        assert step_lrs[0] == [schedule_params[WARMUP_MIN_LR]]
+
+        # Verify lr at warmup completion
+        warmup_num_steps = schedule_params[WARMUP_NUM_STEPS]
+        warmup_max_lr = [schedule_params[WARMUP_MAX_LR]]
+        assert step_lrs[warmup_num_steps] == warmup_max_lr
+
+        # Verify decay phase
+        previous_lr = warmup_max_lr
+        for lr in step_lrs[warmup_num_steps + 1:]:
+            assert lr < previous_lr
+            previous_lr = lr
+
+    schedule_params = config_dict["scheduler"]["params"]
+
+    total_num_steps = schedule_params[TOTAL_NUM_STEPS]
+
+    _test_lr_warmup_decay_schedule(args=args,
+                                   model=model,
+                                   hidden_dim=hidden_dim,
+                                   schedule_params=schedule_params,
+                                   num_steps=total_num_steps)
+
+
+@pytest.mark.parametrize("scheduler_type,params",
+                         [(WARMUP_LR,
+                           {}),
+                          (WARMUP_DECAY_LR,
+                           {
+                               WARMUP_NUM_STEPS: 5,
+                               TOTAL_NUM_STEPS: 10
+                           }),
+                          (ONE_CYCLE,
+                           {
+                               CYCLE_MIN_LR: 0,
+                               CYCLE_MAX_LR: 0.1,
+                               CYCLE_FIRST_STEP_SIZE: 5,
+                               DECAY_STEP_SIZE: 5
+                           }),
+                          (LR_RANGE_TEST,
+                           {
+                               LR_RANGE_TEST_MIN_LR: 1e-4,
+                               LR_RANGE_TEST_STEP_SIZE: 1
+                           })])
+def test_scheduler_optimizer_parity(tmpdir, scheduler_type, params):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            },
+        },
+        "scheduler": {
+            "type": scheduler_type,
+            "params": params
+        },
+        "gradient_clipping": 1.0
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_scheduler_optimizer_parity(args, model, hidden_dim):
+        model, _, _, lr_scheduler = deepspeed.initialize(args=args,
+                                                         model=model,
+                                                         model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+            assert lr_scheduler.get_lr() == model.get_lr()
+
+    _test_scheduler_optimizer_parity(args=args, model=model, hidden_dim=hidden_dim)
+
+
+@pytest.mark.parametrize("min_lr, step_rate, step_size, staircase",
+                         [(1e-4, 1e-5, 1, True),
+                          (1e-5, 1e-5, 1, False),
+                          (1e-4, 1e-3, 10, True),
+                          (1e-3, 1e-3, 10, False),
+                          (1e-2, 1e-2, 19, True),
+                          (1e-2, 1e-2, 19, False)
+                           ])# yapf: disable
+def test_lr_range_test(tmpdir, min_lr, step_rate, step_size, staircase):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            },
+        },
+        "scheduler": {
+            "type": LR_RANGE_TEST,
+            "params": {
+                LR_RANGE_TEST_MIN_LR: min_lr,
+                LR_RANGE_TEST_STEP_RATE: step_rate,
+                LR_RANGE_TEST_STEP_SIZE: step_size,
+                LR_RANGE_TEST_STAIRCASE: staircase
+            }
+        },
+        "gradient_clipping": 1.0
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_lr_range_test(args, model, hidden_dim, min_lr, step_size, staircase):
+        model, _, _, lr_scheduler = deepspeed.initialize(args=args,
+                                                         model=model,
+                                                         model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=max(50,
+                                                          step_size * 2),
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+
+        step_lrs = []
+        for _, batch in enumerate(data_loader):
+            step_lrs.append(lr_scheduler.get_lr())
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+        # Verify starting lr
+        assert step_lrs[0] == min_lr
+
+        if staircase:
+            # Verify staircase increasing lr
+            _verify_staircase_increase(step_lrs, step_size)
+        else:
+            # Verify continuous increasing lr
+            _verify_continuous_increase(step_lrs)
+
+    _test_lr_range_test(args=args,
+                        model=model,
+                        hidden_dim=hidden_dim,
+                        min_lr=[min_lr],
+                        step_size=step_size,
+                        staircase=staircase)
+
+
+@pytest.mark.parametrize("min_lr, max_lr, decay_rate, step_size",
+                         [
+                             (1e-5, 1e-2, 1e-3, 10),
+                             (1e-3, 1e-1, 0, 21),
+                             (1e-5, 1e-2, 1e-3, 10),
+                             (1e-3, 1e-1, 0, 21),
+                         ])  # yapf: disable
+def test_onecycle_lr(tmpdir, min_lr, max_lr, decay_rate, step_size):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            },
+        },
+        "scheduler": {
+            "type": ONE_CYCLE,
+            "params": {
+                CYCLE_MIN_LR: min_lr,
+                CYCLE_MAX_LR: max_lr,
+                DECAY_LR_RATE: decay_rate,
+                CYCLE_FIRST_STEP_SIZE: step_size,
+                DECAY_STEP_SIZE: step_size
+            }
+        },
+        "gradient_clipping": 1.0
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_onecycle_lr(args,
+                          model,
+                          hidden_dim,
+                          min_lr,
+                          max_lr,
+                          step_size,
+                          decay_rate):
+        model, _, _, lr_scheduler = deepspeed.initialize(args=args,
+                                                         model=model,
+                                                         model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=max(50,
+                                                          step_size * 3),
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+
+        step_lrs = []
+        for _, batch in enumerate(data_loader):
+            step_lrs.append(lr_scheduler.get_lr())
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+        # Verify starting lr
+        assert step_lrs[0] == min_lr
+
+        # Verify peak lr
+        assert step_lrs[step_size] == max_lr
+
+        # Verify increasing phase
+        _verify_continuous_increase(step_lrs[:step_size])
+
+        # Verify decreasing phase
+        _verify_continuous_decrease(step_lrs[step_size:(step_size * 2)])
+
+        # Verify decay phase
+        if decay_rate > 0:
+            _verify_continuous_decrease(step_lrs[(step_size * 2):])
+
+    _test_onecycle_lr(args=args,
+                      model=model,
+                      hidden_dim=hidden_dim,
+                      min_lr=[min_lr],
+                      max_lr=[max_lr],
+                      step_size=step_size,
+                      decay_rate=decay_rate)
+
+
+@pytest.mark.parametrize("min_mom, max_mom, decay_rate, step_size",
+                         [
+                             (0.08, 0.09, 1e-3, 10),
+                             (0.08, 0.09, 0, 21),
+                             (0.08, 0.09, 1e-3, 10),
+                             (0.08, 0.09, 0, 21),
+                         ]) # yapf: disable
+def test_onecycle_mom(tmpdir, min_mom, max_mom, decay_rate, step_size):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            },
+        },
+        "scheduler": {
+            "type": ONE_CYCLE,
+            "params": {
+                CYCLE_MIN_LR: 1e-3,
+                CYCLE_MAX_LR: 1e-2,
+                CYCLE_MIN_MOM: min_mom,
+                CYCLE_MAX_MOM: max_mom,
+                DECAY_MOM_RATE: decay_rate,
+                CYCLE_FIRST_STEP_SIZE: step_size,
+                DECAY_STEP_SIZE: step_size
+            }
+        },
+        "gradient_clipping": 1.0
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_onecycle_mom(args,
+                           model,
+                           hidden_dim,
+                           min_mom,
+                           max_mom,
+                           step_size,
+                           decay_rate):
+        model, _, _, lr_scheduler = deepspeed.initialize(args=args,
+                                                         model=model,
+                                                         model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=max(50,
+                                                          step_size * 3),
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+
+        step_moms = []
+        for _, batch in enumerate(data_loader):
+            step_moms.append(lr_scheduler.get_mom())
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+        # Verify starting lr
+        assert step_moms[0][0][0] == max_mom
+
+        # Verify peak lr
+        assert step_moms[step_size][0][0] == min_mom
+
+        # Verify decreasing phase
+        _verify_continuous_decrease(step_moms[:step_size])
+
+        # Verify increasing phase
+        _verify_continuous_increase(step_moms[step_size:(step_size * 2)])
+
+        # Verify decay phase
+        if decay_rate > 0:
+            _verify_continuous_increase(step_moms[(step_size * 2):])
+
+    _test_onecycle_mom(args=args,
+                       model=model,
+                       hidden_dim=hidden_dim,
+                       min_mom=min_mom,
+                       max_mom=max_mom,
+                       step_size=step_size,
+                       decay_rate=decay_rate)
diff --git a/tests/unit/test_partition.py b/tests/unit/test_partition.py
new file mode 100644
index 000000000000..7cd264752c6f
--- /dev/null
+++ b/tests/unit/test_partition.py
@@ -0,0 +1,190 @@
+import pytest
+
+import torch
+import torch.distributed as dist
+
+from deepspeed.runtime.utils import partition_uniform
+from deepspeed.runtime.utils import partition_balanced
+from deepspeed.runtime.utils import prefix_sum_inc
+from deepspeed.runtime.utils import PartitionedTensor
+
+from common import distributed_test
+
+
+@distributed_test(world_size=4)
+def test_partitioned_tensor():
+    world = dist.get_world_size()
+    rank = dist.get_rank()
+
+    group = dist.new_group(ranks=list(range(world)))
+
+    rows = world * 4
+    cols = 3
+
+    full = torch.rand(rows, cols).cuda()
+    dist.broadcast(full, src=0, group=group)
+    part = PartitionedTensor(full, group=group)
+
+    assert len(part.local_size()) == 1
+    assert part.local_size()[0] * world == full.numel()
+
+    reconstructed = part.full()
+    assert torch.equal(full, reconstructed)
+
+
+@distributed_test(world_size=4)
+def test_partitioned_tensor_meta():
+    world = dist.get_world_size()
+    rank = dist.get_rank()
+
+    group = dist.new_group(ranks=list(range(world)))
+
+    rows = world * 7
+    cols = 3
+
+    full = torch.rand(rows, cols).cuda()
+    dist.broadcast(full, src=0, group=group)
+    part = PartitionedTensor(full, group=group)
+
+    my_meta = PartitionedTensor.from_meta(part.to_meta(), part.local_data, group)
+    assert torch.equal(full, my_meta.full())
+
+
+def assert_valid_partition(weights, parts, P):
+    N = len(weights)
+    assert len(parts) == P + 1
+    assert parts[0] == 0
+    assert parts[P] == N
+    for idx in range(P):
+        assert parts[idx] <= parts[idx + 1]
+
+
+def get_partition_weights(weights, parts):
+    """ Return the amount of weight in each partition. """
+    costs = [0] * (len(parts) - 1)
+    P = len(parts) - 1
+    for p in range(P):
+        start = parts[p]
+        stop = parts[p + 1]
+        costs[p] = sum(weights[start:stop])
+    return costs
+
+
+def test_prefix_sum():
+    x = [3, 4, 5]
+    psum = prefix_sum_inc(x)
+    assert psum == [3, 7, 12]
+
+
+def test_valid_partition():
+    N = 10
+    P = 1
+    weights = [1] * N
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
+
+
+def test_short_partition_uniform():
+    N = 2
+    P = 4
+    weights = [1] * N
+    parts = partition_uniform(len(weights), P)
+    assert_valid_partition(weights, parts, P)
+
+
+def test_short_partition():
+    N = 2
+    P = 4
+    weights = [1] * N
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
+
+
+def test_easy_balance_uniform():
+    weights = [1] * 8
+    P = 4
+    parts = partition_uniform(len(weights), P)
+    assert_valid_partition(weights, parts, P)
+    costs = get_partition_weights(weights, parts)
+    assert all(c == 2 for c in costs)
+
+
+def test_easy_balance_balanced():
+    weights = [1] * 8
+    P = 4
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
+    costs = get_partition_weights(weights, parts)
+    assert all(c == 2 for c in costs), costs
+
+
+def test_int_balanced():
+    weights = [0, 1, 2, 3, 3, 3]
+    P = 4
+    parts = partition_balanced(weights, P)
+    assert parts == [0, 3, 4, 5, 6]
+
+    assert_valid_partition(weights, parts, P)
+    costs = get_partition_weights(weights, parts)
+    assert all(c == 3 for c in costs)
+
+
+def test_float_balanced():
+    weights = [0., 1.1, 1.9, 3., 3., 3.]
+    P = 4
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
+    assert parts == [0, 3, 4, 5, 6]
+
+
+@pytest.mark.skip(reason="Variance-minimizing partitioning returns different result.")
+def test_float_lastheavy():
+    weights = [0., 1.1, 1.9, 3., 30.]
+    P = 2
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
+    assert parts == [0, 4, 5]
+
+
+def test_float_midheavy():
+    weights = [0., 1.1, 30, 3.]
+    P = 3
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
+    assert parts == [0, 2, 3, 4]
+
+
+def test_balance_bert():
+    # Parameters per layer for a transformer model with 24 transformers and hidden dim 1024
+    weights = [
+        52559872,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        12596224,
+        0,
+        52559872
+    ]
+    P = 8
+    parts = partition_balanced(weights, P)
+    assert_valid_partition(weights, parts, P)
diff --git a/tests/unit/test_pipe.py b/tests/unit/test_pipe.py
new file mode 100755
index 000000000000..30d4314a8441
--- /dev/null
+++ b/tests/unit/test_pipe.py
@@ -0,0 +1,267 @@
+import os
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+
+import pytest
+
+import deepspeed
+import deepspeed.runtime.utils as ds_utils
+
+
+from deepspeed.runtime.pipe.topology import PipeDataParallelTopology, PipeModelDataParallelTopology
+PipeTopo = PipeDataParallelTopology
+from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec
+
+from common import distributed_test
+
+
+def rel_diff(A, B):
+    return abs(A - B) / abs(A)
+
+
+# All models
+from simple_model import args_from_dict
+
+
+class AlexNet(nn.Module):
+    def __init__(self, num_classes=10):
+        super(AlexNet, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(3,
+                      64,
+                      kernel_size=11,
+                      stride=4,
+                      padding=5),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=2,
+                         stride=2),
+            nn.Conv2d(64,
+                      192,
+                      kernel_size=5,
+                      padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=2,
+                         stride=2),
+            nn.Conv2d(192,
+                      384,
+                      kernel_size=3,
+                      padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(384,
+                      256,
+                      kernel_size=3,
+                      padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256,
+                      256,
+                      kernel_size=3,
+                      padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=2,
+                         stride=2),
+        )
+        self.classifier = nn.Linear(256, num_classes)
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        x = self.features(x)
+        x = x.view(x.size(0), -1)
+        x = self.classifier(x)
+        return self.loss_fn(x, y)
+
+
+class AlexNetPipe(AlexNet):
+    def to_layers(self):
+        layers = [*self.features, lambda x: x.view(x.size(0), -1), self.classifier]
+        return layers
+
+
+class AlexNetPipeSpec(PipelineModule):
+    def __init__(self, num_classes=10, **kwargs):
+        self.num_classes = num_classes
+        specs = [
+            LayerSpec(nn.Conv2d, 3, 64, kernel_size=11, stride=4, padding=5),
+            LayerSpec(nn.ReLU, inplace=True),
+            LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2),
+            LayerSpec(nn.Conv2d, 64, 192, kernel_size=5, padding=2),
+            F.relu,
+            LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2),
+            LayerSpec(nn.Conv2d, 192, 384, kernel_size=3, padding=1),
+            F.relu,
+            LayerSpec(nn.Conv2d, 384, 256, kernel_size=3, padding=1),
+            F.relu,
+            LayerSpec(nn.Conv2d, 256, 256, kernel_size=3, padding=1),
+            F.relu,
+            LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2),
+
+            lambda x: x.view(x.size(0), -1),
+            LayerSpec(nn.Linear, 256, self.num_classes), # classifier
+        ]
+        super().__init__(layers=specs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
+
+
+def cifar_trainset(fp16=False):
+    import torchvision
+    import torchvision.transforms as transforms
+
+    transform_list = [
+        transforms.ToTensor(),
+        transforms.Normalize((0.5,
+                              0.5,
+                              0.5),
+                             (0.5,
+                              0.5,
+                              0.5)),
+    ]
+    if fp16:
+        transform_list.append(torchvision.transforms.Lambda(lambda x: x.half()))
+
+    transform = transforms.Compose(transform_list)
+
+    local_rank = torch.cuda.current_device()
+
+    # Only one rank per machine downloads.
+    dist.barrier()
+    if local_rank != 0:
+        dist.barrier()
+    trainset = torchvision.datasets.CIFAR10(root='/tmp/cifar10-data',
+                                            train=True,
+                                            download=True,
+                                            transform=transform)
+    if local_rank == 0:
+        dist.barrier()
+    return trainset
+
+
+def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, seed=123):
+    with torch.random.fork_rng(devices=[torch.cuda.current_device()]):
+        ds_utils.set_random_seed(seed)
+
+        # disable dropout
+        model.eval()
+
+        trainset = cifar_trainset(fp16=fp16)
+        args.local_rank = dist.get_rank()
+
+        engine, _, _, _ = deepspeed.initialize(
+            args=args,
+            model=model,
+            model_parameters=[p for p in model.parameters()],
+            training_data=trainset)
+
+        losses = []
+        for step in range(num_steps):
+            loss = engine.train_batch()
+            losses.append(loss.item())
+            if step % 50 == 0 and dist.get_rank() == 0:
+                print(f'STEP={step} LOSS={loss.item()}')
+
+        if average_dp_losses:
+            loss_tensor = torch.tensor(losses).cuda()
+            dist.all_reduce(loss_tensor)
+            loss_tensor /= dist.get_world_size()
+            losses = loss_tensor.tolist()
+
+    return losses
+
+
+@pytest.mark.parametrize('topo',
+                         [
+                             PipeTopo(num_pp=1,
+                                      num_dp=4),
+                             PipeTopo(num_pp=2,
+                                      num_dp=2),
+                             PipeTopo(num_pp=4,
+                                      num_dp=1),
+                         ])
+def test_pipe_cifar10(topo, tmpdir):
+    config_dict = {
+        "train_batch_size": 16,
+        "train_micro_batch_size_per_gpu": 4,
+        "steps_per_print": 20,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.001,
+                "betas": [0.9,
+                          0.999],
+                "eps": 1e-8,
+                "weight_decay": 3e-7
+            }
+        },
+        "zero_optimization": {
+            "stage": 0
+        },
+        "fp16": {
+            "enabled": False
+        },
+        "pipeline": {
+            "seed_layers": True,
+            "activation_checkpoint_interval": 1
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+
+    # Allocate model for consistent initial weights.
+    init_net = AlexNetPipe()
+
+    @distributed_test(world_size=4)
+    def _helper(topo, tmpdir, steps=500):
+        assert steps >= 100
+
+        base_net = copy.deepcopy(init_net)
+        base_model = PipelineModule(layers=base_net.to_layers(),
+                                    num_stages=1,
+                                    loss_fn=nn.CrossEntropyLoss())
+
+        # Train with just data parallelism
+        base_losses = train_cifar(base_model,
+                                  args,
+                                  num_steps=steps,
+                                  fp16=config_dict['fp16']['enabled'])
+
+        test_net = copy.deepcopy(init_net)
+        test_model = PipelineModule(layers=test_net.to_layers(),
+                                    topology=topo,
+                                    loss_fn=nn.CrossEntropyLoss())
+
+        #test_model = AlexNetPipe(num_classes=10,
+        #                         topology=test_topo,
+        #                         seed_layers=config_dict['pipeline']['seed_layers'])
+        test_losses = train_cifar(test_model,
+                                  args,
+                                  num_steps=steps,
+                                  fp16=config_dict['fp16']['enabled'])
+
+        abs_diffs = [l0 - l1 for l0, l1 in zip(base_losses, test_losses)]
+        rel_diffs = [rel_diff(l0, l1) for l0, l1 in zip(base_losses, test_losses)]
+        if dist.get_rank() == 0:
+            print(
+                f'abs min={min(abs_diffs)} max={max(abs_diffs)} avg={sum(abs_diffs)/len(abs_diffs)}'
+            )
+            print(
+                f'rel min={min(rel_diffs)} max={max(rel_diffs)} avg={sum(rel_diffs)/len(rel_diffs)}'
+            )
+            print(
+                f'first: base={base_losses[0]} test={test_losses[0]} abs={abs_diffs[0]} rel={rel_diffs[0]}'
+            )
+
+            for lastX in [1, 10, 100]:
+                base_avg = sum(base_losses[-lastX:]) / lastX
+                test_avg = sum(test_losses[-lastX:]) / lastX
+                print(
+                    f'last-{lastX}: base={base_avg} test={test_avg} abs={base_avg - test_avg} rel={rel_diff(base_avg, test_avg)}'
+                )
+
+        lastX = 100
+        base = base_losses[-lastX:]
+        base_avg = sum(base) / len(base)
+        test = test_losses[-lastX:]
+        test_avg = sum(test) / len(test)
+        assert rel_diff(base_avg, test_avg) < 0.03
+
+    _helper(topo, tmpdir)
diff --git a/tests/unit/test_pipe_module.py b/tests/unit/test_pipe_module.py
new file mode 100644
index 000000000000..61f07a196971
--- /dev/null
+++ b/tests/unit/test_pipe_module.py
@@ -0,0 +1,101 @@
+import copy
+
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+
+import pytest
+
+import deepspeed
+
+from deepspeed.runtime.pipe.topology import PipeDataParallelTopology, PipeModelDataParallelTopology
+PipeTopo = PipeDataParallelTopology
+
+from deepspeed.pipe import PipelineModule, LayerSpec
+from deepspeed.utils import RepeatingLoader
+
+from common import distributed_test
+from simple_model import args_from_dict
+
+HIDDEN_DIM = 32
+LAYERS = 8
+
+
+@pytest.fixture
+def sequential_model():
+    model = torch.nn.Sequential(
+        *[nn.Linear(HIDDEN_DIM,
+                    HIDDEN_DIM) for _ in range(LAYERS)],
+        nn.Linear(HIDDEN_DIM,
+                  1),
+    )
+    return model
+
+
+@pytest.fixture
+def simple_args(tmpdir):
+    config_dict = {
+        "train_batch_size": 1,
+        "train_micro_batch_size_per_gpu": 1,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.001,
+                "betas": [0.9,
+                          0.999],
+                "eps": 1e-8,
+                "weight_decay": 3e-7
+            }
+        },
+        "pipeline": {
+            "activation_checkpoint_interval": 1
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    return args
+
+
+def test_pipe_module_sequential(sequential_model, simple_args):
+    batch_input = torch.randn(1, HIDDEN_DIM)
+
+    @distributed_test(world_size=4)
+    def _helper():
+        base_model = copy.deepcopy(sequential_model)
+        base_input = batch_input.clone().detach()
+        base_output = base_model(base_input)
+        base_output = base_output
+        base_params = sum(p.numel() for p in base_model.parameters())
+
+        pipe_model = copy.deepcopy(sequential_model)
+        pipe_model = PipelineModule(layers=pipe_model, num_stages=4)
+
+        # Ensure all parameters are accounted for.
+        my_params = sum(p.numel() for p in pipe_model.parameters())
+        total_pipe_params = torch.LongTensor([my_params]).to('cuda')
+        dist.all_reduce(total_pipe_params)
+        total_pipe_params = total_pipe_params.item()
+        assert total_pipe_params == base_params
+
+        pipe_model, _, _, _ = deepspeed.initialize(
+            args=simple_args,
+            model=pipe_model,
+            model_parameters=[p for p in pipe_model.parameters()])
+
+        if pipe_model.is_first_stage or pipe_model.is_last_stage:
+            pipe_input = base_input.clone().detach().to('cuda')
+            # label 0 is meaningless
+            dataset = [(pipe_input, 0)]
+            loader = RepeatingLoader(dataset)
+            data_iter = iter(loader)
+        else:
+            data_iter = None
+
+        pipe_output = pipe_model.eval_batch(data_iter=data_iter)
+
+        base_output = base_output.to('cpu')
+        pipe_output = pipe_output.to('cpu')
+
+        assert torch.allclose(base_output, pipe_output)
+
+    _helper()
diff --git a/tests/unit/test_pipe_schedule.py b/tests/unit/test_pipe_schedule.py
new file mode 100644
index 000000000000..8c65f3d7614b
--- /dev/null
+++ b/tests/unit/test_pipe_schedule.py
@@ -0,0 +1,157 @@
+import pytest
+
+import deepspeed.runtime.pipe.schedule as schedule
+
+
+def _count_type(cmds, classtype):
+    return len(list(filter(lambda c: type(c) == classtype, cmds)))
+
+
+def test_pipe_inference_schedule_singlestage():
+    sched = schedule.InferenceSchedule(micro_batches=4, stages=1, stage_id=0)
+    assert sched.num_micro_batches == 4
+    full = list(iter(sched))
+    for idx, cmds in enumerate(full):
+        assert len(cmds) == 2
+        assert type(cmds[0]) == schedule.LoadMicroBatch
+        assert type(cmds[1]) == schedule.ForwardPass
+        assert cmds[0].buffer_id == cmds[1].buffer_id
+    assert len(full) == sched.num_micro_batches
+
+
+def test_pipe_train_schedule_singlestage():
+    sched = schedule.TrainSchedule(micro_batches=4, stages=1, stage_id=0)
+    assert sched.num_micro_batches == 4
+    full = list(iter(sched))
+    print()
+    for idx, cmds in enumerate(full):
+        print(idx, cmds)
+        #assert len(cmds) == 2
+        #assert type(cmds[0]) == schedule.LoadMicroBatch
+        #assert type(cmds[1]) == schedule.ForwardPass
+        #assert cmds[0].buffer_id == cmds[1].buffer_id
+    #assert len(full) == sched.num_micro_batches
+
+
+@pytest.mark.parametrize('micro_batches', [1, 3, 8, 10])
+def test_pipe_inference_schedule_firststage(micro_batches, stages=3, verbose=False):
+    sched = schedule.InferenceSchedule(micro_batches=micro_batches,
+                                       stages=stages,
+                                       stage_id=0)
+    assert sched.num_micro_batches == micro_batches
+    full = list(iter(sched))
+    if verbose:
+        print()
+    for idx, cmds in enumerate(full):
+        if verbose:
+            print(idx, cmds)
+        # Ensure we don't send an activation the first step
+        if idx == 0:
+            assert len(cmds) == 2
+            assert type(cmds[0]) == schedule.LoadMicroBatch
+            assert type(cmds[1]) == schedule.ForwardPass
+            assert cmds[0].buffer_id == cmds[1].buffer_id
+            continue
+
+        # the last active step is only a send
+        if idx == sched.num_micro_batches:
+            assert len(cmds) == 1
+            assert type(cmds[0]) == schedule.SendActivation
+            continue
+
+        # no work later on
+        if idx > sched.num_micro_batches:
+            assert len(cmds) == 0
+            continue
+
+        # Normally we need to load/forward/send
+        assert len(cmds) == 3
+        assert _count_type(cmds, schedule.LoadMicroBatch) == 1
+        assert _count_type(cmds, schedule.ForwardPass) == 1
+        assert _count_type(cmds, schedule.SendActivation) == 1
+    assert len(full) == micro_batches + stages - 1
+
+
+@pytest.mark.parametrize('micro_batches', [1, 3, 8, 10])
+def test_pipe_inference_schedule_midstage(micro_batches, stages=3, verbose=False):
+    sched = schedule.InferenceSchedule(micro_batches=micro_batches,
+                                       stages=stages,
+                                       stage_id=1)
+
+    full = list(iter(sched))
+    if verbose:
+        print()
+    for idx, cmds in enumerate(full):
+        if verbose:
+            print(idx, cmds)
+        if idx < sched.stage:
+            assert len(cmds) == 0
+            continue
+        if idx == sched.stage + sched.num_micro_batches:
+            assert len(cmds) == 1
+            assert type(cmds[0]) == schedule.SendActivation
+            continue
+        if idx > sched.stage + sched.num_micro_batches:
+            assert len(cmds) == 0
+            continue
+        assert _count_type(cmds, schedule.LoadMicroBatch) == 0
+        assert _count_type(cmds, schedule.ForwardPass) == 1
+        assert _count_type(cmds, schedule.RecvActivation) == 1
+        if idx > sched.stage:
+            assert _count_type(cmds, schedule.SendActivation) == 1
+    assert len(full) == micro_batches + stages - 1
+
+
+@pytest.mark.parametrize('micro_batches', [1, 3, 8, 10])
+def test_pipe_inference_schedule_laststage(micro_batches, stages=3, verbose=False):
+    sched = schedule.InferenceSchedule(micro_batches=micro_batches,
+                                       stages=stages,
+                                       stage_id=2)
+    full = list(iter(sched))
+    if verbose:
+        print()
+    for idx, cmds in enumerate(full):
+        if verbose:
+            print(idx, cmds)
+        if idx < sched.stage or idx > sched.stage + sched.num_micro_batches:
+            assert len(cmds) == 0
+            continue
+        assert _count_type(cmds, schedule.LoadMicroBatch) == 1
+        assert _count_type(cmds, schedule.ForwardPass) == 1
+        assert _count_type(cmds, schedule.RecvActivation) == 1
+        assert _count_type(cmds, schedule.SendActivation) == 0
+    assert len(full) == micro_batches + stages - 1
+
+
+def test_pipe_schedule_firststage():
+    sched = schedule.TrainSchedule(micro_batches=8, stages=3, stage_id=0)
+    for cmds in sched:
+        assert all(instr.__class__ != schedule.SendGrad for instr in cmds)
+        assert all(instr.__class__ != schedule.RecvActivation for instr in cmds)
+        for instr in cmds:
+            if isinstance(instr, schedule.BufferOpInstruction):
+                assert 0 <= instr.buffer_id < sched.num_pipe_buffers()
+
+
+def test_pipe_schedule_laststage():
+    sched = schedule.TrainSchedule(stages=3, micro_batches=4, stage_id=2)
+    #assert len(sched) == 2 * (sched.micro_batches + sched.stages - 1)
+    print()
+    for cmds in sched:
+        print(cmds)
+        assert all(instr.__class__ != schedule.SendActivation for instr in cmds)
+        assert all(instr.__class__ != schedule.RecvGrad for instr in cmds)
+
+
+def test_pipe_stagequery():
+    sched = schedule.TrainSchedule(stages=3, micro_batches=4, stage_id=0)
+    assert sched.is_first_stage
+    assert not sched.is_last_stage
+
+    sched = schedule.TrainSchedule(stages=3, micro_batches=4, stage_id=1)
+    assert not sched.is_first_stage
+    assert not sched.is_last_stage
+
+    sched = schedule.TrainSchedule(stages=3, micro_batches=4, stage_id=2)
+    assert not sched.is_first_stage
+    assert sched.is_last_stage
diff --git a/tests/unit/test_pld.py b/tests/unit/test_pld.py
new file mode 100755
index 000000000000..784aeff0338f
--- /dev/null
+++ b/tests/unit/test_pld.py
@@ -0,0 +1,117 @@
+import numpy as np
+import deepspeed
+import pytest
+from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
+from common import distributed_test
+from simple_model import SimpleModel, PLD_SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
+
+
+@pytest.mark.parametrize('theta', [0, 0.1, 0.9, 1.0])
+def test_pld_schedule(tmpdir, theta):
+    gamma = 0.001
+
+    pld_scheduler = ProgressiveLayerDrop(theta, gamma)
+    for i in range(10):
+        pld_scheduler.update_state(i)
+        expected_theta = (1. - theta) * np.exp(-gamma * i) + theta
+        actual_theta = pld_scheduler.get_theta()
+        assert expected_theta == actual_theta
+
+
+@pytest.mark.parametrize('theta', [0, 0.1, 0.9, 1.0])
+def test_pld_model(tmpdir, theta):
+    gamma = 0.001
+    config_dict = {
+        "train_batch_size": 1,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": 'Adam',
+            "params": {
+                "lr": 0.0001
+            }
+        },
+        "fp16": {
+            "enabled": True
+        },
+        "progressive_layer_drop": {
+            "enabled": True,
+            "theta": theta,
+            "gamma": gamma
+        }
+    }
+
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = PLD_SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_pld_model(args, model, hidden_dim, theta, gamma):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+
+        for i, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+            expected_theta = (1. - theta) * np.exp(-gamma * i) + theta
+            actual_theta = model.get_pld_theta()
+            assert expected_theta == actual_theta
+
+    _test_pld_model(args=args,
+                    model=model,
+                    hidden_dim=hidden_dim,
+                    theta=theta,
+                    gamma=gamma)
+
+
+def test_non_pld_model(tmpdir):
+    gamma = 0.001
+    theta = 0.5
+    config_dict = {
+        "train_batch_size": 1,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": 'Adam',
+            "params": {
+                "lr": 0.0001
+            }
+        },
+        "fp16": {
+            "enabled": True
+        },
+        "progressive_layer_drop": {
+            "enabled": True,
+            "theta": theta,
+            "gamma": gamma
+        }
+    }
+
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_non_pld_model(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=1,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+
+        for i, batch in enumerate(data_loader):
+            with pytest.raises(TypeError):
+                loss = model(batch[0], batch[1])
+
+    _test_non_pld_model(args=args, model=model, hidden_dim=hidden_dim)
diff --git a/tests/unit/test_run.py b/tests/unit/test_run.py
index 4329e7a885bd..ed069c335fe6 100644
--- a/tests/unit/test_run.py
+++ b/tests/unit/test_run.py
@@ -1,6 +1,6 @@
 import pytest
 
-from deepspeed.pt import deepspeed_run as dsrun
+from deepspeed.launcher import runner as dsrun
 
 
 def test_parser_mutual_exclusive():
diff --git a/tests/unit/test_runtime_utils.py b/tests/unit/test_runtime_utils.py
new file mode 100644
index 000000000000..260b3baa125b
--- /dev/null
+++ b/tests/unit/test_runtime_utils.py
@@ -0,0 +1,14 @@
+import pytest
+
+import deepspeed.runtime.utils as ds_utils
+
+
+def test_call_to_str():
+    c2s = ds_utils.call_to_str
+
+    assert c2s('int') == 'int()'
+    assert c2s('int', 3) == 'int(3)'
+    assert c2s('int', 3, 'jeff') == 'int(3, \'jeff\')'
+
+    assert c2s('hello', val=3) == 'hello(val=3)'
+    assert c2s('hello', 1138, val=3) == 'hello(1138, val=3)'
diff --git a/tests/unit/test_sparse_attention.py b/tests/unit/test_sparse_attention.py
new file mode 100644
index 000000000000..80eb1b31b596
--- /dev/null
+++ b/tests/unit/test_sparse_attention.py
@@ -0,0 +1,349 @@
+# DeepSpeed note, some parts of code taken & adapted from commit c368a9fd1b2c9dee4cc94de9a6bb0be3d447be41
+# https://github.com/ptillet/torch-blocksparse/blob/master/tests/test_softmax.py
+# https://github.com/ptillet/torch-blocksparse/blob/master/tests/test_matmul.py
+# https://github.com/ptillet/torch-blocksparse/blob/master/tests/utils
+
+import pytest
+import torch
+import deepspeed
+from deepspeed.ops.op_builder import SparseAttnBuilder
+
+if not deepspeed.ops.__compatible_ops__[SparseAttnBuilder.NAME]:
+    pytest.skip("sparse attention op is not compatible on this system",
+                allow_module_level=True)
+
+
+def test_sparse_attention_module_availability():
+    try:
+        from deepspeed.ops import sparse_attention
+    except ImportError:
+        print("Sparse Attention Module is not installed!")
+        return False
+    return True
+
+
+def test_matmul_module_availability():
+    try:
+        from deepspeed.ops.sparse_attention import MatMul
+    except ImportError:
+        print("Sparse MatMul Module is not installed!")
+        return False
+    return True
+
+
+def test_softmax_module_availability():
+    try:
+        from deepspeed.ops.sparse_attention import Softmax
+    except ImportError:
+        print("Sparse Softmax Module is not installed!")
+        return False
+    return True
+
+
+def test_sparsityconfig_module_availability():
+    try:
+        from deepspeed.ops.sparse_attention import SparsityConfig
+    except ImportError:
+        print("SparsityConfig Module is not installed!")
+        return False
+    return True
+
+
+def test_densesparsityconfig_module_availability():
+    try:
+        from deepspeed.ops.sparse_attention import DenseSparsityConfig
+    except ImportError:
+        print("DenseSparsityConfig Module is not installed!")
+        return False
+    return True
+
+
+def test_fixedsparsityconfig_module_availability():
+    try:
+        from deepspeed.ops.sparse_attention import FixedSparsityConfig
+    except ImportError:
+        print("FixedSparsityConfig Module is not installed!")
+        return False
+    return True
+
+
+def test_variablesparsityconfig_module_availability():
+    try:
+        from deepspeed.ops.sparse_attention import VariableSparsityConfig
+    except ImportError:
+        print("VariableSparsityConfig Module is not installed!")
+        return False
+    return True
+
+
+def test_bigbirdsparsityconfig_module_availability():
+    try:
+        from deepspeed.ops.sparse_attention import BigBirdSparsityConfig
+    except ImportError:
+        print("BigBirdSparsityConfig Module is not installed!")
+        return False
+    return True
+
+
+def test_bslongformersparsityconfig_module_availability():
+    try:
+        from deepspeed.ops.sparse_attention import BSLongformerSparsityConfig
+    except ImportError:
+        print("BSLongformerSparsityConfig Module is not installed!")
+        return False
+    return True
+
+
+def test_sparseselfattention_module_availability():
+    try:
+        from deepspeed.ops.sparse_attention import SparseSelfAttention
+    except ImportError:
+        print("SparseSelfAttention Module is not installed!")
+        return False
+    return True
+
+
+def test_bertsparseselfattention_module_availability():
+    try:
+        from deepspeed.ops.sparse_attention import BertSparseSelfAttention
+    except ImportError:
+        print("BertSparseSelfAttention Module is not installed!")
+        return False
+    return True
+
+
+def test_sparseattentionutils_availability():
+    try:
+        from deepspeed.ops.sparse_attention import SparseAttentionUtils
+    except ImportError:
+        print("SparseAttentionUtils Module is not installed!")
+        return False
+    return True
+
+
+def test_cpp_utils_availability():
+    try:
+        from deepspeed.ops.sparse_attention import cpp_utils
+    except ImportError:
+        print("Sparse Attention cpp_utils Module is not installed!")
+        return False
+    return True
+
+
+def dense_to_sparse(w, mask, block):
+    """Converts dense matrix with explicit zeros to sparse matrix
+    """
+    Z = w.size(0)
+    ret = torch.empty((Z, mask.sum(), block, block), dtype=w.dtype, device=w.device)
+    nnz = mask.nonzero()
+    h, i, j = nnz[:, 0], nnz[:, 1], nnz[:, 2]
+    for zz in range(Z):
+        for idx, (hh, ii, jj) in enumerate(zip(h, i, j)):
+            ret[zz, idx, :, :] = w[zz, hh, ii*block: (ii+1)*block, jj*block: (jj+1)*block]
+    return ret
+
+
+def sparse_to_dense(w, mask, block, zero=0):
+    """Converts sparse matrix to dense matrix with explicit zeros
+    """
+    maskedw = w.clone()
+    for bz, wz in enumerate(range(0, w.size(0))):
+        for bh, wh in enumerate(range(0, w.size(1))):
+            for bi, wi in enumerate(range(0, w.size(2), block)):
+                for bj, wj in enumerate(range(0, w.size(3), block)):
+                    if mask[bh, bi, bj] == 0:
+                        maskedw[wz, wh, wi:wi + block, wj:wj + block] = zero
+                    #maskedw[wz, wh, wi : wi+block, wj : wj+block] *= mask[bh, bi, bj]
+    return maskedw
+
+
+def allclose(x, y):
+    assert x.dtype == y.dtype
+    rtol, atol = {torch.float32: (1e-4, 1e-5), torch.float16: (1e-2, 1e-3)}[x.dtype]
+    return torch.allclose(x, y, rtol=rtol, atol=atol)
+
+
+def make_layout(rho, shape):
+    probs = torch.Tensor([rho, 1 - rho])
+    generator = torch.distributions.categorical.Categorical(probs)
+    layout = generator.sample(shape)
+    return layout
+
+
+def run_softmax_reference(x, scale, dx, kp_mask, attn_mask, layout, block):
+    x = sparse_to_dense(x, layout, block, zero=float('-inf'))
+    x.retain_grad()
+    if kp_mask is not None:
+        bcattn_mask = attn_mask[None, None, :, :] + torch.zeros_like(x)
+        x[bcattn_mask == 0] = float('-inf')
+        y = torch.softmax(x * scale + kp_mask[:, None, None, :], -1)
+    else:
+        y = torch.softmax(x * scale, -1)
+    y.backward(dx)
+    dx = x.grad.clone()
+    dx = dense_to_sparse(dx, layout, block)
+    y = dense_to_sparse(y, layout, block)
+    return y, dx
+
+
+def run_softmax_sparse(x, scale, dx, kp_mask, attn_mask, layout, block):
+    from deepspeed.ops.sparse_attention import Softmax
+    sparse_softmax = Softmax(layout, block, bench=False)
+    dx = dense_to_sparse(dx, layout, block)
+    x = dense_to_sparse(x, layout, block)
+    x.retain_grad()
+    y = sparse_softmax(x,
+                       scale=scale,
+                       key_padding_mask=kp_mask,
+                       key_padding_mask_mode='add',
+                       attn_mask=attn_mask,
+                       attn_mask_mode='mul')
+    y.backward(dx)
+    dx = x.grad.clone()
+    x.grad.zero_()
+    return x, dx
+
+
+def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layout=None):
+    if layout is None:
+        layout = make_layout(rho, (H, M // block, N // block))
+    if dense_x:
+        x = torch.rand((Z, H, M, N), dtype=dtype, requires_grad=True, device='cuda')
+    else:
+        x = torch.rand((Z,
+                        layout.sum(),
+                        block,
+                        block),
+                       dtype=dtype,
+                       requires_grad=True,
+                       device='cuda')
+    dx = torch.rand_like(x)
+    bool_attn_mask = torch.randint(low=0,
+                                   high=2,
+                                   size=(N,
+                                         N),
+                                   dtype=torch.bool,
+                                   requires_grad=False,
+                                   device='cuda')
+    fp_attn_mask = bool_attn_mask.type(dtype)
+    kp_mask = torch.randint(low=0,
+                            high=2,
+                            size=(Z,
+                                  N),
+                            dtype=dtype,
+                            requires_grad=False,
+                            device='cuda')
+    kp_mask[kp_mask == 1.] = float('-inf')
+    return layout, x, dx, bool_attn_mask, fp_attn_mask, kp_mask
+
+
+def _skip_on_cuda_compatability():
+    #pytest.skip("Skip these tests for now until we get our docker image fixed.")
+    if torch.cuda.get_device_capability()[0] != 7:
+        pytest.skip("needs compute capability 7; v100")
+    cuda_major = int(torch.version.cuda.split('.')[0]) * 10
+    cuda_minor = int(torch.version.cuda.split('.')[1])
+    cuda_version = cuda_major + cuda_minor
+    if cuda_version != 101 and cuda_version != 102:
+        pytest.skip("requires cuda 10.1 or 10.2")
+
+
+@pytest.mark.parametrize("block", [16, 32])
+@pytest.mark.parametrize("width", [256, 576])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+def test_softmax(block, width, dtype):
+    _skip_on_cuda_compatability()
+    Z = 2
+    H = 4
+    scale = 0.4
+    rho = 0.4
+    M = N = width
+    layout, x, dx, bool_attn_mask, fp_attn_mask, kp_mask = init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, layout=None)
+    ref_y, ref_dx = run_softmax_reference(x, scale, dx, kp_mask, bool_attn_mask, layout, block)
+    st_y, st_dx = run_softmax_sparse(x, scale, dx, kp_mask, fp_attn_mask, layout, block)
+    assert allclose(ref_y, st_y)
+    assert allclose(ref_dx, st_dx)
+
+
+def run_matmul_reference(x, w, mode, trans_a, trans_b, layout, block, dy):
+    x = sparse_to_dense(x, layout, block) if mode == 'dsd' else x
+    w = sparse_to_dense(w, layout, block) if mode == 'dds' else w
+    x.retain_grad()
+    w.retain_grad()
+    xx = x.transpose(2, 3) if trans_a else x
+    ww = w.transpose(2, 3) if trans_b else w
+    y = torch.matmul(xx, ww)
+    y = sparse_to_dense(y, layout, block) if mode == 'sdd' else y
+    y.backward(dy)
+    dx = x.grad.clone()
+    dw = w.grad.clone()
+    x.grad.zero_()
+    w.grad.zero_()
+    y = dense_to_sparse(y, layout, block) if mode == 'sdd' else y
+    dx = dense_to_sparse(dx, layout, block) if mode == 'dsd' else dx
+    dw = dense_to_sparse(dw, layout, block) if mode == 'dds' else dw
+    return y, dx, dw
+
+
+def run_matmul_sparse(x, w, mode, trans_a, trans_b, layout, block, dy):
+    from deepspeed.ops.sparse_attention import MatMul
+    x = dense_to_sparse(x, layout, block) if mode == 'dsd' else x
+    w = dense_to_sparse(w, layout, block) if mode == 'dds' else w
+    dy = dense_to_sparse(dy, layout, block) if mode == 'sdd' else dy
+    op = MatMul(layout, block, mode, trans_a=trans_a, trans_b=trans_b)
+    x.retain_grad()
+    w.retain_grad()
+    y = op(x, w)
+    y.backward(dy)
+    dx = x.grad.clone()
+    dw = w.grad.clone()
+    x.grad.zero_()
+    return y, dx, dw
+
+
+def init_matmul_inputs(Z, H, M, N, K, rho, mode, trans_a, trans_b, block, dtype, layout):
+    torch.manual_seed(1)
+    AS0 = K if trans_a else M
+    AS1 = M if trans_a else K
+    BS0 = N if trans_b else K
+    BS1 = K if trans_b else N
+    shape = {'sdd': (M, N), 'dsd': (AS0, AS1), 'dds': (BS0, BS1)}[mode]
+    x = torch.rand((Z, H, AS0, AS1), dtype=dtype, requires_grad=True, device='cuda')
+    w = torch.rand((Z, H, BS0, BS1), dtype=dtype, requires_grad=True, device='cuda')
+    dy = torch.rand((Z, H, M, N), dtype=dtype, device='cuda')
+    if layout is None:
+        layout = make_layout(rho, (H, shape[0] // block, shape[1] // block))
+    else:
+        assert list(layout.shape) == [H, shape[0] // block, shape[1] // block]
+    x.retain_grad()
+    w.retain_grad()
+    return x, w, dy, shape, layout
+
+testdata = [
+      (16, dtype, mode, trans_a, trans_b)\
+         for dtype in [torch.float16, torch.float32]\
+         for mode in ['sdd', 'dsd', 'dds']\
+         for trans_a   in [False, True]\
+         for trans_b   in [False, True]\
+   ] + [
+      (block, torch.float16, mode, False, False)\
+         for block in [16, 32, 64]\
+         for mode in ['sdd', 'dsd', 'dds']\
+   ]
+
+
+@pytest.mark.parametrize("block, dtype, mode, trans_a, trans_b", testdata)
+def test_matmul(block, dtype, mode, trans_a, trans_b):
+    _skip_on_cuda_compatability()
+    Z = 3
+    H = 2
+    M = 128
+    N = 256
+    K = 192
+    rho = 0.5
+    x, w, dy, shape, layout = init_matmul_inputs(Z, H, M, N, K, rho, mode, trans_a, trans_b, block, dtype, layout=None)
+    ref_y, ref_dx, ref_dw = run_matmul_reference(x.clone(), w.clone(), mode, trans_a, trans_b, layout, block, dy)
+    st_y, st_dx, st_dw = run_matmul_sparse(x.clone(), w.clone(), mode, trans_a, trans_b, layout, block, dy)
+    assert allclose(ref_y, st_y)
+    assert allclose(ref_dx, st_dx)
+    assert allclose(ref_dw, st_dw)
diff --git a/tests/unit/test_topology.py b/tests/unit/test_topology.py
new file mode 100644
index 000000000000..176363688de4
--- /dev/null
+++ b/tests/unit/test_topology.py
@@ -0,0 +1,222 @@
+import pytest
+
+import torch
+import torch.distributed as dist
+
+from deepspeed.runtime.pipe.topology import PipelineParallelGrid as Grid
+from deepspeed.runtime.pipe.topology import ProcessTopology as Topo
+from deepspeed.runtime.pipe.topology import _prime_factors
+
+from common import distributed_test
+
+
+def test_topology_2d():
+    topo = Topo(axes=['row', 'col'], dims=[2, 2])
+
+    assert topo.world_size() == 4
+
+    assert topo.get_rank(row=0, col=0) == 0
+    assert topo.get_rank(row=0, col=1) == 1
+    assert topo.get_rank(row=1, col=0) == 2
+    assert topo.get_rank(row=1, col=1) == 3
+
+    assert topo.get_axis_list(axis='row', idx=0) == [0, 1]
+    assert topo.get_axis_list(axis='row', idx=1) == [2, 3]
+    assert topo.get_axis_list(axis='col', idx=0) == [0, 2]
+    assert topo.get_axis_list(axis='col', idx=1) == [1, 3]
+
+
+def test_topology_dims():
+    topo = Topo(axes=['a', 'b', 'c'], dims=[2, 3, 4])
+    assert topo.world_size() == 24
+    assert topo.get_dim('a') == 2
+    assert topo.get_dim('b') == 3
+    assert topo.get_dim('c') == 4
+
+
+def test_topology_match():
+    topo = Topo(axes=['pipe', 'data', 'model'], dims=[2, 2, 2])
+    print(topo.filter_match(pipe=0, data=1))
+    assert topo.filter_match(pipe=0, data=1) == [2, 3]
+    print([topo.get_coord(r) for r in topo.filter_match(pipe=0, data=1)])
+
+
+def test_topology_rank_repr():
+    topo = Topo(axes=['a', 'b'], dims=[2, 2])
+    assert topo.get_rank_repr(rank=0) == 'a_00-b_00'
+    assert topo.get_rank_repr(rank=1) == 'a_00-b_01'
+    assert topo.get_rank_repr(rank=2) == 'a_01-b_00'
+    assert topo.get_rank_repr(rank=3) == 'a_01-b_01'
+
+    assert topo.get_rank_repr(rank=3, inner_sep='+') == 'a+01-b+01'
+    assert topo.get_rank_repr(rank=3,
+                              inner_sep='🤗',
+                              outer_sep='_JEFF_') == 'a🤗01_JEFF_b🤗01'
+
+    topo = Topo(axes=['pipe', 'data'], dims=[2, 2])
+    assert topo.get_rank_repr(rank=0) == ''
+    assert topo.get_rank_repr(rank=1) == ''
+    assert topo.get_rank_repr(rank=2) == ''
+    assert topo.get_rank_repr(rank=3) == ''
+
+    assert topo.get_rank_repr(rank=0, omit_axes=['pipe']) == 'data_00'
+    assert topo.get_rank_repr(rank=1, omit_axes=['pipe']) == 'data_01'
+    assert topo.get_rank_repr(rank=2, omit_axes=['pipe']) == 'data_00'
+    assert topo.get_rank_repr(rank=3, omit_axes=['pipe']) == 'data_01'
+
+    assert topo.get_rank_repr(rank=0, omit_axes=[]) == 'pipe_00-data_00'
+    assert topo.get_rank_repr(rank=1, omit_axes=[]) == 'pipe_00-data_01'
+    assert topo.get_rank_repr(rank=2, omit_axes=[]) == 'pipe_01-data_00'
+    assert topo.get_rank_repr(rank=3, omit_axes=[]) == 'pipe_01-data_01'
+
+    topo = Topo(axes=['pipe', 'data', 'model'], dims=[2, 2, 2])
+    assert topo.get_rank_repr(rank=0) == 'model_00'
+    assert topo.get_rank_repr(rank=1) == 'model_01'
+    assert topo.get_rank_repr(rank=2) == 'model_00'
+    assert topo.get_rank_repr(rank=3) == 'model_01'
+    assert topo.get_rank_repr(rank=4) == 'model_00'
+    assert topo.get_rank_repr(rank=5) == 'model_01'
+    assert topo.get_rank_repr(rank=6) == 'model_00'
+    assert topo.get_rank_repr(rank=7) == 'model_01'
+
+
+def test_topology_3d():
+    topo = Topo(axes=['a', 'b', 'c'], dims=[2, 2, 2])
+
+    assert topo.get_rank(a=0, b=0, c=0) == 0
+    assert topo.get_rank(a=0, b=0, c=1) == 1
+    assert topo.get_rank(a=0, b=1, c=0) == 2
+    assert topo.get_rank(a=0, b=1, c=1) == 3
+    assert topo.get_rank(a=1, b=0, c=0) == 4
+    assert topo.get_rank(a=1, b=0, c=1) == 5
+    assert topo.get_rank(a=1, b=1, c=0) == 6
+    assert topo.get_rank(a=1, b=1, c=1) == 7
+
+    assert topo.get_axis_list('a', 0) == [0, 1, 2, 3]
+    assert topo.get_axis_list('a', 1) == [4, 5, 6, 7]
+    assert topo.get_axis_list('b', 0) == [0, 1, 4, 5]
+    assert topo.get_axis_list('b', 1) == [2, 3, 6, 7]
+    assert topo.get_axis_list('c', 0) == [0, 2, 4, 6]
+    assert topo.get_axis_list('c', 1) == [1, 3, 5, 7]
+
+    assert topo.get_coord(0) == topo.ProcessCoord(0, 0, 0)
+    assert topo.get_coord(1) == topo.ProcessCoord(0, 0, 1)
+    assert topo.get_coord(2) == topo.ProcessCoord(0, 1, 0)
+    assert topo.get_coord(3) == topo.ProcessCoord(0, 1, 1)
+    assert topo.get_coord(4) == topo.ProcessCoord(1, 0, 0)
+    assert topo.get_coord(5) == topo.ProcessCoord(1, 0, 1)
+    assert topo.get_coord(6) == topo.ProcessCoord(1, 1, 0)
+    assert topo.get_coord(7) == topo.ProcessCoord(1, 1, 1)
+
+    assert topo.filter_match(a=0) == [0, 1, 2, 3]
+    assert topo.filter_match(b=1, c=1) == [3, 7]
+    assert topo.filter_match(a=1, b=1, c=1) == [7]
+
+    # Easy access method
+    assert topo.get_coord(0).a == 0
+
+
+def test_topology_comm_list():
+    topo = Topo(axes=['pipe', 'data', 'model'], dims=[2, 2, 2])
+
+    assert topo.get_rank(pipe=0, data=0, model=0) == 0
+    assert topo.get_rank(pipe=0, data=0, model=1) == 1
+    assert topo.get_rank(pipe=0, data=1, model=0) == 2
+    assert topo.get_rank(pipe=0, data=1, model=1) == 3
+    assert topo.get_rank(pipe=1, data=0, model=0) == 4
+    assert topo.get_rank(pipe=1, data=0, model=1) == 5
+    assert topo.get_rank(pipe=1, data=1, model=0) == 6
+    assert topo.get_rank(pipe=1, data=1, model=1) == 7
+
+    pipe_list = [
+        [0, 4], # data=0, model=0
+        [1, 5], # data=0, model=1
+        [2, 6], # data=1, model=0
+        [3, 7], # data=1, model=1
+    ]
+    assert topo.get_axis_comm_lists('pipe') == pipe_list
+
+    data_list = [
+        [0, 2], # pipe=0, model=0
+        [1, 3], # pipe=0, model=1
+        [4, 6], # pipe=1, model=0
+        [5, 7], # pipe=1, model=1
+    ]
+    assert topo.get_axis_comm_lists('data') == data_list
+
+    model_list = [
+        [0, 1], # pipe=0, data=0
+        [2, 3], # pipe=0, data=1
+        [4, 5], # pipe=1, data=0
+        [6, 7], # pipe=1, data=1
+    ]
+    assert topo.get_axis_comm_lists('model') == model_list
+
+    # Handle nonsense. We don't want to RuntimeError because it allows us to write more
+    # generalized code for data/model/pipe parallelism
+    assert topo.get_axis_comm_lists('jeff') == []
+
+
+@distributed_test(world_size=4)
+def test_grid_pipe_data():
+    topo = Topo(axes=['pipe', 'data'], dims=[2, 2])
+    grid = Grid(topology=topo)
+
+    assert grid._is_grid_valid()
+
+    rank = dist.get_rank()
+
+    assert grid.is_first_stage == (grid.get_stage_id() == 0)
+    assert grid.is_last_stage == (
+        grid.get_stage_id() == grid.get_pipe_parallel_world_size() - 1)
+
+    # Test collectives along the pipeline parallel process groups
+    rank_tensor = torch.LongTensor(data=[rank]).cuda()
+    dist.all_reduce(rank_tensor, group=grid.get_pipe_parallel_group())
+    pipe_group = grid.pp_group
+    assert torch.all(rank_tensor == sum(pipe_group))
+
+    # Test collectives along the data parallel process groups
+    rank_tensor = torch.LongTensor(data=[rank]).cuda()
+    dist.all_reduce(rank_tensor, group=grid.get_data_parallel_group())
+    data_group = grid.dp_group
+    assert torch.all(rank_tensor == sum(data_group))
+
+
+@distributed_test(world_size=4)
+def test_stage_to_global():
+    topo = Topo(axes=['pipe', 'data'], dims=[2, 2])
+    grid = Grid(topology=topo)
+
+    assert grid._is_grid_valid()
+
+    assert grid.stage_to_global(stage_id=0, data=0) == 0
+    assert grid.stage_to_global(stage_id=0, data=1) == 1
+    assert grid.stage_to_global(stage_id=1, data=0) == 2
+    assert grid.stage_to_global(stage_id=1, data=1) == 3
+
+    me = topo.get_coord(rank=dist.get_rank())
+    if me.data == 0:
+        assert grid.stage_to_global(stage_id=0) == 0
+        assert grid.stage_to_global(stage_id=1) == 2
+    else:
+        assert grid.stage_to_global(stage_id=0) == 1
+        assert grid.stage_to_global(stage_id=1) == 3
+
+
+def test_primes():
+    """ Test prime factorizations. """
+    def _product(ps):
+        p = 1
+        for num in ps:
+            p *= num
+        return p
+
+    with pytest.raises(ValueError):
+        _prime_factors(0)
+
+    for x in range(1, 30):
+        primes = _prime_factors(x)
+        assert _product(primes) == x
+        for p in primes:
+            assert _prime_factors(p) == [p]
diff --git a/tests/unit/test_zero.py b/tests/unit/test_zero.py
new file mode 100755
index 000000000000..235b790387ea
--- /dev/null
+++ b/tests/unit/test_zero.py
@@ -0,0 +1,69 @@
+import torch
+import pytest
+import json
+import argparse
+import os
+
+from common import distributed_test
+from simple_model import SimpleModel, random_dataloader, args_from_dict
+
+import deepspeed
+
+
+def run_unbalanced_gradients(model, data_loader):
+    def drop_some_gradients(model, iter):
+        odd_iteration = iter % 2
+        for i, p in enumerate(model.parameters()):
+            p.requires_grad = (i % 2) == odd_iteration
+
+    def enable_grads(model):
+        for p in model.parameters():
+            p.requires_grad = True
+
+    for i, batch in enumerate(data_loader):
+        drop_some_gradients(model, i + 1)
+        loss = model(batch[0], batch[1])
+        model.backward(loss)
+        model.step()
+        enable_grads(model)
+
+
+@pytest.mark.parametrize('zero_stage', [1, 2])
+def test_zero_unbalanced_gradients(tmpdir, zero_stage):
+    config_dict = {
+        "train_micro_batch_size_per_gpu": 2,
+        "gradient_accumulation_steps": 2,
+        "steps_per_print": 1,
+        "zero_optimization": {
+            "stage": zero_stage
+        },
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 1e-3
+            }
+        },
+        "fp16": {
+            "enabled": True,
+            "initial_scale_power": 8
+        }
+    }
+
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 4
+
+    model = SimpleModel(hidden_dim=hidden_dim)
+
+    @distributed_test(world_size=[1])
+    def _test_zero_unbalanced_gradients(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=16,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+
+        run_unbalanced_gradients(model, data_loader)
+
+    _test_zero_unbalanced_gradients(args=args, model=model, hidden_dim=hidden_dim)
diff --git a/third_party/apex b/third_party/apex
deleted file mode 160000
index 494f8ab3fc1b..000000000000
--- a/third_party/apex
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 494f8ab3fc1b0b26949a3bcbb2bcac78008d48c1
diff --git a/version.txt b/version.txt
new file mode 100644
index 000000000000..5503126d59f8
--- /dev/null
+++ b/version.txt
@@ -0,0 +1 @@
+0.3.10