bobisai · bobisapotato · Jan 24, 2021 · Jul 21, 2020 · Jul 22, 2020 · Jul 22, 2020
diff --git a/.clang-format b/.clang-format
@@ -20,7 +20,7 @@ AllowShortLoopsOnASingleLine: true
 AlwaysBreakAfterDefinitionReturnType: None
 AlwaysBreakAfterReturnType: None
 AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: Yes
+AlwaysBreakTemplateDeclarations: true
 BinPackArguments:  false
 BinPackParameters: false
 BraceWrapping:

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -0,0 +1,51 @@
+# This is a basic workflow to help you get started with Actions
+
+name: Build
+
+# Controls when the action will run.
+on:
+  push:
+    paths-ignore:
+    - 'docs/**'
+  pull_request:
+    paths-ignore:
+    - 'docs/**'
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "build"
+  build:
+    # The type of runner that the job will run on
+    runs-on: self-hosted
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+
+      # Runs a single command using the runners shell
+      - name: environment
+        run: |
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      # Runs a set of commands using the runners shell
+      - name: Install deepspeed
+        run: |
+          pip install .[dev]
+          ds_report
+
+      - name: Formatting checks
+        run: |
+           pre-commit run --all-files
+
+      # Runs a set of commands using the runners shell
+      - name: Unit tests
+        run: |
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --durations=0 --forked --verbose -x tests/unit/
diff --git a/.github/workflows/pre-compile-ops.yml b/.github/workflows/pre-compile-ops.yml
@@ -0,0 +1,47 @@
+# This is a basic workflow to help you get started with Actions
+
+name: Tests-w-precompiled-ops
+
+# Controls when the action will run.
+on:
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "build"
+  build:
+    # The type of runner that the job will run on
+    runs-on: self-hosted
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+
+      # Runs a single command using the runners shell
+      - name: environment
+        run: |
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      # Runs a set of commands using the runners shell
+      - name: Install deepspeed
+        run: |
+          DS_BUILD_OPS=1 pip install .[dev]
+          ds_report
+
+      - name: Formatting checks
+        run: |
+           pre-commit run --all-files
+
+      # Runs a set of commands using the runners shell
+      - name: Unit tests
+        run: |
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --durations=0 --forked --verbose -x tests/unit/
diff --git a/.gitignore b/.gitignore
@@ -3,21 +3,28 @@
 *~
 *.swp
 *.log
-deepspeed/git_version_info.py
+deepspeed/git_version_info_installed.py
 
 # Build + installation data
 build/
 dist/
-fused_lamb_*.so
+*.so
 deepspeed.egg-info/
+build.txt
 
 # Website
 docs/_site/
 docs/build
+docs/code-docs/source/_build
 docs/code-docs/_build
 docs/code-docs/build
 .sass-cache/
 .jekyll-cache/
 .jekyll-metadata
 
+# Testing data
 tests/unit/saved_checkpoint/
+
+# Dev/IDE data
+.vscode
+.theia
diff --git a/.gitmodules b/.gitmodules
@@ -1,6 +1,3 @@
-[submodule "third_party/apex"]
-	path = third_party/apex
-	url = https://github.com/NVIDIA/apex.git
 [submodule "DeepSpeedExamples"]
 	path = DeepSpeedExamples
 	url = https://github.com/microsoft/DeepSpeedExamples

diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -0,0 +1,18 @@
+
+# Required
+version: 2
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: docs/code-docs/source/conf.py
+  fail_on_warning: false
+
+# Optionally build your docs in additional formats such as PDF
+formats:
+  - pdf
+
+# Optionally set the version of Python and requirements required to build your docs
+python:
+  version: 3.7
+  install:
+    - requirements: requirements/requirements-readthedocs.txt
diff --git a/CODEOWNERS b/CODEOWNERS
@@ -0,0 +1 @@
+*       @jeffra @samyam @tjruwase @ShadenSmith @conglongli @awan-10 @arashashari @cli99 @eltonzheng @minjiaz @RezaYazdaniAminabadi @niumanar
diff --git a/DeepSpeedExamples b/DeepSpeedExamples
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,4 @@
+include *.txt README.md
+recursive-include requirements *.txt
+recursive-include deepspeed *.cpp *.h *.cu *.tr *.cuh *.cc
+recursive-include csrc *.cpp *.h *.cu *.tr *.cuh *.cc
diff --git a/README.md b/README.md
@@ -1,6 +1,8 @@
-[![Build Status](https://dev.azure.com/DeepSpeedMSFT/DeepSpeed/_apis/build/status/microsoft.DeepSpeed?branchName=master)](https://dev.azure.com/DeepSpeedMSFT/DeepSpeed/_build/latest?definitionId=1&branchName=master)
+[![Build Status](https://github.com/microsoft/deepspeed/workflows/Build/badge.svg)](https://github.com/microsoft/DeepSpeed/actions)
+[![PyPI version](https://badge.fury.io/py/deepspeed.svg)](https://pypi.org/project/deepspeed/)
 [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)
 [![License MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)
+[![Docker Pulls](https://img.shields.io/docker/pulls/deepspeed/deepspeed)](https://hub.docker.com/r/deepspeed/deepspeed)
 
 [DeepSpeed](https://www.deepspeed.ai/) is a deep learning optimization
 library that makes distributed training easy, efficient, and effective.
@@ -9,9 +11,13 @@ library that makes distributed training easy, efficient, and effective.
 <p align="center"><i><b>10x Faster Training</b></i></p>
 <p align="center"><i><b>Minimal Code Change</b></i></p>
 
-DeepSpeed can train deep learning models with over a hundred billion parameters on current
-generation of GPU clusters, while achieving over 10x in system performance
-compared to the state-of-art. Early adopters of DeepSpeed have already produced
+DeepSpeed delivers extreme-scale model training for everyone, from data scientists training on massive supercomputers to those training on low-end clusters or even on a single GPU:
+* Extreme scale: Using current generation of GPU clusters with hundreds of devices,  3D parallelism of DeepSpeed can efficiently train deep learning models with trillions of parameters.  
+* Extremely memory efficient: With just a single GPU, ZeRO-Offload of DeepSpeed can train models with over 10B parameters, 10x bigger than the state of arts, democratizing multi-billion-parameter model training such that many deep learning scientists can explore bigger and better models.
+* Extremely long sequence length: Sparse attention of DeepSpeed powers an order-of-magnitude longer input sequence and obtains up to 6x faster execution comparing with dense transformers.  
+* Extremely communication efficient: 3D parallelism improves communication efficiency allows users to train multi-billion-parameter models 2–7x faster on clusters with limited network bandwidth.  1-bit Adam reduces communication volume by up to 5x while achieving similar convergence efficiency to Adam, allowing for scaling to different types of GPU clusters and networks.
+
+Early adopters of DeepSpeed have already produced
 a language model (LM) with over 17B parameters called
 [Turing-NLG](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft),
 establishing a new SOTA in the LM category.
@@ -25,25 +31,26 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 
 
 # News
-
-* [2020/05/19] [ZeRO-2 & DeepSpeed: Shattering Barriers of Deep Learning Speed & Scale](https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/)
-<span style="color:dodgerblue">**[_NEW_]**</span>
-* [2020/05/19] [An Order-of-Magnitude Larger and Faster Training with ZeRO-2](https://www.deepspeed.ai/news/2020/05/18/zero-stage2.html)
-<span style="color:dodgerblue">**[_NEW_]**</span>
-* [2020/05/19] [The Fastest and Most Efficient BERT Training through Optimized Transformer Kernels](https://www.deepspeed.ai/news/2020/05/18/bert-record.html)
-<span style="color:dodgerblue">**[_NEW_]**</span>
-* [2020/02/13] [Turing-NLG: A 17-billion-parameter language model by Microsoft](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/)
-* [2020/02/13] [ZeRO & DeepSpeed: New system optimizations enable training models with over 100 billion parameters](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/)
+* [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
+* [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
+* [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/)
+  * [Powering 10x longer sequences and 6x faster execution through DeepSpeed Sparse Attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention-news.html)
+  * [Training a trillion parameters with pipeline parallelism](https://www.deepspeed.ai/news/2020/09/08/pipeline-parallelism.html)
+  * [Up to 5x less communication and 3.4x faster training through 1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-news.html)
+  * [10x bigger model training on a single GPU with ZeRO-Offload](https://www.deepspeed.ai/news/2020/09/08/ZeRO-Offload.html)
+* [2020/08/07] [DeepSpeed Microsoft Research Webinar](https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html) is now available on-demand
 
 
 # Table of Contents
 | Section                                 | Description                                 |
 | --------------------------------------- | ------------------------------------------- |
 | [Why DeepSpeed?](#why-deepspeed)        |  DeepSpeed overview                         |
-| [Features](#features)                   |  DeepSpeed features                         |
-| [Further Reading](#further-reading)     |  DeepSpeed documentation, tutorials, etc.   |
-| [Contributing](#contributing)           |  Instructions for contributing to DeepSpeed |
-| [Publications](#publications)           |  DeepSpeed publications                     |
+| [Install](#installation)                |  Installation details                       |
+| [Features](#features)                   |  Feature list and overview                  |
+| [Further Reading](#further-reading)     |  Documentation, tutorials, etc.             |
+| [Contributing](#contributing)           |  Instructions for contributing              |
+| [Publications](#publications)           |  Publications related to DeepSpeed          |
+| [Videos](#videos)                       |  Videos related to DeepSpeed                |
 
 # Why DeepSpeed?
 Training advanced deep learning models is challenging. Beyond model design,
@@ -55,8 +62,35 @@ a large model easily runs out of memory with pure data parallelism and it is
 difficult to use model parallelism. DeepSpeed addresses these challenges to
 accelerate model development *and* training.
 
-# Features
+# Installation
+
+The quickest way to get started with DeepSpeed is via pip, this will install
+the latest release of DeepSpeed which is not tied to specific PyTorch or CUDA
+versions. DeepSpeed includes several C++/CUDA extensions that we commonly refer
+to as our 'ops'.  By default, all of these extensions/ops will be built
+just-in-time (JIT) using [torch's JIT C++ extension loader that relies on
+ninja](https://pytorch.org/docs/stable/cpp_extension.html) to build and
+dynamically link them at runtime.
+
+**Note:** [PyTorch](https://pytorch.org/) must be installed _before_ installing
+DeepSpeed.
 
+```bash
+pip install deepspeed
+```
+
+After installation, you can validate your install and see which extensions/ops
+your machine is compatible with via the DeepSpeed environment report.
+
+```bash
+ds_report
+```
+
+If you would like to pre-install any of the DeepSpeed extensions/ops (instead
+of JIT compiling) or install pre-compiled ops via PyPI please see our [advanced
+installation instructions](https://www.deepspeed.ai/tutorials/advanced-install/).
+
+# Features
 Below we provide a brief feature list, see our detailed [feature
 overview](https://www.deepspeed.ai/features/) for descriptions and usage.
 
@@ -66,10 +100,27 @@ overview](https://www.deepspeed.ai/features/) for descriptions and usage.
 * [Model Parallelism](https://www.deepspeed.ai/features/#model-parallelism)
   * Support for Custom Model Parallelism
   * Integration with Megatron-LM
-* [Memory and Bandwidth Optimizations](https://www.deepspeed.ai/features/#memory-and-bandwidth-optimizations)
-  * The Zero Redundancy Optimizer (ZeRO)
-  * Constant Buffer Optimization (CBO)
+* [Pipeline Parallelism](https://www.deepspeed.ai/tutorials/pipeline/)
+  * 3D Parallelism
+* [The Zero Redundancy Optimizer (ZeRO)](https://www.deepspeed.ai/tutorials/zero/)
+  * Optimizer State and Gradient Partitioning
+  * Activation Partitioning
+  * Constant Buffer Optimization
+  * Contiguous Memory Optimization
+* [ZeRO-Offload](https://www.deepspeed.ai/tutorials/zero-offload/)
+  * Leverage both CPU/GPU memory for model training
+  * Support 10B model training on a single GPU
+* [Ultra-fast dense transformer kernels](https://www.deepspeed.ai/news/2020/05/18/bert-record.html)
+* [Sparse attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention.html)
+  * Memory- and compute-efficient sparse kernels
+  * Support 10x long sequences than dense
+  * Flexible support to different sparse structures
+* [1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html)
+  * Custom communication collective
+  * Up to 5x communication volume saving
+* [Additional Memory and Bandwidth Optimizations](https://www.deepspeed.ai/features/#additional-memory-and-bandwidth-optimizations)
   * Smart Gradient Accumulation
+  * Communication/Computation Overlap
 * [Training Features](https://www.deepspeed.ai/features/#training-features)
   * Simplified training API
   * Gradient Clipping
@@ -79,6 +130,7 @@ overview](https://www.deepspeed.ai/features/) for descriptions and usage.
   * Memory bandwidth optimized FP16 Optimizer
   * Large Batch Training with LAMB Optimizer
   * Memory efficient Training with ZeRO Optimizer
+  * CPU-Adam
 * [Training Agnostic Checkpointing](https://www.deepspeed.ai/features/#training-agnostic-checkpointing)
 * [Advanced Parameter Search](https://www.deepspeed.ai/features/#advanced-parameter-search)
   * Learning Rate Range Test
@@ -127,8 +179,23 @@ all repos using our CLA.
 This project has adopted the [Microsoft Open Source Code of
 Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the
 [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact
-[[email protected]](mailto:[email protected]) with any additional questions or
-comments.
+[[email protected]](mailto:[email protected]) with any additional questions or comments.
 
 # Publications
-1. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He. (2019) ZeRO: Memory Optimization Towards Training A Trillion Parameter Models. [ArXiv:1910.02054](https://arxiv.org/abs/1910.02054)
+1. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He. (2019) ZeRO: memory optimizations toward training trillion parameter models. [arXiv:1910.02054](https://arxiv.org/abs/1910.02054) and [In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC '20)](https://dl.acm.org/doi/10.5555/3433701.3433727).
+2. Jeff Rasley, Samyam Rajbhandari, Olatunji Ruwase, and Yuxiong He. (2020) DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters. [In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD '20, Tutorial)](https://dl.acm.org/doi/10.1145/3394486.3406703).
+3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
+4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840).
+
+# Videos
+1. DeepSpeed KDD 2020 Tutorial
+    1. [Overview](https://www.youtube.com/watch?v=CaseqC45DNc&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=29)
+    2. [ZeRO + large model training](https://www.youtube.com/watch?v=y4_bCiAsIAk&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=28)
+    3. [17B T-NLG demo](https://www.youtube.com/watch?v=9V-ZbP92drg&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=27)
+    4. [Fastest BERT training + RScan tuning](https://www.youtube.com/watch?v=o1K-ZG9F6u0&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=26)
+    5. DeepSpeed hands on deep dive: [part 1](https://www.youtube.com/watch?v=_NOk-mBwDYg&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=92), [part 2](https://www.youtube.com/watch?v=sG6_c4VXLww&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=94), [part 3](https://www.youtube.com/watch?v=k9yPkBTayos&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=93)
+    6. [FAQ](https://www.youtube.com/watch?v=nsHu6vEgPew&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=24)
+2. Microsoft Research Webinar
+    * Registration is free and all videos are available on-demand.
+    * [ZeRO & Fastest BERT: Increasing the scale and speed of deep learning training in DeepSpeed](https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html).
+3. [DeepSpeed on AzureML](https://youtu.be/yBVXR8G8Bg8)
diff --git a/azure-pipelines-docker.yml b/azure-pipelines-docker.yml
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		* @jeffra @samyam @tjruwase @ShadenSmith @conglongli @awan-10 @arashashari @cli99 @eltonzheng @minjiaz @RezaYazdaniAminabadi @niumanar