From 2e145eb843d0eee5bacd6b19b8a806677892c3ef Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 14 Sep 2020 15:44:27 -0700 Subject: [PATCH 01/18] moving to .readthedocs.yml --- .readthedocs.yml | 17 +++++++++++++++++ DeepSpeedExamples | 2 +- deepspeed/git_version_info.py | 9 +++++++++ docs/code-docs/requirements.local.txt | 3 --- requirements/requirements-dev.txt | 3 +++ .../requirements-readthedocs.txt | 0 6 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 .readthedocs.yml create mode 100644 deepspeed/git_version_info.py delete mode 100755 docs/code-docs/requirements.local.txt rename docs/code-docs/requirements.readthedocs.txt => requirements/requirements-readthedocs.txt (100%) diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 000000000000..84188849f014 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,17 @@ + +# Required +version: 2 + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/code-docs/conf.py + +# Optionally build your docs in additional formats such as PDF +formats: + - pdf + +# Optionally set the version of Python and requirements required to build your docs +python: + version: 3.7 + install: + - requirements: requirements/requirements-readthedocs.txt diff --git a/DeepSpeedExamples b/DeepSpeedExamples index b989b41b526d..9e2c34e31cec 160000 --- a/DeepSpeedExamples +++ b/DeepSpeedExamples @@ -1 +1 @@ -Subproject commit b989b41b526db164611bedd3e73c09b8c2c5cbfc +Subproject commit 9e2c34e31cec99f7d5785c6a1a3b0854c322f883 diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py new file mode 100644 index 000000000000..7072127e3aea --- /dev/null +++ b/deepspeed/git_version_info.py @@ -0,0 +1,9 @@ +version = '0.3.0' +git_hash = '[none]' +git_branch = '[none]' +installed_ops = { + 'lamb': False, + 'transformer': False, + 'sparse-attn': False, + 'cpu-adam': False +} diff --git a/docs/code-docs/requirements.local.txt b/docs/code-docs/requirements.local.txt deleted file mode 100755 index fdce5922d1b3..000000000000 --- a/docs/code-docs/requirements.local.txt +++ /dev/null @@ -1,3 +0,0 @@ -sphinx -recommonmark -sphinx-rtd-theme diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index b6a873656eac..0118f6ee760c 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -2,3 +2,6 @@ pytest pytest-forked pre-commit clang-format +sphinx +recommonmark +sphinx-rtd-theme diff --git a/docs/code-docs/requirements.readthedocs.txt b/requirements/requirements-readthedocs.txt similarity index 100% rename from docs/code-docs/requirements.readthedocs.txt rename to requirements/requirements-readthedocs.txt From 88fab2b4eed1764d977bc72485c7cb77ae425de1 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 14 Sep 2020 15:46:02 -0700 Subject: [PATCH 02/18] catching up DSE --- DeepSpeedExamples | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DeepSpeedExamples b/DeepSpeedExamples index 9e2c34e31cec..b989b41b526d 160000 --- a/DeepSpeedExamples +++ b/DeepSpeedExamples @@ -1 +1 @@ -Subproject commit 9e2c34e31cec99f7d5785c6a1a3b0854c322f883 +Subproject commit b989b41b526db164611bedd3e73c09b8c2c5cbfc From 975cbb7408385a9f80ffcbf54fd363341dda7518 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 14 Sep 2020 15:55:39 -0700 Subject: [PATCH 03/18] conf.py location --- .readthedocs.yml | 2 +- docs/code-docs/source/training.rst | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index 84188849f014..b6e6a8dda828 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -4,7 +4,7 @@ version: 2 # Build documentation in the docs/ directory with Sphinx sphinx: - configuration: docs/code-docs/conf.py + configuration: docs/code-docs/source/conf.py # Optionally build your docs in additional formats such as PDF formats: diff --git a/docs/code-docs/source/training.rst b/docs/code-docs/source/training.rst index 55ce4642ff70..f72b721076b3 100644 --- a/docs/code-docs/source/training.rst +++ b/docs/code-docs/source/training.rst @@ -2,7 +2,7 @@ Training API ============ :func:`deepspeed.initialize` returns a *model engine* in its first argument -of type ``DeepSpeedLight``. This engine is used to progress training: +of type :class:`DeepSpeedEngine`. This engine is used to progress training: .. code-block:: python @@ -18,12 +18,14 @@ of type ``DeepSpeedLight``. This engine is used to progress training: Forward Propagation ------------------- -.. autofunction:: deepspeed.DeepSpeedLight.forward +.. automodule:: deepspeed.DeepSpeedEngine + +.. autofunction:: deepspeed.DeepSpeedEngine.forward Backward Propagation -------------------- -.. autofunction:: deepspeed.DeepSpeedLight.backward +.. autofunction:: deepspeed.DeepSpeedEngine.backward Optimizer Step -------------- -.. autofunction:: deepspeed.DeepSpeedLight.step +.. autofunction:: deepspeed.DeepSpeedEngine.step From 2c2e9d163cefb7ebe2918542a714f8f4fa171215 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 14 Sep 2020 15:59:23 -0700 Subject: [PATCH 04/18] mock numpy import --- .readthedocs.yml | 1 + docs/code-docs/source/conf.py | 9 +-------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index b6e6a8dda828..7375338f09fe 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -5,6 +5,7 @@ version: 2 # Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/code-docs/source/conf.py + fail_on_warning: true # Optionally build your docs in additional formats such as PDF formats: diff --git a/docs/code-docs/source/conf.py b/docs/code-docs/source/conf.py index d49496c5ba7c..f0f111c3a879 100644 --- a/docs/code-docs/source/conf.py +++ b/docs/code-docs/source/conf.py @@ -78,11 +78,4 @@ autoclass_content = 'both' -autodoc_mock_imports = [ - "torch", - "apex", - "mpi4py", - "tensorboardX", - "deepspeed_transformer_cuda", - "deepspeed_stochastic_transformer_cuda", -] +autodoc_mock_imports = ["torch", "apex", "mpi4py", "tensorboardX", "numpy"] From f1dea531a86aca8faebcb60cf6722227bd4ca4ac Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 14 Sep 2020 16:15:37 -0700 Subject: [PATCH 05/18] version info redirect --- .gitignore | 2 +- deepspeed/git_version_info.py | 13 ++++--------- deepspeed/git_version_info_template.py | 9 +++++++++ setup.py | 2 +- 4 files changed, 15 insertions(+), 11 deletions(-) create mode 100644 deepspeed/git_version_info_template.py diff --git a/.gitignore b/.gitignore index 8bf6f1d2b33d..ae2fb660b176 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,7 @@ *~ *.swp *.log -deepspeed/git_version_info.py +deepspeed/git_version_info_installed.py # Build + installation data build/ diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py index 7072127e3aea..b2773b2aef82 100644 --- a/deepspeed/git_version_info.py +++ b/deepspeed/git_version_info.py @@ -1,9 +1,4 @@ -version = '0.3.0' -git_hash = '[none]' -git_branch = '[none]' -installed_ops = { - 'lamb': False, - 'transformer': False, - 'sparse-attn': False, - 'cpu-adam': False -} +try: + from .git_version_info_installed import * +except ImportError: + from .git_version_info_sample import * diff --git a/deepspeed/git_version_info_template.py b/deepspeed/git_version_info_template.py new file mode 100644 index 000000000000..53040ef01547 --- /dev/null +++ b/deepspeed/git_version_info_template.py @@ -0,0 +1,9 @@ +version = '0.3.0+[none]' +git_hash = '[none]' +git_branch = '[none]' +installed_ops = { + 'lamb': False, + 'transformer': False, + 'sparse-attn': False, + 'cpu-adam': False +} diff --git a/setup.py b/setup.py index 55459395ec79..8c38826f84e9 100755 --- a/setup.py +++ b/setup.py @@ -297,7 +297,7 @@ def command_exists(cmd): git_hash = "unknown" git_branch = "unknown" print(f"version={VERSION}+{git_hash}, git_hash={git_hash}, git_branch={git_branch}") -with open('deepspeed/git_version_info.py', 'w') as fd: +with open('deepspeed/git_version_info_installed.py', 'w') as fd: fd.write(f"version='{VERSION}+{git_hash}'\n") fd.write(f"git_hash='{git_hash}'\n") fd.write(f"git_branch='{git_branch}'\n") From 19e65b988b00962ea39b31e3d8bd18fe94a6712a Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 14 Sep 2020 16:21:48 -0700 Subject: [PATCH 06/18] import documentation --- deepspeed/git_version_info.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py index b2773b2aef82..5bb2735599a7 100644 --- a/deepspeed/git_version_info.py +++ b/deepspeed/git_version_info.py @@ -1,4 +1,6 @@ try: + # This is populated by setup.py from .git_version_info_installed import * -except ImportError: - from .git_version_info_sample import * +except ModuleNotFoundError: + # Will be missing from checkouts that haven't been installed (e.g., readthedocs) + from .git_version_info_template import * From 4fb0451d56821d058136199d8d823118e3cd77dc Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 14 Sep 2020 16:28:02 -0700 Subject: [PATCH 07/18] cleaning up --- docs/code-docs/source/deepspeed.pt.rst | 134 ------------------------- docs/code-docs/source/deepspeed.rst | 38 ------- 2 files changed, 172 deletions(-) delete mode 100644 docs/code-docs/source/deepspeed.pt.rst delete mode 100644 docs/code-docs/source/deepspeed.rst diff --git a/docs/code-docs/source/deepspeed.pt.rst b/docs/code-docs/source/deepspeed.pt.rst deleted file mode 100644 index 991963e32a43..000000000000 --- a/docs/code-docs/source/deepspeed.pt.rst +++ /dev/null @@ -1,134 +0,0 @@ -deepspeed.pt package -==================== - -Submodules ----------- - -deepspeed.pt.deepspeed\_config module -------------------------------------- - -.. automodule:: deepspeed.pt.deepspeed_config - :members: - :undoc-members: - :show-inheritance: - -deepspeed.pt.deepspeed\_constants module ----------------------------------------- - -.. automodule:: deepspeed.pt.deepspeed_constants - :members: - :undoc-members: - :show-inheritance: - -deepspeed.pt.deepspeed\_csr\_tensor module ------------------------------------------- - -.. automodule:: deepspeed.pt.deepspeed_csr_tensor - :members: - :undoc-members: - :show-inheritance: - -deepspeed.pt.deepspeed\_dataloader module ------------------------------------------ - -.. automodule:: deepspeed.pt.deepspeed_dataloader - :members: - :undoc-members: - :show-inheritance: - -deepspeed.pt.deepspeed\_fused\_lamb module ------------------------------------------- - -.. automodule:: deepspeed.pt.deepspeed_fused_lamb - :members: - :undoc-members: - :show-inheritance: - -deepspeed.pt.deepspeed\_launch module -------------------------------------- - -.. automodule:: deepspeed.pt.deepspeed_launch - :members: - :undoc-members: - :show-inheritance: - -deepspeed.pt.deepspeed\_light module ------------------------------------- - -.. automodule:: deepspeed.pt.deepspeed_light - :members: - :undoc-members: - :show-inheritance: - -deepspeed.pt.deepspeed\_lr\_schedules module --------------------------------------------- - -.. automodule:: deepspeed.pt.deepspeed_lr_schedules - :members: - :undoc-members: - :show-inheritance: - -deepspeed.pt.deepspeed\_run module ----------------------------------- - -.. automodule:: deepspeed.pt.deepspeed_run - :members: - :undoc-members: - :show-inheritance: - -deepspeed.pt.deepspeed\_timer module ------------------------------------- - -.. automodule:: deepspeed.pt.deepspeed_timer - :members: - :undoc-members: - :show-inheritance: - -deepspeed.pt.deepspeed\_utils module ------------------------------------- - -.. automodule:: deepspeed.pt.deepspeed_utils - :members: - :undoc-members: - :show-inheritance: - -deepspeed.pt.deepspeed\_zero\_optimizer module ----------------------------------------------- - -.. automodule:: deepspeed.pt.deepspeed_zero_optimizer - :members: - :undoc-members: - :show-inheritance: - -deepspeed.pt.fp16\_optimizer module ------------------------------------ - -.. automodule:: deepspeed.pt.fp16_optimizer - :members: - :undoc-members: - :show-inheritance: - -deepspeed.pt.fp16\_unfused\_optimizer module --------------------------------------------- - -.. automodule:: deepspeed.pt.fp16_unfused_optimizer - :members: - :undoc-members: - :show-inheritance: - -deepspeed.pt.loss\_scaler module --------------------------------- - -.. automodule:: deepspeed.pt.loss_scaler - :members: - :undoc-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: deepspeed.pt - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/code-docs/source/deepspeed.rst b/docs/code-docs/source/deepspeed.rst deleted file mode 100644 index 480793bbcaba..000000000000 --- a/docs/code-docs/source/deepspeed.rst +++ /dev/null @@ -1,38 +0,0 @@ -deepspeed package -================= - -Subpackages ------------ - -.. toctree:: - :maxdepth: 4 - - deepspeed.pt - -Submodules ----------- - -deepspeed.git\_version\_info module ------------------------------------ - -.. automodule:: deepspeed.git_version_info - :members: - :undoc-members: - :show-inheritance: - -deepspeed.install\_config module --------------------------------- - -.. automodule:: deepspeed.install_config - :members: - :undoc-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: deepspeed - :members: - :undoc-members: - :show-inheritance: From 6e5872ef40338c403fa6a2fc0e325c8a0badfec8 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 14 Sep 2020 16:38:18 -0700 Subject: [PATCH 08/18] checkpoint updates --- docs/code-docs/source/model-checkpointing.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/code-docs/source/model-checkpointing.rst b/docs/code-docs/source/model-checkpointing.rst index eaf349b27ccb..064f228f1e2c 100644 --- a/docs/code-docs/source/model-checkpointing.rst +++ b/docs/code-docs/source/model-checkpointing.rst @@ -5,8 +5,8 @@ DeepSpeed provides routines for checkpointing model state during training. Loading Training Checkpoints ---------------------------- -.. autofunction:: deepspeed.DeepSpeedLight.load_checkpoint +.. autofunction:: deepspeed.DeepSpeedEngine.load_checkpoint Saving Training Checkpoints --------------------------- -.. autofunction:: deepspeed.DeepSpeedLight.save_checkpoint +.. autofunction:: deepspeed.DeepSpeedEngine.save_checkpoint From aa58ce8f0ad88d02c6a807f86475f01b1cf97057 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 14 Sep 2020 16:40:15 -0700 Subject: [PATCH 09/18] clean old file --- .readthedocs.yml | 2 +- docs/code-docs/source/modules.rst | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) delete mode 100644 docs/code-docs/source/modules.rst diff --git a/.readthedocs.yml b/.readthedocs.yml index 7375338f09fe..a2da36620152 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -5,7 +5,7 @@ version: 2 # Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/code-docs/source/conf.py - fail_on_warning: true + fail_on_warning: false # Optionally build your docs in additional formats such as PDF formats: diff --git a/docs/code-docs/source/modules.rst b/docs/code-docs/source/modules.rst deleted file mode 100644 index ffb76bdd7102..000000000000 --- a/docs/code-docs/source/modules.rst +++ /dev/null @@ -1,7 +0,0 @@ -deepspeed -========= - -.. toctree:: - :maxdepth: 4 - - deepspeed From e75ccd39635e707dc003d8aa03f871c14d979d90 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 14 Sep 2020 16:45:58 -0700 Subject: [PATCH 10/18] docstring indents --- deepspeed/ops/transformer/transformer.py | 60 +++++++++---------- .../activation_checkpointing/checkpointing.py | 15 +++-- 2 files changed, 39 insertions(+), 36 deletions(-) diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index 54ed407c6ddb..8abc04d50079 100644 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -37,52 +37,52 @@ def __init__(self, class DeepSpeedTransformerConfig(TransformerConfig): """Initialize the DeepSpeed Transformer Config. - Arguments: - batch_size: The maximum batch size used for running the kernel on each GPU + Arguments: + batch_size: The maximum batch size used for running the kernel on each GPU - max_seq_length: The sequence-length of the model being trained with DeepSpeed + max_seq_length: The sequence-length of the model being trained with DeepSpeed - hidden_size: The hidden size of the transformer layer + hidden_size: The hidden size of the transformer layer - heads: The number of heads in the self-attention of the transformer layer + heads: The number of heads in the self-attention of the transformer layer - attn_dropout_ratio: The ratio of dropout for the attention's output + attn_dropout_ratio: The ratio of dropout for the attention's output - hidden_dropout_ratio: The ratio of dropout for the transformer's output + hidden_dropout_ratio: The ratio of dropout for the transformer's output - num_hidden_layers: The number of transformer layers + num_hidden_layers: The number of transformer layers - initializer_range: BERT model's initializer range for initializing parameter data + initializer_range: BERT model's initializer range for initializing parameter data - local_rank: Optional: The rank of GPU running the transformer kernel, it is not required - to use if the model already set the current device, otherwise need to set it - so that the transformer kernel can work on the right device + local_rank: Optional: The rank of GPU running the transformer kernel, it is not required + to use if the model already set the current device, otherwise need to set it + so that the transformer kernel can work on the right device - seed: The random seed for the dropout layers + seed: The random seed for the dropout layers - fp16: Enable half-precision computation + fp16: Enable half-precision computation - pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture + pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture - normalize_invertible: Optional: Enable invertible LayerNorm execution (dropping the input activation), - default is False + normalize_invertible: Optional: Enable invertible LayerNorm execution (dropping the input activation), + default is False - gelu_checkpoint: Optional: Enable checkpointing of Gelu activation output to save memory, - default is False + gelu_checkpoint: Optional: Enable checkpointing of Gelu activation output to save memory, + default is False - adjust_init_range: Optional: Set as True (default) if the model adjusts the weight initial values of - its self-attention output and layer output, False keeps the initializer_range no change. - See the adjustment below: - output_std = self.config.initializer_range / math.sqrt(2.0 * num_layers) + adjust_init_range: Optional: Set as True (default) if the model adjusts the weight initial values of + its self-attention output and layer output, False keeps the initializer_range no change. + See the adjustment below: + output_std = self.config.initializer_range / math.sqrt(2.0 * num_layers) - attn_dropout_checkpoint: Optional: Enable checkpointing of attention dropout to save memory, - default is False + attn_dropout_checkpoint: Optional: Enable checkpointing of attention dropout to save memory, + default is False - stochastic_mode: Enable for high performance, please note that this flag has some level of - non-determinism and can produce different results on different runs. However, we have seen - that by enabling it, the pretraining tasks such as BERT are not affected and can obtain - a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend - to turn it off in order to be able to reproduce the same result through the regular kernel execution. + stochastic_mode: Enable for high performance, please note that this flag has some level of + non-determinism and can produce different results on different runs. However, we have seen + that by enabling it, the pretraining tasks such as BERT are not affected and can obtain + a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend + to turn it off in order to be able to reproduce the same result through the regular kernel execution. """ def __init__(self, batch_size=-1, diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py index 3950e7eced20..d95d04e7df1e 100755 --- a/deepspeed/runtime/activation_checkpointing/checkpointing.py +++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py @@ -236,6 +236,9 @@ def model_parallel_cuda_manual_seed(seed): parallel GPUs, but the same across data parallel groups. This is used for example for dropout in model parallel regions. + + Args: + seed (int): The seed to use. """ global mpu # 2718 is just for fun and any POSITIVE value will work. @@ -314,12 +317,12 @@ def get_full_inputs(tensors, device=None): class CheckpointFunction(torch.autograd.Function): """This function is adapted from torch.utils.checkpoint with two main changes: - 1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state` - 2) the states in the model parallel tracker are also properly - tracked/set/reset. - 3) Performance activation partitioning, contiguous memory optimization - 4) CPU Checkpointing - 5) Profile forward and backward functions + 1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state` + 2) the states in the model parallel tracker are also properly + tracked/set/reset. + 3) Performance activation partitioning, contiguous memory optimization + 4) CPU Checkpointing + 5) Profile forward and backward functions """ @staticmethod def forward(ctx, run_function, *args): From 213f6105442f00510706ae5feebf061ba8951a88 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 14 Sep 2020 16:49:46 -0700 Subject: [PATCH 11/18] docstring indents --- deepspeed/ops/transformer/transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index 8abc04d50079..acb9f7bccc7c 100644 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -55,8 +55,8 @@ class DeepSpeedTransformerConfig(TransformerConfig): initializer_range: BERT model's initializer range for initializing parameter data local_rank: Optional: The rank of GPU running the transformer kernel, it is not required - to use if the model already set the current device, otherwise need to set it - so that the transformer kernel can work on the right device + to use if the model already set the current device, otherwise need to set it + so that the transformer kernel can work on the right device seed: The random seed for the dropout layers From 7b0f5ffc638dd69f84dcee023042c4f23368464c Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 14 Sep 2020 16:52:32 -0700 Subject: [PATCH 12/18] ignoring warnings for now --- deepspeed/ops/transformer/transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index acb9f7bccc7c..8abc04d50079 100644 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -55,8 +55,8 @@ class DeepSpeedTransformerConfig(TransformerConfig): initializer_range: BERT model's initializer range for initializing parameter data local_rank: Optional: The rank of GPU running the transformer kernel, it is not required - to use if the model already set the current device, otherwise need to set it - so that the transformer kernel can work on the right device + to use if the model already set the current device, otherwise need to set it + so that the transformer kernel can work on the right device seed: The random seed for the dropout layers From dd436518f93717f058cccb8250e25a557456239b Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 14 Sep 2020 16:54:35 -0700 Subject: [PATCH 13/18] view source --- docs/code-docs/source/conf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/code-docs/source/conf.py b/docs/code-docs/source/conf.py index f0f111c3a879..32bd66c17752 100644 --- a/docs/code-docs/source/conf.py +++ b/docs/code-docs/source/conf.py @@ -20,10 +20,12 @@ author = 'Microsoft' # The full version, including alpha/beta/rc tags -release = '0.1.0' +release = '0.3.0' master_doc = 'index' +html_show_sourcelink = True + autodoc_member_order = 'bysource' # -- General configuration --------------------------------------------------- From 12b4d55e0e47f72e2fba7ca473a3f1910b98381c Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 14 Sep 2020 16:57:54 -0700 Subject: [PATCH 14/18] viewcode extension --- docs/code-docs/source/conf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/code-docs/source/conf.py b/docs/code-docs/source/conf.py index 32bd66c17752..167f6427d7b4 100644 --- a/docs/code-docs/source/conf.py +++ b/docs/code-docs/source/conf.py @@ -24,8 +24,6 @@ master_doc = 'index' -html_show_sourcelink = True - autodoc_member_order = 'bysource' # -- General configuration --------------------------------------------------- @@ -36,6 +34,7 @@ extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', 'recommonmark', 'sphinx_rtd_theme', ] From cd460d195b7971e16b4487872d7987e8cc0a3fc5 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 14 Sep 2020 17:22:52 -0700 Subject: [PATCH 15/18] version info cleanup --- deepspeed/git_version_info_template.py | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 deepspeed/git_version_info_template.py diff --git a/deepspeed/git_version_info_template.py b/deepspeed/git_version_info_template.py deleted file mode 100644 index 53040ef01547..000000000000 --- a/deepspeed/git_version_info_template.py +++ /dev/null @@ -1,9 +0,0 @@ -version = '0.3.0+[none]' -git_hash = '[none]' -git_branch = '[none]' -installed_ops = { - 'lamb': False, - 'transformer': False, - 'sparse-attn': False, - 'cpu-adam': False -} From c3316407366d463ed86ea3b1e615cd4c36880c5c Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 14 Sep 2020 17:24:19 -0700 Subject: [PATCH 16/18] version info cleanup --- deepspeed/git_version_info.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py index 5bb2735599a7..82f60a86a6f6 100644 --- a/deepspeed/git_version_info.py +++ b/deepspeed/git_version_info.py @@ -3,4 +3,12 @@ from .git_version_info_installed import * except ModuleNotFoundError: # Will be missing from checkouts that haven't been installed (e.g., readthedocs) - from .git_version_info_template import * + version = '0.3.0+[none]' + git_hash = '[none]' + git_branch = '[none]' + installed_ops = { + 'lamb': False, + 'transformer': False, + 'sparse-attn': False, + 'cpu-adam': False + } From 3d12352b571cb595f15e5f33ed7cf1b1e3bfd859 Mon Sep 17 00:00:00 2001 From: Shaden Smith Date: Mon, 14 Sep 2020 17:25:12 -0700 Subject: [PATCH 17/18] reverting docstring indentation changes --- deepspeed/ops/transformer/transformer.py | 60 +++++++++---------- .../activation_checkpointing/checkpointing.py | 15 ++--- 2 files changed, 36 insertions(+), 39 deletions(-) diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index 8abc04d50079..54ed407c6ddb 100644 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -37,52 +37,52 @@ def __init__(self, class DeepSpeedTransformerConfig(TransformerConfig): """Initialize the DeepSpeed Transformer Config. - Arguments: - batch_size: The maximum batch size used for running the kernel on each GPU + Arguments: + batch_size: The maximum batch size used for running the kernel on each GPU - max_seq_length: The sequence-length of the model being trained with DeepSpeed + max_seq_length: The sequence-length of the model being trained with DeepSpeed - hidden_size: The hidden size of the transformer layer + hidden_size: The hidden size of the transformer layer - heads: The number of heads in the self-attention of the transformer layer + heads: The number of heads in the self-attention of the transformer layer - attn_dropout_ratio: The ratio of dropout for the attention's output + attn_dropout_ratio: The ratio of dropout for the attention's output - hidden_dropout_ratio: The ratio of dropout for the transformer's output + hidden_dropout_ratio: The ratio of dropout for the transformer's output - num_hidden_layers: The number of transformer layers + num_hidden_layers: The number of transformer layers - initializer_range: BERT model's initializer range for initializing parameter data + initializer_range: BERT model's initializer range for initializing parameter data - local_rank: Optional: The rank of GPU running the transformer kernel, it is not required - to use if the model already set the current device, otherwise need to set it - so that the transformer kernel can work on the right device + local_rank: Optional: The rank of GPU running the transformer kernel, it is not required + to use if the model already set the current device, otherwise need to set it + so that the transformer kernel can work on the right device - seed: The random seed for the dropout layers + seed: The random seed for the dropout layers - fp16: Enable half-precision computation + fp16: Enable half-precision computation - pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture + pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture - normalize_invertible: Optional: Enable invertible LayerNorm execution (dropping the input activation), - default is False + normalize_invertible: Optional: Enable invertible LayerNorm execution (dropping the input activation), + default is False - gelu_checkpoint: Optional: Enable checkpointing of Gelu activation output to save memory, - default is False + gelu_checkpoint: Optional: Enable checkpointing of Gelu activation output to save memory, + default is False - adjust_init_range: Optional: Set as True (default) if the model adjusts the weight initial values of - its self-attention output and layer output, False keeps the initializer_range no change. - See the adjustment below: - output_std = self.config.initializer_range / math.sqrt(2.0 * num_layers) + adjust_init_range: Optional: Set as True (default) if the model adjusts the weight initial values of + its self-attention output and layer output, False keeps the initializer_range no change. + See the adjustment below: + output_std = self.config.initializer_range / math.sqrt(2.0 * num_layers) - attn_dropout_checkpoint: Optional: Enable checkpointing of attention dropout to save memory, - default is False + attn_dropout_checkpoint: Optional: Enable checkpointing of attention dropout to save memory, + default is False - stochastic_mode: Enable for high performance, please note that this flag has some level of - non-determinism and can produce different results on different runs. However, we have seen - that by enabling it, the pretraining tasks such as BERT are not affected and can obtain - a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend - to turn it off in order to be able to reproduce the same result through the regular kernel execution. + stochastic_mode: Enable for high performance, please note that this flag has some level of + non-determinism and can produce different results on different runs. However, we have seen + that by enabling it, the pretraining tasks such as BERT are not affected and can obtain + a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend + to turn it off in order to be able to reproduce the same result through the regular kernel execution. """ def __init__(self, batch_size=-1, diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py index d95d04e7df1e..3950e7eced20 100755 --- a/deepspeed/runtime/activation_checkpointing/checkpointing.py +++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py @@ -236,9 +236,6 @@ def model_parallel_cuda_manual_seed(seed): parallel GPUs, but the same across data parallel groups. This is used for example for dropout in model parallel regions. - - Args: - seed (int): The seed to use. """ global mpu # 2718 is just for fun and any POSITIVE value will work. @@ -317,12 +314,12 @@ def get_full_inputs(tensors, device=None): class CheckpointFunction(torch.autograd.Function): """This function is adapted from torch.utils.checkpoint with two main changes: - 1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state` - 2) the states in the model parallel tracker are also properly - tracked/set/reset. - 3) Performance activation partitioning, contiguous memory optimization - 4) CPU Checkpointing - 5) Profile forward and backward functions + 1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state` + 2) the states in the model parallel tracker are also properly + tracked/set/reset. + 3) Performance activation partitioning, contiguous memory optimization + 4) CPU Checkpointing + 5) Profile forward and backward functions """ @staticmethod def forward(ctx, run_function, *args): From 026bea75eac40728ad933bbac1c4b1c54cf615b3 Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Wed, 16 Sep 2020 16:08:59 -0700 Subject: [PATCH 18/18] remove git_version_info_installed.py if exists --- install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install.sh b/install.sh index b8360e392659..7389a6f993a0 100755 --- a/install.sh +++ b/install.sh @@ -137,7 +137,7 @@ rm_if_exist() { if [ "$no_clean" == "0" ]; then # remove deepspeed build files - rm_if_exist deepspeed/git_version_info.py + rm_if_exist deepspeed/git_version_info_installed.py rm_if_exist dist rm_if_exist build rm_if_exist deepspeed.egg-info