Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' into deepspeed_mics_init
Browse files Browse the repository at this point in the history
lantiga authored Dec 10, 2024
2 parents 5409bc9 + 030f36b commit 6ca2bac
Showing 11 changed files with 103 additions and 10 deletions.
4 changes: 3 additions & 1 deletion .azure/gpu-benchmarks.yml
Original file line number Diff line number Diff line change
@@ -75,7 +75,9 @@ jobs:
pip list
displayName: "Image info & NVIDIA"
- bash: pip install -e .[dev] --find-links ${TORCH_URL}
- bash: |
pip install -e .[dev] --find-links ${TORCH_URL}
pip install setuptools==75.6.0
env:
FREEZE_REQUIREMENTS: "1"
displayName: "Install package"
1 change: 1 addition & 0 deletions .azure/gpu-tests-fabric.yml
Original file line number Diff line number Diff line change
@@ -107,6 +107,7 @@ jobs:
- bash: |
extra=$(python -c "print({'lightning': 'fabric-'}.get('$(PACKAGE_NAME)', ''))")
pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}" --find-links="${TORCHVISION_URL}"
pip install setuptools==75.6.0
displayName: "Install package & dependencies"
- bash: |
1 change: 1 addition & 0 deletions .azure/gpu-tests-pytorch.yml
Original file line number Diff line number Diff line change
@@ -111,6 +111,7 @@ jobs:
- bash: |
extra=$(python -c "print({'lightning': 'pytorch-'}.get('$(PACKAGE_NAME)', ''))")
pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}" --find-links="${TORCHVISION_URL}"
pip install setuptools==75.6.0
displayName: "Install package & dependencies"
- bash: pip uninstall -y lightning
3 changes: 2 additions & 1 deletion dockers/base-cuda/Dockerfile
Original file line number Diff line number Diff line change
@@ -59,7 +59,6 @@ RUN \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get install -y \
python${PYTHON_VERSION} \
python3-setuptools \
python${PYTHON_VERSION}-dev \
&& \
update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \
@@ -79,6 +78,8 @@ RUN \
curl https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} && \
# Disable cache \
pip config set global.cache-dir false && \
# Install recent setuptools to obtain pkg_resources \
pip install setuptools==75.6.0 && \
# set particular PyTorch version \
pip install -q wget packaging && \
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py && \
2 changes: 1 addition & 1 deletion dockers/release/Dockerfile
Original file line number Diff line number Diff line change
@@ -39,7 +39,7 @@ RUN \
fi && \
# otherwise there is collision with folder name and pkg name on Pypi
cd pytorch-lightning && \
pip install setuptools && \
pip install setuptools==75.6.0 && \
PACKAGE_NAME=lightning pip install '.[extra,loggers,strategies]' --no-cache-dir && \
PACKAGE_NAME=pytorch pip install '.[extra,loggers,strategies]' --no-cache-dir && \
cd .. && \
8 changes: 8 additions & 0 deletions docs/source-pytorch/common/index.rst
Original file line number Diff line number Diff line change
@@ -23,6 +23,7 @@
../data/data
../model/own_your_loop
../advanced/model_init
../common/tbptt


#############
@@ -202,6 +203,13 @@ How-to Guides
:col_css: col-md-4
:height: 180

.. displayitem::
:header: Truncated Back-Propagation Through Time
:description: Efficiently step through time when training recurrent models
:button_link: ../common/tbptt.html
:col_css: col-md-4
:height: 180

.. raw:: html

</div>
59 changes: 59 additions & 0 deletions docs/source-pytorch/common/tbptt.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
##############################################
Truncated Backpropagation Through Time (TBPTT)
##############################################

Truncated Backpropagation Through Time (TBPTT) performs backpropogation every k steps of
a much longer sequence. This is made possible by passing training batches
split along the time-dimensions into splits of size k to the
``training_step``. In order to keep the same forward propagation behavior, all
hidden states should be kept in-between each time-dimension split.


.. code-block:: python
import torch
import torch.optim as optim
import pytorch_lightning as pl
from pytorch_lightning import LightningModule
class LitModel(LightningModule):
def __init__(self):
super().__init__()
# 1. Switch to manual optimization
self.automatic_optimization = False
self.truncated_bptt_steps = 10
self.my_rnn = ParityModuleRNN() # Define RNN model using ParityModuleRNN
# 2. Remove the `hiddens` argument
def training_step(self, batch, batch_idx):
# 3. Split the batch in chunks along the time dimension
split_batches = split_batch(batch, self.truncated_bptt_steps)
batch_size = 10
hidden_dim = 20
hiddens = torch.zeros(1, batch_size, hidden_dim, device=self.device)
for split_batch in range(split_batches):
# 4. Perform the optimization in a loop
loss, hiddens = self.my_rnn(split_batch, hiddens)
self.backward(loss)
self.optimizer.step()
self.optimizer.zero_grad()
# 5. "Truncate"
hiddens = hiddens.detach()
# 6. Remove the return of `hiddens`
# Returning loss in manual optimization is not needed
return None
def configure_optimizers(self):
return optim.Adam(self.my_rnn.parameters(), lr=0.001)
if __name__ == "__main__":
model = LitModel()
trainer = pl.Trainer(max_epochs=5)
trainer.fit(model, train_dataloader) # Define your own dataloader
4 changes: 3 additions & 1 deletion docs/source-pytorch/conf.py
Original file line number Diff line number Diff line change
@@ -462,7 +462,9 @@ def _load_py_module(name: str, location: str) -> ModuleType:
("py:obj", "lightning.pytorch.utilities.memory.is_out_of_cpu_memory"),
("py:func", "lightning.pytorch.utilities.rank_zero.rank_zero_only"),
("py:class", "lightning.pytorch.utilities.types.LRSchedulerConfig"),
("py:class", "lightning.pytorch.utilities.types.OptimizerLRSchedulerConfig"),
("py:class", "lightning.pytorch.utilities.types.LRSchedulerConfigType"),
("py:class", "lightning.pytorch.utilities.types.OptimizerConfigType"),
("py:class", "lightning.pytorch.utilities.types.OptimizerLRSchedulerConfigType"),
("py:class", "lightning_habana.pytorch.plugins.precision.HPUPrecisionPlugin"),
("py:class", "lightning_habana.pytorch.strategies.HPUDDPStrategy"),
("py:class", "lightning_habana.pytorch.strategies.HPUParallelStrategy"),
8 changes: 7 additions & 1 deletion src/lightning/fabric/strategies/deepspeed.py
Original file line number Diff line number Diff line change
@@ -18,6 +18,7 @@
import platform
from collections.abc import Mapping
from contextlib import AbstractContextManager, ExitStack
from datetime import timedelta
from itertools import chain
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
@@ -29,6 +30,7 @@
from typing_extensions import override

from lightning.fabric.accelerators import Accelerator, CUDAAccelerator
from lightning.fabric.plugins.collectives.torch_collective import default_pg_timeout
from lightning.fabric.plugins.environments.cluster_environment import ClusterEnvironment
from lightning.fabric.plugins.precision import Precision
from lightning.fabric.strategies.ddp import DDPStrategy
@@ -97,6 +99,7 @@ def __init__(
load_full_weights: bool = False,
precision: Optional[Precision] = None,
process_group_backend: Optional[str] = None,
timeout: Optional[timedelta] = default_pg_timeout,
) -> None:
"""Provides capabilities to run training using the DeepSpeed library, with training optimizations for large
billion parameter models. `For more information: https://pytorch-
@@ -241,6 +244,7 @@ def __init__(
process_group_backend=process_group_backend,
)
self._backward_sync_control = None # DeepSpeed handles gradient accumulation internally
self._timeout: Optional[timedelta] = timeout

self.config = self._load_config(config)
if self.config is None:
@@ -662,7 +666,9 @@ def _init_deepspeed_distributed(self) -> None:
f"MEMBER: {self.global_rank + 1}/{self.world_size}"
)
self._process_group_backend = self._get_process_group_backend()
deepspeed.init_distributed(self._process_group_backend, distributed_port=self.cluster_environment.main_port)
deepspeed.init_distributed(
self._process_group_backend, distributed_port=self.cluster_environment.main_port, timeout=self._timeout
)

def _set_node_environment_variables(self) -> None:
assert self.cluster_environment is not None
8 changes: 7 additions & 1 deletion src/lightning/pytorch/strategies/deepspeed.py
Original file line number Diff line number Diff line change
@@ -19,6 +19,7 @@
from collections import OrderedDict
from collections.abc import Generator, Mapping
from contextlib import contextmanager
from datetime import timedelta
from pathlib import Path
from typing import TYPE_CHECKING, Any, Optional, Union

@@ -30,6 +31,7 @@

import lightning.pytorch as pl
from lightning.fabric.plugins import ClusterEnvironment
from lightning.fabric.plugins.collectives.torch_collective import default_pg_timeout
from lightning.fabric.strategies import _StrategyRegistry
from lightning.fabric.strategies.deepspeed import (
_DEEPSPEED_AVAILABLE,
@@ -119,6 +121,7 @@ def __init__(
load_full_weights: bool = False,
precision_plugin: Optional[Precision] = None,
process_group_backend: Optional[str] = None,
timeout: Optional[timedelta] = default_pg_timeout,
) -> None:
"""Provides capabilities to run training using the DeepSpeed library, with training optimizations for large
billion parameter models. `For more information: https://pytorch-
@@ -264,6 +267,7 @@ def __init__(
precision_plugin=precision_plugin,
process_group_backend=process_group_backend,
)
self._timeout: Optional[timedelta] = timeout

self.config = self._load_config(config)
if self.config is None:
@@ -364,7 +368,9 @@ def _init_deepspeed_distributed(self) -> None:
f"MEMBER: {self.global_rank + 1}/{self.world_size}"
)
self._process_group_backend = self._get_process_group_backend()
deepspeed.init_distributed(self._process_group_backend, distributed_port=self.cluster_environment.main_port)
deepspeed.init_distributed(
self._process_group_backend, distributed_port=self.cluster_environment.main_port, timeout=self._timeout
)

def _set_node_environment_variables(self) -> None:
assert self.cluster_environment is not None
15 changes: 11 additions & 4 deletions src/lightning/pytorch/utilities/types.py
Original file line number Diff line number Diff line change
@@ -104,18 +104,25 @@ class LRSchedulerConfigType(TypedDict, total=False):
strict: bool


class OptimizerLRSchedulerConfig(TypedDict):
class OptimizerConfigType(TypedDict):
optimizer: Optimizer
lr_scheduler: NotRequired[Union[LRSchedulerTypeUnion, LRSchedulerConfigType]]


class OptimizerLRSchedulerConfigType(TypedDict):
optimizer: Optimizer
lr_scheduler: Union[LRSchedulerTypeUnion, LRSchedulerConfigType]
monitor: NotRequired[str]


OptimizerLRScheduler = Optional[
Union[
Optimizer,
Sequence[Optimizer],
tuple[Sequence[Optimizer], Sequence[Union[LRSchedulerTypeUnion, LRSchedulerConfig]]],
OptimizerLRSchedulerConfig,
Sequence[OptimizerLRSchedulerConfig],
OptimizerConfigType,
OptimizerLRSchedulerConfigType,
Sequence[OptimizerConfigType],
Sequence[OptimizerLRSchedulerConfigType],
]
]

0 comments on commit 6ca2bac

Please sign in to comment.