From e15a66412cc220fa241ec7cbb64b339a2f124761 Mon Sep 17 00:00:00 2001 From: four4fish <88516121+four4fish@users.noreply.github.com> Date: Sun, 20 Feb 2022 10:11:47 -0800 Subject: [PATCH] Add back deterministic support in accelerator_connector (#11999) Co-authored-by: ananthsub Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> --- CHANGELOG.md | 3 +++ .../connectors/accelerator_connector.py | 22 ++++++++++++++++++- .../test_accelerator_connector.py | 9 ++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f8bb34adeef05..c88e4fa0e9564 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -299,6 +299,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Changed default logger name to `lightning_logs` for consistency ([#11762](https://github.com/PyTorchLightning/pytorch-lightning/pull/11762)) + +- Rewrote `accelerator_connector` ([#11448](https://github.com/PyTorchLightning/pytorch-lightning/pull/11448)) + ### Deprecated - Deprecated `training_type_plugin` property in favor of `strategy` in `Trainer` and updated the references ([#11141](https://github.com/PyTorchLightning/pytorch-lightning/pull/11141)) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 20c5f485b4e71..8d451f97249fc 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -74,7 +74,12 @@ rank_zero_warn, ) from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _HOROVOD_AVAILABLE, _IPU_AVAILABLE, _TPU_AVAILABLE +from pytorch_lightning.utilities.imports import ( + _HOROVOD_AVAILABLE, + _IPU_AVAILABLE, + _TORCH_GREATER_EQUAL_1_8, + _TPU_AVAILABLE, +) log = logging.getLogger(__name__) @@ -141,6 +146,7 @@ def __init__( torch.backends.cudnn.benchmark = benchmark self.replace_sampler_ddp = replace_sampler_ddp self.sync_batchnorm = sync_batchnorm + self._init_deterministic(deterministic) # 1. Parsing flags # Get registered strategies, built-in accelerators and precision plugins @@ -196,6 +202,20 @@ def __init__( # 6. Instantiate Strategy - Part 2 self._lazy_init_strategy() + def _init_deterministic(self, deterministic: bool) -> None: + self.deterministic = deterministic + if _TORCH_GREATER_EQUAL_1_8: + torch.use_deterministic_algorithms(deterministic) + else: + torch.set_deterministic(deterministic) + if deterministic: + # fixing non-deterministic part of horovod + # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383 + os.environ["HOROVOD_FUSION_THRESHOLD"] = "0" + + # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility + os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" + def _check_config_and_set_final_flags( self, strategy: Optional[Union[str, Strategy]], diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 76fa6d64f5a56..d616abd76bfa5 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -947,3 +947,12 @@ def test_passing_zero_and_empty_list_to_devices_flag(): with pytest.warns(UserWarning, match=r"switching to `cpu` accelerator"): Trainer(accelerator="gpu", devices=[]) + + +@pytest.mark.parametrize("deterministic", [True, False]) +def test_deterministic_init(deterministic): + trainer = Trainer(accelerator="auto", deterministic=deterministic) + assert trainer._accelerator_connector.deterministic == deterministic + if deterministic: + assert os.environ.get("CUBLAS_WORKSPACE_CONFIG") == ":4096:8" + assert os.environ.get("HOROVOD_FUSION_THRESHOLD") == "0"