diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 3a94ef67589108..729d258cfcd63b 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,14 +1,14 @@ --- name: Bug report -about: Create a report to help us improve +about: Create a bug report to help us improve title: '' -labels: bug / fix, help wanted +labels: bug assignees: '' --- ## 🐛 Bug - + ### To Reproduce diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md index 75919587387a9f..f5ff43d6f093a2 100644 --- a/.github/ISSUE_TEMPLATE/documentation.md +++ b/.github/ISSUE_TEMPLATE/documentation.md @@ -1,8 +1,8 @@ --- name: Typos and doc fixes -about: Typos and doc fixes +about: Tell us about how we can improve our documentation title: '' -labels: documentation +labels: docs assignees: '' --- diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index ab95a714e6dd71..11da695decfe01 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,8 +1,8 @@ --- name: Feature request -about: Suggest an idea for this project +about: Propose a feature for this project title: '' -labels: enhancement +labels: feature assignees: '' --- @@ -12,7 +12,7 @@ assignees: '' ### Motivation - + ### Pitch diff --git a/.github/ISSUE_TEMPLATE/code_improvement.md b/.github/ISSUE_TEMPLATE/refactor.md similarity index 83% rename from .github/ISSUE_TEMPLATE/code_improvement.md rename to .github/ISSUE_TEMPLATE/refactor.md index 7608b604e611b7..5e07b0aae2df13 100644 --- a/.github/ISSUE_TEMPLATE/code_improvement.md +++ b/.github/ISSUE_TEMPLATE/refactor.md @@ -1,18 +1,18 @@ --- -name: Code improvement -about: Suggest a code improvement, i.e. refactoring, deprecation, etc. +name: Refactor +about: Suggest a code refactor or deprecation title: '' -labels: refactors / code health +labels: refactor assignees: '' --- -## Proposed refactoring or deprecation +## Proposed refactor - + ### Motivation - + ### Pitch diff --git a/.github/lightning-probot.yml b/.github/lightning-probot.yml new file mode 100644 index 00000000000000..bd6a330a448a4c --- /dev/null +++ b/.github/lightning-probot.yml @@ -0,0 +1 @@ +tracking_issue: 10530 diff --git a/.github/mergify.yml b/.github/mergify.yml index a2b1e8aede6de9..53ec106873dfe0 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -45,7 +45,7 @@ pull_request_rules: - "#changes-requested-reviews-by=0" # no requested changes actions: label: - add: [ "0:] Ready-To-Go" ] + add: [ "ready" ] - name: Not ready yet conditions: @@ -54,13 +54,13 @@ pull_request_rules: - "#changes-requested-reviews-by>=1" # no requested changes actions: label: - remove: [ "0:] Ready-To-Go" ] + remove: [ "ready" ] - name: add core reviewer conditions: - -conflict # skip if conflict - -draft # filter-out GH draft PRs - - label="0:] Ready-To-Go" + - label="ready" - "#approved-reviews-by<3" # number of review approvals - "#review-requested<3" # number of requested reviews actions: diff --git a/.github/stale.yml b/.github/stale.yml index 84049394d3aab5..1ac5e7448c9ff9 100644 --- a/.github/stale.yml +++ b/.github/stale.yml @@ -8,8 +8,8 @@ issues: daysUntilClose: 7 # Issues with these labels will never be considered stale exemptLabels: - - Important - - Priority + - p0 + - p1 # Comment to post when marking an issue as stale. Set to `false` to disable markComment: > This issue has been automatically marked as stale because it hasn't had any recent activity. diff --git a/.github/workflows/probot-auto-cc.yml b/.github/workflows/probot-auto-cc.yml new file mode 100644 index 00000000000000..0595c4eee65f7f --- /dev/null +++ b/.github/workflows/probot-auto-cc.yml @@ -0,0 +1,18 @@ +name: Probot + +on: + issues: + types: + - labeled + pull_request: + types: + - labeled + +jobs: + auto-cc: + if: ${{ github.repository_owner == 'PyTorchLightning' }} + runs-on: ubuntu-latest + steps: + - uses: carmocca/probot@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 24e68bb24f16e6..712141aaffa61e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,7 +44,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated `ClusterEnvironment.master_{address,port}` in favor of `ClusterEnvironment.main_{address,port}` ([#10103](https://github.com/PyTorchLightning/pytorch-lightning/issues/10103)) -- +- Deprecated `DistributedType` in favor of `_StrategyType` ([#10505](https://github.com/PyTorchLightning/pytorch-lightning/pull/10505)) - @@ -57,6 +57,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Removed +- Removed deprecated parameter `method` in `pytorch_lightning.utilities.model_helpers.is_overridden` ([#10507](https://github.com/PyTorchLightning/pytorch-lightning/pull/10507)) + + - Remove deprecated method `ClusterEnvironment.creates_children` ([#10339](https://github.com/PyTorchLightning/pytorch-lightning/issues/10339)) @@ -123,6 +126,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed deprecated `Trainer.train_loop` property in favor of `Trainer.fit_loop` ([#10482](https://github.com/PyTorchLightning/pytorch-lightning/pull/10482)) + +- Removed deprecated `disable_validation` property from Trainer ([#10450](https://github.com/PyTorchLightning/pytorch-lightning/pull/10450)) + + +- Removed deprecated `CheckpointConnector.hpc_load` property in favor of `CheckpointConnector.restore` ([#10525](https://github.com/PyTorchLightning/pytorch-lightning/pull/10525)) + + + ### Fixed - Fixed an issue where class or init-only variables of dataclasses were passed to the dataclass constructor in `utilities.apply_to_collection` ([#9702](https://github.com/PyTorchLightning/pytorch-lightning/issues/9702)) @@ -134,6 +145,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed sampler replacement logic with `overfit_batches` ([#10486](https://github.com/PyTorchLightning/pytorch-lightning/issues/10486)) +- Fixed `isinstance` not working with `init_meta_context`, materialized model not being moved to the device ([#10493](https://github.com/PyTorchLightning/metrics/pull/10493)) + + - Fixed an issue that prevented the Trainer to shutdown workers when execution is interrupted due to failure([#10463](https://github.com/PyTorchLightning/pytorch-lightning/issues/10463)) diff --git a/docs/source/advanced/training_tricks.rst b/docs/source/advanced/training_tricks.rst index 28f81d98dcbd31..a389b0db69a2e1 100644 --- a/docs/source/advanced/training_tricks.rst +++ b/docs/source/advanced/training_tricks.rst @@ -64,10 +64,7 @@ read `this post None: def apply_fn(module: Union["DeviceDtypeModuleMixin", Module]) -> None: - if not isinstance(module, DeviceDtypeModuleMixin): + # TODO: Find why `isinstance(module, DeviceDtypeModuleMixin)` doesn't + # work when using `init_meta_context`. + if not isinstance(module, (DeviceDtypeModuleMixin, pl.LightningModule)): return if device is not None: module._device = device diff --git a/pytorch_lightning/lite/lite.py b/pytorch_lightning/lite/lite.py index d36e874cbae7b9..2a2ed9586b420e 100644 --- a/pytorch_lightning/lite/lite.py +++ b/pytorch_lightning/lite/lite.py @@ -41,7 +41,7 @@ ) from pytorch_lightning.trainer.connectors.accelerator_connector import AcceleratorConnector from pytorch_lightning.trainer.data_loading import TrainerDataLoadingMixin -from pytorch_lightning.utilities import DeviceType, DistributedType, move_data_to_device +from pytorch_lightning.utilities import _StrategyType, DeviceType, move_data_to_device from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors from pytorch_lightning.utilities.data import has_iterable_dataset from pytorch_lightning.utilities.device_parser import _parse_devices @@ -477,14 +477,14 @@ def _supported_device_types() -> Sequence[DeviceType]: ) @staticmethod - def _supported_strategy_types() -> Sequence[DistributedType]: + def _supported_strategy_types() -> Sequence[_StrategyType]: return ( - DistributedType.DP, - DistributedType.DDP, - DistributedType.DDP_SPAWN, - DistributedType.DEEPSPEED, - DistributedType.DDP_SHARDED, - DistributedType.DDP_SHARDED_SPAWN, + _StrategyType.DP, + _StrategyType.DDP, + _StrategyType.DDP_SPAWN, + _StrategyType.DEEPSPEED, + _StrategyType.DDP_SHARDED, + _StrategyType.DDP_SHARDED_SPAWN, ) @staticmethod diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 84e9b55b9ee085..0285859a6714a4 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -55,7 +55,7 @@ ReduceOp, sync_ddp_if_available, ) -from pytorch_lightning.utilities.enums import DistributedType +from pytorch_lightning.utilities.enums import _StrategyType from pytorch_lightning.utilities.exceptions import DeadlockDetectedException, MisconfigurationException from pytorch_lightning.utilities.seed import reset_seed from pytorch_lightning.utilities.types import STEP_OUTPUT @@ -79,7 +79,7 @@ class DDPPlugin(ParallelPlugin): devices (e.g. GPU) per node. It is very similar to how :mod:`torch.distributed.launch` launches processes. """ - distributed_backend = DistributedType.DDP + distributed_backend = _StrategyType.DDP def __init__( self, diff --git a/pytorch_lightning/plugins/training_type/ddp2.py b/pytorch_lightning/plugins/training_type/ddp2.py index ef623a794da42b..a142d518a0f2f0 100644 --- a/pytorch_lightning/plugins/training_type/ddp2.py +++ b/pytorch_lightning/plugins/training_type/ddp2.py @@ -15,14 +15,14 @@ from pytorch_lightning.plugins.training_type.ddp import DDPPlugin from pytorch_lightning.utilities.apply_func import apply_to_collection -from pytorch_lightning.utilities.enums import DistributedType +from pytorch_lightning.utilities.enums import _StrategyType from pytorch_lightning.utilities.types import _METRIC_COLLECTION class DDP2Plugin(DDPPlugin): """DDP2 behaves like DP in one node, but synchronization across nodes behaves like in DDP.""" - distributed_backend = DistributedType.DDP2 + distributed_backend = _StrategyType.DDP2 @property def global_rank(self) -> int: diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 677e031cd04af4..a77027adb6dcf6 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -43,7 +43,7 @@ ReduceOp, sync_ddp_if_available, ) -from pytorch_lightning.utilities.enums import DistributedType +from pytorch_lightning.utilities.enums import _StrategyType from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.seed import reset_seed from pytorch_lightning.utilities.types import STEP_OUTPUT @@ -58,7 +58,7 @@ class DDPSpawnPlugin(ParallelPlugin): """Spawns processes using the :func:`torch.multiprocessing.spawn` method and joins processes after training finishes.""" - distributed_backend = DistributedType.DDP_SPAWN + distributed_backend = _StrategyType.DDP_SPAWN def __init__( self, diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 2464a8ba4eecad..94235f361d9458 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -36,7 +36,7 @@ from pytorch_lightning.utilities import AMPType, GradClipAlgorithmType from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.distributed import log, rank_zero_info, rank_zero_only -from pytorch_lightning.utilities.enums import DistributedType +from pytorch_lightning.utilities.enums import _StrategyType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _DEEPSPEED_AVAILABLE from pytorch_lightning.utilities.model_helpers import is_overridden @@ -82,7 +82,7 @@ def _move_float_tensors_to_half(self, batch: Any): class DeepSpeedPlugin(DDPPlugin): - distributed_backend = DistributedType.DEEPSPEED + distributed_backend = _StrategyType.DEEPSPEED DEEPSPEED_ENV_VAR = "PL_DEEPSPEED_CONFIG_PATH" def __init__( diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index a0f53791bc373d..83328e8c472717 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -20,7 +20,7 @@ from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.utilities.apply_func import apply_to_collection -from pytorch_lightning.utilities.enums import DistributedType +from pytorch_lightning.utilities.enums import _StrategyType from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.types import _METRIC_COLLECTION @@ -29,7 +29,7 @@ class DataParallelPlugin(ParallelPlugin): """Implements data-parallel training in a single process, i.e., the model gets replicated to each device and each gets a split of the data.""" - distributed_backend = DistributedType.DP + distributed_backend = _StrategyType.DP def __init__( self, diff --git a/pytorch_lightning/plugins/training_type/fully_sharded.py b/pytorch_lightning/plugins/training_type/fully_sharded.py index 704afa1a91aaac..c9601a905df1c0 100644 --- a/pytorch_lightning/plugins/training_type/fully_sharded.py +++ b/pytorch_lightning/plugins/training_type/fully_sharded.py @@ -20,7 +20,7 @@ from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.training_type.ddp import DDPPlugin from pytorch_lightning.utilities import _FAIRSCALE_FULLY_SHARDED_AVAILABLE -from pytorch_lightning.utilities.enums import DistributedType +from pytorch_lightning.utilities.enums import _StrategyType from pytorch_lightning.utilities.exceptions import MisconfigurationException if _FAIRSCALE_FULLY_SHARDED_AVAILABLE: @@ -30,7 +30,7 @@ class DDPFullyShardedPlugin(DDPPlugin): - distributed_backend = DistributedType.DDP_FULLY_SHARDED + distributed_backend = _StrategyType.DDP_FULLY_SHARDED def __init__( self, diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py index 30360e1ab458f3..51558189a3d35f 100644 --- a/pytorch_lightning/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -26,7 +26,7 @@ from pytorch_lightning.utilities.distributed import distributed_available from pytorch_lightning.utilities.distributed import group as dist_group from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp -from pytorch_lightning.utilities.enums import DistributedType +from pytorch_lightning.utilities.enums import _StrategyType if _HOROVOD_AVAILABLE: import horovod.torch as hvd @@ -35,7 +35,7 @@ class HorovodPlugin(ParallelPlugin): """Plugin for Horovod distributed training integration.""" - distributed_backend = DistributedType.HOROVOD + distributed_backend = _StrategyType.HOROVOD def __init__( self, diff --git a/pytorch_lightning/plugins/training_type/sharded.py b/pytorch_lightning/plugins/training_type/sharded.py index 5955f3a46f38e6..d7563437bd16b1 100644 --- a/pytorch_lightning/plugins/training_type/sharded.py +++ b/pytorch_lightning/plugins/training_type/sharded.py @@ -23,7 +23,7 @@ from pytorch_lightning.plugins.training_type.ddp import DDPPlugin from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE, rank_zero_only -from pytorch_lightning.utilities.enums import DistributedType +from pytorch_lightning.utilities.enums import _StrategyType from pytorch_lightning.utilities.exceptions import MisconfigurationException if _FAIRSCALE_AVAILABLE: @@ -36,7 +36,7 @@ class DDPShardedPlugin(DDPPlugin): """Optimizer and gradient sharded training provided by FairScale.""" - distributed_backend = DistributedType.DDP_SHARDED + distributed_backend = _StrategyType.DDP_SHARDED _REDUCE_BUFFER_SIZE_DEFAULT: int = 2 ** 23 # 8M def __init__(self, *args, **kwargs): diff --git a/pytorch_lightning/plugins/training_type/sharded_spawn.py b/pytorch_lightning/plugins/training_type/sharded_spawn.py index e0ae5c7bba1879..12e627edbe5cbd 100644 --- a/pytorch_lightning/plugins/training_type/sharded_spawn.py +++ b/pytorch_lightning/plugins/training_type/sharded_spawn.py @@ -24,7 +24,7 @@ from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, rank_zero_only -from pytorch_lightning.utilities.enums import DistributedType +from pytorch_lightning.utilities.enums import _StrategyType from pytorch_lightning.utilities.exceptions import MisconfigurationException if _FAIRSCALE_AVAILABLE: @@ -38,7 +38,7 @@ class DDPSpawnShardedPlugin(DDPSpawnPlugin): """Optimizer sharded training provided by FairScale.""" - distributed_backend = DistributedType.DDP_SHARDED_SPAWN + distributed_backend = _StrategyType.DDP_SHARDED_SPAWN def configure_ddp(self) -> None: trainer = self.lightning_module.trainer diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 43eb65ce21a22b..47deeed2dca1d5 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -61,10 +61,10 @@ TorchElasticEnvironment, ) from pytorch_lightning.utilities import ( + _StrategyType, AMPType, device_parser, DeviceType, - DistributedType, rank_zero_deprecation, rank_zero_info, rank_zero_warn, @@ -278,7 +278,7 @@ def _set_devices_if_none(self) -> None: self.devices = self.num_processes def _handle_accelerator_and_strategy(self) -> None: - deprecated_types = [t for t in DistributedType if t not in (DistributedType.TPU_SPAWN, DistributedType.DDP_CPU)] + deprecated_types = [t for t in _StrategyType if t not in (_StrategyType.TPU_SPAWN, _StrategyType.DDP_CPU)] if self.distributed_backend is not None and self.distributed_backend in deprecated_types: rank_zero_deprecation( f"Passing `Trainer(accelerator={self.distributed_backend!r})` has been deprecated" @@ -290,12 +290,12 @@ def _handle_accelerator_and_strategy(self) -> None: f" also passed `Trainer(accelerator={self.distributed_backend!r})`." f" HINT: Use just `Trainer(strategy={self.strategy!r})` instead." ) - if self.strategy == DistributedType.TPU_SPAWN: + if self.strategy == _StrategyType.TPU_SPAWN: raise MisconfigurationException( "`Trainer(strategy='tpu_spawn')` is not a valid strategy," " you can use `Trainer(strategy='ddp_spawn', accelerator='tpu')` instead." ) - if self.strategy == DistributedType.DDP_CPU: + if self.strategy == _StrategyType.DDP_CPU: raise MisconfigurationException( "`Trainer(strategy='ddp_cpu')` is not a valid strategy," " you can use `Trainer(strategy='ddp'|'ddp_spawn', accelerator='cpu')` instead." @@ -505,31 +505,31 @@ def _map_devices_to_accelerator(self, accelerator: str) -> bool: @property def use_dp(self) -> bool: - return self._distrib_type == DistributedType.DP + return self._distrib_type == _StrategyType.DP @property def use_ddp(self) -> bool: return self._distrib_type in ( - DistributedType.DDP, - DistributedType.DDP_SPAWN, - DistributedType.DDP_SHARDED, - DistributedType.DDP_SHARDED_SPAWN, - DistributedType.DDP_FULLY_SHARDED, - DistributedType.DEEPSPEED, - DistributedType.TPU_SPAWN, + _StrategyType.DDP, + _StrategyType.DDP_SPAWN, + _StrategyType.DDP_SHARDED, + _StrategyType.DDP_SHARDED_SPAWN, + _StrategyType.DDP_FULLY_SHARDED, + _StrategyType.DEEPSPEED, + _StrategyType.TPU_SPAWN, ) @property def use_ddp2(self) -> bool: - return self._distrib_type == DistributedType.DDP2 + return self._distrib_type == _StrategyType.DDP2 @property def use_horovod(self) -> bool: - return self._distrib_type == DistributedType.HOROVOD + return self._distrib_type == _StrategyType.HOROVOD @property def use_deepspeed(self) -> bool: - return self._distrib_type == DistributedType.DEEPSPEED + return self._distrib_type == _StrategyType.DEEPSPEED @property def _is_sharded_training_type(self) -> bool: @@ -590,7 +590,7 @@ def root_gpu(self) -> Optional[int]: @staticmethod def _is_plugin_training_type(plugin: Union[str, TrainingTypePlugin]) -> bool: - if isinstance(plugin, str) and (plugin in TrainingTypePluginsRegistry or plugin in list(DistributedType)): + if isinstance(plugin, str) and (plugin in TrainingTypePluginsRegistry or plugin in list(_StrategyType)): return True return isinstance(plugin, TrainingTypePlugin) @@ -635,7 +635,7 @@ def select_precision_plugin(self) -> PrecisionPlugin: ) return TPUBf16PrecisionPlugin() - if self._distrib_type == DistributedType.DEEPSPEED or isinstance(self._training_type_plugin, DeepSpeedPlugin): + if self._distrib_type == _StrategyType.DEEPSPEED or isinstance(self._training_type_plugin, DeepSpeedPlugin): return DeepSpeedPrecisionPlugin(self.precision) if self.precision == 32: @@ -706,15 +706,15 @@ def select_training_type_plugin(self) -> TrainingTypePlugin: use_slurm_ddp = self.use_ddp and self._is_slurm_managing_tasks use_torchelastic_ddp = self.use_ddp and TorchElasticEnvironment.is_using_torchelastic() use_kubeflow_ddp = self.use_ddp and KubeflowEnvironment.is_using_kubeflow() - use_ddp_spawn = self._distrib_type == DistributedType.DDP_SPAWN + use_ddp_spawn = self._distrib_type == _StrategyType.DDP_SPAWN use_ddp_cpu_spawn = use_ddp_spawn and self.use_cpu - use_tpu_spawn = self.use_tpu and self._distrib_type == DistributedType.TPU_SPAWN + use_tpu_spawn = self.use_tpu and self._distrib_type == _StrategyType.TPU_SPAWN use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and TorchElasticEnvironment.is_using_torchelastic() use_ddp_cpu_kubeflow = use_ddp_cpu_spawn and KubeflowEnvironment.is_using_kubeflow() use_ddp_cpu_slurm = use_ddp_cpu_spawn and self._is_slurm_managing_tasks - use_ddp_sharded = self._distrib_type == DistributedType.DDP_SHARDED - use_ddp_sharded_spawn = self._distrib_type == DistributedType.DDP_SHARDED_SPAWN - use_ddp_fully_sharded = self._distrib_type == DistributedType.DDP_FULLY_SHARDED + use_ddp_sharded = self._distrib_type == _StrategyType.DDP_SHARDED + use_ddp_sharded_spawn = self._distrib_type == _StrategyType.DDP_SHARDED_SPAWN + use_ddp_fully_sharded = self._distrib_type == _StrategyType.DDP_FULLY_SHARDED if use_tpu_spawn: ddp_plugin_cls = TPUSpawnPlugin @@ -839,27 +839,27 @@ def set_distributed_mode(self, strategy: Optional[str] = None): if self.has_horovodrun(): self._set_horovod_backend() elif self.num_gpus == 0 and self.num_nodes > 1: - self._distrib_type = DistributedType.DDP + self._distrib_type = _StrategyType.DDP elif self.num_gpus == 0 and self.num_processes > 1: - self.distributed_backend = DistributedType.DDP_SPAWN + self.distributed_backend = _StrategyType.DDP_SPAWN elif self.num_gpus > 1 and not _use_cpu: rank_zero_warn( "You requested multiple GPUs but did not specify a backend, e.g." ' `Trainer(strategy="dp"|"ddp"|"ddp2")`. Setting `strategy="ddp_spawn"` for you.' ) - self.distributed_backend = DistributedType.DDP_SPAWN + self.distributed_backend = _StrategyType.DDP_SPAWN # special case with DDP on CPUs - if self.distributed_backend == DistributedType.DDP_CPU: + if self.distributed_backend == _StrategyType.DDP_CPU: if _TPU_AVAILABLE: raise MisconfigurationException( "`accelerator='ddp_cpu'` is not supported on TPU machines. " "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810" ) if self.num_processes == 1 and self.num_nodes > 1: - self._distrib_type = DistributedType.DDP + self._distrib_type = _StrategyType.DDP else: - self._distrib_type = DistributedType.DDP_SPAWN + self._distrib_type = _StrategyType.DDP_SPAWN if self.num_gpus > 0: rank_zero_warn( "You requested one or more GPUs, but set `accelerator='ddp_cpu'`. Training will not use GPUs." @@ -872,25 +872,25 @@ def set_distributed_mode(self, strategy: Optional[str] = None): elif self.has_tpu and not _use_cpu: self._device_type = DeviceType.TPU if isinstance(self.tpu_cores, int): - self._distrib_type = DistributedType.TPU_SPAWN + self._distrib_type = _StrategyType.TPU_SPAWN elif self.has_ipu and not _use_cpu: self._device_type = DeviceType.IPU elif self.distributed_backend and self._distrib_type is None: - self._distrib_type = DistributedType(self.distributed_backend) + self._distrib_type = _StrategyType(self.distributed_backend) if self.num_gpus > 0 and not _use_cpu: self._device_type = DeviceType.GPU - _gpu_distrib_types = (DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2) + _gpu_distrib_types = (_StrategyType.DP, _StrategyType.DDP, _StrategyType.DDP_SPAWN, _StrategyType.DDP2) # DP and DDP2 cannot run without GPU if self.num_gpus == 0 and self._distrib_type in _gpu_distrib_types and not _use_cpu: if (self.num_nodes and self.num_nodes > 1) or (self.num_processes and self.num_processes > 1): - if self._distrib_type in (DistributedType.DP, DistributedType.DDP2): + if self._distrib_type in (_StrategyType.DP, _StrategyType.DDP2): rank_zero_warn( f"{self._distrib_type.value!r} is not supported on CPUs, hence setting `strategy='ddp'`." ) - self._distrib_type = DistributedType.DDP + self._distrib_type = _StrategyType.DDP else: rank_zero_warn("You are running on single node with no parallelization, so distributed has no effect.") self._distrib_type = None @@ -900,28 +900,28 @@ def set_distributed_mode(self, strategy: Optional[str] = None): # for DDP overwrite nb processes by requested GPUs if self._device_type == DeviceType.GPU and self._distrib_type in ( - DistributedType.DDP, - DistributedType.DDP_SPAWN, + _StrategyType.DDP, + _StrategyType.DDP_SPAWN, ): self.num_processes = self.num_gpus - if self._device_type == DeviceType.GPU and self._distrib_type == DistributedType.DDP2: + if self._device_type == DeviceType.GPU and self._distrib_type == _StrategyType.DDP2: self.num_processes = self.num_nodes # Horovod is an extra case... - if self.distributed_backend == DistributedType.HOROVOD: + if self.distributed_backend == _StrategyType.HOROVOD: self._set_horovod_backend() using_valid_distributed = self.use_ddp or self.use_ddp2 if self.num_nodes > 1 and not using_valid_distributed: - # throw error to force user to choose a supported distributed type such as ddp or ddp2 + # throw error to force user to choose a supported strategy type such as ddp or ddp2 raise MisconfigurationException( "Your chosen strategy does not support `num_nodes > 1`. Please set `strategy=('ddp'|'ddp2')`." ) def _set_horovod_backend(self): self.check_horovod() - self._distrib_type = DistributedType.HOROVOD + self._distrib_type = _StrategyType.HOROVOD # Initialize Horovod to get rank / size info hvd.init() @@ -941,7 +941,7 @@ def check_interactive_compatibility(self): f"`Trainer(strategy={self._distrib_type.value!r})` or" f" `Trainer(accelerator={self._distrib_type.value!r})` is not compatible with an interactive" " environment. Run your code as a script, or choose one of the compatible backends:" - f" {', '.join(DistributedType.interactive_compatible_types())}." + f" {', '.join(_StrategyType.interactive_compatible_types())}." " In case you are spawning processes yourself, make sure to include the Trainer" " creation inside the worker function." ) diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index 921c2e0a7e160b..ab0d3aa4288fa8 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -413,17 +413,6 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict: return checkpoint - def hpc_load(self, checkpoint_path: _PATH) -> None: - """Attempts to restore the full training and model state from a HPC checkpoint file. - - .. deprecated:: v1.4 Will be removed in v1.6. Use :meth:`restore` instead. - """ - rank_zero_deprecation( - "`CheckpointConnector.hpc_load()` was deprecated in v1.4 and will be removed in v1.6." - " Use `CheckpointConnector.restore()` instead." - ) - self.restore(checkpoint_path) - def max_ckpt_version_in_folder(self, dir_path: _PATH, name_key: str = "ckpt_") -> Optional[int]: """List up files in `dir_path` with `name_key`, then yield maximum suffix number. diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 9c40e728391c1c..bdc051091b50c7 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -38,7 +38,7 @@ FastForwardSampler, ) from pytorch_lightning.utilities.data import get_len, has_iterable_dataset, has_len_all_ranks -from pytorch_lightning.utilities.enums import DistributedType +from pytorch_lightning.utilities.enums import _StrategyType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _fault_tolerant_training from pytorch_lightning.utilities.model_helpers import is_overridden @@ -70,7 +70,7 @@ def _worker_check(self, dataloader: DataLoader, name: str) -> None: if not isinstance(dataloader, DataLoader): return - using_spawn = self._accelerator_connector._distrib_type == DistributedType.DDP_SPAWN + using_spawn = self._accelerator_connector._distrib_type == _StrategyType.DDP_SPAWN num_cpus = multiprocessing.cpu_count() # ddp_spawn + num_workers > 0 don't mix! tell the user diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b84f03393309b3..19efdce8e35492 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -64,10 +64,10 @@ from pytorch_lightning.tuner.tuning import Tuner from pytorch_lightning.utilities import ( _IPU_AVAILABLE, + _StrategyType, _TPU_AVAILABLE, device_parser, DeviceType, - DistributedType, GradClipAlgorithmType, parsing, rank_zero_deprecation, @@ -84,7 +84,7 @@ from pytorch_lightning.utilities.distributed import distributed_available from pytorch_lightning.utilities.exceptions import ExitGracefullyException, MisconfigurationException from pytorch_lightning.utilities.imports import _fault_tolerant_training -from pytorch_lightning.utilities.meta import materialize_module +from pytorch_lightning.utilities.meta import is_on_meta_device, materialize_module from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.seed import reset_seed from pytorch_lightning.utilities.types import ( @@ -1406,10 +1406,21 @@ def _call_setup_hook(self) -> None: def _call_configure_sharded_model(self) -> None: with self.accelerator.model_sharded_context(): - materialize_module(self.lightning_module) + self._handle_meta_model() self.call_hook("configure_sharded_model") self.call_hook("on_configure_sharded_model") + def _handle_meta_model(self) -> None: + if not is_on_meta_device(self.lightning_module): + return + + if isinstance(self.training_type_plugin, DDPSpawnPlugin): + raise MisconfigurationException("LightningModule on meta device isn't supported with spawn.") + + materialize_module(self.lightning_module) + # the trainer reference is lost during materialization + self.lightning_module.trainer = proxy(self) + def _call_teardown_hook(self) -> None: fn = self.state.fn._setup_fn @@ -1591,7 +1602,7 @@ def should_rank_save_checkpoint(self) -> bool: return self.training_type_plugin.should_rank_save_checkpoint @property - def _distrib_type(self) -> DistributedType: + def _distrib_type(self) -> _StrategyType: return self._accelerator_connector._distrib_type @property @@ -1754,10 +1765,10 @@ def distributed_sampler_kwargs(self) -> Optional[dict]: @property def data_parallel(self) -> bool: return self._distrib_type in ( - DistributedType.DP, - DistributedType.DDP, - DistributedType.DDP_SPAWN, - DistributedType.DDP2, + _StrategyType.DP, + _StrategyType.DDP, + _StrategyType.DDP_SPAWN, + _StrategyType.DDP2, ) @property @@ -1783,15 +1794,6 @@ def _should_reload_dl_epoch(self) -> bool: n_epochs = self.reload_dataloaders_every_n_epochs return n_epochs and (not self.current_epoch % n_epochs) - @property - def disable_validation(self) -> bool: - """Check if validation is disabled during training.""" - rank_zero_deprecation( - "`trainer.disable_validation` is deprecated in v1.4 and will be removed in v1.6." - " Use `not trainer.enable_validation` instead." - ) - return not self.enable_validation - @property def enable_validation(self) -> bool: """Check if we should run validation during training.""" diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index 7343e28d6d8112..22164908a3e3f1 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -18,6 +18,7 @@ from pytorch_lightning.utilities.apply_func import move_data_to_device # noqa: F401 from pytorch_lightning.utilities.distributed import AllGatherGrad, rank_zero_info, rank_zero_only # noqa: F401 from pytorch_lightning.utilities.enums import ( # noqa: F401 + _StrategyType, AMPType, DeviceType, DistributedType, diff --git a/pytorch_lightning/utilities/enums.py b/pytorch_lightning/utilities/enums.py index 436c675c382c26..18b0336b82d5f6 100644 --- a/pytorch_lightning/utilities/enums.py +++ b/pytorch_lightning/utilities/enums.py @@ -12,8 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. """Enumerated utilities.""" -from enum import Enum -from typing import List, Optional, Union +from enum import Enum, EnumMeta +from typing import Any, List, Optional, Union + +from pytorch_lightning.utilities.warnings import rank_zero_deprecation class LightningEnum(str, Enum): @@ -37,6 +39,31 @@ def __hash__(self) -> int: return hash(self.value.lower()) +class _OnAccessEnumMeta(EnumMeta): + """Enum with a hook to run a function whenever a member is accessed. + + Adapted from: + https://www.buzzphp.com/posts/how-do-i-detect-and-invoke-a-function-when-a-python-enum-member-is-accessed + """ + + def __getattribute__(cls, name: str) -> Any: + obj = super().__getattribute__(name) + if isinstance(obj, Enum): + obj.deprecate() + return obj + + def __getitem__(cls, name: str) -> Any: + member = super().__getitem__(name) + member.deprecate() + return member + + def __call__(cls, value: str, *args: Any, **kwargs: Any) -> Any: + obj = super().__call__(value, *args, **kwargs) + if isinstance(obj, Enum): + obj.deprecate() + return obj + + class AMPType(LightningEnum): """Type of Automatic Mixed Precission used for training. @@ -73,8 +100,8 @@ def supported_types() -> List[str]: return [x.value for x in PrecisionType] -class DistributedType(LightningEnum): - """Define type of distributed computing. +class DistributedType(LightningEnum, metaclass=_OnAccessEnumMeta): + """Define type of training strategy. >>> # you can match the type with string >>> DistributedType.DDP == 'ddp' @@ -82,8 +109,24 @@ class DistributedType(LightningEnum): >>> # which is case invariant >>> DistributedType.DDP2 in ('ddp2', ) True + + Deprecated since v1.6.0 and will be removed in v1.8.0. + + Use `_StrategyType` instead. """ + DP = "dp" + DDP = "ddp" + DDP2 = "ddp2" + DDP_CPU = "ddp_cpu" + DDP_SPAWN = "ddp_spawn" + TPU_SPAWN = "tpu_spawn" + DEEPSPEED = "deepspeed" + HOROVOD = "horovod" + DDP_SHARDED = "ddp_sharded" + DDP_SHARDED_SPAWN = "ddp_sharded_spawn" + DDP_FULLY_SHARDED = "ddp_fully_sharded" + @staticmethod def interactive_compatible_types() -> List["DistributedType"]: """Returns a list containing interactive compatible DistributeTypes.""" @@ -98,17 +141,11 @@ def is_interactive_compatible(self) -> bool: """Returns whether self is interactive compatible.""" return self in DistributedType.interactive_compatible_types() - DP = "dp" - DDP = "ddp" - DDP2 = "ddp2" - DDP_CPU = "ddp_cpu" - DDP_SPAWN = "ddp_spawn" - TPU_SPAWN = "tpu_spawn" - DEEPSPEED = "deepspeed" - HOROVOD = "horovod" - DDP_SHARDED = "ddp_sharded" - DDP_SHARDED_SPAWN = "ddp_sharded_spawn" - DDP_FULLY_SHARDED = "ddp_fully_sharded" + def deprecate(self) -> None: + rank_zero_deprecation( + "`DistributedType` Enum has been deprecated in v1.6 and will be removed in v1.8." + " Use the string value `{self.value!r}` instead." + ) class DeviceType(LightningEnum): @@ -188,3 +225,41 @@ def get_max_depth(mode: str) -> int: @staticmethod def supported_types() -> List[str]: return [x.value for x in ModelSummaryMode] + + +class _StrategyType(LightningEnum): + """Define type of training strategy. + + >>> # you can match the type with string + >>> _StrategyType.DDP == 'ddp' + True + >>> # which is case invariant + >>> _StrategyType.DDP2 in ('ddp2', ) + True + """ + + DP = "dp" + DDP = "ddp" + DDP2 = "ddp2" + DDP_CPU = "ddp_cpu" + DDP_SPAWN = "ddp_spawn" + TPU_SPAWN = "tpu_spawn" + DEEPSPEED = "deepspeed" + HOROVOD = "horovod" + DDP_SHARDED = "ddp_sharded" + DDP_SHARDED_SPAWN = "ddp_sharded_spawn" + DDP_FULLY_SHARDED = "ddp_fully_sharded" + + @staticmethod + def interactive_compatible_types() -> List["_StrategyType"]: + """Returns a list containing interactive compatible _StrategyTypes.""" + return [ + _StrategyType.DP, + _StrategyType.DDP_SPAWN, + _StrategyType.DDP_SHARDED_SPAWN, + _StrategyType.TPU_SPAWN, + ] + + def is_interactive_compatible(self) -> bool: + """Returns whether self is interactive compatible.""" + return self in _StrategyType.interactive_compatible_types() diff --git a/pytorch_lightning/utilities/meta.py b/pytorch_lightning/utilities/meta.py index 60e6cc791b7aee..6d3c1d6b5f11bf 100644 --- a/pytorch_lightning/utilities/meta.py +++ b/pytorch_lightning/utilities/meta.py @@ -18,13 +18,14 @@ from functools import partial from itertools import chain from types import ModuleType -from typing import Callable, Dict, Generator, Iterator, List, Optional, Set, Type +from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Set, Type import torch from torch import nn, Tensor from torch.nn import Module from torch.nn.modules.container import ModuleDict, ModuleList, Sequential +import pytorch_lightning as pl from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_10 @@ -191,7 +192,6 @@ def materialize_module(root_module: nn.Module) -> nn.Module: # cache subclasses to optimize the search when resetting the meta device later on. __STORAGE_META__ = {} - __CREATED_MODULES__ = set() @@ -237,45 +237,52 @@ def _set_meta_device() -> None: for subclass in get_all_subclasses(torch.nn.modules.module.Module): - if isinstance(subclass, (Sequential, ModuleList, ModuleDict)): + if subclass in (Sequential, ModuleList, ModuleDict, pl.LightningModule): continue # if a subclass has already been stored, we should use the cache if str(subclass) in __STORAGE_META__: - # reset the class import package to its rightfull state. + # reset the class import package to its rightful state. mods, subclass, meta_class = __STORAGE_META__[subclass] for mod in mods: setattr(mod, subclass.__name__, meta_class) continue + class _IsinstanceMetaclass(type(subclass)): + def __instancecheck__(self, instance: Any) -> bool: + """Overrides the ``isinstance`` check on ``_MaterializerModule`` objects.""" + return isinstance(instance, self.__bases__[0]) + # Create a class subclassing current `subclass` overriding its new method. # this will enable use to use `torch.distributed.nn.utils.init_meta` to create a `meta` # version of the current subclass module - class _MetaClass(subclass): + class _MaterializerModule(subclass, metaclass=_IsinstanceMetaclass): @classmethod @contextmanager - def instantiation_context(cls, materialize: bool): + def instantiation_context(cls): _unset_meta_device(from_created=True) yield _set_meta_device_populated(from_created=True) @classmethod def materialize(cls, materialize_fn: Callable): - with cls.instantiation_context(materialize=True): + with cls.instantiation_context(): obj = materialize_fn() return obj @staticmethod def add_subclasses(subclass): - """This is used to unrol the instantion tree while creating the modules.""" - __CREATED_MODULES__.add(subclass) + """This is used to unroll the instantiation tree while creating the modules.""" + # Don't store the LightningModule as skipped from the Meta process. + if subclass != pl.LightningModule: + __CREATED_MODULES__.add(subclass) if subclass.__bases__[0] != torch.nn.modules.module.Module: - _MetaClass.add_subclasses(subclass.__bases__[0]) + _MaterializerModule.add_subclasses(subclass.__bases__[0]) def __new__(cls, *args, **kwargs): subclass = cls.__bases__[0] cls.add_subclasses(subclass) - with cls.instantiation_context(materialize=False): + with cls.instantiation_context(): obj = init_meta(subclass, *args, **kwargs) obj.materialize = partial(cls.materialize, materialize_fn=obj.materialize) @@ -294,9 +301,8 @@ def search(mod: ModuleType) -> List[ModuleType]: # nn.Module class can be imported at different level and they all need to be mocked. # Example: torch.nn.Linear is actually torch.nn.modules.linear.Linear # Therefore, torch.nn.Linear, torch.nn.modules.Linear, torch.nn.modules.linear.Linear - # needs to be replaced by the torch.nn.linear.modules.Linear _MetaClass - out = [] - out.append(search(mod)) + # needs to be replaced by the torch.nn.linear.modules.Linear _MaterializerModule + out = [search(mod)] for name in submodules[1:]: mod = getattr(mod, name) out.append(search(mod)) @@ -305,11 +311,11 @@ def search(mod: ModuleType) -> List[ModuleType]: mods = [mod for mod in chain(*out) if mod] # store the modules search so it doesn't have to be performed again for this class - __STORAGE_META__[subclass] = (mods, subclass, _MetaClass) + __STORAGE_META__[subclass] = (mods, subclass, _MaterializerModule) # replace all subclass by its meta form for mod in mods: - setattr(mod, subclass.__name__, _MetaClass) + setattr(mod, subclass.__name__, _MaterializerModule) @contextmanager @@ -321,3 +327,11 @@ def init_meta_context() -> Generator: _set_meta_device() yield _unset_meta_device() + + +def is_on_meta_device(module: nn.Module) -> bool: + try: + param = next(module.parameters()) + return param.device.type == "meta" + except StopIteration: + return False diff --git a/pytorch_lightning/utilities/model_helpers.py b/pytorch_lightning/utilities/model_helpers.py index 3146b33fe153dc..bb48b481e625f1 100644 --- a/pytorch_lightning/utilities/model_helpers.py +++ b/pytorch_lightning/utilities/model_helpers.py @@ -12,26 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. from functools import partial -from typing import Optional, Type, Union +from typing import Optional, Type from unittest.mock import Mock import pytorch_lightning as pl -from pytorch_lightning.utilities import rank_zero_deprecation -def is_overridden( - method_name: str, - instance: Optional[object] = None, - parent: Optional[Type[object]] = None, - model: Optional[Union["pl.LightningModule", "pl.LightningDataModule"]] = None, -) -> bool: - if model is not None and instance is None: - rank_zero_deprecation( - "`is_overriden(model=...)` has been deprecated and will be removed in v1.6." - "Please use `is_overriden(instance=...)`" - ) - instance = model - +def is_overridden(method_name: str, instance: Optional[object] = None, parent: Optional[Type[object]] = None) -> bool: if instance is None: # if `self.lightning_module` was passed as instance, it can be `None` return False diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index d95f5c8e6f9ea6..e70d862b048e0a 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -43,7 +43,7 @@ SLURMEnvironment, TorchElasticEnvironment, ) -from pytorch_lightning.utilities import DeviceType, DistributedType +from pytorch_lightning.utilities import _StrategyType, DeviceType from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel from tests.helpers.runif import RunIf @@ -636,7 +636,7 @@ def test_unsupported_distrib_types_on_cpu(training_type): with pytest.warns(UserWarning, match="is not supported on CPUs, hence setting `strategy='ddp"): trainer = Trainer(accelerator=training_type, num_processes=2) - assert trainer._distrib_type == DistributedType.DDP + assert trainer._distrib_type == _StrategyType.DDP def test_accelerator_ddp_for_cpu(tmpdir): diff --git a/tests/base/model_test_epoch_ends.py b/tests/base/model_test_epoch_ends.py index 746ceb94a5de07..b001298e93dd0c 100644 --- a/tests/base/model_test_epoch_ends.py +++ b/tests/base/model_test_epoch_ends.py @@ -15,7 +15,7 @@ import torch -from pytorch_lightning.utilities import DistributedType +from pytorch_lightning.utilities import _StrategyType class TestEpochEndVariations(ABC): @@ -34,13 +34,13 @@ def test_epoch_end(self, outputs): test_loss = self.get_output_metric(output, "test_loss") # reduce manually when using dp - if self.trainer._distrib_type == DistributedType.DP: + if self.trainer._distrib_type == _StrategyType.DP: test_loss = torch.mean(test_loss) test_loss_mean += test_loss # reduce manually when using dp test_acc = self.get_output_metric(output, "test_acc") - if self.trainer._distrib_type == DistributedType.DP: + if self.trainer._distrib_type == _StrategyType.DP: test_acc = torch.mean(test_acc) test_acc_mean += test_acc @@ -69,13 +69,13 @@ def test_epoch_end__multiple_dataloaders(self, outputs): test_loss = output["test_loss"] # reduce manually when using dp - if self.trainer._distrib_type == DistributedType.DP: + if self.trainer._distrib_type == _StrategyType.DP: test_loss = torch.mean(test_loss) test_loss_mean += test_loss # reduce manually when using dp test_acc = output["test_acc"] - if self.trainer._distrib_type == DistributedType.DP: + if self.trainer._distrib_type == _StrategyType.DP: test_acc = torch.mean(test_acc) test_acc_mean += test_acc diff --git a/tests/deprecated_api/test_remove_1-6.py b/tests/deprecated_api/test_remove_1-6.py index d2f3cec5cba4f7..1ded07734a7de2 100644 --- a/tests/deprecated_api/test_remove_1-6.py +++ b/tests/deprecated_api/test_remove_1-6.py @@ -17,7 +17,6 @@ import pytest from pytorch_lightning import Trainer -from pytorch_lightning.utilities.model_helpers import is_overridden from tests.helpers import BoringModel @@ -48,27 +47,3 @@ def test_v1_6_0_reload_dataloaders_every_epoch(tmpdir): [call.val_dataloader()] + [call.train_dataloader(), call.val_dataloader()] * 3 + [call.test_dataloader()] ) assert tracker.mock_calls == expected_sequence - - -def test_v1_6_0_is_overridden_model(): - model = BoringModel() - with pytest.deprecated_call(match="and will be removed in v1.6"): - assert is_overridden("validation_step", model=model) - with pytest.deprecated_call(match="and will be removed in v1.6"): - assert not is_overridden("foo", model=model) - - -def test_v1_6_0_deprecated_disable_validation(): - trainer = Trainer() - with pytest.deprecated_call(match="disable_validation` is deprecated in v1.4"): - _ = trainer.disable_validation - - -def test_v1_6_0_deprecated_hpc_load(tmpdir): - model = BoringModel() - trainer = Trainer(default_root_dir=tmpdir, max_steps=1) - trainer.fit(model) - trainer.checkpoint_connector.hpc_save(tmpdir, trainer.logger) - checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder(str(tmpdir)) - with pytest.deprecated_call(match=r"`CheckpointConnector.hpc_load\(\)` was deprecated in v1.4"): - trainer.checkpoint_connector.hpc_load(checkpoint_path) diff --git a/tests/deprecated_api/test_remove_1-8.py b/tests/deprecated_api/test_remove_1-8.py new file mode 100644 index 00000000000000..f668f63b9f4503 --- /dev/null +++ b/tests/deprecated_api/test_remove_1-8.py @@ -0,0 +1,23 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Test deprecated functionality which will be removed in v1.8.0.""" +import pytest + +from pytorch_lightning.utilities.enums import DistributedType + + +def test_v1_8_0_deprecated_distributed_type_enum(): + + with pytest.deprecated_call(match="has been deprecated in v1.6 and will be removed in v1.8."): + _ = DistributedType.DDP diff --git a/tests/helpers/pipelines.py b/tests/helpers/pipelines.py index 643d3e50cb8940..6fa3bbb5dc9433 100644 --- a/tests/helpers/pipelines.py +++ b/tests/helpers/pipelines.py @@ -15,7 +15,7 @@ from torchmetrics.functional import accuracy from pytorch_lightning import LightningDataModule, LightningModule, Trainer -from pytorch_lightning.utilities import DistributedType +from pytorch_lightning.utilities import _StrategyType from tests.helpers import BoringModel from tests.helpers.utils import get_default_logger, load_model_from_checkpoint, reset_seed @@ -82,7 +82,7 @@ def run_model_test( run_prediction_eval_model_template(model, dataloader, min_acc=min_acc) if with_hpc: - if trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2): + if trainer._distrib_type in (_StrategyType.DDP, _StrategyType.DDP_SPAWN, _StrategyType.DDP2): # on hpc this would work fine... but need to hack it for the purpose of the test trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = trainer.init_optimizers( pretrained_model diff --git a/tests/lite/test_lite.py b/tests/lite/test_lite.py index bd69cf359473e0..7c79cb7f2e709c 100644 --- a/tests/lite/test_lite.py +++ b/tests/lite/test_lite.py @@ -31,7 +31,7 @@ _replace_dataloader_init_method, ) from pytorch_lightning.plugins import DeepSpeedPlugin, PrecisionPlugin, TrainingTypePlugin -from pytorch_lightning.utilities import DistributedType +from pytorch_lightning.utilities import _StrategyType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.seed import pl_worker_init_function from tests.helpers.runif import RunIf @@ -251,12 +251,12 @@ def test_seed_everything(): @pytest.mark.parametrize( "strategy", [ - DistributedType.DP, - DistributedType.DDP, - DistributedType.DDP_SPAWN, - pytest.param(DistributedType.DEEPSPEED, marks=RunIf(deepspeed=True)), - pytest.param(DistributedType.DDP_SHARDED, marks=RunIf(fairscale=True)), - pytest.param(DistributedType.DDP_SHARDED_SPAWN, marks=RunIf(fairscale=True)), + _StrategyType.DP, + _StrategyType.DDP, + _StrategyType.DDP_SPAWN, + pytest.param(_StrategyType.DEEPSPEED, marks=RunIf(deepspeed=True)), + pytest.param(_StrategyType.DDP_SHARDED, marks=RunIf(fairscale=True)), + pytest.param(_StrategyType.DDP_SHARDED_SPAWN, marks=RunIf(fairscale=True)), ], ) def test_setup_dataloaders_replace_custom_sampler(strategy): @@ -279,12 +279,12 @@ def test_setup_dataloaders_replace_custom_sampler(strategy): @pytest.mark.parametrize( "strategy", [ - DistributedType.DP, - DistributedType.DDP, - DistributedType.DDP_SPAWN, - pytest.param(DistributedType.DEEPSPEED, marks=RunIf(deepspeed=True)), - pytest.param(DistributedType.DDP_SHARDED, marks=RunIf(fairscale=True)), - pytest.param(DistributedType.DDP_SHARDED_SPAWN, marks=RunIf(fairscale=True)), + _StrategyType.DP, + _StrategyType.DDP, + _StrategyType.DDP_SPAWN, + pytest.param(_StrategyType.DEEPSPEED, marks=RunIf(deepspeed=True)), + pytest.param(_StrategyType.DDP_SHARDED, marks=RunIf(fairscale=True)), + pytest.param(_StrategyType.DDP_SHARDED_SPAWN, marks=RunIf(fairscale=True)), ], ) @pytest.mark.parametrize("shuffle", [True, False]) diff --git a/tests/trainer/test_data_loading.py b/tests/trainer/test_data_loading.py index 97097b2074ca16..4f3a482e37ac47 100644 --- a/tests/trainer/test_data_loading.py +++ b/tests/trainer/test_data_loading.py @@ -20,7 +20,7 @@ from torch.utils.data.sampler import BatchSampler, Sampler, SequentialSampler from pytorch_lightning import Trainer -from pytorch_lightning.utilities.enums import DistributedType +from pytorch_lightning.utilities.enums import _StrategyType from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel, RandomDataset from tests.helpers.runif import RunIf @@ -137,7 +137,7 @@ def _get_warning_msg(): @pytest.mark.parametrize("num_workers", [0, 1]) def test_dataloader_warnings(tmpdir, num_workers): trainer = Trainer(default_root_dir=tmpdir, strategy="ddp_spawn", num_processes=2, fast_dev_run=4) - assert trainer._accelerator_connector._distrib_type == DistributedType.DDP_SPAWN + assert trainer._accelerator_connector._distrib_type == _StrategyType.DDP_SPAWN trainer.fit(TestSpawnBoringModel(num_workers)) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index d2e5f771a9c408..dc0ce2b68452c8 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -48,7 +48,7 @@ DDPSpawnShardedPlugin, ) from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities import DeviceType, DistributedType +from pytorch_lightning.utilities import _StrategyType, DeviceType from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.exceptions import DeadlockDetectedException, MisconfigurationException from pytorch_lightning.utilities.seed import seed_everything @@ -1154,15 +1154,15 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): ), ( dict(accelerator="ddp", num_processes=2, gpus=None), - dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), + dict(_distrib_type=_StrategyType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), ), ( dict(accelerator="ddp", num_nodes=2, gpus=None), - dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1), + dict(_distrib_type=_StrategyType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1), ), ( dict(accelerator="ddp_cpu", num_processes=2, gpus=None), - dict(_distrib_type=DistributedType.DDP_SPAWN, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), + dict(_distrib_type=_StrategyType.DDP_SPAWN, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), ), ( dict(accelerator="ddp2", gpus=None), @@ -1174,43 +1174,43 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): ), ( dict(accelerator="dp", gpus=1), - dict(_distrib_type=DistributedType.DP, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1), + dict(_distrib_type=_StrategyType.DP, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1), ), ( dict(accelerator="ddp", gpus=1), - dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1), + dict(_distrib_type=_StrategyType.DDP, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1), ), ( dict(accelerator="ddp_cpu", num_processes=2, gpus=1), - dict(_distrib_type=DistributedType.DDP_SPAWN, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), + dict(_distrib_type=_StrategyType.DDP_SPAWN, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), ), ( dict(accelerator="ddp2", gpus=1), - dict(_distrib_type=DistributedType.DDP2, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1), + dict(_distrib_type=_StrategyType.DDP2, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1), ), ( dict(accelerator=None, gpus=2), - dict(_distrib_type=DistributedType.DDP_SPAWN, _device_type=DeviceType.GPU, num_gpus=2, num_processes=2), + dict(_distrib_type=_StrategyType.DDP_SPAWN, _device_type=DeviceType.GPU, num_gpus=2, num_processes=2), ), ( dict(accelerator="dp", gpus=2), - dict(_distrib_type=DistributedType.DP, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1), + dict(_distrib_type=_StrategyType.DP, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1), ), ( dict(accelerator="ddp", gpus=2), - dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.GPU, num_gpus=2, num_processes=2), + dict(_distrib_type=_StrategyType.DDP, _device_type=DeviceType.GPU, num_gpus=2, num_processes=2), ), ( dict(accelerator="ddp2", gpus=2), - dict(_distrib_type=DistributedType.DDP2, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1), + dict(_distrib_type=_StrategyType.DDP2, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1), ), ( dict(accelerator="ddp2", num_processes=2, gpus=None), - dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), + dict(_distrib_type=_StrategyType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), ), ( dict(accelerator="dp", num_processes=2, gpus=None), - dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), + dict(_distrib_type=_StrategyType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), ), ], ) @@ -2096,11 +2096,11 @@ def training_step(self, batch, batch_idx): ), ( dict(strategy="ddp", num_processes=2, gpus=None), - dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), + dict(_distrib_type=_StrategyType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), ), ( dict(strategy="ddp", num_nodes=2, gpus=None), - dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1), + dict(_distrib_type=_StrategyType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1), ), ( dict(strategy="ddp2", gpus=None), @@ -2112,47 +2112,47 @@ def training_step(self, batch, batch_idx): ), ( dict(strategy="dp", gpus=1), - dict(_distrib_type=DistributedType.DP, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1), + dict(_distrib_type=_StrategyType.DP, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1), ), ( dict(strategy="ddp", gpus=1), - dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1), + dict(_distrib_type=_StrategyType.DDP, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1), ), ( dict(strategy="ddp_spawn", gpus=1), - dict(_distrib_type=DistributedType.DDP_SPAWN, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1), + dict(_distrib_type=_StrategyType.DDP_SPAWN, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1), ), ( dict(strategy="ddp2", gpus=1), - dict(_distrib_type=DistributedType.DDP2, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1), + dict(_distrib_type=_StrategyType.DDP2, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1), ), ( dict(strategy=None, gpus=2), - dict(_distrib_type=DistributedType.DDP_SPAWN, _device_type=DeviceType.GPU, num_gpus=2, num_processes=2), + dict(_distrib_type=_StrategyType.DDP_SPAWN, _device_type=DeviceType.GPU, num_gpus=2, num_processes=2), ), ( dict(strategy="dp", gpus=2), - dict(_distrib_type=DistributedType.DP, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1), + dict(_distrib_type=_StrategyType.DP, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1), ), ( dict(strategy="ddp", gpus=2), - dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.GPU, num_gpus=2, num_processes=2), + dict(_distrib_type=_StrategyType.DDP, _device_type=DeviceType.GPU, num_gpus=2, num_processes=2), ), ( dict(strategy="ddp2", gpus=2), - dict(_distrib_type=DistributedType.DDP2, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1), + dict(_distrib_type=_StrategyType.DDP2, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1), ), ( dict(strategy="ddp2", num_processes=2, gpus=None), - dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), + dict(_distrib_type=_StrategyType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), ), ( dict(strategy="dp", num_processes=2, gpus=None), - dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), + dict(_distrib_type=_StrategyType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), ), ( dict(strategy="ddp_spawn", num_processes=2, gpus=None), - dict(_distrib_type=DistributedType.DDP_SPAWN, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), + dict(_distrib_type=_StrategyType.DDP_SPAWN, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), ), ( dict(strategy="ddp_spawn", num_processes=1, gpus=None), @@ -2161,7 +2161,7 @@ def training_step(self, batch, batch_idx): ( dict(strategy="ddp_fully_sharded", gpus=1), dict( - _distrib_type=DistributedType.DDP_FULLY_SHARDED, + _distrib_type=_StrategyType.DDP_FULLY_SHARDED, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1, @@ -2169,32 +2169,32 @@ def training_step(self, batch, batch_idx): ), ( dict(strategy=DDPSpawnPlugin(), num_processes=2, gpus=None), - dict(_distrib_type=DistributedType.DDP_SPAWN, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), + dict(_distrib_type=_StrategyType.DDP_SPAWN, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), ), ( dict(strategy=DDPSpawnPlugin(), gpus=2), - dict(_distrib_type=DistributedType.DDP_SPAWN, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1), + dict(_distrib_type=_StrategyType.DDP_SPAWN, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1), ), ( dict(strategy=DDPPlugin(), num_processes=2, gpus=None), - dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), + dict(_distrib_type=_StrategyType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2), ), ( dict(strategy=DDPPlugin(), gpus=2), - dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1), + dict(_distrib_type=_StrategyType.DDP, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1), ), ( dict(strategy=DDP2Plugin(), gpus=2), - dict(_distrib_type=DistributedType.DDP2, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1), + dict(_distrib_type=_StrategyType.DDP2, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1), ), ( dict(strategy=DataParallelPlugin(), gpus=2), - dict(_distrib_type=DistributedType.DP, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1), + dict(_distrib_type=_StrategyType.DP, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1), ), ( dict(strategy=DDPFullyShardedPlugin(), gpus=2), dict( - _distrib_type=DistributedType.DDP_FULLY_SHARDED, + _distrib_type=_StrategyType.DDP_FULLY_SHARDED, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1, @@ -2203,7 +2203,7 @@ def training_step(self, batch, batch_idx): ( dict(strategy=DDPSpawnShardedPlugin(), gpus=2), dict( - _distrib_type=DistributedType.DDP_SHARDED_SPAWN, + _distrib_type=_StrategyType.DDP_SHARDED_SPAWN, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1, @@ -2211,7 +2211,7 @@ def training_step(self, batch, batch_idx): ), ( dict(strategy=DDPShardedPlugin(), gpus=2), - dict(_distrib_type=DistributedType.DDP_SHARDED, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1), + dict(_distrib_type=_StrategyType.DDP_SHARDED, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1), ), ], ) diff --git a/tests/utilities/test_meta.py b/tests/utilities/test_meta.py index 8e36a86c3beef0..581b949d9167f5 100644 --- a/tests/utilities/test_meta.py +++ b/tests/utilities/test_meta.py @@ -14,7 +14,7 @@ from torch import nn from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.utilities.meta import init_meta_context, materialize_module +from pytorch_lightning.utilities.meta import init_meta_context, is_on_meta_device, materialize_module from tests.helpers.runif import RunIf @@ -31,18 +31,23 @@ def __init__(self, num_layers: int): self.layer = nn.Sequential(*[nn.Linear(1, 1) for _ in range(self.hparams.num_layers)]) -@RunIf(min_torch="1.10.0") +@RunIf(special=True, min_torch="1.10.0") def test_init_meta_context(): with init_meta_context(): m = nn.Linear(in_features=1, out_features=1) + assert isinstance(m, nn.Linear) assert m.weight.device.type == "meta" + assert is_on_meta_device(m) mlp = MLP(4) assert mlp.layer[0].weight.device.type == "meta" mlp = materialize_module(mlp) assert mlp.layer[0].weight.device.type == "cpu" + assert not is_on_meta_device(mlp) + assert not is_on_meta_device(nn.Module()) + model = BoringModel(4) assert model.layer[0].weight.device.type == "meta" materialize_module(model)