Lightning-AI · SeanNaren · Aug 24, 2021 · Aug 23, 2021 · Aug 23, 2021 · Aug 23, 2021
@@ -57,6 +57,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added DeepSpeed Stage 1 support ([#8974](https://github.com/PyTorchLightning/pytorch-lightning/pull/8974))
 
 
+- Added bfloat16 support for Lightning Trainer ([#9049](https://github.com/PyTorchLightning/pytorch-lightning/pull/9049))
+
+
 ### Changed
 
 - Parsing of the `gpus` Trainer argument has changed: `gpus="n"` (str) no longer selects the GPU index n and instead selects the first n devices. ([#8770](https://github.com/PyTorchLightning/pytorch-lightning/pull/8770))

@@ -12,32 +12,60 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from contextlib import contextmanager
-from typing import Any, Callable, Dict, Generator
+from typing import Any, Callable, Dict, Generator, Union
 
 import torch
 from torch.optim import LBFGS, Optimizer
 
 import pytorch_lightning as pl
 from pytorch_lightning.plugins.precision.mixed import MixedPrecisionPlugin
-from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE, AMPType
+from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE, _TORCH_GREATER_EQUAL_1_10, AMPType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.warnings import WarningCache
+
+warning_cache = WarningCache()
 
 
 class NativeMixedPrecisionPlugin(MixedPrecisionPlugin):
-    """Plugin for native mixed precision training with :mod:`torch.cuda.amp`."""
+    """
+    Plugin for native mixed precision training with :mod:`torch.cuda.amp`.
+
+    Args:
+        precision: Whether to use torch.float16 (16) or torch.bfloat16 (bfloat16).
+    """
 
-    def __init__(self) -> None:
+    def __init__(self, precision: Union[int, str] = 16) -> None:
         super().__init__()
+
         if not _NATIVE_AMP_AVAILABLE:
             raise MisconfigurationException(
                 "You have asked for native AMP but your PyTorch version does not support it."
                 " Consider upgrading with `pip install torch>=1.6`."
             )
-
+        self.fast_dtype = self._select_precision_dtype(precision)
         self.backend = AMPType.NATIVE
-        self.scaler = torch.cuda.amp.GradScaler()
+        if not self.is_bfloat16:
+            self.scaler = torch.cuda.amp.GradScaler()
+
+    def _select_precision_dtype(self, precision: Union[int, str] = 16) -> torch.dtype:
+        if precision == "bfloat16":
+            if not _TORCH_GREATER_EQUAL_1_10:
+                raise MisconfigurationException(
+                    "To use bfloat16 with native amp you must install torch greater or equal to 1.10."
+                )
+            return torch.bfloat16
+        return torch.float16
+
+    @property
+    def is_bfloat16(self) -> bool:
+        return self.fast_dtype == torch.bfloat16
 
     def pre_backward(self, model: "pl.LightningModule", closure_loss: torch.Tensor) -> torch.Tensor:
+        if self.is_bfloat16:
+            warning_cache.warn(
+                "Skipping torch.cuda.amp.GradScaler in NativeMixedPrecisionPlugin as torch.bfloat16 is used."
+            )
+            return super().pre_backward(model, closure_loss)
         closure_loss = self.scaler.scale(closure_loss)
         return super().pre_backward(model, closure_loss)
 
@@ -49,6 +77,9 @@ def pre_optimizer_step(
         lambda_closure: Callable,
         **kwargs: Any,
     ) -> bool:
+        if self.is_bfloat16:
+            # skip scaler logic, as bfloat16 does not require scaler
+            return super().pre_optimizer_step(model, optimizer, optimizer_idx, lambda_closure, **kwargs)
         if isinstance(optimizer, LBFGS):
             raise MisconfigurationException(
                 f"native PyTorch amp and lbfgs are not compatible (optimizer {optimizer_idx})."
@@ -65,33 +96,40 @@ def pre_optimizer_step(
             self.scaler.update()
         return False
 
+    @property
+    def autocast(self) -> torch.cuda.amp.autocast:
+        if self.is_bfloat16:
+            return torch.cuda.amp.autocast(fast_dtype=self.fast_dtype)
+        return torch.cuda.amp.autocast()
+
     @contextmanager
     def train_step_context(self) -> Generator[None, None, None]:
         """Enable autocast context"""
-        with torch.cuda.amp.autocast():
+        with self.autocast:
             yield
 
     @contextmanager
     def val_step_context(self) -> Generator[None, None, None]:
         """Enable autocast context"""
-        with torch.cuda.amp.autocast():
+        with self.autocast:
             yield
 
     @contextmanager
     def test_step_context(self) -> Generator[None, None, None]:
         """Enable autocast context"""
-        with torch.cuda.amp.autocast():
+        with self.autocast:
             yield
 
     @contextmanager
     def predict_step_context(self) -> Generator[None, None, None]:
         """Enable autocast context"""
-        with torch.cuda.amp.autocast():
+        with self.autocast:
             yield
 
     def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
-        if "native_amp_scaling_state" in checkpoint:
+        if "native_amp_scaling_state" in checkpoint and not self.is_bfloat16:
             self.scaler.load_state_dict(checkpoint["native_amp_scaling_state"])
 
     def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
-        checkpoint["native_amp_scaling_state"] = self.scaler.state_dict()
+        if not self.is_bfloat16:
+            checkpoint["native_amp_scaling_state"] = self.scaler.state_dict()
@@ -24,8 +24,8 @@
 class ShardedNativeMixedPrecisionPlugin(NativeMixedPrecisionPlugin):
     """Mixed Precision for Sharded Training"""
 
-    def __init__(self) -> None:
-        super().__init__()
+    def __init__(self, precision: Union[int, str] = 16) -> None:
+        super().__init__(precision)
         self.scaler = ShardedGradScaler()
 
     def clip_grad_by_norm(

@@ -560,7 +560,7 @@ def select_precision_plugin(self) -> PrecisionPlugin:
             return PrecisionPlugin()
         if self.precision == 64:
             return DoublePrecisionPlugin()
-        if self.precision == 16:
+        if self.precision in (16, "bfloat16"):
             if self.use_tpu:
                 return TPUHalfPrecisionPlugin()
 
@@ -581,12 +581,12 @@ def select_precision_plugin(self) -> PrecisionPlugin:
                     else:
                         raise MisconfigurationException(msg)
                 else:
-                    log.info("Using native 16bit precision.")
+                    log.info(f"Using native {self.precision}bit precision")
                     if self._is_sharded_training_type:
-                        return ShardedNativeMixedPrecisionPlugin()
+                        return ShardedNativeMixedPrecisionPlugin(self.precision)
                     if self._is_fully_sharded_training_type:
-                        return FullyShardedNativeMixedPrecisionPlugin()
-                    return NativeMixedPrecisionPlugin()
+                        return FullyShardedNativeMixedPrecisionPlugin(self.precision)
+                    return NativeMixedPrecisionPlugin(self.precision)
 
             if self.amp_type == AMPType.APEX:
                 if not _APEX_AVAILABLE:

@@ -137,7 +137,7 @@ def __init__(
         log_every_n_steps: int = 50,
         accelerator: Optional[Union[str, Accelerator]] = None,
         sync_batchnorm: bool = False,
-        precision: int = 32,
+        precision: Union[int, str] = 32,
         weights_summary: Optional[str] = "top",
         weights_save_path: Optional[str] = None,
         num_sanity_val_steps: int = 2,
@@ -255,8 +255,8 @@ def __init__(
 
             plugins: Plugins allow modification of core behavior like ddp and amp, and enable custom lightning plugins.
 
-            precision: Double precision (64), full precision (32) or half precision (16). Can be used on CPU, GPU or
-                TPUs.
+            precision: Double precision (64), full precision (32), half precision (16) or bfloat16 precision (bfloat16).
+                Can be used on CPU, GPU or TPUs.
 
             max_epochs: Stop training once this number of epochs is reached. Disabled by default (None).
                 If both max_epochs and max_steps are not specified, defaults to ``max_epochs`` = 1000.

@@ -46,6 +46,7 @@
     _TORCH_GREATER_EQUAL_1_7,
     _TORCH_GREATER_EQUAL_1_8,
     _TORCH_GREATER_EQUAL_1_9,
+    _TORCH_GREATER_EQUAL_1_10,
     _TORCH_QUANTIZE_AVAILABLE,
     _TORCHTEXT_AVAILABLE,
     _TORCHVISION_AVAILABLE,

@@ -68,6 +68,8 @@ def _compare_version(package: str, op, version) -> bool:
 _TORCH_GREATER_EQUAL_1_8 = _compare_version("torch", operator.ge, "1.8.0")
 _TORCH_GREATER_EQUAL_1_8_1 = _compare_version("torch", operator.ge, "1.8.1")
 _TORCH_GREATER_EQUAL_1_9 = _compare_version("torch", operator.ge, "1.9.0")
+_TORCH_GREATER_EQUAL_1_10 = _compare_version("torch", operator.ge, "1.10.0dev")
+
 
 _APEX_AVAILABLE = _module_available("apex.amp")
 _BOLTS_AVAILABLE = _module_available("pl_bolts")

@@ -21,6 +21,7 @@
 from pytorch_lightning import Trainer
 from pytorch_lightning.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin
 from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers import BoringModel
 from tests.helpers.runif import RunIf
 
@@ -174,3 +175,68 @@ def test_amp_apex_ddp_spawn_fit(amp_level, tmpdir):
     assert isinstance(trainer.precision_plugin, ApexMixedPrecisionPlugin)
     model = BoringModel()
     trainer.fit(model)
+
+
+@RunIf(min_gpus=1, amp_native=True)
+def test_amp_precision_16_bfloat_disabled(tmpdir):
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        precision=16,
+        gpus=1,
+    )
+    plugin = trainer.precision_plugin
+    assert isinstance(plugin, NativeMixedPrecisionPlugin)
+    assert not plugin.is_bfloat16
+
+
+@RunIf(min_gpus=1, amp_native=True, min_torch="1.10.0dev")
+def test_amp_precision_bfloat(tmpdir):
+    class TestModel(BoringModel):
+        def training_step(self, batch, batch_idx):
+            output = self(batch)
+            assert output.dtype == torch.bfloat16
+            loss = self.loss(batch, output)
+            return {"loss": loss}
+
+        def validation_step(self, batch, batch_idx):
+            output = self(batch)
+            assert output.dtype == torch.bfloat16
+            loss = self.loss(batch, output)
+            return {"x": loss}
+
+        def test_step(self, batch, batch_idx):
+            output = self(batch)
+            assert output.dtype == torch.bfloat16
+            loss = self.loss(batch, output)
+            return {"y": loss}
+
+    model = TestModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        precision="bfloat16",
+        gpus=1,
+    )
+    plugin = trainer.precision_plugin
+    assert isinstance(plugin, NativeMixedPrecisionPlugin)
+    assert plugin.is_bfloat16
+    assert plugin.autocast.fast_dtype == torch.bfloat16
+    with pytest.warns(
+        UserWarning, match="Skipping torch.cuda.amp.GradScaler in NativeMixedPrecisionPlugin as torch.bfloat16 is used."
+    ):
+        trainer.fit(model)
+
+
+@RunIf(min_gpus=1, amp_native=True, max_torch="1.9")
+def test_amp_precision_16_bfloat_throws_error(tmpdir):
+    with pytest.raises(
+        MisconfigurationException,
+        match="To use bfloat16 with native amp you must install torch greater or equal to 1.10",
+    ):
+        Trainer(
+            default_root_dir=tmpdir,
+            precision="bfloat16",
+            gpus=1,
+        )