diff --git a/CHANGELOG.md b/CHANGELOG.md
index 84db9b385009e..c04dd2b673481 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,11 +10,15 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Added
 
 
+- Added argument `trainer.predict(ckpt_path)` ([#7430](https://github.com/PyTorchLightning/pytorch-lightning/pull/7430))
+
+
 - Added `clip_grad_by_value` support for TPUs ([#7025](https://github.com/PyTorchLightning/pytorch-lightning/pull/7025))
 
 
 ### Changed
 
+
 - Log epoch metrics before the `on_evaluation_end` hook ([#7272](https://github.com/PyTorchLightning/pytorch-lightning/pull/7272))
 
 
@@ -25,6 +29,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Refactored Loops
     * Moved attributes `global_step`, `current_epoch`, `max/min_steps`, `max/min_epochs`, `batch_idx`, and `total_batch_idx` to TrainLoop ([#7437](https://github.com/PyTorchLightning/pytorch-lightning/pull/7025))
+    * Refactored result handling in training loop ([#7506](https://github.com/PyTorchLightning/pytorch-lightning/pull/7506))
 
 - `DataModule`s now avoid duplicate `{setup,teardown,prepare_data}` calls for the same stage ([#7238](https://github.com/PyTorchLightning/pytorch-lightning/pull/7238))
 
@@ -35,10 +40,16 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Changed the behaviour when logging evaluation step metrics to no longer append `/epoch_*` to the metric name ([#7351](https://github.com/PyTorchLightning/pytorch-lightning/pull/7351))
 
 
-- Changed `resolve_training_type_plugins` to allow setting `num_nodes` and `sync_batchnorm` from `Trainer` setting ([7026](https://github.com/PyTorchLightning/pytorch-lightning/pull/7026))
+- Changed `resolve_training_type_plugins` to allow setting `num_nodes` and `sync_batchnorm` from `Trainer` setting ([#7026](https://github.com/PyTorchLightning/pytorch-lightning/pull/7026))
+
+
+- Default `seed_everything(workers=True)` in the `LightningCLI` ([#7504](https://github.com/PyTorchLightning/pytorch-lightning/pull/7504))
 
 
-- Changed `model.state_dict()` in `CheckpointConnector` to allow `training_type_plugin` to customize the model's `state_dict()` ([7474](https://github.com/PyTorchLightning/pytorch-lightning/pull/7474))
+- Changed `model.state_dict()` in `CheckpointConnector` to allow `training_type_plugin` to customize the model's `state_dict()` ([#7474](https://github.com/PyTorchLightning/pytorch-lightning/pull/7474))
+
+
+- MLflowLogger now uses the env variable `MLFLOW_TRACKING_URI` as default tracking uri ([#7457](https://github.com/PyTorchLightning/pytorch-lightning/pull/7457))
 
 
 ### Deprecated
@@ -52,14 +63,31 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Removed
 
+- Prune deprecated classif. metrics from `pytorch_lightning.metrics.functional.classification` ([7499](https://github.com/PyTorchLightning/pytorch-lightning/pull/7499))
+
+
+- Removed deprecated data parallel classes `LightningDataParallel` and `LightningDistributedDataParallel` from `pytorch_lightning.overrides.data_parallel` ([7510](https://github.com/PyTorchLightning/pytorch-lightning/pull/7510))
+
+
+- Removed deprecated trainer attributes - `get_model` and `accelerator_backend` ([7502](https://github.com/PyTorchLightning/pytorch-lightning/pull/7502))
+
+
+- Removed deprecated utils modules `model_utils`, `warning_utils`, `xla_device_utils` and partially `argparse_utils` ([7503](https://github.com/PyTorchLightning/pytorch-lightning/pull/7503))
+
+
+- Removed deprecated trainer attributes - `on_cpu`, `on_tpu`, `use_tpu`, `on_gpu`, `use_dp`, `use_ddp`, `use_ddp2`, `use_horovod`, `use_single_gpu` ([#7501](https://github.com/PyTorchLightning/pytorch-lightning/pull/7501))
+
 
 ### Fixed
 
+
 - Fixed parsing of multiple training dataloaders ([#7433](https://github.com/PyTorchLightning/pytorch-lightning/pull/7433))
 
+
 - Fixed recursive passing of `wrong_type` keyword argument in `pytorch_lightning.utilities.apply_to_collection` ([#7433](https://github.com/PyTorchLightning/pytorch-lightning/pull/7433))
 
 
+
 ## [1.3.1] - 2021-05-11
 
 ### Fixed
diff --git a/README.md b/README.md
index f14205fb7e382..8da7836fb689e 100644
--- a/README.md
+++ b/README.md
@@ -118,22 +118,22 @@ pip install pytorch-lightning
   conda install pytorch-lightning -c conda-forge
   ```
 
-  #### Install stable 1.2.x
+  #### Install stable 1.3.x
 
-  the actual status of 1.2 [stable] is following:
+  the actual status of 1.3 [stable] is following:
 
-  ![CI base testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20base%20testing/badge.svg?branch=release%2F1.2.x&event=push)
-  ![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=release%2F1.2.x&event=push)
-  ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=release%2F1.2.x&event=push)
-  ![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=release%2F1.2.x&event=push)
-  ![Docs check](https://github.com/PyTorchLightning/pytorch-lightning/workflows/Docs%20check/badge.svg?branch=release%2F1.2.x&event=push)
+  ![CI base testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20base%20testing/badge.svg?branch=release%2F1.3.x&event=push)
+  ![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=release%2F1.3.x&event=push)
+  ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=release%2F1.3.x&event=push)
+  ![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=release%2F1.3.x&event=push)
+  ![Docs check](https://github.com/PyTorchLightning/pytorch-lightning/workflows/Docs%20check/badge.svg?branch=release%2F1.3.x&event=push)
 
   Install future release from the source
   ```bash
-  pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@release/1.2.x --upgrade
+  pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@release/1.3.x --upgrade
   ```
 
-  #### Install bleeding-edge - future 1.3
+  #### Install bleeding-edge - future 1.4
 
   Install nightly from the source (no guarantees)
   ```bash
diff --git a/dockers/README.md b/dockers/README.md
index 549006ec62c02..581c03c530d26 100644
--- a/dockers/README.md
+++ b/dockers/README.md
@@ -71,14 +71,14 @@ Inspiration comes from https://u.group/thinking/how-to-put-jupyter-notebooks-in-
 1. Build the docker image:
     ```bash
     docker image build \
-        -t pytorch-lightning:v1.2.9 \
+        -t pytorch-lightning:v1.3.1 \
         -f dockers/nvidia/Dockerfile \
-        --build-arg LIGHTNING_VERSION=1.2.9 \
+        --build-arg LIGHTNING_VERSION=1.3.1 \
         .
     ```
 2. start the server and map ports:
     ```bash
-    docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.2.9
+    docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.3.1
     ```
 3. Connect in local browser:
     - copy the generated path e.g. `http://hostname:8888/?token=0719fa7e1729778b0cec363541a608d5003e26d4910983c6`
diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet
index 8c3f3693fda50..13f70deed43ca 100644
--- a/dockers/tpu-tests/tpu_test_cases.jsonnet
+++ b/dockers/tpu-tests/tpu_test_cases.jsonnet
@@ -22,7 +22,7 @@ local tputests = base.BaseTest {
     |||
       cd pytorch-lightning
       coverage run --source=pytorch_lightning -m pytest -v --capture=no \
-          pytorch_lightning/utilities/xla_device_utils.py \
+          pytorch_lightning/utilities/xla_device.py \
           tests/accelerators/test_tpu_backend.py \
           tests/models/test_tpu.py
       test_exit_code=$?
diff --git a/docs/source/common/lightning_cli.rst b/docs/source/common/lightning_cli.rst
index b11d505c502ad..1df80e1ccf830 100644
--- a/docs/source/common/lightning_cli.rst
+++ b/docs/source/common/lightning_cli.rst
@@ -91,8 +91,8 @@ practice to create a configuration file and provide this to the tool. A way to d
 
 The instantiation of the :class:`~pytorch_lightning.utilities.cli.LightningCLI` class takes care of parsing command line
 and config file options, instantiating the classes, setting up a callback to save the config in the log directory and
-finally running :func:`trainer.fit`. The resulting object :code:`cli` can be used for instance to get the result of fit,
-i.e., :code:`cli.fit_result`.
+finally running the trainer. The resulting object :code:`cli` can be used for example to get the instance of the
+model, (:code:`cli.model`).
 
 After multiple trainings with different configurations, each run will have in its respective log directory a
 :code:`config.yaml` file. This file can be used for reference to know in detail all the settings that were used for each
diff --git a/pl_examples/basic_examples/autoencoder.py b/pl_examples/basic_examples/autoencoder.py
index a574adb40d6e0..8ea03dabc9bdb 100644
--- a/pl_examples/basic_examples/autoencoder.py
+++ b/pl_examples/basic_examples/autoencoder.py
@@ -116,8 +116,7 @@ def test_dataloader(self):
 
 def cli_main():
     cli = LightningCLI(LitAutoEncoder, MyDataModule, seed_everything_default=1234)
-    result = cli.trainer.test(cli.model, datamodule=cli.datamodule)
-    print(result)
+    cli.trainer.test(cli.model, datamodule=cli.datamodule)
 
 
 if __name__ == '__main__':
diff --git a/pl_examples/basic_examples/backbone_image_classifier.py b/pl_examples/basic_examples/backbone_image_classifier.py
index 53a24dfdb221f..57cf97be00023 100644
--- a/pl_examples/basic_examples/backbone_image_classifier.py
+++ b/pl_examples/basic_examples/backbone_image_classifier.py
@@ -129,8 +129,7 @@ def test_dataloader(self):
 
 def cli_main():
     cli = LightningCLI(LitClassifier, MyDataModule, seed_everything_default=1234)
-    result = cli.trainer.test(cli.model, datamodule=cli.datamodule)
-    print(result)
+    cli.trainer.test(cli.model, datamodule=cli.datamodule)
 
 
 if __name__ == '__main__':
diff --git a/pl_examples/basic_examples/dali_image_classifier.py b/pl_examples/basic_examples/dali_image_classifier.py
index 1a9dc46c81137..eca5c21b3242c 100644
--- a/pl_examples/basic_examples/dali_image_classifier.py
+++ b/pl_examples/basic_examples/dali_image_classifier.py
@@ -222,8 +222,7 @@ def cli_main():
         return
 
     cli = LightningCLI(LitClassifier, MyDataModule, seed_everything_default=1234)
-    result = cli.trainer.test(cli.model, datamodule=cli.datamodule)
-    print(result)
+    cli.trainer.test(cli.model, datamodule=cli.datamodule)
 
 
 if __name__ == "__main__":
diff --git a/pl_examples/basic_examples/simple_image_classifier.py b/pl_examples/basic_examples/simple_image_classifier.py
index d401e884a2f18..ffb6434352b2e 100644
--- a/pl_examples/basic_examples/simple_image_classifier.py
+++ b/pl_examples/basic_examples/simple_image_classifier.py
@@ -77,8 +77,7 @@ def configure_optimizers(self):
 
 def cli_main():
     cli = LightningCLI(LitClassifier, MNISTDataModule, seed_everything_default=1234)
-    result = cli.trainer.test(cli.model, datamodule=cli.datamodule)
-    print(result)
+    cli.trainer.test(cli.model, datamodule=cli.datamodule)
 
 
 if __name__ == '__main__':
diff --git a/pytorch_lightning/core/datamodule.py b/pytorch_lightning/core/datamodule.py
index 23626ed9cbeae..84210e9d7b667 100644
--- a/pytorch_lightning/core/datamodule.py
+++ b/pytorch_lightning/core/datamodule.py
@@ -37,7 +37,7 @@ def __init__(self):
             def prepare_data(self):
                 # download, split, etc...
                 # only called on 1 GPU/TPU in distributed
-            def setup(self):
+            def setup(self, stage):
                 # make assignments here (val/train/test split)
                 # called on every process in DDP
             def train_dataloader(self):
diff --git a/pytorch_lightning/loggers/mlflow.py b/pytorch_lightning/loggers/mlflow.py
index 516ed55de9fcf..fbcd4bbcc5183 100644
--- a/pytorch_lightning/loggers/mlflow.py
+++ b/pytorch_lightning/loggers/mlflow.py
@@ -16,6 +16,7 @@
 -------------
 """
 import logging
+import os
 import re
 from argparse import Namespace
 from time import time
@@ -85,7 +86,8 @@ def any_lightning_module_function_or_hook(self):
     Args:
         experiment_name: The name of the experiment
         tracking_uri: Address of local or remote tracking server.
-            If not provided, defaults to `file:<save_dir>`.
+            If not provided, defaults to `MLFLOW_TRACKING_URI` environment variable if set, otherwise it falls
+            back to `file:<save_dir>`.
         tags: A dictionary tags for the experiment.
         save_dir: A path to a local directory where the MLflow runs get saved.
             Defaults to `./mlflow` if `tracking_uri` is not provided.
@@ -104,7 +106,7 @@ def any_lightning_module_function_or_hook(self):
     def __init__(
         self,
         experiment_name: str = 'default',
-        tracking_uri: Optional[str] = None,
+        tracking_uri: Optional[str] = os.getenv('MLFLOW_TRACKING_URI'),
         tags: Optional[Dict[str, Any]] = None,
         save_dir: Optional[str] = './mlruns',
         prefix: str = '',
diff --git a/pytorch_lightning/metrics/functional/__init__.py b/pytorch_lightning/metrics/functional/__init__.py
index 3b31dad5d3411..2bd5ca9b2e579 100644
--- a/pytorch_lightning/metrics/functional/__init__.py
+++ b/pytorch_lightning/metrics/functional/__init__.py
@@ -15,13 +15,6 @@
 from pytorch_lightning.metrics.functional.auc import auc  # noqa: F401
 from pytorch_lightning.metrics.functional.auroc import auroc  # noqa: F401
 from pytorch_lightning.metrics.functional.average_precision import average_precision  # noqa: F401
-from pytorch_lightning.metrics.functional.classification import (  # noqa: F401
-    dice_score,
-    get_num_classes,
-    multiclass_auroc,
-    stat_scores_multiple_classes,
-    to_categorical,
-)
 from pytorch_lightning.metrics.functional.confusion_matrix import confusion_matrix  # noqa: F401
 from pytorch_lightning.metrics.functional.explained_variance import explained_variance  # noqa: F401
 from pytorch_lightning.metrics.functional.f_beta import f1, fbeta  # noqa: F401
diff --git a/pytorch_lightning/metrics/functional/classification.py b/pytorch_lightning/metrics/functional/classification.py
deleted file mode 100644
index de2f21e26438a..0000000000000
--- a/pytorch_lightning/metrics/functional/classification.py
+++ /dev/null
@@ -1,352 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from functools import wraps
-from typing import Callable, Optional, Sequence, Tuple
-
-import torch
-from torchmetrics.utilities import class_reduce, reduce
-from torchmetrics.utilities.data import get_num_classes, to_categorical
-
-from pytorch_lightning.metrics.functional.auc import auc as __auc
-from pytorch_lightning.metrics.functional.auroc import auroc as __auroc
-from pytorch_lightning.metrics.functional.iou import iou as __iou
-from pytorch_lightning.utilities import rank_zero_deprecation, rank_zero_warn
-
-
-def stat_scores(
-    pred: torch.Tensor,
-    target: torch.Tensor,
-    class_index: int,
-    argmax_dim: int = 1,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    .. deprecated::
-        Use :func:`torchmetrics.functional.stat_scores`. Will be removed in v1.4.0.
-    """
-    if pred.ndim == target.ndim + 1:
-        pred = to_categorical(pred, argmax_dim=argmax_dim)
-
-    tp = ((pred == class_index) * (target == class_index)).to(torch.long).sum()
-    fp = ((pred == class_index) * (target != class_index)).to(torch.long).sum()
-    tn = ((pred != class_index) * (target != class_index)).to(torch.long).sum()
-    fn = ((pred != class_index) * (target == class_index)).to(torch.long).sum()
-    sup = (target == class_index).to(torch.long).sum()
-
-    return tp, fp, tn, fn, sup
-
-
-# todo: remove in 1.4
-def stat_scores_multiple_classes(
-    pred: torch.Tensor,
-    target: torch.Tensor,
-    num_classes: Optional[int] = None,
-    argmax_dim: int = 1,
-    reduction: str = 'none',
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    .. deprecated::
-        Use :func:`torchmetrics.functional.stat_scores`. Will be removed in v1.4.0.
-    """
-    rank_zero_deprecation(
-        "This `stat_scores_multiple_classes` was deprecated in v1.2.0 in favor of"
-        " `from pytorch_lightning.metrics.functional import stat_scores`."
-        " It will be removed in v1.4.0"
-    )
-    if pred.ndim == target.ndim + 1:
-        pred = to_categorical(pred, argmax_dim=argmax_dim)
-
-    num_classes = get_num_classes(pred, target, num_classes=num_classes)
-
-    if pred.dtype != torch.bool:
-        pred = pred.clamp_max(max=num_classes)
-    if target.dtype != torch.bool:
-        target = target.clamp_max(max=num_classes)
-
-    possible_reductions = ('none', 'sum', 'elementwise_mean')
-    if reduction not in possible_reductions:
-        raise ValueError("reduction type %s not supported" % reduction)
-
-    if reduction == 'none':
-        pred = pred.view((-1, )).long()
-        target = target.view((-1, )).long()
-
-        tps = torch.zeros((num_classes + 1, ), device=pred.device)
-        fps = torch.zeros((num_classes + 1, ), device=pred.device)
-        fns = torch.zeros((num_classes + 1, ), device=pred.device)
-        sups = torch.zeros((num_classes + 1, ), device=pred.device)
-
-        match_true = (pred == target).float()
-        match_false = 1 - match_true
-
-        tps.scatter_add_(0, pred, match_true)
-        fps.scatter_add_(0, pred, match_false)
-        fns.scatter_add_(0, target, match_false)
-        tns = pred.size(0) - (tps + fps + fns)
-        sups.scatter_add_(0, target, torch.ones_like(match_true))
-
-        tps = tps[:num_classes]
-        fps = fps[:num_classes]
-        tns = tns[:num_classes]
-        fns = fns[:num_classes]
-        sups = sups[:num_classes]
-
-    elif reduction == 'sum' or reduction == 'elementwise_mean':
-        count_match_true = (pred == target).sum().float()
-        oob_tp, oob_fp, oob_tn, oob_fn, oob_sup = stat_scores(pred, target, num_classes, argmax_dim)
-
-        tps = count_match_true - oob_tp
-        fps = pred.nelement() - count_match_true - oob_fp
-        fns = pred.nelement() - count_match_true - oob_fn
-        tns = pred.nelement() * (num_classes + 1) - (tps + fps + fns + oob_tn)
-        sups = pred.nelement() - oob_sup.float()
-
-        if reduction == 'elementwise_mean':
-            tps /= num_classes
-            fps /= num_classes
-            fns /= num_classes
-            tns /= num_classes
-            sups /= num_classes
-
-    return tps.float(), fps.float(), tns.float(), fns.float(), sups.float()
-
-
-def _confmat_normalize(cm):
-    """ Normalization function for confusion matrix """
-    cm = cm / cm.sum(-1, keepdim=True)
-    nan_elements = cm[torch.isnan(cm)].nelement()
-    if nan_elements != 0:
-        cm[torch.isnan(cm)] = 0
-        rank_zero_warn(f'{nan_elements} nan values found in confusion matrix have been replaced with zeros.')
-    return cm
-
-
-# todo: remove in 1.4
-def precision_recall(
-    pred: torch.Tensor,
-    target: torch.Tensor,
-    num_classes: Optional[int] = None,
-    class_reduction: str = 'micro',
-    return_support: bool = False,
-    return_state: bool = False
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    .. deprecated::
-        Use :func:`torchmetrics.functional.precision_recall`. Will be removed in v1.4.0.
-    """
-    rank_zero_deprecation(
-        "This `precision_recall` was deprecated in v1.2.0 in favor of"
-        " `from pytorch_lightning.metrcs.functional import precision_recall`."
-        " It will be removed in v1.4.0"
-    )
-
-    tps, fps, tns, fns, sups = stat_scores_multiple_classes(pred=pred, target=target, num_classes=num_classes)
-
-    precision = class_reduce(tps, tps + fps, sups, class_reduction=class_reduction)
-    recall = class_reduce(tps, tps + fns, sups, class_reduction=class_reduction)
-    if return_state:
-        return {'tps': tps, 'fps': fps, 'fns': fns, 'sups': sups}
-    if return_support:
-        return precision, recall, sups
-    return precision, recall
-
-
-# todo: remove in 1.4
-def precision(
-    pred: torch.Tensor,
-    target: torch.Tensor,
-    num_classes: Optional[int] = None,
-    class_reduction: str = 'micro',
-) -> torch.Tensor:
-    """
-    .. deprecated::
-        Use :func:`torchmetrics.functional.precision`. Will be removed in v1.4.0.
-    """
-    rank_zero_deprecation(
-        "This `precision` was deprecated in v1.2.0 in favor of"
-        " `from pytorch_lightning.metrics.functional import precision`."
-        " It will be removed in v1.4.0"
-    )
-
-    return precision_recall(pred=pred, target=target, num_classes=num_classes, class_reduction=class_reduction)[0]
-
-
-# todo: remove in 1.4
-def recall(
-    pred: torch.Tensor,
-    target: torch.Tensor,
-    num_classes: Optional[int] = None,
-    class_reduction: str = 'micro',
-) -> torch.Tensor:
-    """
-    .. deprecated::
-        Use :func:`torchmetrics.functional.recall`. Will be removed in v1.4.0.
-    """
-    rank_zero_deprecation(
-        "This `recall` was deprecated in v1.2.0 in favor of"
-        " `from pytorch_lightning.metrics.functional import recall`."
-        " It will be removed in v1.4.0"
-    )
-
-    return precision_recall(pred=pred, target=target, num_classes=num_classes, class_reduction=class_reduction)[1]
-
-
-# todo: remove in 1.4
-def auc(
-    x: torch.Tensor,
-    y: torch.Tensor,
-) -> torch.Tensor:
-    """
-    .. deprecated::
-        Use :func:`torchmetrics.functional.auc`. Will be removed in v1.4.0.
-    """
-    rank_zero_deprecation(
-        "This `auc` was deprecated in v1.2.0 in favor of"
-        " `pytorch_lightning.metrics.functional.auc import auc`."
-        " It will be removed in v1.4.0"
-    )
-    return __auc(x, y)
-
-
-# todo: remove in 1.4
-def _auc_decorator() -> Callable:
-
-    def wrapper(func_to_decorate: Callable) -> Callable:
-
-        @wraps(func_to_decorate)
-        def new_func(*args, **kwargs) -> torch.Tensor:
-            x, y = func_to_decorate(*args, **kwargs)[:2]
-
-            return auc(x, y)
-
-        return new_func
-
-    return wrapper
-
-
-# todo: remove in 1.4
-def _multiclass_auc_decorator() -> Callable:
-
-    def wrapper(func_to_decorate: Callable) -> Callable:
-
-        @wraps(func_to_decorate)
-        def new_func(*args, **kwargs) -> torch.Tensor:
-            results = []
-            for class_result in func_to_decorate(*args, **kwargs):
-                x, y = class_result[:2]
-                results.append(auc(x, y))
-
-            return torch.stack(results)
-
-        return new_func
-
-    return wrapper
-
-
-# todo: remove in 1.4
-def auroc(
-    pred: torch.Tensor,
-    target: torch.Tensor,
-    sample_weight: Optional[Sequence] = None,
-    pos_label: int = 1.,
-    max_fpr: float = None,
-) -> torch.Tensor:
-    """
-    .. deprecated::
-        Use :func:`torchmetrics.functional.auroc`. Will be removed in v1.4.0.
-    """
-    rank_zero_deprecation(
-        "This `auroc` was deprecated in v1.2.0 in favor of `pytorch_lightning.metrics.functional.auroc import auroc`."
-        " It will be removed in v1.4.0"
-    )
-    return __auroc(
-        preds=pred, target=target, sample_weights=sample_weight, pos_label=pos_label, max_fpr=max_fpr, num_classes=1
-    )
-
-
-# todo: remove in 1.4
-def multiclass_auroc(
-    pred: torch.Tensor,
-    target: torch.Tensor,
-    sample_weight: Optional[Sequence] = None,
-    num_classes: Optional[int] = None,
-) -> torch.Tensor:
-    """
-    .. deprecated::
-        Use :func:`torchmetrics.functional.auroc`. Will be removed in v1.4.0.
-    """
-    rank_zero_deprecation(
-        "This `multiclass_auroc` was deprecated in v1.2.0 in favor of"
-        " `pytorch_lightning.metrics.functional.auroc import auroc`."
-        " It will be removed in v1.4.0"
-    )
-
-    return __auroc(preds=pred, target=target, sample_weights=sample_weight, num_classes=num_classes)
-
-
-def dice_score(
-    pred: torch.Tensor,
-    target: torch.Tensor,
-    bg: bool = False,
-    nan_score: float = 0.0,
-    no_fg_score: float = 0.0,
-    reduction: str = 'elementwise_mean',
-) -> torch.Tensor:
-    """
-    .. deprecated::
-        Use :func:`torchmetrics.functional.dice_score`. Will be removed in v1.4.0.
-    """
-    num_classes = pred.shape[1]
-    bg = (1 - int(bool(bg)))
-    scores = torch.zeros(num_classes - bg, device=pred.device, dtype=torch.float32)
-    for i in range(bg, num_classes):
-        if not (target == i).any():
-            # no foreground class
-            scores[i - bg] += no_fg_score
-            continue
-
-        tp, fp, tn, fn, sup = stat_scores(pred=pred, target=target, class_index=i)
-        denom = (2 * tp + fp + fn).to(torch.float)
-        # nan result
-        score_cls = (2 * tp).to(torch.float) / denom if torch.is_nonzero(denom) else nan_score
-
-        scores[i - bg] += score_cls
-    return reduce(scores, reduction=reduction)
-
-
-# todo: remove in 1.4
-def iou(
-    pred: torch.Tensor,
-    target: torch.Tensor,
-    ignore_index: Optional[int] = None,
-    absent_score: float = 0.0,
-    num_classes: Optional[int] = None,
-    reduction: str = 'elementwise_mean',
-) -> torch.Tensor:
-    """
-    .. deprecated::
-        Use :func:`torchmetrics.functional.iou`. Will be removed in v1.4.0.
-    """
-    rank_zero_deprecation(
-        "This `iou` was deprecated in v1.2.0 in favor of `from pytorch_lightning.metrics.functional.iou import iou`."
-        " It will be removed in v1.4.0"
-    )
-    return __iou(
-        pred,
-        target,
-        ignore_index=ignore_index,
-        absent_score=absent_score,
-        threshold=0.5,
-        num_classes=num_classes,
-        reduction=reduction
-    )
diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py
index 272f4c67502c7..3d6e527ef95a9 100644
--- a/pytorch_lightning/overrides/data_parallel.py
+++ b/pytorch_lightning/overrides/data_parallel.py
@@ -16,12 +16,9 @@
 from typing import Any
 
 import torch
-from torch.nn import DataParallel
-from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.overrides.base import _LightningModuleWrapperBase
-from pytorch_lightning.overrides.distributed import LightningDistributedModule
 from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 
@@ -35,27 +32,6 @@ def _ignore_scalar_return_in_dp():
     )
 
 
-class LightningDataParallel(DataParallel):
-
-    def __init__(self, module: LightningModule, *args, **kwargs):
-        warnings.warn(
-            "The usage of `LightningDataParallel` is deprecated since v1.2 and will be removed in v1.4."
-            " From now on we recommend to directly subclass `torch.nn.parallel.DataParallel`.", DeprecationWarning
-        )
-        super().__init__(LightningParallelModule(module), *args, **kwargs)
-
-
-class LightningDistributedDataParallel(DistributedDataParallel):
-
-    def __init__(self, module: LightningModule, *args, **kwargs):
-        warnings.warn(
-            "The usage of `LightningDistributedDataParallel` is deprecated since v1.2 and will be removed in v1.4."
-            " From now on we recommend to directly subclass `torch.nn.parallel.DistributedDataParallel`.",
-            DeprecationWarning
-        )
-        super().__init__(LightningDistributedModule(module), *args, **kwargs)
-
-
 class LightningParallelModule(_LightningModuleWrapperBase):
     """
     Wraps the user's LightningModule and redirects the forward call to the appropriate
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
index 8c09de075147a..1c8298557662b 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -195,14 +195,14 @@ def cache_training_step_metrics(self, opt_closure_result):
         self._callback_metrics.update(callback_metrics_tmp)
         self._logged_metrics.update(logged_metrics_tmp)
 
-    def log_metrics(self, metrics, grad_norm_dic, step=None):
+    def log_metrics(self, metrics, grad_norm_dict, step=None):
         """Logs the metric dict passed in.
         If `step` parameter is None and `step` key is presented is metrics,
         uses metrics["step"] as a step
 
         Args:
             metrics (dict): Metric values
-            grad_norm_dic (dict): Gradient norms
+            grad_norm_dict (dict): Gradient norms
             step (int): Step for which metrics should be logged. Default value is `self.global_step` during training or
                 the total validation / test log step count during validation and testing.
         """
@@ -212,7 +212,7 @@ def log_metrics(self, metrics, grad_norm_dic, step=None):
             metrics.update(mem_map)
 
         # add norms
-        metrics.update(grad_norm_dic)
+        metrics.update(grad_norm_dict)
 
         # turn all tensors to scalars
         scalar_metrics = metrics_to_scalars(metrics)
@@ -368,11 +368,11 @@ def log_train_step_metrics(self, batch_output):
         # when metrics should be logged
         if self.should_update_logs or self.trainer.fast_dev_run is True:
             # logs user requested information to logger
-            grad_norm_dic = batch_output.grad_norm_dic
-            if grad_norm_dic is None:
-                grad_norm_dic = {}
-            if len(batch_log_metrics) > 0 or len(grad_norm_dic) > 0:
-                self.log_metrics(batch_log_metrics, grad_norm_dic)
+            grad_norm_dict = batch_output.grad_norm_dict
+            if grad_norm_dict is None:
+                grad_norm_dict = {}
+            if len(batch_log_metrics) > 0 or len(grad_norm_dict) > 0:
+                self.log_metrics(batch_log_metrics, grad_norm_dict)
                 self._callback_metrics.update(batch_log_metrics)
 
     @property
diff --git a/pytorch_lightning/trainer/deprecated_api.py b/pytorch_lightning/trainer/deprecated_api.py
index 32dbc8c4088a3..7e7817d277dae 100644
--- a/pytorch_lightning/trainer/deprecated_api.py
+++ b/pytorch_lightning/trainer/deprecated_api.py
@@ -11,141 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from pytorch_lightning.accelerators import Accelerator
-from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.trainer.connectors.accelerator_connector import AcceleratorConnector
-from pytorch_lightning.utilities import DeviceType, DistributedType, rank_zero_deprecation
 
-
-class DeprecatedDistDeviceAttributes:
-
-    num_gpus: int
-    accelerator_connector: AcceleratorConnector
-
-    @property
-    def on_cpu(self) -> bool:
-        rank_zero_deprecation("Internal: `on_cpu` is deprecated in v1.2 and will be removed in v1.4.")
-        return self.accelerator_connector._device_type == DeviceType.CPU
-
-    @on_cpu.setter
-    def on_cpu(self, val: bool) -> None:
-        rank_zero_deprecation("Internal: `on_cpu` is deprecated in v1.2 and will be removed in v1.4.")
-        if val:
-            self.accelerator_connector._device_type = DeviceType.CPU
-
-    @property
-    def on_tpu(self) -> bool:
-        rank_zero_deprecation("Internal: `on_tpu` is deprecated in v1.2 and will be removed in v1.4.")
-        return self.accelerator_connector._device_type == DeviceType.TPU
-
-    @on_tpu.setter
-    def on_tpu(self, val: bool) -> None:
-        rank_zero_deprecation("Internal: `on_tpu` is deprecated in v1.2 and will be removed in v1.4.")
-        if val:
-            self.accelerator_connector._device_type = DeviceType.TPU
-
-    @property
-    def use_tpu(self) -> bool:
-        rank_zero_deprecation("Internal: `use_tpu` is deprecated in v1.2 and will be removed in v1.4.")
-        return self.on_tpu
-
-    @use_tpu.setter
-    def use_tpu(self, val: bool) -> None:
-        rank_zero_deprecation("Internal: `use_tpu` is deprecated in v1.2 and will be removed in v1.4.")
-        self.on_tpu = val
-
-    @property
-    def on_gpu(self) -> bool:
-        rank_zero_deprecation("Internal: `on_gpu` is deprecated in v1.2 and will be removed in v1.4.")
-        return self.accelerator_connector._device_type == DeviceType.GPU
-
-    @on_gpu.setter
-    def on_gpu(self, val: bool) -> None:
-        rank_zero_deprecation("Internal: `on_gpu` is deprecated in v1.2 and will be removed in v1.4.")
-        if val:
-            self.accelerator_connector._device_type = DeviceType.GPU
-
-    @property
-    def use_dp(self) -> bool:
-        rank_zero_deprecation("Internal: `use_dp` is deprecated in v1.2 and will be removed in v1.4.")
-        return self.accelerator_connector._distrib_type == DistributedType.DP
-
-    @use_dp.setter
-    def use_dp(self, val: bool) -> None:
-        rank_zero_deprecation("Internal: `use_dp` is deprecated in v1.2 and will be removed in v1.4.")
-        if val:
-            self.accelerator_connector._distrib_type = DistributedType.DP
-
-    @property
-    def use_ddp(self) -> bool:
-        rank_zero_deprecation("Internal: `use_ddp` is deprecated in v1.2 and will be removed in v1.4.")
-        return self.accelerator_connector._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
-
-    @use_ddp.setter
-    def use_ddp(self, val: bool) -> None:
-        rank_zero_deprecation("Internal: `use_ddp` is deprecated in v1.2 and will be removed in v1.4.")
-        if val:
-            self.accelerator_connector._distrib_type = DistributedType.DDP
-
-    @property
-    def use_ddp2(self) -> bool:
-        rank_zero_deprecation("Internal: `use_ddp2` is deprecated in v1.2 and will be removed in v1.4.")
-        return self.accelerator_connector._distrib_type == DistributedType.DDP2
-
-    @use_ddp2.setter
-    def use_ddp2(self, val: bool) -> None:
-        rank_zero_deprecation("Internal: `use_ddp2` is deprecated in v1.2 and will be removed in v1.4.")
-        if val:
-            self.accelerator_connector._distrib_type = DistributedType.DDP2
-
-    @property
-    def use_horovod(self) -> bool:
-        rank_zero_deprecation("Internal: `use_horovod` is deprecated in v1.2 and will be removed in v1.4.")
-        return self.accelerator_connector._distrib_type == DistributedType.HOROVOD
-
-    @use_horovod.setter
-    def use_horovod(self, val: bool) -> None:
-        rank_zero_deprecation("Internal: `use_horovod` is deprecated in v1.2 and will be removed in v1.4.")
-        if val:
-            self.accelerator_connector._distrib_type = DistributedType.HOROVOD
-
-    @property
-    def use_single_gpu(self) -> bool:
-        rank_zero_deprecation("Internal: `use_single_gpu` is deprecated in v1.2 and will be removed in v1.4.")
-        # todo, limiting to exclude DDP2 is not clear but it comes from connectors...
-        return (
-            self.accelerator_connector._device_type and self.accelerator_connector._device_type == DeviceType.GPU
-            and self.num_gpus == 1 and self.accelerator_connector._distrib_type not in (DistributedType.DDP2, )
-        )
-
-    @use_single_gpu.setter
-    def use_single_gpu(self, val: bool) -> None:
-        rank_zero_deprecation("Internal: `use_single_gpu` is deprecated in v1.2 and will be removed in v1.4.")
-        if val:
-            self.accelerator_connector._device_type = DeviceType.GPU
+from pytorch_lightning.utilities import rank_zero_deprecation
 
 
 class DeprecatedTrainerAttributes:
 
-    accelerator: Accelerator
-    lightning_module: LightningModule
     sanity_checking: bool
 
-    @property
-    def accelerator_backend(self) -> Accelerator:
-        rank_zero_deprecation(
-            "The `Trainer.accelerator_backend` attribute is deprecated in favor of `Trainer.accelerator`"
-            " since 1.2 and will be removed in v1.4."
-        )
-        return self.accelerator
-
-    def get_model(self) -> LightningModule:
-        rank_zero_deprecation(
-            "The use of `Trainer.get_model()` is deprecated in favor of `Trainer.lightning_module`"
-            " and will be removed in v1.4."
-        )
-        return self.lightning_module
-
     @property
     def running_sanity_check(self) -> bool:
         rank_zero_deprecation(
diff --git a/pytorch_lightning/trainer/predict_loop.py b/pytorch_lightning/trainer/predict_loop.py
index fb1ad3b054c9e..77dfde7f771da 100644
--- a/pytorch_lightning/trainer/predict_loop.py
+++ b/pytorch_lightning/trainer/predict_loop.py
@@ -60,6 +60,7 @@ def should_store_predictions(self) -> bool:
 
     def on_trainer_init(self):
         self.trainer.num_predict_batches = []
+        self.trainer.predicted_ckpt_path = None
 
     def get_predict_dataloaders(self):
         self.trainer.reset_predict_dataloader(self.trainer.lightning_module)
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 8732d8c33dce7..a9a431ddbba5e 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -47,7 +47,7 @@
 from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector
 from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector
 from pytorch_lightning.trainer.data_loading import TrainerDataLoadingMixin
-from pytorch_lightning.trainer.deprecated_api import DeprecatedDistDeviceAttributes, DeprecatedTrainerAttributes
+from pytorch_lightning.trainer.deprecated_api import DeprecatedTrainerAttributes
 from pytorch_lightning.trainer.evaluation_loop import EvaluationLoop
 from pytorch_lightning.trainer.logging import TrainerLoggingMixin
 from pytorch_lightning.trainer.model_hooks import TrainerModelHooksMixin
@@ -83,7 +83,6 @@ class Trainer(
     TrainerLoggingMixin,
     TrainerTrainingTricksMixin,
     TrainerDataLoadingMixin,
-    DeprecatedDistDeviceAttributes,
     DeprecatedTrainerAttributes,
 ):
 
@@ -379,6 +378,7 @@ def __init__(
             terminate_on_nan,
         )
         self.evaluation_loop.on_trainer_init()
+        self.predict_loop.on_trainer_init()
 
         # configure tuner
         self.tuner.on_trainer_init(auto_lr_find, auto_scale_batch_size)
@@ -585,6 +585,7 @@ def predict(
         dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None,
         datamodule: Optional[LightningDataModule] = None,
         return_predictions: Optional[bool] = None,
+        ckpt_path: Optional[str] = 'best',
     ) -> Optional[_PREDICT_OUTPUT]:
         r"""
 
@@ -601,6 +602,10 @@ def predict(
             return_predictions: Whether to return predictions.
                 ``True`` by default except when an accelerator that spawns processes is used (not supported).
 
+            ckpt_path: Either ``best`` or path to the checkpoint you wish to use to predict.
+                If ``None``, use the current weights of the model.
+                When the model is given as argument, this parameter will not apply.
+
         Returns:
             Returns a list of dictionaries, one for each provided dataloader containing their respective predictions.
         """
@@ -610,8 +615,6 @@ def predict(
         # --------------------
         Trainer._log_api_event("predict")
 
-        model = model or self.lightning_module
-
         self.state.fn = TrainerFn.PREDICTING
         self.state.status = TrainerStatus.RUNNING
         self.predicting = True
@@ -621,9 +624,15 @@ def predict(
         if dataloaders is not None and datamodule:
             raise MisconfigurationException('You cannot pass both `trainer.predict(dataloaders=..., datamodule=...)`')
 
+        model_provided = model is not None
+        model = model or self.lightning_module
+
         # links data to the trainer
         self.data_connector.attach_data(model, predict_dataloaders=dataloaders, datamodule=datamodule)
 
+        if not model_provided:
+            self.predicted_ckpt_path = self.__load_ckpt_weights(ckpt_path)
+
         results = self._run(model)
 
         assert self.state.stopped
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 43ed2c7ffa964..3f269a1cbc146 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -51,8 +51,6 @@ def __init__(
         self.warning_cache = WarningCache()
         self._teardown_already_run = False
         self.running_loss = TensorRunningAccum(window_length=20)
-        self._curr_step_result = None
-        self._cur_grad_norm_dict = None
         self._multiple_trainloader_mode = multiple_trainloader_mode
         self._skip_backward = False
         self.trainer._multiple_trainloader_mode = multiple_trainloader_mode
@@ -437,15 +435,15 @@ def on_before_zero_grad(self, optimizer):
     def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx):
         self.trainer.accelerator.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx)
 
-    def track_and_norm_grad(self, optimizer):
+    def track_and_norm_grad(self, optimizer) -> dict:
         # track gradient norms
-        grad_norm_dic = self._track_gradient_norm()
+        grad_norm_dict = self._track_gradient_norm()
 
         # clip gradients
         self.trainer.accelerator.clip_gradients(
             optimizer, self.trainer.gradient_clip_val, gradient_clip_algorithm=self.trainer.gradient_clip_algorithm
         )
-        self._cur_grad_norm_dict = grad_norm_dic
+        return grad_norm_dict
 
     def _track_gradient_norm(self):
         grad_norm_dict = {}
@@ -654,7 +652,7 @@ def _on_train_epoch_end_hook(self, processed_epoch_output) -> None:
 
     def run_training_batch(self, batch, batch_idx, dataloader_idx):
         # track grad norms
-        grad_norm_dic = {}
+        grad_norm_dict = {}
 
         # bookkeeping
         self.trainer.hiddens = None
@@ -668,19 +666,19 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx):
             self.warning_cache.warn("train_dataloader yielded None. If this was on purpose, ignore this warning...")
             return AttributeDict(
                 signal=0,
-                grad_norm_dic=grad_norm_dic,
+                grad_norm_dict={},
                 training_step_output_for_epoch_end=batch_outputs,
             )
 
         # hook
         response = self.trainer.call_hook("on_batch_start")
         if response == -1:
-            return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic)
+            return AttributeDict(signal=-1, grad_norm_dict={})
 
         # hook
         response = self.trainer.call_hook("on_train_batch_start", batch, batch_idx, dataloader_idx)
         if response == -1:
-            return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic)
+            return AttributeDict(signal=-1, grad_norm_dict={})
 
         # lightning module hook
         splits = self._tbptt_split_batch(batch)
@@ -693,6 +691,7 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx):
                 # toggle model params + set info to logger_connector
                 self.run_train_split_start(split_idx, split_batch, opt_idx, optimizer)
 
+                result = AttributeDict()
                 if self.should_accumulate():
                     # For gradient accumulation
 
@@ -703,24 +702,19 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx):
                     # automatic_optimization=True: perform dpp sync only when performing optimizer_step
                     # automatic_optimization=False: don't block synchronization here
                     with self.block_ddp_sync_behaviour():
-                        self.training_step_and_backward(
+                        result = self.training_step_and_backward(
                             split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens
                         )
 
-                    batch_outputs = self._process_closure_result(
-                        batch_outputs=batch_outputs,
-                        opt_idx=opt_idx,
-                    )
-
                 # ------------------------------
                 # BACKWARD PASS
                 # ------------------------------
                 # gradient update with accumulated gradients
-
                 else:
                     if self.trainer.lightning_module.automatic_optimization:
 
                         def train_step_and_backward_closure():
+                            nonlocal result
                             result = self.training_step_and_backward(
                                 split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens
                             )
@@ -730,30 +724,28 @@ def train_step_and_backward_closure():
                         self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
 
                     else:
-                        self._curr_step_result = self.training_step(
-                            split_batch, batch_idx, opt_idx, self.trainer.hiddens
-                        )
+                        result = self.training_step(split_batch, batch_idx, opt_idx, self.trainer.hiddens)
 
-                    if self._curr_step_result is None:
+                    if not result:
                         # user decided to skip optimization
                         # make sure to zero grad.
                         continue
 
-                    batch_outputs = self._process_closure_result(
-                        batch_outputs=batch_outputs,
-                        opt_idx=opt_idx,
-                    )
-
                     # todo: Properly aggregate grad_norm accros opt_idx and split_idx
-                    grad_norm_dic = self._cur_grad_norm_dict
-                    self._cur_grad_norm_dict = None
+                    grad_norm_dict = result.get("grad_norm_dict", {})
 
                     # update running loss + reset accumulated loss
-                    self.update_running_loss()
+                    self.update_running_loss(result.loss)
+
+                batch_outputs = self._process_closure_result(
+                    opt_closure_result=result,
+                    batch_outputs=batch_outputs,
+                    opt_idx=opt_idx,
+                )
 
         result = AttributeDict(
             signal=0,
-            grad_norm_dic=grad_norm_dic,
+            grad_norm_dict=grad_norm_dict,
             training_step_output_for_epoch_end=batch_outputs,
         )
         return result
@@ -782,11 +774,10 @@ def block_ddp_sync_behaviour(self, should_block_sync: bool = False):
         else:
             yield None
 
-    def _process_closure_result(self, batch_outputs: list, opt_idx: int) -> list:
-        opt_closure_result = self._curr_step_result
-
-        if opt_closure_result is not None:
-
+    def _process_closure_result(
+        self, opt_closure_result: Optional[AttributeDict], batch_outputs: list, opt_idx: int
+    ) -> list:
+        if opt_closure_result:
             # cache metrics
             self.trainer.logger_connector.cache_training_step_metrics(opt_closure_result)
 
@@ -798,12 +789,6 @@ def _process_closure_result(self, batch_outputs: list, opt_idx: int) -> list:
             batch_opt_idx = opt_idx if len(batch_outputs) > 1 else 0
             batch_outputs[batch_opt_idx].append(opt_closure_result.training_step_output_for_epoch_end)
 
-            if self.trainer.lightning_module.automatic_optimization:
-                # track total loss for logging (avoid mem leaks)
-                self.accumulated_loss.append(opt_closure_result.loss)
-
-        self._curr_step_result = None
-
         return batch_outputs
 
     def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, hiddens):
@@ -811,7 +796,6 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer,
         with self.trainer.profiler.profile("training_step_and_backward"):
             # lightning module hook
             result = self.training_step(split_batch, batch_idx, opt_idx, hiddens)
-            self._curr_step_result = result
 
             if not self._skip_backward and self.trainer.lightning_module.automatic_optimization:
                 is_first_batch_to_accumulate = batch_idx % self.trainer.accumulate_grad_batches == 0
@@ -866,7 +850,7 @@ def backward(self, result, optimizer, opt_idx, *args, **kwargs):
 
         if not self.should_accumulate():
             # track gradients
-            self.track_and_norm_grad(optimizer=optimizer)
+            result.grad_norm_dict = self.track_and_norm_grad(optimizer=optimizer)
 
     def update_train_loop_lr_schedulers(self, monitor_metrics=None):
         num_accumulated_batches_reached = self._accumulated_batches_reached()
@@ -995,7 +979,11 @@ def run_train_split_start(self, split_idx, split_batch, opt_idx, optimizer):
         # use to track metrics internally
         self.trainer.logger_connector.on_train_split_start(split_idx, opt_idx, split_batch)
 
-    def update_running_loss(self):
+    def update_running_loss(self, current_loss: torch.Tensor) -> None:
+        if self.trainer.lightning_module.automatic_optimization:
+            # track total loss for logging (avoid mem leaks)
+            self.accumulated_loss.append(current_loss)
+
         accumulated_loss = self.accumulated_loss.mean()
 
         if accumulated_loss is not None:
diff --git a/pytorch_lightning/utilities/argparse_utils.py b/pytorch_lightning/utilities/argparse_utils.py
index 17f0e9b8cc4a9..92cb7804da878 100644
--- a/pytorch_lightning/utilities/argparse_utils.py
+++ b/pytorch_lightning/utilities/argparse_utils.py
@@ -2,8 +2,6 @@
 
 rank_zero_deprecation("`argparse_utils` package has been renamed to `argparse` since v1.2 and will be removed in v1.4")
 
-from pytorch_lightning.utilities.argparse import *  # noqa: F403 E402 F401
-
 # for backward compatibility with old checkpoints (versions < 1.2.0)
 # that need to be able to unpickle the function from the checkpoint
 from pytorch_lightning.utilities.argparse import _gpus_arg_default  # noqa: E402 F401 # isort: skip
diff --git a/pytorch_lightning/utilities/cli.py b/pytorch_lightning/utilities/cli.py
index 413b06f39f7a6..da6592ae66c18 100644
--- a/pytorch_lightning/utilities/cli.py
+++ b/pytorch_lightning/utilities/cli.py
@@ -128,13 +128,14 @@ def __init__(
         .. warning:: ``LightningCLI`` is in beta and subject to change.
 
         Args:
-            model_class: The LightningModule class to train on.
-            datamodule_class: An optional LightningDataModule class.
+            model_class: :class:`~pytorch_lightning.core.lightning.LightningModule` class to train on.
+            datamodule_class: An optional :class:`~pytorch_lightning.core.datamodule.LightningDataModule` class.
             save_config_callback: A callback class to save the training config.
-            trainer_class: An optional extension of the Trainer class.
+            trainer_class: An optional subclass of the :class:`~pytorch_lightning.trainer.trainer.Trainer` class.
             trainer_defaults: Set to override Trainer defaults or add persistent callbacks.
-            seed_everything_default: Default value for seed_everything argument.
-            description: Description of the tool shown when running --help.
+            seed_everything_default: Default value for the :func:`~pytorch_lightning.utilities.seed.seed_everything`
+                seed argument.
+            description: Description of the tool shown when running ``--help``.
             env_prefix: Prefix for environment variables.
             env_parse: Whether environment variable parsing is enabled.
             parser_kwargs: Additional arguments to instantiate LightningArgumentParser.
@@ -165,7 +166,7 @@ def __init__(
         self.add_arguments_to_parser(self.parser)
         self.parse_arguments()
         if self.config['seed_everything'] is not None:
-            seed_everything(self.config['seed_everything'])
+            seed_everything(self.config['seed_everything'], workers=True)
         self.before_instantiate_classes()
         self.instantiate_classes()
         self.prepare_fit_kwargs()
diff --git a/pytorch_lightning/utilities/model_utils.py b/pytorch_lightning/utilities/model_utils.py
deleted file mode 100644
index 728f73f4f0d32..0000000000000
--- a/pytorch_lightning/utilities/model_utils.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from pytorch_lightning.utilities import rank_zero_deprecation
-
-rank_zero_deprecation(
-    "`model_utils` package has been renamed to `model_helpers` since v1.2 and will be removed in v1.4"
-)
-
-from pytorch_lightning.utilities.model_helpers import *  # noqa: F403 E402 F401
diff --git a/pytorch_lightning/utilities/types.py b/pytorch_lightning/utilities/types.py
index ecb0101a2279e..fdfdb95b08692 100644
--- a/pytorch_lightning/utilities/types.py
+++ b/pytorch_lightning/utilities/types.py
@@ -1,12 +1,26 @@
-from typing import Any, Dict, Iterator, List, Union
-
-import torch
-from torchmetrics import Metric
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 Convention:
  - Do not include any `_TYPE` suffix
  - Types used in public hooks (as those in the `LightningModule` and `Callback`) should be public (no trailing `_`)
 """
+from typing import Any, Dict, Iterator, List, Union
+
+import torch
+from torchmetrics import Metric
+
 _METRIC = Union[Metric, torch.Tensor, int, float]
 STEP_OUTPUT = Union[torch.Tensor, Dict[str, Any]]
 EPOCH_OUTPUT = List[STEP_OUTPUT]
diff --git a/pytorch_lightning/utilities/warning_utils.py b/pytorch_lightning/utilities/warning_utils.py
deleted file mode 100644
index 0668bababa609..0000000000000
--- a/pytorch_lightning/utilities/warning_utils.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from pytorch_lightning.utilities import rank_zero_deprecation
-
-rank_zero_deprecation("`warning_utils` package has been renamed to `warnings` since v1.2 and will be removed in v1.4")
-
-from pytorch_lightning.utilities.warnings import *  # noqa: F403 E402 F401
diff --git a/pytorch_lightning/utilities/xla_device_utils.py b/pytorch_lightning/utilities/xla_device_utils.py
deleted file mode 100644
index f028222e3930b..0000000000000
--- a/pytorch_lightning/utilities/xla_device_utils.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from pytorch_lightning.utilities import rank_zero_deprecation
-
-rank_zero_deprecation(
-    "`xla_device_utils` package has been renamed to `xla_device` since v1.2 and will be removed in v1.4"
-)
-
-from pytorch_lightning.utilities.xla_device import *  # noqa: F403 E402 F401
diff --git a/setup.cfg b/setup.cfg
index 34ad80e7cbed0..d747530ec2c2e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -44,7 +44,6 @@ exclude_lines =
 # *metrics (94%+) are temporarily removed from testing while tests speed up
 omit =
     pytorch_lightning/cluster_environments/*.py
-    pytorch_lightning/utilities/xla_device_utils.py
     pytorch_lightning/utilities/distributed.py
     pytorch_lightning/tuner/auto_gpu_select.py
 
diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
index a57fbb4afcbdc..fc24401aa106c 100644
--- a/tests/accelerators/test_accelerator_connector.py
+++ b/tests/accelerators/test_accelerator_connector.py
@@ -105,7 +105,6 @@ def test_accelerator_choice_ddp_slurm(setup_distributed_mock):
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert trainer.use_ddp
             assert trainer.accelerator_connector.is_slurm_managing_tasks
             assert isinstance(trainer.accelerator, GPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDPPlugin)
@@ -144,7 +143,6 @@ def test_accelerator_choice_ddp2_slurm(device_count_mock, setup_distributed_mock
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert trainer.use_ddp2
             assert trainer.accelerator_connector.is_slurm_managing_tasks
             assert isinstance(trainer.accelerator, GPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDP2Plugin)
@@ -183,7 +181,6 @@ def test_accelerator_choice_ddp_te(device_count_mock, setup_distributed_mock):
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert trainer.use_ddp
             assert isinstance(trainer.accelerator, GPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDPPlugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
@@ -221,7 +218,6 @@ def test_accelerator_choice_ddp2_te(device_count_mock, setup_distributed_mock):
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert trainer.use_ddp2
             assert isinstance(trainer.accelerator, GPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDP2Plugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
@@ -257,7 +253,6 @@ def test_accelerator_choice_ddp_cpu_te(device_count_mock, setup_distributed_mock
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert trainer.use_ddp
             assert isinstance(trainer.accelerator, CPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDPPlugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
@@ -294,7 +289,6 @@ def test_accelerator_choice_ddp_cpu_slurm(device_count_mock, setup_distributed_m
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert trainer.use_ddp
             assert trainer.accelerator_connector.is_slurm_managing_tasks
             assert isinstance(trainer.accelerator, CPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDPPlugin)
@@ -342,7 +336,6 @@ def creates_children(self) -> bool:
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert trainer.use_ddp
             assert isinstance(trainer.accelerator, CPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDPPlugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, CustomCluster)
diff --git a/tests/deprecated_api/test_remove_1-4.py b/tests/deprecated_api/test_remove_1-4.py
index 99e1b31f6edad..37d8abfdf905d 100644
--- a/tests/deprecated_api/test_remove_1-4.py
+++ b/tests/deprecated_api/test_remove_1-4.py
@@ -14,176 +14,16 @@
 """Test deprecated functionality which will be removed in v1.4.0"""
 
 import pytest
-import torch
 
 from pytorch_lightning import Trainer
-from pytorch_lightning.overrides.data_parallel import (
-    LightningDataParallel,
-    LightningDistributedDataParallel,
-    LightningParallelModule,
-)
-from pytorch_lightning.overrides.distributed import LightningDistributedModule
-from pytorch_lightning.plugins import DDPSpawnPlugin
-from pytorch_lightning.plugins.environments import LightningEnvironment
 from tests.deprecated_api import _soft_unimport_module
 from tests.helpers import BoringModel
-from tests.helpers.runif import RunIf
-
-
-def test_v1_4_0_deprecated_trainer_attributes():
-    with pytest.deprecated_call(match="will be removed in v1.4."):
-        trainer = Trainer()
-        _ = trainer.accelerator_backend
-    assert trainer.accelerator == trainer.accelerator_backend
-
-
-def test_v1_4_0_deprecated_trainer_methods():
-    with pytest.deprecated_call(match='will be removed in v1.4'):
-        trainer = Trainer()
-        _ = trainer.get_model()
-    assert trainer.get_model() == trainer.lightning_module
 
 
 def test_v1_4_0_deprecated_imports():
     _soft_unimport_module('pytorch_lightning.utilities.argparse_utils')
     with pytest.deprecated_call(match='will be removed in v1.4'):
-        from pytorch_lightning.utilities.argparse_utils import from_argparse_args  # noqa: F811 F401
-
-    _soft_unimport_module('pytorch_lightning.utilities.model_utils')
-    with pytest.deprecated_call(match='will be removed in v1.4'):
-        from pytorch_lightning.utilities.model_utils import is_overridden  # noqa: F811 F401
-
-    _soft_unimport_module('pytorch_lightning.utilities.warning_utils')
-    with pytest.deprecated_call(match='will be removed in v1.4'):
-        from pytorch_lightning.utilities.warning_utils import WarningCache  # noqa: F811 F401
-
-    _soft_unimport_module('pytorch_lightning.utilities.xla_device_utils')
-    with pytest.deprecated_call(match='will be removed in v1.4'):
-        from pytorch_lightning.utilities.xla_device_utils import XLADeviceUtils  # noqa: F811 F401
-
-
-def test_v1_4_0_deprecated_trainer_device_distrib():
-    """Test that Trainer attributes works fine."""
-    trainer = Trainer()
-    trainer.accelerator_connector._distrib_type = None
-    trainer.accelerator_connector._device_type = None
-
-    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
-        trainer.on_cpu = True
-    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
-        assert trainer.on_cpu
-
-    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
-        trainer.on_gpu = True
-    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
-        assert trainer.on_gpu
-
-    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
-        trainer.on_tpu = True
-    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
-        assert trainer.on_tpu
-    trainer.accelerator_connector._device_type = None
-    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
-        trainer.use_tpu = True
-    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
-        assert trainer.use_tpu
-
-    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
-        trainer.use_dp = True
-    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
-        assert trainer.use_dp
-
-    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
-        trainer.use_ddp = True
-    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
-        assert trainer.use_ddp
-
-    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
-        trainer.use_ddp2 = True
-    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
-        assert trainer.use_ddp2
-
-    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
-        trainer.use_horovod = True
-    with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
-        assert trainer.use_horovod
-
-
-def test_v1_4_0_deprecated_metrics():
-    from pytorch_lightning.metrics.functional.classification import stat_scores_multiple_classes
-    with pytest.deprecated_call(match='will be removed in v1.4'):
-        stat_scores_multiple_classes(pred=torch.tensor([0, 1]), target=torch.tensor([0, 1]))
-
-    from pytorch_lightning.metrics.functional.classification import iou
-    with pytest.deprecated_call(match='will be removed in v1.4'):
-        iou(torch.randint(0, 2, (10, 3, 3)), torch.randint(0, 2, (10, 3, 3)))
-
-    from pytorch_lightning.metrics.functional.classification import recall
-    with pytest.deprecated_call(match='will be removed in v1.4'):
-        recall(torch.randint(0, 2, (10, 3, 3)), torch.randint(0, 2, (10, 3, 3)))
-
-    from pytorch_lightning.metrics.functional.classification import precision
-    with pytest.deprecated_call(match='will be removed in v1.4'):
-        precision(torch.randint(0, 2, (10, 3, 3)), torch.randint(0, 2, (10, 3, 3)))
-
-    from pytorch_lightning.metrics.functional.classification import precision_recall
-    with pytest.deprecated_call(match='will be removed in v1.4'):
-        precision_recall(torch.randint(0, 2, (10, 3, 3)), torch.randint(0, 2, (10, 3, 3)))
-
-    from pytorch_lightning.metrics.functional.classification import auc
-    with pytest.deprecated_call(match='will be removed in v1.4'):
-        auc(torch.rand(10, ).sort().values, torch.rand(10, ))
-
-    from pytorch_lightning.metrics.functional.classification import auroc
-    with pytest.deprecated_call(match='will be removed in v1.4'):
-        auroc(torch.rand(10, ), torch.randint(0, 2, (10, )))
-
-    from pytorch_lightning.metrics.functional.classification import multiclass_auroc
-    with pytest.deprecated_call(match='will be removed in v1.4'):
-        multiclass_auroc(torch.rand(20, 5).softmax(dim=-1), torch.randint(0, 5, (20, )), num_classes=5)
-
-
-class CustomDDPPlugin(DDPSpawnPlugin):
-
-    def configure_ddp(self):
-        # old, deprecated implementation
-        with pytest.deprecated_call(
-            match='`LightningDistributedDataParallel` is deprecated since v1.2 and will be removed in v1.4.'
-        ):
-            self._model = LightningDistributedDataParallel(
-                module=self.lightning_module,
-                device_ids=self.determine_ddp_device_ids(),
-                **self._ddp_kwargs,
-            )
-            assert isinstance(self.model, torch.nn.parallel.DistributedDataParallel)
-            assert isinstance(self.model.module, LightningDistributedModule)
-
-
-@RunIf(min_gpus=2, skip_windows=True)
-def test_v1_4_0_deprecated_lightning_distributed_data_parallel(tmpdir):
-    model = BoringModel()
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        fast_dev_run=True,
-        gpus=2,
-        accelerator="ddp_spawn",
-        plugins=[
-            CustomDDPPlugin(
-                parallel_devices=[torch.device("cuda", 0), torch.device("cuda", 1)],
-                cluster_environment=LightningEnvironment(),
-            )
-        ]
-    )
-    trainer.fit(model)
-
-
-@RunIf(min_gpus=1)
-def test_v1_4_0_deprecated_lightning_data_parallel():
-    model = BoringModel()
-    with pytest.deprecated_call(match="`LightningDataParallel` is deprecated since v1.2 and will be removed in v1.4."):
-        dp_model = LightningDataParallel(model, device_ids=[0])
-    assert isinstance(dp_model, torch.nn.DataParallel)
-    assert isinstance(dp_model.module, LightningParallelModule)
+        from pytorch_lightning.utilities.argparse_utils import _gpus_arg_default  # noqa: F811 F401
 
 
 def test_v1_4_0_deprecated_manual_optimization_optimizer(tmpdir):
diff --git a/tests/trainer/loops/test_evaluation_loop_flow.py b/tests/trainer/loops/test_evaluation_loop_flow.py
index 8fdb321b6f230..3177a3aa09156 100644
--- a/tests/trainer/loops/test_evaluation_loop_flow.py
+++ b/tests/trainer/loops/test_evaluation_loop_flow.py
@@ -71,7 +71,7 @@ def backward(self, loss, optimizer, optimizer_idx):
 
     out = trainer.train_loop.run_training_batch(batch, batch_idx, 0)
     assert out.signal == 0
-    assert len(out.grad_norm_dic) == 0 and isinstance(out.grad_norm_dic, dict)
+    assert len(out.grad_norm_dict) == 0 and isinstance(out.grad_norm_dict, dict)
 
     train_step_out = out.training_step_output_for_epoch_end
     assert len(train_step_out) == 1
@@ -140,7 +140,7 @@ def backward(self, loss, optimizer, optimizer_idx):
 
     out = trainer.train_loop.run_training_batch(batch, batch_idx, 0)
     assert out.signal == 0
-    assert len(out.grad_norm_dic) == 0 and isinstance(out.grad_norm_dic, dict)
+    assert len(out.grad_norm_dict) == 0 and isinstance(out.grad_norm_dict, dict)
 
     train_step_out = out.training_step_output_for_epoch_end
     assert len(train_step_out) == 1
diff --git a/tests/trainer/loops/test_training_loop_flow_scalar.py b/tests/trainer/loops/test_training_loop_flow_scalar.py
index 816134ee52941..f14f7d339d83f 100644
--- a/tests/trainer/loops/test_training_loop_flow_scalar.py
+++ b/tests/trainer/loops/test_training_loop_flow_scalar.py
@@ -155,7 +155,7 @@ def backward(self, loss, optimizer, optimizer_idx):
 
     out = trainer.train_loop.run_training_batch(batch, batch_idx, 0)
     assert out.signal == 0
-    assert len(out.grad_norm_dic) == 0 and isinstance(out.grad_norm_dic, dict)
+    assert len(out.grad_norm_dict) == 0 and isinstance(out.grad_norm_dict, dict)
 
     train_step_out = out.training_step_output_for_epoch_end
     assert len(train_step_out) == 1
@@ -231,7 +231,7 @@ def backward(self, loss, optimizer, optimizer_idx):
 
     out = trainer.train_loop.run_training_batch(batch, batch_idx, 0)
     assert out.signal == 0
-    assert len(out.grad_norm_dic) == 0 and isinstance(out.grad_norm_dic, dict)
+    assert len(out.grad_norm_dict) == 0 and isinstance(out.grad_norm_dict, dict)
 
     train_step_out = out.training_step_output_for_epoch_end
     assert len(train_step_out) == 1
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index f04061a23e096..19de7b0a985d4 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -38,6 +38,7 @@
 from pytorch_lightning.plugins import DDPSpawnPlugin
 from pytorch_lightning.profiler import AdvancedProfiler, PassThroughProfiler, PyTorchProfiler, SimpleProfiler
 from pytorch_lightning.trainer.states import TrainerFn
+from pytorch_lightning.utilities import DeviceType, DistributedType
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.seed import seed_everything
@@ -611,7 +612,7 @@ def test_benchmark_option(tmpdir):
 
 @pytest.mark.parametrize("ckpt_path", (None, "best", "specific"))
 @pytest.mark.parametrize("save_top_k", (-1, 0, 1, 2))
-@pytest.mark.parametrize("fn", ("validate", "test"))
+@pytest.mark.parametrize("fn", ("validate", "test", "predict"))
 def test_tested_checkpoint_path(tmpdir, ckpt_path, save_top_k, fn):
 
     class TestModel(BoringModel):
@@ -620,48 +621,55 @@ def validation_step(self, batch, batch_idx):
             self.log("foo", -batch_idx)
             return super().validation_step(batch, batch_idx)
 
+        def test_step(self, *args):
+            return self.validation_step(*args)
+
+        def predict_step(self, *args):
+            args = args[:-1]  # remove `dataloader_idx`
+            return self.validation_step(*args)
+
     model = TestModel()
+    model.test_epoch_end = None
     trainer = Trainer(
         max_epochs=2,
+        limit_val_batches=1,
+        limit_test_batches=1,
+        limit_predict_batches=1,
         progress_bar_refresh_rate=0,
         default_root_dir=tmpdir,
         callbacks=[ModelCheckpoint(monitor="foo", save_top_k=save_top_k)],
     )
     trainer.fit(model)
 
-    test_or_validate = getattr(trainer, fn)
+    trainer_fn = getattr(trainer, fn)
+    path_attr = f"{fn}{'d' if fn == 'validate' else 'ed'}_ckpt_path"
+    assert getattr(trainer, path_attr) is None
+
     if ckpt_path == "best":
         # ckpt_path is 'best', meaning we load the best weights
         if save_top_k == 0:
             with pytest.raises(MisconfigurationException, match=".*is not configured to save the best.*"):
-                test_or_validate(ckpt_path=ckpt_path)
+                trainer_fn(ckpt_path=ckpt_path)
         else:
-            test_or_validate(ckpt_path=ckpt_path)
-            if fn == "test":
-                assert trainer.tested_ckpt_path == trainer.checkpoint_callback.best_model_path
-            else:
-                assert trainer.validated_ckpt_path == trainer.checkpoint_callback.best_model_path
+            trainer_fn(ckpt_path=ckpt_path)
+            assert getattr(trainer, path_attr) == trainer.checkpoint_callback.best_model_path
     elif ckpt_path is None:
         # ckpt_path is None, meaning we don't load any checkpoints and
         # use the weights from the end of training
-        test_or_validate(ckpt_path=ckpt_path)
-        assert trainer.tested_ckpt_path is None
-        assert trainer.validated_ckpt_path is None
+        trainer_fn(ckpt_path=ckpt_path)
+        assert getattr(trainer, path_attr) is None
     else:
         # specific checkpoint, pick one from saved ones
         if save_top_k == 0:
             with pytest.raises(FileNotFoundError):
-                test_or_validate(ckpt_path="random.ckpt")
+                trainer_fn(ckpt_path="random.ckpt")
         else:
             ckpt_path = str(
                 list((Path(tmpdir) / f"lightning_logs/version_{trainer.logger.version}/checkpoints").iterdir()
                      )[0].absolute()
             )
-            test_or_validate(ckpt_path=ckpt_path)
-            if fn == "test":
-                assert trainer.tested_ckpt_path == ckpt_path
-            else:
-                assert trainer.validated_ckpt_path == ckpt_path
+            trainer_fn(ckpt_path=ckpt_path)
+            assert getattr(trainer, path_attr) == ckpt_path
 
 
 def test_disabled_training(tmpdir):
@@ -1042,14 +1050,8 @@ def test_gpu_choice(tmpdir):
 
 
 @pytest.mark.parametrize(
-    ["limit_val_batches"],
-    [
-        pytest.param(0.0),  # this should run no sanity checks
-        pytest.param(1),
-        pytest.param(1.0),
-        pytest.param(0.5),
-        pytest.param(5),
-    ],
+    "limit_val_batches",
+    [0.0, 1, 1.0, 0.5, 5],
 )
 def test_num_sanity_val_steps(tmpdir, limit_val_batches):
     """
@@ -1079,15 +1081,7 @@ def test_num_sanity_val_steps(tmpdir, limit_val_batches):
         )
 
 
-@pytest.mark.parametrize(
-    ["limit_val_batches"],
-    [
-        pytest.param(0.0),  # this should run no sanity checks
-        pytest.param(1),
-        pytest.param(1.0),
-        pytest.param(0.3),
-    ],
-)
+@pytest.mark.parametrize("limit_val_batches", [0.0, 1, 1.0, 0.3])
 def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
     """
     Test that `num_sanity_val_steps=-1` runs through all validation data once, and as many batches as
@@ -1118,207 +1112,67 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
     [
         (
             dict(accelerator=None, gpus=None),
-            dict(
-                use_dp=False,
-                use_ddp=False,
-                use_ddp2=False,
-                num_gpus=0,
-                on_gpu=False,
-                use_single_gpu=False,
-                num_processes=1,
-            ),
+            dict(_distrib_type=None, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1),
         ),
         (
             dict(accelerator="dp", gpus=None),
-            dict(
-                use_dp=False,
-                use_ddp=False,
-                use_ddp2=False,
-                num_gpus=0,
-                on_gpu=False,
-                use_single_gpu=False,
-                num_processes=1,
-            ),
-        ),
-        (
-            dict(accelerator="dp", gpus=None),
-            dict(
-                use_dp=False,
-                use_ddp=False,
-                use_ddp2=False,
-                num_gpus=0,
-                on_gpu=False,
-                use_single_gpu=False,
-                num_processes=1,
-            ),
+            dict(_distrib_type=None, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1),
         ),
         (
             dict(accelerator="ddp", gpus=None),
-            dict(
-                use_dp=False,
-                use_ddp=False,
-                use_ddp2=False,
-                num_gpus=0,
-                on_gpu=False,
-                use_single_gpu=False,
-                num_processes=1,
-            ),
+            dict(_distrib_type=None, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1),
         ),
         (
             dict(accelerator="ddp", num_processes=2, gpus=None),
-            dict(
-                use_dp=False,
-                use_ddp=True,
-                use_ddp2=False,
-                num_gpus=0,
-                on_gpu=False,
-                use_single_gpu=False,
-                num_processes=2,
-            ),
+            dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2),
         ),
         (
             dict(accelerator="ddp", num_nodes=2, gpus=None),
-            dict(
-                use_dp=False,
-                use_ddp=True,
-                use_ddp2=False,
-                num_gpus=0,
-                on_gpu=False,
-                use_single_gpu=False,
-                num_processes=1,
-            ),
+            dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1),
         ),
         (
             dict(accelerator="ddp_cpu", num_processes=2, gpus=None),
-            dict(
-                use_dp=False,
-                use_ddp=True,
-                use_ddp2=False,
-                num_gpus=0,
-                on_gpu=False,
-                use_single_gpu=False,
-                num_processes=2,
-            ),
+            dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2),
         ),
         (
             dict(accelerator="ddp2", gpus=None),
-            dict(
-                use_dp=False,
-                use_ddp=False,
-                use_ddp2=False,
-                num_gpus=0,
-                on_gpu=False,
-                use_single_gpu=False,
-                num_processes=1,
-            ),
+            dict(_distrib_type=None, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1),
         ),
         (
             dict(accelerator=None, gpus=1),
-            dict(
-                use_dp=False,
-                use_ddp=False,
-                use_ddp2=False,
-                num_gpus=1,
-                on_gpu=True,
-                use_single_gpu=True,
-                num_processes=1,
-            ),
+            dict(_distrib_type=None, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1),
         ),
         (
             dict(accelerator="dp", gpus=1),
-            dict(
-                use_dp=True,
-                use_ddp=False,
-                use_ddp2=False,
-                num_gpus=1,
-                on_gpu=True,
-                use_single_gpu=True,
-                num_processes=1,
-            ),
+            dict(_distrib_type=DistributedType.DP, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1),
         ),
         (
             dict(accelerator="ddp", gpus=1),
-            dict(
-                use_dp=False,
-                use_ddp=True,
-                use_ddp2=False,
-                num_gpus=1,
-                on_gpu=True,
-                use_single_gpu=True,
-                num_processes=1,
-            ),
+            dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1),
         ),
         (
             dict(accelerator="ddp_cpu", num_processes=2, gpus=1),
-            dict(
-                use_dp=False,
-                use_ddp=True,
-                use_ddp2=False,
-                num_gpus=0,
-                on_gpu=False,
-                use_single_gpu=False,
-                num_processes=2,
-            ),
+            dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2),
         ),
         (
             dict(accelerator="ddp2", gpus=1),
-            dict(
-                use_dp=False,
-                use_ddp=False,
-                use_ddp2=True,
-                num_gpus=1,
-                on_gpu=True,
-                use_single_gpu=False,
-                num_processes=1,
-            ),
+            dict(_distrib_type=DistributedType.DDP2, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1),
         ),
         (
             dict(accelerator=None, gpus=2),
-            dict(
-                use_dp=False,
-                use_ddp=True,
-                use_ddp2=False,
-                num_gpus=2,
-                on_gpu=True,
-                use_single_gpu=False,
-                num_processes=2,
-            ),
+            dict(_distrib_type=DistributedType.DDP_SPAWN, _device_type=DeviceType.GPU, num_gpus=2, num_processes=2),
         ),
         (
             dict(accelerator="dp", gpus=2),
-            dict(
-                use_dp=True,
-                use_ddp=False,
-                use_ddp2=False,
-                num_gpus=2,
-                on_gpu=True,
-                use_single_gpu=False,
-                num_processes=1,
-            ),
+            dict(_distrib_type=DistributedType.DP, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1),
         ),
         (
             dict(accelerator="ddp", gpus=2),
-            dict(
-                use_dp=False,
-                use_ddp=True,
-                use_ddp2=False,
-                num_gpus=2,
-                on_gpu=True,
-                use_single_gpu=False,
-                num_processes=2,
-            ),
+            dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.GPU, num_gpus=2, num_processes=2),
         ),
         (
             dict(accelerator="ddp2", gpus=2),
-            dict(
-                use_dp=False,
-                use_ddp=False,
-                use_ddp2=True,
-                num_gpus=2,
-                on_gpu=True,
-                use_single_gpu=False,
-                num_processes=1,
-            ),
+            dict(_distrib_type=DistributedType.DDP2, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1),
         ),
     ],
 )
@@ -1327,7 +1181,7 @@ def test_trainer_config(trainer_kwargs, expected, monkeypatch):
         monkeypatch.setattr(torch.cuda, "is_available", lambda: True)
         monkeypatch.setattr(torch.cuda, "device_count", lambda: trainer_kwargs["gpus"])
     trainer = Trainer(**trainer_kwargs)
-    assert len(expected) == 7
+    assert len(expected) == 4
     for k, v in expected.items():
         assert getattr(trainer, k) == v, f"Failed {k}: {v}"
 
@@ -1371,17 +1225,10 @@ def __init__(self, **kwargs):
 
 
 @pytest.mark.parametrize(
-    "trainer_params",
-    [
-        OmegaConf.create({
-            "max_epochs": 1,
-            "gpus": 1
-        }),
-        OmegaConf.create({
-            "max_epochs": 1,
-            "gpus": [0]
-        }),
-    ],
+    "trainer_params", [
+        OmegaConf.create(dict(max_epochs=1, gpus=1)),
+        OmegaConf.create(dict(max_epochs=1, gpus=[0])),
+    ]
 )
 @RunIf(min_gpus=1)
 def test_trainer_omegaconf(trainer_params):
@@ -2001,8 +1848,9 @@ def on_predict_start(self) -> None:
         assert not self.training
 
 
-@pytest.mark.parametrize(['accelerator', 'num_processes'],
-                         [(None, 1), pytest.param('ddp', 2, marks=RunIf(skip_windows=True))])
+@pytest.mark.parametrize(
+    'accelerator,num_processes', [(None, 1), pytest.param('ddp', 2, marks=RunIf(skip_windows=True))]
+)
 def test_model_in_correct_mode_during_stages(tmpdir, accelerator, num_processes):
     model = TrainerStagesModel()
     trainer = Trainer(default_root_dir=tmpdir, accelerator=accelerator, num_processes=num_processes, fast_dev_run=True)