[Metrics] Unification of FBeta (#4656)

* implementation * init files * more stable reduction * add tests * docs * remove old implementation * pep8 * changelog * Apply suggestions from code review Co-authored-by: Adrian Wälchli <[email protected]> Co-authored-by: Nicki Skafte <[email protected]> Co-authored-by: Teddy Koker <[email protected]> Co-authored-by: Adrian Wälchli <[email protected]> (cherry picked from commit 6831ba9)
Lightning-AI · Nov 23, 2020 · f00c246 · f00c246
1 parent c8afcab
commit f00c246
Show file tree

Hide file tree

Showing 13 changed files with 424 additions and 216 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added casting to python types for numpy scalars when logging hparams ([#4647](https://github.com/PyTorchLightning/pytorch-lightning/pull/4647))
 
 
+- Added `F1` class metric ([#4656](https://github.com/PyTorchLightning/pytorch-lightning/pull/4656))
+
+
 ### Changed
 
 - Consistently use `step=trainer.global_step` in `LearningRateMonitor` independently of `logging_interval` ([#4376](https://github.com/PyTorchLightning/pytorch-lightning/pull/4376))
@@ -20,6 +23,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Metric states are no longer as default added to `state_dict` ([#4685](https://github.com/PyTorchLightning/pytorch-lightning/pull/))
 
 
+- Renamed class metric `Fbeta` >> `FBeta` ([#4656](https://github.com/PyTorchLightning/pytorch-lightning/pull/4656))
+
+
 ### Deprecated
 
 

diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
@@ -221,10 +221,16 @@ Recall
 .. autoclass:: pytorch_lightning.metrics.classification.Recall
     :noindex:
 
-Fbeta
+FBeta
 ~~~~~
 
-.. autoclass:: pytorch_lightning.metrics.classification.Fbeta
+.. autoclass:: pytorch_lightning.metrics.classification.FBeta
+    :noindex:
+
+F1
+~~
+
+.. autoclass:: pytorch_lightning.metrics.classification.F1
     :noindex:
 
 Regression Metrics
@@ -325,17 +331,17 @@ dice_score [func]
     :noindex:
 
 
-f1_score [func]
+f1 [func]
 ~~~~~~~~~~~~~~~
 
-.. autofunction:: pytorch_lightning.metrics.functional.classification.f1_score
+.. autofunction:: pytorch_lightning.metrics.functional.f1
     :noindex:
 
 
-fbeta_score [func]
+fbeta [func]
 ~~~~~~~~~~~~~~~~~~
 
-.. autofunction:: pytorch_lightning.metrics.functional.classification.fbeta_score
+.. autofunction:: pytorch_lightning.metrics.functional.fbeta
     :noindex:
 
 

diff --git a/pytorch_lightning/metrics/__init__.py b/pytorch_lightning/metrics/__init__.py
@@ -15,9 +15,11 @@
 
 from pytorch_lightning.metrics.classification import (
     Accuracy,
+    ConfusionMatrix,
     Precision,
     Recall,
-    Fbeta
+    F1,
+    FBeta,
 )
 
 from pytorch_lightning.metrics.regression import (

diff --git a/pytorch_lightning/metrics/classification/__init__.py b/pytorch_lightning/metrics/classification/__init__.py
@@ -13,4 +13,5 @@
 # limitations under the License.
 from pytorch_lightning.metrics.classification.accuracy import Accuracy
 from pytorch_lightning.metrics.classification.precision_recall import Precision, Recall
-from pytorch_lightning.metrics.classification.f_beta import Fbeta
+from pytorch_lightning.metrics.classification.f_beta import FBeta, F1
+from pytorch_lightning.metrics.classification.confusion_matrix import ConfusionMatrix
diff --git a/pytorch_lightning/metrics/classification/f_beta.py b/pytorch_lightning/metrics/classification/f_beta.py
@@ -11,21 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import math
-import functools
-from abc import ABC, abstractmethod
-from typing import Any, Callable, Optional, Union
-from collections.abc import Mapping, Sequence
-from collections import namedtuple
+from typing import Any, Optional
 
 import torch
-from torch import nn
+
 from pytorch_lightning.metrics.metric import Metric
-from pytorch_lightning.metrics.functional.reduction import class_reduce
-from pytorch_lightning.metrics.classification.precision_recall import _input_format
+from pytorch_lightning.metrics.functional.f_beta import (
+    _fbeta_update,
+    _fbeta_compute
+)
 
 
-class Fbeta(Metric):
+class FBeta(Metric):
     """
     Computes f_beta metric.
 
@@ -51,7 +48,10 @@ class Fbeta(Metric):
 
         average:
             * `'micro'` computes metric globally
-            * `'macro'` computes metric for each class and then takes the mean
+            * `'macro'` computes metric for each class and uniformly averages them
+            * `'weighted'` computes metric for each class and does a weighted-average,
+                where each class is weighted by their support (accounts for class imbalance)
+            * `None` computes and returns the metric per class
 
         multilabel: If predictions are from multilabel classification.
         compute_on_step:
@@ -64,29 +64,28 @@ class Fbeta(Metric):
 
     Example:
 
-        >>> from pytorch_lightning.metrics import Fbeta
+        >>> from pytorch_lightning.metrics import FBeta
         >>> target = torch.tensor([0, 1, 2, 0, 1, 2])
         >>> preds = torch.tensor([0, 2, 1, 0, 0, 1])
-        >>> f_beta = Fbeta(num_classes=3, beta=0.5)
+        >>> f_beta = FBeta(num_classes=3, beta=0.5)
         >>> f_beta(preds, target)
         tensor(0.3333)
 
     """
+
     def __init__(
         self,
-        num_classes: int = 1,
-        beta: float = 1.,
+        num_classes: int,
+        beta: float = 1.0,
         threshold: float = 0.5,
-        average: str = 'micro',
+        average: str = "micro",
         multilabel: bool = False,
         compute_on_step: bool = True,
         dist_sync_on_step: bool = False,
         process_group: Optional[Any] = None,
     ):
         super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
+            compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group,
         )
 
         self.num_classes = num_classes
@@ -95,8 +94,10 @@ def __init__(
         self.average = average
         self.multilabel = multilabel
 
-        assert self.average in ('micro', 'macro'), \
-            "average passed to the function must be either `micro` or `macro`"
+        allowed_average = ("micro", "macro", "weighted", None)
+        if self.average not in allowed_average:
+            raise ValueError('Argument `average` expected to be one of the following:'
+                             f' {allowed_average} but got {self.average}')
 
         self.add_state("true_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum")
         self.add_state("predicted_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum")
@@ -110,25 +111,88 @@ def update(self, preds: torch.Tensor, target: torch.Tensor):
             preds: Predictions from model
             target: Ground truth values
         """
-        preds, target = _input_format(self.num_classes, preds, target, self.threshold, self.multilabel)
+        true_positives, predicted_positives, actual_positives = _fbeta_update(
+            preds, target, self.num_classes, self.threshold, self.multilabel
+        )
 
-        self.true_positives += torch.sum(preds * target, dim=1)
-        self.predicted_positives += torch.sum(preds, dim=1)
-        self.actual_positives += torch.sum(target, dim=1)
+        self.true_positives += true_positives
+        self.predicted_positives += predicted_positives
+        self.actual_positives += actual_positives
 
-    def compute(self):
+    def compute(self) -> torch.Tensor:
         """
-        Computes accuracy over state.
+        Computes fbeta over state.
         """
-        if self.average == 'micro':
-            precision = self.true_positives.sum().float() / (self.predicted_positives.sum())
-            recall = self.true_positives.sum().float() / (self.actual_positives.sum())
+        return _fbeta_compute(self.true_positives, self.predicted_positives,
+                              self.actual_positives, self.beta, self.average)
+
+
+class F1(FBeta):
+    """
+    Computes F1 metric. F1 metrics correspond to a equally weighted average of the
+    precision and recall scores.
+
+    Works with binary, multiclass, and multilabel data.
+    Accepts logits from a model output or integer class values in prediction.
+    Works with multi-dimensional preds and target.
+
+    Forward accepts
+
+    - ``preds`` (float or long tensor): ``(N, ...)`` or ``(N, C, ...)`` where C is the number of classes
+    - ``target`` (long tensor): ``(N, ...)``
+
+    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument.
+    This is the case for binary and multi-label logits.
+
+    If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
+
+    Args:
+        num_classes: Number of classes in the dataset.
+        threshold:
+            Threshold value for binary or multi-label logits. default: 0.5
 
-        elif self.average == 'macro':
-            precision = self.true_positives.float() / (self.predicted_positives)
-            recall = self.true_positives.float() / (self.actual_positives)
+        average:
+            * `'micro'` computes metric globally
+            * `'macro'` computes metric for each class and uniformly averages them
+            * `'weighted'` computes metric for each class and does a weighted-average,
+                where each class is weighted by their support (accounts for class imbalance)
+            * `None` computes and returns the metric per class
 
-        num = (1 + self.beta ** 2) * precision * recall
-        denom = self.beta ** 2 * precision + recall
+        multilabel: If predictions are from multilabel classification.
+        compute_on_step:
+            Forward only calls ``update()`` and returns None if this is set to False. default: True
+        dist_sync_on_step:
+            Synchronize metric state across processes at each ``forward()``
+            before returning the value at the step. default: False
+        process_group:
+            Specify the process group on which synchronization is called. default: None (which selects the entire world)
+
+    Example:
+        >>> from pytorch_lightning.metrics import F1
+        >>> target = torch.tensor([0, 1, 2, 0, 1, 2])
+        >>> preds = torch.tensor([0, 2, 1, 0, 0, 1])
+        >>> f1 = F1(num_classes=3)
+        >>> f1(preds, target)
+        tensor(0.3333)
+    """
 
-        return class_reduce(num=num, denom=denom, weights=None, class_reduction='macro')
+    def __init__(
+        self,
+        num_classes: int = 1,
+        beta: float = 1.0,
+        threshold: float = 0.5,
+        average: str = "micro",
+        multilabel: bool = False,
+        compute_on_step: bool = True,
+        dist_sync_on_step: bool = False,
+        process_group: Optional[Any] = None,
+    ):
+        super().__init__(
+            num_classes=num_classes,
+            beta=1.0,
+            threshold=threshold,
+            average=average,
+            compute_on_step=compute_on_step,
+            dist_sync_on_step=dist_sync_on_step,
+            process_group=process_group,
+        )
diff --git a/pytorch_lightning/metrics/classification/precision_recall.py b/pytorch_lightning/metrics/classification/precision_recall.py
@@ -21,34 +21,7 @@
 import torch
 from torch import nn
 from pytorch_lightning.metrics.metric import Metric
-from pytorch_lightning.metrics.utils import to_onehot, METRIC_EPS
-
-
-def _input_format(num_classes: int, preds: torch.Tensor, target: torch.Tensor, threshold=0.5, multilabel=False):
-    if not (len(preds.shape) == len(target.shape) or len(preds.shape) == len(target.shape) + 1):
-        raise ValueError(
-            "preds and target must have same number of dimensions, or one additional dimension for preds"
-        )
-
-    if len(preds.shape) == len(target.shape) + 1:
-        # multi class probabilites
-        preds = torch.argmax(preds, dim=1)
-
-    if len(preds.shape) == len(target.shape) and preds.dtype == torch.long and num_classes > 1 and not multilabel:
-        # multi-class
-        preds = to_onehot(preds, num_classes=num_classes)
-        target = to_onehot(target, num_classes=num_classes)
-
-    elif len(preds.shape) == len(target.shape) and preds.dtype == torch.float:
-        # binary or multilabel probablities
-        preds = (preds >= threshold).long()
-
-    # transpose class as first dim and reshape
-    if len(preds.shape) > 1:
-        preds = preds.transpose(1, 0)
-        target = target.transpose(1, 0)
-
-    return preds.reshape(num_classes, -1), target.reshape(num_classes, -1)
+from pytorch_lightning.metrics.utils import to_onehot, METRIC_EPS, _input_format_classification_one_hot
 
 
 class Precision(Metric):
@@ -126,7 +99,9 @@ def __init__(
         self.add_state("predicted_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum")
 
     def update(self, preds: torch.Tensor, target: torch.Tensor):
-        preds, target = _input_format(self.num_classes, preds, target, self.threshold, self.multilabel)
+        preds, target = _input_format_classification_one_hot(
+            self.num_classes, preds, target, self.threshold, self.multilabel
+        )
 
         # multiply because we are counting (1, 1) pair for true positives
         self.true_positives += torch.sum(preds * target, dim=1)
@@ -221,7 +196,9 @@ def update(self, preds: torch.Tensor, target: torch.Tensor):
             preds: Predictions from model
             target: Ground truth values
         """
-        preds, target = _input_format(self.num_classes, preds, target, self.threshold, self.multilabel)
+        preds, target = _input_format_classification_one_hot(
+            self.num_classes, preds, target, self.threshold, self.multilabel
+        )
 
         # multiply because we are counting (1, 1) pair for true positives
         self.true_positives += torch.sum(preds * target, dim=1)

diff --git a/pytorch_lightning/metrics/functional/__init__.py b/pytorch_lightning/metrics/functional/__init__.py
@@ -18,8 +18,6 @@
     average_precision,
     confusion_matrix,
     dice_score,
-    f1_score,
-    fbeta_score,
     multiclass_precision_recall_curve,
     multiclass_roc,
     precision,
@@ -44,3 +42,5 @@
 from pytorch_lightning.metrics.functional.mean_squared_log_error import mean_squared_log_error
 from pytorch_lightning.metrics.functional.psnr import psnr
 from pytorch_lightning.metrics.functional.ssim import ssim
+from pytorch_lightning.metrics.functional.confusion_matrix import confusion_matrix
+from pytorch_lightning.metrics.functional.f_beta import fbeta, f1