From c3944eb2024613f83245fa3f13ebdad78a4ccd13 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Tue, 19 Dec 2023 18:11:37 +0100
Subject: [PATCH 01/57] update

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_img/__init__.py | 1 +
 tests/unit/metrics/per_img/__init__.py   | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 src/anomalib/metrics/per_img/__init__.py
 create mode 100644 tests/unit/metrics/per_img/__init__.py

diff --git a/src/anomalib/metrics/per_img/__init__.py b/src/anomalib/metrics/per_img/__init__.py
new file mode 100644
index 0000000000..85c5c5cd4f
--- /dev/null
+++ b/src/anomalib/metrics/per_img/__init__.py
@@ -0,0 +1 @@
+"""Per-Image Metrics."""
diff --git a/tests/unit/metrics/per_img/__init__.py b/tests/unit/metrics/per_img/__init__.py
new file mode 100644
index 0000000000..b925ba9aa3
--- /dev/null
+++ b/tests/unit/metrics/per_img/__init__.py
@@ -0,0 +1 @@
+"""Per-Image Metrics Tests."""

From b61c47fbee429989420b25269f3679d0ac0c7e7a Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Wed, 20 Dec 2023 12:25:04 +0100
Subject: [PATCH 02/57] test binclf curves numpy and numba and fixes

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 .../{per_img => per_image}/__init__.py        |   0
 .../metrics/per_image/_binclf_curve_numba.py  | 105 ++++
 .../metrics/per_image/binclf_curve_numpy.py   | 470 ++++++++++++++++++
 .../{per_img => per_image}/__init__.py        |   0
 .../per_image/test_binclf_curve_numpy.py      | 405 +++++++++++++++
 5 files changed, 980 insertions(+)
 rename src/anomalib/metrics/{per_img => per_image}/__init__.py (100%)
 create mode 100644 src/anomalib/metrics/per_image/_binclf_curve_numba.py
 create mode 100644 src/anomalib/metrics/per_image/binclf_curve_numpy.py
 rename tests/unit/metrics/{per_img => per_image}/__init__.py (100%)
 create mode 100644 tests/unit/metrics/per_image/test_binclf_curve_numpy.py

diff --git a/src/anomalib/metrics/per_img/__init__.py b/src/anomalib/metrics/per_image/__init__.py
similarity index 100%
rename from src/anomalib/metrics/per_img/__init__.py
rename to src/anomalib/metrics/per_image/__init__.py
diff --git a/src/anomalib/metrics/per_image/_binclf_curve_numba.py b/src/anomalib/metrics/per_image/_binclf_curve_numba.py
new file mode 100644
index 0000000000..56e4a0e29e
--- /dev/null
+++ b/src/anomalib/metrics/per_image/_binclf_curve_numba.py
@@ -0,0 +1,105 @@
+"""Binary classification matrix curve (NUMBA implementation of low level functions)."""
+
+import numba
+import numpy as np
+from numpy import ndarray
+
+
+@numba.jit(nopython=True)
+def binclf_one_curve_numba(scores: ndarray, gts: ndarray, threshs: ndarray) -> ndarray:
+    """ONE binary classification matrix at each threshold (NUMBA implementation).
+
+    This does the same as `_binclf_one_curve_python` but with numba using just-in-time compilation.
+
+    ATTENTION: VALIDATION IS NOT DONE HERE. Make sure to validate the arguments before calling this function.
+
+    Args:
+        scores (ndarray): Anomaly scores (D,).
+        gts (ndarray): Binary (bool) ground truth of shape (D,).
+        threshs (ndarray): Sequence of thresholds in ascending order (K,).
+
+    Returns:
+        ndarray: Binary classification matrix curve (K, 2, 2)
+
+        See docstring of `binclf_multiple_curves` for details.
+    """
+    num_th = len(threshs)
+
+    # POSITIVES
+    scores_pos = scores[gts]
+    # the sorting is very important for the algorithm to work and the speedup
+    scores_pos = np.sort(scores_pos)
+    # start counting with lowest th, so everything is predicted as positive (this variable is updated in the loop)
+    num_pos = current_count_tp = len(scores_pos)
+
+    tps = np.empty((num_th,), dtype=np.int64)
+
+    # NEGATIVES
+    # same thing but for the negative samples
+    scores_neg = scores[~gts]
+    scores_neg = np.sort(scores_neg)
+    num_neg = current_count_fp = len(scores_neg)
+
+    fps = np.empty((num_th,), dtype=np.int64)
+
+    # it will progressively drop the scores that are below the current th
+    for thidx, th in enumerate(threshs):
+        num_drop = 0
+        num_scores = len(scores_pos)
+        while num_drop < num_scores and scores_pos[num_drop] < th:  # ! scores_pos !
+            num_drop += 1
+        # ---
+        scores_pos = scores_pos[num_drop:]
+        current_count_tp -= num_drop
+        tps[thidx] = current_count_tp
+
+        # same with the negatives
+        num_drop = 0
+        num_scores = len(scores_neg)
+        while num_drop < num_scores and scores_neg[num_drop] < th:  # ! scores_neg !
+            num_drop += 1
+        # ---
+        scores_neg = scores_neg[num_drop:]
+        current_count_fp -= num_drop
+        fps[thidx] = current_count_fp
+
+    fns = num_pos * np.ones((num_th,), dtype=np.int64) - tps
+    tns = num_neg * np.ones((num_th,), dtype=np.int64) - fps
+
+    # sequence of dimensions is (threshs, true class, predicted class) (see docstring)
+    return np.stack(
+        (
+            np.stack((tns, fps), axis=-1),
+            np.stack((fns, tps), axis=-1),
+        ),
+        axis=-1,
+    ).transpose(0, 2, 1)
+
+
+@numba.jit(nopython=True, parallel=True)
+def binclf_multiple_curves_numba(scores_batch: ndarray, gts_batch: ndarray, threshs: ndarray) -> ndarray:
+    """MULTIPLE binary classification matrix at each threshold (NUMBA implementation).
+
+    This does the same as `_binclf_multiple_curves_python` but with numba,
+    using parallelization and just-in-time compilation.
+
+    ATTENTION: VALIDATION IS NOT DONE HERE. Make sure to validate the arguments before calling this function.
+
+    Args:
+        scores_batch (ndarray): Anomaly scores (N, D,).
+        gts_batch (ndarray): Binary (bool) ground truth of shape (N, D,).
+        threshs (ndarray): Sequence of thresholds in ascending order (K,).
+
+    Returns:
+        ndarray: Binary classification matrix curves (N, K, 2, 2)
+
+        See docstring of `binclf_multiple_curves` for details.
+    """
+    num_imgs = scores_batch.shape[0]
+    num_th = len(threshs)
+    ret = np.empty((num_imgs, num_th, 2, 2), dtype=np.int64)
+    for imgidx in numba.prange(num_imgs):
+        scoremap = scores_batch[imgidx]
+        mask = gts_batch[imgidx]
+        ret[imgidx] = binclf_one_curve_numba(scoremap, mask, threshs)
+    return ret
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
new file mode 100644
index 0000000000..4d639ff692
--- /dev/null
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -0,0 +1,470 @@
+"""Binary classification curve (numpy-only implementation)."""
+
+import itertools
+import logging
+
+import numpy as np
+from numpy import ndarray
+
+try:
+    import numba  # noqa: F401
+except ImportError:
+    HAS_NUMBA = False
+else:
+    HAS_NUMBA = True
+    from . import _binclf_curve_numba
+
+
+ALGORITHM_PYTHON = "python"
+ALGORITHM_NUMBA = "numba"
+ALGORIGHTMS = (ALGORITHM_PYTHON, ALGORITHM_NUMBA)
+
+THRESHS_CHOICE_GIVEN = "given"
+THRESHS_CHOICE_MINMAX_LINSPACE = "minmax-linspace"
+THRESHS_CHOICE_MEAN_FPR_OPTIMIZED = "mean-fpr-optimized"
+THRESHS_CHOICES = (
+    THRESHS_CHOICE_GIVEN,
+    THRESHS_CHOICE_MINMAX_LINSPACE,
+    THRESHS_CHOICE_MEAN_FPR_OPTIMIZED,
+)
+
+# TODO(jpcbertoldo): warn when the rations from binclf are too imprecise  # noqa: TD003
+
+
+logger = logging.getLogger(__name__)
+
+
+# =========================================== ARGS VALIDATION ===========================================
+
+
+def _validate_scores_batch(scores_batch: ndarray) -> None:
+    """scores_batch (ndarray): floating (N, D)."""
+    if not isinstance(scores_batch, ndarray):
+        msg = f"Expected `scores_batch` to be an ndarray, but got {type(scores_batch)}"
+        raise TypeError(msg)
+
+    if scores_batch.dtype.kind != "f":
+        msg = (
+            "Expected `scores_batch` to be an floating ndarray with anomaly scores_batch,"
+            f" but got ndarray with dtype {scores_batch.dtype}"
+        )
+        raise TypeError(msg)
+
+    if scores_batch.ndim != 2:
+        msg = f"Expected `scores_batch` to be 2D, but got {scores_batch.ndim}"
+        raise ValueError(msg)
+
+
+def _validate_gts_batch(gts_batch: ndarray) -> None:
+    """gts_batch (ndarray): boolean (N, D)."""
+    if not isinstance(gts_batch, ndarray):
+        msg = f"Expected `gts_batch` to be an ndarray, but got {type(gts_batch)}"
+        raise TypeError(msg)
+
+    if gts_batch.dtype.kind != "b":
+        msg = (
+            "Expected `gts_batch` to be an boolean ndarray with anomaly scores_batch,"
+            f" but got ndarray with dtype {gts_batch.dtype}"
+        )
+        raise TypeError(msg)
+
+    if gts_batch.ndim != 2:
+        msg = f"Expected `gts_batch` to be 2D, but got {gts_batch.ndim}"
+        raise ValueError(msg)
+
+
+def _validate_threshs(threshs: ndarray) -> None:
+    if not isinstance(threshs, ndarray):
+        msg = f"Expected `threshs` to be an ndarray, but got {type(threshs)}"
+        raise TypeError(msg)
+
+    if threshs.ndim != 1:
+        msg = f"Expected `threshs` to be 1D, but got {threshs.ndim}"
+        raise ValueError(msg)
+
+    if threshs.dtype.kind != "f":
+        msg = (
+            "Expected `threshs` to be an floating ndarray with anomaly scores,"
+            f" but got ndarray with dtype {threshs.dtype}"
+        )
+        raise TypeError(msg)
+
+    # make sure they are strictly increasing
+    if any(thresh <= prev_th for prev_th, thresh in itertools.pairwise(threshs)):
+        msg = "Expected `threshs` to be strictly increasing, but it is not."
+        raise ValueError(msg)
+
+
+def _validate_num_threshs(num_threshs: int) -> None:
+    if not isinstance(num_threshs, int):
+        msg = f"Expected `num_threshs` to be an integer, but got {type(num_threshs)}"
+        raise TypeError(msg)
+
+    if num_threshs < 2:
+        msg = f"If argument `num_threshs` is an integer, expected it to be larger than 1, but got {num_threshs}"
+        raise ValueError(msg)
+
+
+def _validate_thresh_bounds(thresh_bounds: tuple[float, float]) -> None:
+    if not isinstance(thresh_bounds, tuple):
+        msg = f"Expected `thresh_bounds` to be a tuple, but got {type(thresh_bounds)}"
+        raise TypeError(msg)
+
+    if len(thresh_bounds) != 2:
+        msg = f"Expected `thresh_bounds` to be a tuple of length 2, but got {len(thresh_bounds)}"
+        raise ValueError(msg)
+
+    lower, upper = thresh_bounds
+
+    if not isinstance(lower, float) or not isinstance(upper, float):
+        msg = f"Expected `thresh_bounds` to be a tuple of floats, but got {type(lower)} and {type(upper)}"
+        raise TypeError(msg)
+
+    if lower >= upper:
+        msg = f"Expected `thresh_bounds[1]` > `thresh_bounds[0]`, but got {thresh_bounds[1]} <= {thresh_bounds[0]}"
+        raise ValueError(msg)
+
+
+def _validate_anomaly_maps(anomaly_maps: ndarray) -> None:
+    if not isinstance(anomaly_maps, ndarray):
+        msg = f"Expected `anomaly_maps` to be an ndarray, but got {type(anomaly_maps)}"
+        raise TypeError(msg)
+
+    if anomaly_maps.ndim != 3:
+        msg = f"Expected `anomaly_maps` have 3 dimensions (N, H, W), but got {anomaly_maps.ndim} dimensions"
+        raise ValueError(msg)
+
+    if anomaly_maps.dtype.kind != "f":
+        msg = (
+            "Expected `anomaly_maps` to be an floating ndarray with anomaly scores,"
+            f" but got ndarray with dtype {anomaly_maps.dtype}"
+        )
+        raise TypeError(msg)
+
+
+def _validate_masks(masks: ndarray) -> None:
+    if not isinstance(masks, ndarray):
+        msg = f"Expected `masks` to be an ndarray, but got {type(masks)}"
+        raise TypeError(msg)
+
+    if masks.ndim != 3:
+        msg = f"Expected `masks` have 3 dimensions (N, H, W), but got {masks.ndim} dimensions"
+        raise ValueError(msg)
+
+    if masks.dtype.kind == "b":
+        pass
+
+    elif masks.dtype.kind in ("i", "u"):
+        masks_unique_vals = np.unique(masks)
+        if np.any((masks_unique_vals != 0) & (masks_unique_vals != 1)):
+            msg = (
+                "Expected `masks` to be a *binary* ndarray with ground truth labels, "
+                f"but got ndarray with unique values {masks_unique_vals}"
+            )
+            raise ValueError(msg)
+
+    else:
+        msg = (
+            "Expected `masks` to be an integer or boolean ndarray with ground truth labels, "
+            f"but got ndarray with dtype {masks.dtype}"
+        )
+        raise TypeError(msg)
+
+
+def _validate_same_shape(*args) -> None:
+    assert len(args) > 0
+    shapes = [tuple(arg.shape) for arg in args]
+    if not all(shape == shapes[0] for shape in shapes):
+        msg = f"Expecteds to have the same shape, but got {shapes}"
+        raise ValueError(msg)
+
+
+def _validate_binclf_curves(binclf_curves: ndarray) -> None:
+    if not isinstance(binclf_curves, ndarray):
+        msg = f"Expected `binclf_curves` to be an ndarray, but got {type(binclf_curves)}"
+        raise TypeError(msg)
+
+    if binclf_curves.ndim != 4:
+        msg = f"Expected `binclf_curves` to be 4D, but got {binclf_curves.ndim}D"
+        raise ValueError(msg)
+
+    if binclf_curves.shape[-2:] != (2, 2):
+        msg = f"Expected `binclf_curves` to have shape (..., 2, 2), but got {binclf_curves.shape}"
+        raise ValueError(msg)
+
+    if binclf_curves.dtype != np.int64:
+        msg = f"Expected `binclf_curves` to have dtype int64, but got {binclf_curves.dtype}."
+        raise TypeError(msg)
+
+    if (binclf_curves < 0).any():
+        msg = "Expected `binclf_curves` to have non-negative values, but got negative values."
+        raise ValueError(msg)
+
+    neg = binclf_curves[:, :, 0, :].sum(axis=-1)  # (num_images, num_threshs)
+
+    if (neg != neg[:, :1]).any():
+        msg = "Expected `binclf_curves` to have the same number of negatives per image for every thresh."
+        raise ValueError(msg)
+
+    pos = binclf_curves[:, :, 1, :].sum(axis=-1)  # (num_images, num_threshs)
+
+    if (pos != pos[:, :1]).any():
+        msg = "Expected `binclf_curves` to have the same number of positives per image for every thresh."
+        raise ValueError(msg)
+
+
+# =========================================== PYTHON VERSION ===========================================
+
+
+def _binclf_one_curve_python(scores: ndarray, gts: ndarray, threshs: ndarray) -> ndarray:
+    """ONE binary classification matrix at each threshold (PYTHON implementation).
+
+    In the case where the thresholds are given (i.e. not considering all possible thresholds based on the scores),
+    this weird-looking function is faster than the two options in `torchmetrics` on the CPU:
+        - `_binary_precision_recall_curve_update_vectorized`
+        - `_binary_precision_recall_curve_update_loop`
+
+    (both in module `torchmetrics.functional.classification.precision_recall_curve` in `torchmetrics==1.1.0`).
+
+    ATTENTION: VALIDATION IS NOT DONE HERE. Make sure to validate the arguments before calling this function.
+
+    Args:
+        scores (ndarray): Anomaly scores (D,).
+        gts (ndarray): Binary (bool) ground truth of shape (D,).
+        threshs (ndarray): Sequence of thresholds in ascending order (K,).
+
+    Returns:
+        ndarray: Binary classification matrix curve (K, 2, 2)
+
+        See docstring of `binclf_multiple_curves` for details.
+    """
+    num_th = len(threshs)
+
+    # POSITIVES
+    scores_positives = scores[gts]
+    # the sorting is very important for the algorithm to work and the speedup
+    scores_positives = np.sort(scores_positives)
+    # variable updated in the loop; start counting with lowest thresh ==> everything is predicted as positive
+    num_pos = current_count_tp = scores_positives.size
+    tps = np.empty((num_th,), dtype=np.int64)
+
+    # NEGATIVES
+    # same thing but for the negative samples
+    scores_negatives = scores[~gts]
+    scores_negatives = np.sort(scores_negatives)
+    num_neg = current_count_fp = scores_negatives.size
+    fps = np.empty((num_th,), dtype=np.int64)
+
+    def score_less_than_thresh(thresh):  # noqa: ANN001, ANN202
+        def func(score) -> bool:  # noqa: ANN001
+            return score < thresh
+
+        return func
+
+    # it will progressively drop the scores that are below the current thresh
+    for thresh_idx, thresh in enumerate(threshs):
+        # UPDATE POSITIVES
+        # < becasue it is the same as ~(>=)
+        num_drop = sum(1 for _ in itertools.takewhile(score_less_than_thresh(thresh), scores_positives))
+        scores_positives = scores_positives[num_drop:]
+        current_count_tp -= num_drop
+        tps[thresh_idx] = current_count_tp
+
+        # UPDATE NEGATIVES
+        # same with the negatives
+        num_drop = sum(1 for _ in itertools.takewhile(score_less_than_thresh(thresh), scores_negatives))
+        scores_negatives = scores_negatives[num_drop:]
+        current_count_fp -= num_drop
+        fps[thresh_idx] = current_count_fp
+
+    # deduce the rest of the matrix counts
+    fns = num_pos * np.ones((num_th,), dtype=np.int64) - tps
+    tns = num_neg * np.ones((num_th,), dtype=np.int64) - fps
+
+    # sequence of dimensions is (threshs, true class, predicted class) (see docstring)
+    return np.stack(
+        [
+            np.stack([tns, fps], axis=-1),
+            np.stack([fns, tps], axis=-1),
+        ],
+        axis=-1,
+    ).transpose(0, 2, 1)
+
+
+_binclf_multiple_curves_python = np.vectorize(_binclf_one_curve_python, signature="(n),(n),(k)->(k,2,2)")
+_binclf_multiple_curves_python.__doc__ = """
+MULTIPLE binary classification matrix at each threshold (PYTHON implementation).
+vectorized version of `_binclf_one_curve_python` (see above)
+"""
+
+# =========================================== INTERFACE ===========================================
+
+
+def binclf_multiple_curves(
+    scores_batch: ndarray,
+    gts_batch: ndarray,
+    threshs: ndarray,
+    algorithm: str = ALGORITHM_NUMBA,
+) -> ndarray:
+    """Multiple binary classification matrix (per-instance scope) at each threshold (shared).
+
+    This is a wrapper around `_binclf_multiple_curves_python` and `_binclf_multiple_curves_numba`.
+    Validation of the arguments is done here (not in the actual implementation functions).
+
+    Note: predicted as positive condition is `score >= thresh`.
+
+    Args:
+        scores_batch (ndarray): Anomaly scores (N, D,).
+        gts_batch (ndarray): Binary (bool) ground truth of shape (N, D,).
+        threshs (ndarray): Sequence of thresholds in ascending order (K,).
+        algorithm (str, optional): Algorithm to use. Defaults to ALGORITHM_NUMBA.
+
+    Returns:
+        ndarray: Binary classification matrix curves (N, K, 2, 2)
+
+        The last two dimensions are the confusion matrix (ground truth, predictions)
+        So for each thresh it gives:
+            - `tp`: `[... , 1, 1]`
+            - `fp`: `[... , 0, 1]`
+            - `fn`: `[... , 1, 0]`
+            - `tn`: `[... , 0, 0]`
+
+        `t` is for `true` and `f` is for `false`, `p` is for `positive` and `n` is for `negative`, so:
+            - `tp` stands for `true positive`
+            - `fp` stands for `false positive`
+            - `fn` stands for `false negative`
+            - `tn` stands for `true negative`
+
+        The numbers in each confusion matrix are the counts (not the ratios).
+
+        Counts are relative to each instance (i.e. from 0 to D, e.g. the total is the number of pixels in the image).
+
+        Thresholds are shared across all instances, so all confusion matrices, for instance,
+        at position [:, 0, :, :] are relative to the 1st threshold in `threshs`.
+    """
+    _validate_scores_batch(scores_batch)
+    _validate_gts_batch(gts_batch)
+    _validate_same_shape(scores_batch, gts_batch)
+    _validate_threshs(threshs)
+
+    if algorithm == ALGORITHM_PYTHON:
+        return _binclf_multiple_curves_python(scores_batch, gts_batch, threshs)
+
+    if algorithm == ALGORITHM_NUMBA:
+        if not HAS_NUMBA:
+            logger.warning(
+                "Algorithm 'numba' was selected, but numba is not installed. Fallback to 'python' algorithm.",
+            )
+            return _binclf_multiple_curves_python(scores_batch, gts_batch, threshs)
+        return _binclf_curve_numba.binclf_multiple_curves_numba(scores_batch, gts_batch, threshs)
+
+    msg = f"Expected `algorithm` to be one of {ALGORIGHTMS}, but got {algorithm}"
+    raise NotImplementedError(msg)
+
+
+# ========================================= PER-IMAGE ===========================================
+
+
+def per_img_binclf_curve(
+    anomaly_maps: ndarray,
+    masks: ndarray,
+    algorithm: str = ALGORITHM_NUMBA,
+    threshs_choice: str = THRESHS_CHOICE_MINMAX_LINSPACE,
+    threshs_given: ndarray | None = None,
+    num_threshs: int | None = None,
+) -> tuple[ndarray, ndarray]:
+    """Compute the binary classification matrix of each image in the batch for multiple thresholds (shared).
+
+    Args:
+        anomaly_maps (Tensor): Anomaly score maps of shape (N, H, W [, D, ...])
+        masks (Tensor): Binary ground truth masks of shape (N, H, W [, D, ...])
+        algorithm (str, optional): Algorithm to use. Defaults to ALGORITHM_NUMBA.
+        threshs_choice (str, optional): Sequence of thresholds to use. Defaults to THRESH_SEQUENCE_MINMAX_LINSPACE.
+        #
+        # `threshs_choice`-dependent arguments
+        #
+        # THRESH_SEQUENCE_GIVEN
+        threshs_given (Tensor, optional): Sequence of thresholds to use.
+        #
+        # THRESH_SEQUENCE_MINMAX_LINSPACE
+        num_threshs (int, optional): Number of thresholds between the min and max of the anomaly maps.
+
+    Returns:
+        tuple[ndarray, ndarray]:
+            [0] Thresholds of shape (K,) and dtype is the same as `anomaly_maps.dtype`.
+
+            [1] Binary classification matrices of shape (N, K, 2, 2)
+
+                N: number of images/instances
+                K: number of thresholds
+
+            The last two dimensions are the confusion matrix (ground truth, predictions)
+            So for each thresh it gives:
+                - `tp`: `[... , 1, 1]`
+                - `fp`: `[... , 0, 1]`
+                - `fn`: `[... , 1, 0]`
+                - `tn`: `[... , 0, 0]`
+
+            `t` is for `true` and `f` is for `false`, `p` is for `positive` and `n` is for `negative`, so:
+                - `tp` stands for `true positive`
+                - `fp` stands for `false positive`
+                - `fn` stands for `false negative`
+                - `tn` stands for `true negative`
+
+            The numbers in each confusion matrix are the counts of pixels in the image (not the ratios).
+
+            Thresholds are shared across all images, so all confusion matrices, for instance,
+            at position [:, 0, :, :] are relative to the 1st threshold in `threshs`.
+
+    """
+    # validate inputs
+    _validate_anomaly_maps(anomaly_maps)
+    _validate_masks(masks)
+    _validate_same_shape(anomaly_maps, masks)
+
+    threshs: ndarray
+
+    if threshs_choice == THRESHS_CHOICE_GIVEN:
+        assert threshs_given is not None
+        _validate_threshs(threshs_given)
+        if num_threshs is not None:
+            logger.warning(
+                f"Argument `num_threshs` was given, but it is ignored because `threshs_choice` is {threshs_choice}.",
+            )
+        threshs = threshs_given.astype(anomaly_maps.dtype)
+
+    elif threshs_choice == THRESHS_CHOICE_MINMAX_LINSPACE:
+        assert num_threshs is not None
+        if threshs_given is not None:
+            logger.warning(
+                f"Argument `threshs_given` was given, but it is ignored because `threshs_choice` is {threshs_choice}.",
+            )
+        thresh_low, thresh_high = thresh_bounds = (anomaly_maps.min().item(), anomaly_maps.max().item())
+        try:
+            _validate_thresh_bounds(thresh_bounds)
+        except ValueError as ex:
+            msg = "Invalid `thresh_bounds` computed from `anomaly_maps`."
+            raise ValueError(msg) from ex
+        threshs = np.linspace(thresh_low, thresh_high, num_threshs, dtype=anomaly_maps.dtype)
+
+    elif threshs_choice == THRESHS_CHOICE_MEAN_FPR_OPTIMIZED:
+        raise NotImplementedError(f"TODO implement {threshs_choice}")  # noqa: EM102
+
+    else:
+        msg = f"Expected `threshs_choice` to be one of {THRESHS_CHOICES}, but got {threshs_choice}"
+        raise NotImplementedError(msg)
+
+    # keep the batch dimension and flatten the rest
+    scores_batch = anomaly_maps.reshape(anomaly_maps.shape[0], -1)
+    gts_batch = masks.reshape(masks.shape[0], -1).astype(bool)  # make sure it is boolean
+
+    binclf_curves = binclf_multiple_curves(scores_batch, gts_batch, threshs, algorithm=algorithm)
+
+    try:
+        _validate_binclf_curves(binclf_curves)
+
+    except (TypeError, ValueError) as ex:
+        msg = "Invalid `binclf_curves` was computed."
+        raise RuntimeError(msg) from ex
+
+    return threshs, binclf_curves
diff --git a/tests/unit/metrics/per_img/__init__.py b/tests/unit/metrics/per_image/__init__.py
similarity index 100%
rename from tests/unit/metrics/per_img/__init__.py
rename to tests/unit/metrics/per_image/__init__.py
diff --git a/tests/unit/metrics/per_image/test_binclf_curve_numpy.py b/tests/unit/metrics/per_image/test_binclf_curve_numpy.py
new file mode 100644
index 0000000000..edc8f31565
--- /dev/null
+++ b/tests/unit/metrics/per_image/test_binclf_curve_numpy.py
@@ -0,0 +1,405 @@
+"""Tests for per-image binary classification curves using numpy and numba versions."""
+# ruff: noqa: SLF001, PT011
+
+import numpy as np
+import pytest
+from numpy import ndarray
+
+
+def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
+    """Generate test cases."""
+    pred = np.arange(1, 5, dtype=np.float32)
+    threshs = np.arange(1, 5, dtype=np.float32)
+
+    gt_norm = np.zeros(4).astype(bool)
+    gt_anom = np.concatenate([np.zeros(2), np.ones(2)]).astype(bool)
+
+    # in the case where thresholds are all unique values in the predictions
+    expected_norm = np.stack(
+        [
+            np.array([[0, 4], [0, 0]]),
+            np.array([[1, 3], [0, 0]]),
+            np.array([[2, 2], [0, 0]]),
+            np.array([[3, 1], [0, 0]]),
+        ],
+        axis=0,
+    ).astype(int)
+    expected_anom = np.stack(
+        [
+            np.array([[0, 2], [0, 2]]),
+            np.array([[1, 1], [0, 2]]),
+            np.array([[2, 0], [0, 2]]),
+            np.array([[2, 0], [1, 1]]),
+        ],
+        axis=0,
+    ).astype(int)
+
+    # in the case where all thresholds are higher than the highest prediction
+    expected_norm_threshs_too_high = np.stack(
+        [
+            np.array([[4, 0], [0, 0]]),
+            np.array([[4, 0], [0, 0]]),
+            np.array([[4, 0], [0, 0]]),
+            np.array([[4, 0], [0, 0]]),
+        ],
+        axis=0,
+    ).astype(int)
+    expected_anom_threshs_too_high = np.stack(
+        [
+            np.array([[2, 0], [2, 0]]),
+            np.array([[2, 0], [2, 0]]),
+            np.array([[2, 0], [2, 0]]),
+            np.array([[2, 0], [2, 0]]),
+        ],
+        axis=0,
+    ).astype(int)
+
+    # in the case where all thresholds are lower than the lowest prediction
+    expected_norm_threshs_too_low = np.stack(
+        [
+            np.array([[0, 4], [0, 0]]),
+            np.array([[0, 4], [0, 0]]),
+            np.array([[0, 4], [0, 0]]),
+            np.array([[0, 4], [0, 0]]),
+        ],
+        axis=0,
+    ).astype(int)
+    expected_anom_threshs_too_low = np.stack(
+        [
+            np.array([[0, 2], [0, 2]]),
+            np.array([[0, 2], [0, 2]]),
+            np.array([[0, 2], [0, 2]]),
+            np.array([[0, 2], [0, 2]]),
+        ],
+        axis=0,
+    ).astype(int)
+
+    if metafunc.function is test__binclf_one_curve_python or metafunc.function is test__binclf_one_curve_numba:
+        metafunc.parametrize(
+            argnames=("pred", "gt", "threshs", "expected"),
+            argvalues=[
+                (pred, gt_anom, threshs[:3], expected_anom[:3]),
+                (pred, gt_anom, threshs, expected_anom),
+                (pred, gt_norm, threshs, expected_norm),
+                (pred, gt_norm, 10 * threshs, expected_norm_threshs_too_high),
+                (pred, gt_anom, 10 * threshs, expected_anom_threshs_too_high),
+                (pred, gt_norm, 0.001 * threshs, expected_norm_threshs_too_low),
+                (pred, gt_anom, 0.001 * threshs, expected_anom_threshs_too_low),
+            ],
+        )
+
+    preds = np.stack([pred, pred], axis=0)
+    gts = np.stack([gt_anom, gt_norm], axis=0)
+    binclf_curves = np.stack([expected_anom, expected_norm], axis=0)
+    binclf_curves_threshs_too_high = np.stack([expected_anom_threshs_too_high, expected_norm_threshs_too_high], axis=0)
+    binclf_curves_threshs_too_low = np.stack([expected_anom_threshs_too_low, expected_norm_threshs_too_low], axis=0)
+
+    if (
+        metafunc.function is test__binclf_multiple_curves_python
+        or metafunc.function is test__binclf_multiple_curves_numba
+    ):
+        metafunc.parametrize(
+            argnames=("preds", "gts", "threshs", "expecteds"),
+            argvalues=[
+                (preds, gts, threshs[:3], binclf_curves[:, :3]),
+                (preds, gts, threshs, binclf_curves),
+            ],
+        )
+
+    if metafunc.function is test_binclf_multiple_curves:
+        metafunc.parametrize(
+            argnames=(
+                "preds",
+                "gts",
+                "threshs",
+                "expected_binclf_curves",
+            ),
+            argvalues=[
+                (preds[:1], gts[:1], threshs, binclf_curves[:1]),
+                (preds, gts, threshs, binclf_curves),
+                (10 * preds, gts, 10 * threshs, binclf_curves),
+            ],
+        )
+        metafunc.parametrize(
+            argnames=("algorithm",),
+            argvalues=[
+                ("python",),
+                ("numba",),
+            ],
+        )
+
+    if metafunc.function is test_binclf_multiple_curves_validations:
+        metafunc.parametrize(
+            argnames=("args", "exception"),
+            argvalues=[
+                # `scores` and `gts` must be 2D
+                ([preds.reshape(2, 2, 2), gts, threshs], ValueError),
+                ([preds, gts.flatten(), threshs], ValueError),
+                # `threshs` must be 1D
+                ([preds, gts, threshs.reshape(2, 2)], ValueError),
+                # `scores` and `gts` must have the same shape
+                ([preds, gts[:1], threshs], ValueError),
+                ([preds[:, :2], gts, threshs], ValueError),
+                # `scores` be of type float
+                ([preds.astype(int), gts, threshs], TypeError),
+                # `gts` be of type bool
+                ([preds, gts.astype(int), threshs], TypeError),
+                # `threshs` be of type float
+                ([preds, gts, threshs.astype(int)], TypeError),
+                # `threshs` must be sorted in ascending order
+                ([preds, gts, np.flip(threshs)], ValueError),
+                ([preds, gts, np.concatenate([threshs[-2:], threshs[:2]])], ValueError),
+            ],
+        )
+        metafunc.parametrize(
+            argnames=("kwargs",),
+            argvalues=[
+                ({"algorithm": "python"},),
+                ({"algorithm": "numba"},),
+            ],
+        )
+
+    # the following tests are for `per_img_binclf_curve()`, which expects
+    # inputs in image spatial format, i.e. (height, width)
+    preds = preds.reshape(2, 2, 2)
+    gts = gts.reshape(2, 2, 2)
+
+    if metafunc.function is test_per_img_binclf_curve:
+        metafunc.parametrize(
+            argnames=(
+                "anomaly_maps",
+                "masks",
+                "threshs_choice",
+                "threshs_given",
+                "num_threshs",
+                "expected_threshs",
+                "expected_binclf_curves",
+            ),
+            argvalues=[
+                # `threshs_choice` = "given"
+                (
+                    preds,
+                    gts,
+                    "given",
+                    threshs,
+                    None,
+                    threshs,
+                    binclf_curves,
+                ),
+                (
+                    preds,
+                    gts,
+                    "given",
+                    10 * threshs,
+                    2,
+                    10 * threshs,
+                    binclf_curves_threshs_too_high,
+                ),
+                (
+                    preds,
+                    gts,
+                    "given",
+                    0.01 * threshs,
+                    None,
+                    0.01 * threshs,
+                    binclf_curves_threshs_too_low,
+                ),
+                # `threshs_choice` = 'minmax-linspace'"
+                (
+                    preds,
+                    gts,
+                    "minmax-linspace",
+                    None,
+                    len(threshs),
+                    threshs,
+                    binclf_curves,
+                ),
+                (
+                    2 * preds,
+                    gts.astype(int),  # this is ok
+                    "minmax-linspace",
+                    None,
+                    len(threshs),
+                    2 * threshs,
+                    binclf_curves,
+                ),
+            ],
+        )
+        metafunc.parametrize(
+            argnames=("algorithm",),
+            argvalues=[
+                ("python",),
+                ("numba",),
+            ],
+        )
+
+    if metafunc.function is test_per_img_binclf_curve_validations:
+        metafunc.parametrize(
+            argnames=("args", "exception"),
+            argvalues=[
+                # `scores` and `gts` must be 3D
+                ([preds.reshape(2, 2, 2, 1), gts], ValueError),
+                ([preds, gts.flatten()], ValueError),
+                # `scores` and `gts` must have the same shape
+                ([preds, gts[:1]], ValueError),
+                ([preds[:, :1], gts], ValueError),
+                # `scores` be of type float
+                ([preds.astype(int), gts], TypeError),
+                # `gts` be of type bool or int
+                ([preds, gts.astype(float)], TypeError),
+                # `threshs` be of type float
+                ([preds, gts, threshs.astype(int)], TypeError),
+            ],
+        )
+        metafunc.parametrize(
+            argnames=("kwargs",),
+            argvalues=[
+                ({"algorithm": "numba", "threshs_choice": "given", "threshs_given": threshs, "num_threshs": None},),
+                (
+                    {
+                        "algorithm": "python",
+                        "threshs_choice": "minmax-linspace",
+                        "threshs_given": None,
+                        "num_threshs": len(threshs),
+                    },
+                ),
+            ],
+        )
+
+
+# ==================================================================================================
+# LOW-LEVEL FUNCTIONS (PYTHON)
+
+
+def test__binclf_one_curve_python(pred: ndarray, gt: ndarray, threshs: ndarray, expected: ndarray) -> None:
+    """Test if `_binclf_one_curve_python()` returns the expected values."""
+    from anomalib.metrics.per_image import binclf_curve_numpy
+
+    computed = binclf_curve_numpy._binclf_one_curve_python(pred, gt, threshs)
+    assert computed.shape == (threshs.size, 2, 2)
+    assert (computed == expected).all()
+
+
+def test__binclf_multiple_curves_python(
+    preds: ndarray,
+    gts: ndarray,
+    threshs: ndarray,
+    expecteds: ndarray,
+) -> None:
+    """Test if `_binclf_multiple_curves_python()` returns the expected values."""
+    from anomalib.metrics.per_image import binclf_curve_numpy
+
+    computed = binclf_curve_numpy._binclf_multiple_curves_python(preds, gts, threshs)
+    assert computed.shape == (preds.shape[0], threshs.size, 2, 2)
+    assert (computed == expecteds).all()
+
+
+# ==================================================================================================
+# LOW-LEVEL FUNCTIONS (NUMBA)
+
+
+def test__binclf_one_curve_numba(pred: ndarray, gt: ndarray, threshs: ndarray, expected: ndarray) -> None:
+    """Test if `_binclf_one_curve_numba()` returns the expected values."""
+    from anomalib.metrics.per_image import _binclf_curve_numba
+
+    computed = _binclf_curve_numba.binclf_one_curve_numba(pred, gt, threshs)
+    assert computed.shape == (threshs.size, 2, 2)
+    assert (computed == expected).all()
+
+
+def test__binclf_multiple_curves_numba(preds: ndarray, gts: ndarray, threshs: ndarray, expecteds: ndarray) -> None:
+    """Test if `_binclf_multiple_curves_python()` returns the expected values."""
+    from anomalib.metrics.per_image import _binclf_curve_numba
+
+    computed = _binclf_curve_numba.binclf_multiple_curves_numba(preds, gts, threshs)
+    assert computed.shape == (preds.shape[0], threshs.size, 2, 2)
+    assert (computed == expecteds).all()
+
+
+# ==================================================================================================
+# API FUNCTIONS
+
+
+def test_binclf_multiple_curves(
+    preds: ndarray,
+    gts: ndarray,
+    threshs: ndarray,
+    expected_binclf_curves: ndarray,
+    algorithm: str,
+) -> None:
+    """Test if `binclf_multiple_curves()` returns the expected values."""
+    from anomalib.metrics.per_image import binclf_curve_numpy
+
+    computed = binclf_curve_numpy.binclf_multiple_curves(
+        preds,
+        gts,
+        threshs,
+        algorithm=algorithm,
+    )
+    assert computed.shape == expected_binclf_curves.shape
+    assert (computed == expected_binclf_curves).all()
+
+    # it's ok to have the threhsholds beyond the range of the preds
+    binclf_curve_numpy.binclf_multiple_curves(preds, gts, 2 * threshs, algorithm=algorithm)
+
+    # or inside the bounds without reaching them
+    binclf_curve_numpy.binclf_multiple_curves(preds, gts, 0.5 * threshs, algorithm=algorithm)
+
+    # it's also ok to have more threshs than unique values in the preds
+    # add the values in between the threshs
+    threshs_unncessary = 0.5 * (threshs[:-1] + threshs[1:])
+    threshs_unncessary = np.concatenate([threshs_unncessary, threshs])
+    threshs_unncessary = np.sort(threshs_unncessary)
+    binclf_curve_numpy.binclf_multiple_curves(preds, gts, threshs_unncessary, algorithm=algorithm)
+
+    # or less
+    binclf_curve_numpy.binclf_multiple_curves(preds, gts, threshs[1:3], algorithm=algorithm)
+
+
+def test_binclf_multiple_curves_validations(args: list, kwargs: dict, exception: Exception) -> None:
+    """Test if `_binclf_multiple_curves_python()` raises the expected errors."""
+    from anomalib.metrics.per_image import binclf_curve_numpy
+
+    with pytest.raises(exception):
+        binclf_curve_numpy.binclf_multiple_curves(*args, **kwargs)
+
+
+def test_per_img_binclf_curve(
+    anomaly_maps: ndarray,
+    masks: ndarray,
+    algorithm: str,
+    threshs_choice: str,
+    threshs_given: ndarray | None,
+    num_threshs: int | None,
+    expected_threshs: ndarray,
+    expected_binclf_curves: ndarray,
+) -> None:
+    """Test if `per_img_binclf_curve()` returns the expected values."""
+    from anomalib.metrics.per_image import binclf_curve_numpy
+
+    computed_threshs, computed_binclf_curves = binclf_curve_numpy.per_img_binclf_curve(
+        anomaly_maps,
+        masks,
+        algorithm=algorithm,
+        threshs_choice=threshs_choice,
+        threshs_given=threshs_given,
+        num_threshs=num_threshs,
+    )
+
+    # threshs
+    assert computed_threshs.shape == expected_threshs.shape
+    assert computed_threshs.dtype == computed_threshs.dtype
+    assert (computed_threshs == expected_threshs).all()
+
+    # binclf_curves
+    assert computed_binclf_curves.shape == expected_binclf_curves.shape
+    assert computed_binclf_curves.dtype == expected_binclf_curves.dtype
+    assert (computed_binclf_curves == expected_binclf_curves).all()
+
+
+def test_per_img_binclf_curve_validations(args: list, kwargs: dict, exception: Exception) -> None:
+    """Test if `per_img_binclf_curve()` raises the expected errors."""
+    from anomalib.metrics.per_image import binclf_curve_numpy
+
+    with pytest.raises(exception):
+        binclf_curve_numpy.per_img_binclf_curve(*args, **kwargs)

From e6006b44fa3969f7bee7b2093dac0804979d7da7 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Thu, 21 Dec 2023 14:23:32 +0100
Subject: [PATCH 03/57] correct som docstrings

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 .../metrics/per_image/_binclf_curve_numba.py        |  5 ++++-
 .../metrics/per_image/binclf_curve_numpy.py         | 13 +++++++++----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/anomalib/metrics/per_image/_binclf_curve_numba.py b/src/anomalib/metrics/per_image/_binclf_curve_numba.py
index 56e4a0e29e..60b9336f74 100644
--- a/src/anomalib/metrics/per_image/_binclf_curve_numba.py
+++ b/src/anomalib/metrics/per_image/_binclf_curve_numba.py
@@ -1,4 +1,7 @@
-"""Binary classification matrix curve (NUMBA implementation of low level functions)."""
+"""Binary classification matrix curve (NUMBA implementation of low level functions).
+
+See docstring of `binclf_curve` or `binclf_curve_numpy` for more details.
+"""
 
 import numba
 import numpy as np
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index 4d639ff692..5170d45081 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -1,4 +1,9 @@
-"""Binary classification curve (numpy-only implementation)."""
+"""Binary classification curve (numpy-only implementation).
+
+A binary classification (binclf) matrix (TP, FP, FN, TN) is evaluated at multiple thresholds.
+
+The thresholds are shared by all instances/images, but their binclf are computed independently for each instance/image.
+"""
 
 import itertools
 import logging
@@ -376,15 +381,15 @@ def per_img_binclf_curve(
     """Compute the binary classification matrix of each image in the batch for multiple thresholds (shared).
 
     Args:
-        anomaly_maps (Tensor): Anomaly score maps of shape (N, H, W [, D, ...])
-        masks (Tensor): Binary ground truth masks of shape (N, H, W [, D, ...])
+        anomaly_maps (ndarray): Anomaly score maps of shape (N, H, W [, D, ...])
+        masks (ndarray): Binary ground truth masks of shape (N, H, W [, D, ...])
         algorithm (str, optional): Algorithm to use. Defaults to ALGORITHM_NUMBA.
         threshs_choice (str, optional): Sequence of thresholds to use. Defaults to THRESH_SEQUENCE_MINMAX_LINSPACE.
         #
         # `threshs_choice`-dependent arguments
         #
         # THRESH_SEQUENCE_GIVEN
-        threshs_given (Tensor, optional): Sequence of thresholds to use.
+        threshs_given (ndarray, optional): Sequence of thresholds to use.
         #
         # THRESH_SEQUENCE_MINMAX_LINSPACE
         num_threshs (int, optional): Number of thresholds between the min and max of the anomaly maps.

From 101d646005b3095a2516723dcf2ec6103080b752 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Thu, 21 Dec 2023 14:35:31 +0100
Subject: [PATCH 04/57] torch interface and tests

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 .../metrics/per_image/binclf_curve.py         | 109 ++++++++++++
 ...lf_curve_numpy.py => test_binclf_curve.py} | 167 ++++++++++++------
 2 files changed, 224 insertions(+), 52 deletions(-)
 create mode 100644 src/anomalib/metrics/per_image/binclf_curve.py
 rename tests/unit/metrics/per_image/{test_binclf_curve_numpy.py => test_binclf_curve.py} (78%)

diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
new file mode 100644
index 0000000000..4685ddac7d
--- /dev/null
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -0,0 +1,109 @@
+"""Binary classification curve (torch and torchmetrics interfaces).
+
+This module implements interfaces for the code in `binclf_curve_numpy.py`. Check its docstring for more details.
+"""
+
+from __future__ import annotations
+
+import torch
+from torch import Tensor
+
+from . import binclf_curve_numpy
+
+# =========================================== ARGS VALIDATION ===========================================
+
+
+def _validate_is_tensor(tensor: Tensor, argname: str | None = None) -> None:
+    """Validate that `tensor` is a tensor and convert it to a numpy ndarray.
+
+    Validations will preferably happen in ndarray so the numpy code can be reused without torch,
+    so often times the Tensor arguments will be converted to ndarray and then validated.
+    """
+    argname = f"'{argname}'" if argname is not None else "argument"
+
+    if not isinstance(tensor, Tensor):
+        msg = f"Expected {argname} to be a tensor, but got {type(tensor)}"
+        raise TypeError(msg)
+
+
+# =========================================== FUNCTIONAL ===========================================
+
+
+def per_img_binclf_curve(
+    anomaly_maps: Tensor,
+    masks: Tensor,
+    algorithm: str = binclf_curve_numpy.ALGORITHM_NUMBA,
+    threshs_choice: str = binclf_curve_numpy.THRESHS_CHOICE_MINMAX_LINSPACE,
+    threshs_given: Tensor | None = None,
+    num_threshs: int | None = None,
+) -> tuple[Tensor, Tensor]:
+    """Compute the binary classification matrix of each image in the batch for multiple thresholds (shared).
+
+    ATTENTION: tensors are converted to numpy arrays and then converted back to tensors.
+
+    Args:
+        anomaly_maps (Tensor): Anomaly score maps of shape (N, H, W [, D, ...])
+        masks (Tensor): Binary ground truth masks of shape (N, H, W [, D, ...])
+        algorithm (str, optional): Algorithm to use. Defaults to ALGORITHM_NUMBA.
+        threshs_choice (str, optional): Sequence of thresholds to use. Defaults to THRESH_SEQUENCE_MINMAX_LINSPACE.
+        #
+        # `threshs_choice`-dependent arguments
+        #
+        # THRESH_SEQUENCE_GIVEN
+        threshs_given (Tensor, optional): Sequence of thresholds to use.
+        #
+        # THRESH_SEQUENCE_MINMAX_LINSPACE
+        num_threshs (int, optional): Number of thresholds between the min and max of the anomaly maps.
+
+    Returns:
+        tuple[Tensor, Tensor]:
+            [0] Thresholds of shape (K,) and dtype is the same as `anomaly_maps.dtype`.
+
+            [1] Binary classification matrices of shape (N, K, 2, 2)
+
+                N: number of images/instances
+                K: number of thresholds
+
+            The last two dimensions are the confusion matrix (ground truth, predictions)
+            So for each thresh it gives:
+                - `tp`: `[... , 1, 1]`
+                - `fp`: `[... , 0, 1]`
+                - `fn`: `[... , 1, 0]`
+                - `tn`: `[... , 0, 0]`
+
+            `t` is for `true` and `f` is for `false`, `p` is for `positive` and `n` is for `negative`, so:
+                - `tp` stands for `true positive`
+                - `fp` stands for `false positive`
+                - `fn` stands for `false negative`
+                - `tn` stands for `true negative`
+
+            The numbers in each confusion matrix are the counts of pixels in the image (not the ratios).
+
+            Thresholds are shared across all images, so all confusion matrices, for instance,
+            at position [:, 0, :, :] are relative to the 1st threshold in `threshs`.
+
+    """
+    _validate_is_tensor(anomaly_maps, argname="anomaly_maps")
+    anomaly_maps_array = anomaly_maps.detach().cpu().numpy()
+
+    _validate_is_tensor(masks, argname="masks")
+    masks_array = masks.detach().cpu().numpy()
+
+    if threshs_given is not None:
+        _validate_is_tensor(threshs_given, argname="threshs_given")
+        threshs_given_array = threshs_given.detach().cpu().numpy()
+    else:
+        threshs_given_array = None
+
+    threshs_array, binclf_curves_array = binclf_curve_numpy.per_img_binclf_curve(
+        anomaly_maps=anomaly_maps_array,
+        masks=masks_array,
+        algorithm=algorithm,
+        threshs_choice=threshs_choice,
+        threshs_given=threshs_given_array,
+        num_threshs=num_threshs,
+    )
+    threshs = torch.from_numpy(threshs_array).to(anomaly_maps.device)
+    binclf_curves = torch.from_numpy(binclf_curves_array).to(anomaly_maps.device).long()
+
+    return threshs, binclf_curves
diff --git a/tests/unit/metrics/per_image/test_binclf_curve_numpy.py b/tests/unit/metrics/per_image/test_binclf_curve.py
similarity index 78%
rename from tests/unit/metrics/per_image/test_binclf_curve_numpy.py
rename to tests/unit/metrics/per_image/test_binclf_curve.py
index edc8f31565..bb8c9ca400 100644
--- a/tests/unit/metrics/per_image/test_binclf_curve_numpy.py
+++ b/tests/unit/metrics/per_image/test_binclf_curve.py
@@ -3,7 +3,9 @@
 
 import numpy as np
 import pytest
+import torch
 from numpy import ndarray
+from torch import Tensor
 
 
 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
@@ -164,7 +166,72 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
     preds = preds.reshape(2, 2, 2)
     gts = gts.reshape(2, 2, 2)
 
-    if metafunc.function is test_per_img_binclf_curve:
+    per_img_binclf_curves_numpy_argvalues = [
+        # `threshs_choice` = "given"
+        (
+            preds,
+            gts,
+            "given",
+            threshs,
+            None,
+            threshs,
+            binclf_curves,
+        ),
+        (
+            preds,
+            gts,
+            "given",
+            10 * threshs,
+            2,
+            10 * threshs,
+            binclf_curves_threshs_too_high,
+        ),
+        (
+            preds,
+            gts,
+            "given",
+            0.01 * threshs,
+            None,
+            0.01 * threshs,
+            binclf_curves_threshs_too_low,
+        ),
+        # `threshs_choice` = 'minmax-linspace'"
+        (
+            preds,
+            gts,
+            "minmax-linspace",
+            None,
+            len(threshs),
+            threshs,
+            binclf_curves,
+        ),
+        (
+            2 * preds,
+            gts.astype(int),  # this is ok
+            "minmax-linspace",
+            None,
+            len(threshs),
+            2 * threshs,
+            binclf_curves,
+        ),
+    ]
+
+    if metafunc.function is test_per_img_binclf_curve_numpy:
+        metafunc.parametrize(
+            argnames=(
+                "anomaly_maps",
+                "masks",
+                "threshs_choice",
+                "threshs_given",
+                "num_threshs",
+                "expected_threshs",
+                "expected_binclf_curves",
+            ),
+            argvalues=per_img_binclf_curves_numpy_argvalues,
+        )
+
+    # the test with the torch interface are the same we just convert ndarray to Tensor
+    if metafunc.function is test_per_img_binclf_curve_torch:
         metafunc.parametrize(
             argnames=(
                 "anomaly_maps",
@@ -176,55 +243,12 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
                 "expected_binclf_curves",
             ),
             argvalues=[
-                # `threshs_choice` = "given"
-                (
-                    preds,
-                    gts,
-                    "given",
-                    threshs,
-                    None,
-                    threshs,
-                    binclf_curves,
-                ),
-                (
-                    preds,
-                    gts,
-                    "given",
-                    10 * threshs,
-                    2,
-                    10 * threshs,
-                    binclf_curves_threshs_too_high,
-                ),
-                (
-                    preds,
-                    gts,
-                    "given",
-                    0.01 * threshs,
-                    None,
-                    0.01 * threshs,
-                    binclf_curves_threshs_too_low,
-                ),
-                # `threshs_choice` = 'minmax-linspace'"
-                (
-                    preds,
-                    gts,
-                    "minmax-linspace",
-                    None,
-                    len(threshs),
-                    threshs,
-                    binclf_curves,
-                ),
-                (
-                    2 * preds,
-                    gts.astype(int),  # this is ok
-                    "minmax-linspace",
-                    None,
-                    len(threshs),
-                    2 * threshs,
-                    binclf_curves,
-                ),
+                tuple(torch.from_numpy(v) if isinstance(v, np.ndarray) else v for v in argvals)
+                for argvals in per_img_binclf_curves_numpy_argvalues
             ],
         )
+
+    if metafunc.function is test_per_img_binclf_curve_numpy or metafunc.function is test_per_img_binclf_curve_torch:
         metafunc.parametrize(
             argnames=("algorithm",),
             argvalues=[
@@ -233,7 +257,9 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
             ],
         )
 
-    if metafunc.function is test_per_img_binclf_curve_validations:
+    # if metafunc.function is test_per_img_binclf_curve_numpy:
+
+    if metafunc.function is test_per_img_binclf_curve_numpy_validations:
         metafunc.parametrize(
             argnames=("args", "exception"),
             argvalues=[
@@ -317,7 +343,7 @@ def test__binclf_multiple_curves_numba(preds: ndarray, gts: ndarray, threshs: nd
 
 
 # ==================================================================================================
-# API FUNCTIONS
+# API FUNCTIONS (NUMPY)
 
 
 def test_binclf_multiple_curves(
@@ -364,7 +390,7 @@ def test_binclf_multiple_curves_validations(args: list, kwargs: dict, exception:
         binclf_curve_numpy.binclf_multiple_curves(*args, **kwargs)
 
 
-def test_per_img_binclf_curve(
+def test_per_img_binclf_curve_numpy(
     anomaly_maps: ndarray,
     masks: ndarray,
     algorithm: str,
@@ -397,9 +423,46 @@ def test_per_img_binclf_curve(
     assert (computed_binclf_curves == expected_binclf_curves).all()
 
 
-def test_per_img_binclf_curve_validations(args: list, kwargs: dict, exception: Exception) -> None:
+def test_per_img_binclf_curve_numpy_validations(args: list, kwargs: dict, exception: Exception) -> None:
     """Test if `per_img_binclf_curve()` raises the expected errors."""
     from anomalib.metrics.per_image import binclf_curve_numpy
 
     with pytest.raises(exception):
         binclf_curve_numpy.per_img_binclf_curve(*args, **kwargs)
+
+
+# ==================================================================================================
+# API FUNCTIONS (TORCH)
+
+
+def test_per_img_binclf_curve_torch(
+    anomaly_maps: Tensor,
+    masks: Tensor,
+    algorithm: str,
+    threshs_choice: str,
+    threshs_given: Tensor | None,
+    num_threshs: int | None,
+    expected_threshs: Tensor,
+    expected_binclf_curves: Tensor,
+) -> None:
+    """Test if `per_img_binclf_curve()` returns the expected values."""
+    from anomalib.metrics.per_image import binclf_curve
+
+    computed_threshs, computed_binclf_curves = binclf_curve.per_img_binclf_curve(
+        anomaly_maps,
+        masks,
+        algorithm=algorithm,
+        threshs_choice=threshs_choice,
+        threshs_given=threshs_given,
+        num_threshs=num_threshs,
+    )
+
+    # threshs
+    assert computed_threshs.shape == expected_threshs.shape
+    assert computed_threshs.dtype == computed_threshs.dtype
+    assert (computed_threshs == expected_threshs).all()
+
+    # binclf_curves
+    assert computed_binclf_curves.shape == expected_binclf_curves.shape
+    assert computed_binclf_curves.dtype == expected_binclf_curves.dtype
+    assert (computed_binclf_curves == expected_binclf_curves).all()

From 62d54801a6eff8bbb935dfd5c8b599fbc21abb51 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Thu, 21 Dec 2023 15:34:44 +0100
Subject: [PATCH 05/57] torch interface and tests

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 .../metrics/per_image/binclf_curve.py         | 121 ++++++++++++++++--
 .../metrics/per_image/binclf_curve_numpy.py   |  27 +++-
 .../metrics/per_image/test_binclf_curve.py    |   7 +-
 3 files changed, 140 insertions(+), 15 deletions(-)

diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
index 4685ddac7d..04ed5894ac 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -1,10 +1,12 @@
-"""Binary classification curve (torch and torchmetrics interfaces).
+"""Binary classification curve (torch interface).
 
 This module implements interfaces for the code in `binclf_curve_numpy.py`. Check its docstring for more details.
 """
 
 from __future__ import annotations
 
+from dataclasses import dataclass
+
 import torch
 from torch import Tensor
 
@@ -26,6 +28,95 @@ def _validate_is_tensor(tensor: Tensor, argname: str | None = None) -> None:
         raise TypeError(msg)
 
 
+def _validate_threshs(threshs: Tensor) -> None:
+    _validate_is_tensor(threshs, argname="threshs")
+    binclf_curve_numpy._validate_threshs(threshs.detach().cpu().numpy())  # noqa: SLF001
+
+
+def _validate_binclf_curves(binclf_curves: Tensor, valid_threshs: Tensor | None = None) -> None:
+    _validate_is_tensor(binclf_curves, argname="binclf_curves")
+    if valid_threshs is not None:
+        _validate_threshs(valid_threshs)
+    binclf_curve_numpy._validate_binclf_curves(  # noqa: SLF001
+        binclf_curves.detach().cpu().numpy(),
+        valid_threshs=valid_threshs.detach().cpu().numpy() if valid_threshs is not None else None,
+    )
+
+
+# =========================================== RESULTS DATACLASS ===========================================
+
+
+@dataclass
+class PerImageBinClfCurveResult:
+    """Class interface for the results of `per_img_binclf_curve()`."""
+
+    algorithm: str
+    threshs_choice: str
+    threshs: Tensor
+    binclf_curves: Tensor
+
+    def __post_init__(self):  # noqa: D105, ANN204
+        try:
+            _validate_threshs(self.threshs)
+
+        except (TypeError, ValueError) as ex:
+            msg = "Invalid `threshs`!"
+            raise RuntimeError(msg) from ex
+
+        try:
+            _validate_binclf_curves(self.binclf_curves, valid_threshs=self.threshs)
+
+        except (TypeError, ValueError) as ex:
+            msg = "Invalid `binclf_curves`!"
+            raise RuntimeError(msg) from ex
+
+    @property
+    def num_images(self) -> int:
+        """Number of images (N)."""
+        return self.binclf_curves.shape[0]
+
+    @property
+    def num_threshs(self) -> int:
+        """Number of thresholds (K)."""
+        return self.threshs.shape[0]
+
+    @property
+    def tprs(self) -> Tensor:
+        """True positive rates (TPR) for image for each thresh.
+
+        TPR = TP / P = TP / (TP + FN)
+
+        Returns:
+            Tensor: shape (N, K), dtype float64
+            N: number of images
+            K: number of thresholds
+        """
+        # shape: (num images, num threshs)
+        tps = self.binclf_curves[..., 1, 1]
+        pos = self.binclf_curves[..., 1, :].sum(dim=-1)
+
+        # tprs will be nan if pos == 0 (normal image), which is expected
+        return tps.to(torch.float64) / pos.to(torch.float64)
+
+    @property
+    def fprs(self) -> Tensor:
+        """False positive rates (TPR) for image for each thresh.
+
+        FPR = FP / N = FP / (FP + TN)
+
+        Returns:
+            Tensor: shape (N, K), dtype float64
+            N: number of images
+            K: number of thresholds
+        """
+        # shape: (num images, num threshs)
+        fps = self.binclf_curves[..., 0, 1]
+        neg = self.binclf_curves[..., 0, :].sum(dim=-1)
+
+        # it can be `nan` if an anomalous image is fully covered by the mask
+        return fps.to(torch.float64) / neg.to(torch.float64)
+
+
 # =========================================== FUNCTIONAL ===========================================
 
 
@@ -34,25 +125,27 @@ def per_img_binclf_curve(
     masks: Tensor,
     algorithm: str = binclf_curve_numpy.ALGORITHM_NUMBA,
     threshs_choice: str = binclf_curve_numpy.THRESHS_CHOICE_MINMAX_LINSPACE,
+    return_result_object: bool = True,
     threshs_given: Tensor | None = None,
     num_threshs: int | None = None,
-) -> tuple[Tensor, Tensor]:
+) -> PerImageBinClfCurveResult | tuple[Tensor, Tensor]:
     """Compute the binary classification matrix of each image in the batch for multiple thresholds (shared).
 
-    ATTENTION: tensors are converted to numpy arrays and then converted back to tensors.
+    ATTENTION: tensors are converted to numpy arrays and then converted back to tensors (same device as `anomaly_maps`).
 
     Args:
         anomaly_maps (Tensor): Anomaly score maps of shape (N, H, W [, D, ...])
         masks (Tensor): Binary ground truth masks of shape (N, H, W [, D, ...])
         algorithm (str, optional): Algorithm to use. Defaults to ALGORITHM_NUMBA.
         threshs_choice (str, optional): Sequence of thresholds to use. Defaults to THRESH_SEQUENCE_MINMAX_LINSPACE.
-        #
-        # `threshs_choice`-dependent arguments
-        #
-        # THRESH_SEQUENCE_GIVEN
+        return_result_object (bool, optional): Whether to return a `PerImageBinClfCurveResult` object. Defaults to True.
+
+        *** `threshs_choice`-dependent arguments ***
+
+        THRESH_SEQUENCE_GIVEN
         threshs_given (Tensor, optional): Sequence of thresholds to use.
-        #
-        # THRESH_SEQUENCE_MINMAX_LINSPACE
+
+        THRESH_SEQUENCE_MINMAX_LINSPACE
         num_threshs (int, optional): Number of thresholds between the min and max of the anomaly maps.
 
     Returns:
@@ -106,4 +199,12 @@ def per_img_binclf_curve(
     threshs = torch.from_numpy(threshs_array).to(anomaly_maps.device)
     binclf_curves = torch.from_numpy(binclf_curves_array).to(anomaly_maps.device).long()
 
-    return threshs, binclf_curves
+    if not return_result_object:
+        return threshs, binclf_curves
+
+    return PerImageBinClfCurveResult(
+        algorithm=algorithm,
+        threshs_choice=threshs_choice,
+        threshs=threshs,
+        binclf_curves=binclf_curves,
+    )
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index 5170d45081..15b6664720 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -33,8 +33,6 @@
     THRESHS_CHOICE_MEAN_FPR_OPTIMIZED,
 )
 
-# TODO(jpcbertoldo): warn when the rations from binclf are too imprecise  # noqa: TD003
-
 
 logger = logging.getLogger(__name__)
 
@@ -184,7 +182,7 @@ def _validate_same_shape(*args) -> None:
         raise ValueError(msg)
 
 
-def _validate_binclf_curves(binclf_curves: ndarray) -> None:
+def _validate_binclf_curves(binclf_curves: ndarray, valid_threshs: ndarray | None) -> None:
     if not isinstance(binclf_curves, ndarray):
         msg = f"Expected `binclf_curves` to be an ndarray, but got {type(binclf_curves)}"
         raise TypeError(msg)
@@ -217,6 +215,16 @@ def _validate_binclf_curves(binclf_curves: ndarray) -> None:
         msg = "Expected `binclf_curves` to have the same number of positives per image for every thresh."
         raise ValueError(msg)
 
+    if valid_threshs is None:
+        return
+
+    if binclf_curves.shape[1] != valid_threshs.shape[0]:
+        msg = (
+            "Expected `binclf_curves` to have the same number of thresholds as `threshs`, "
+            f"but got {binclf_curves.shape[1]} and {valid_threshs.shape[0]}"
+        )
+        raise RuntimeError(msg)
+
 
 # =========================================== PYTHON VERSION ===========================================
 
@@ -465,8 +473,19 @@ def per_img_binclf_curve(
 
     binclf_curves = binclf_multiple_curves(scores_batch, gts_batch, threshs, algorithm=algorithm)
 
+    num_images = anomaly_maps.shape[0]
+
     try:
-        _validate_binclf_curves(binclf_curves)
+        _validate_binclf_curves(binclf_curves, valid_threshs=threshs)
+
+        # these two validations cannot be done in `_validate_binclf_curves` because it does not have access to the
+        # original shapes of `anomaly_maps`
+        if binclf_curves.shape[0] != num_images:
+            msg = (
+                "Expected `binclf_curves` to have the same number of images as `anomaly_maps`, "
+                f"but got {binclf_curves.shape[0]} and {anomaly_maps.shape[0]}"
+            )
+            raise RuntimeError(msg)
 
     except (TypeError, ValueError) as ex:
         msg = "Invalid `binclf_curves` was computed."
diff --git a/tests/unit/metrics/per_image/test_binclf_curve.py b/tests/unit/metrics/per_image/test_binclf_curve.py
index bb8c9ca400..1fa220c4a6 100644
--- a/tests/unit/metrics/per_image/test_binclf_curve.py
+++ b/tests/unit/metrics/per_image/test_binclf_curve.py
@@ -448,7 +448,7 @@ def test_per_img_binclf_curve_torch(
     """Test if `per_img_binclf_curve()` returns the expected values."""
     from anomalib.metrics.per_image import binclf_curve
 
-    computed_threshs, computed_binclf_curves = binclf_curve.per_img_binclf_curve(
+    results_object = binclf_curve.per_img_binclf_curve(
         anomaly_maps,
         masks,
         algorithm=algorithm,
@@ -456,6 +456,7 @@ def test_per_img_binclf_curve_torch(
         threshs_given=threshs_given,
         num_threshs=num_threshs,
     )
+    computed_threshs, computed_binclf_curves = results_object.threshs, results_object.binclf_curves
 
     # threshs
     assert computed_threshs.shape == expected_threshs.shape
@@ -466,3 +467,7 @@ def test_per_img_binclf_curve_torch(
     assert computed_binclf_curves.shape == expected_binclf_curves.shape
     assert computed_binclf_curves.dtype == expected_binclf_curves.dtype
     assert (computed_binclf_curves == expected_binclf_curves).all()
+
+    # test properties
+    results_object.fprs  # noqa: B018
+    results_object.tprs  # noqa: B018

From 0f0b4248a12749b5e5e03764d7051e6021a8bc8e Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Thu, 21 Dec 2023 16:18:07 +0100
Subject: [PATCH 06/57] constants regrouped in dataclass as class vars

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 .../metrics/per_image/binclf_curve.py         |  5 +-
 .../metrics/per_image/binclf_curve_numpy.py   | 52 +++++++++++--------
 .../metrics/per_image/test_binclf_curve.py    | 52 ++++++++++++-------
 3 files changed, 67 insertions(+), 42 deletions(-)

diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
index 04ed5894ac..391e2edfbd 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -11,6 +11,7 @@
 from torch import Tensor
 
 from . import binclf_curve_numpy
+from .binclf_curve_numpy import Algorithm, ThreshsChoice
 
 # =========================================== ARGS VALIDATION ===========================================
 
@@ -123,8 +124,8 @@ def fprs(self) -> Tensor:
 def per_img_binclf_curve(
     anomaly_maps: Tensor,
     masks: Tensor,
-    algorithm: str = binclf_curve_numpy.ALGORITHM_NUMBA,
-    threshs_choice: str = binclf_curve_numpy.THRESHS_CHOICE_MINMAX_LINSPACE,
+    algorithm: str = Algorithm.NUMBA,
+    threshs_choice: str = ThreshsChoice.MINMAX_LINSPACE,
     return_result_object: bool = True,
     threshs_given: Tensor | None = None,
     num_threshs: int | None = None,
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index 15b6664720..8726937ecd 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -7,6 +7,8 @@
 
 import itertools
 import logging
+from dataclasses import dataclass
+from typing import ClassVar
 
 import numpy as np
 from numpy import ndarray
@@ -19,22 +21,28 @@
     HAS_NUMBA = True
     from . import _binclf_curve_numba
 
+logger = logging.getLogger(__name__)
 
-ALGORITHM_PYTHON = "python"
-ALGORITHM_NUMBA = "numba"
-ALGORIGHTMS = (ALGORITHM_PYTHON, ALGORITHM_NUMBA)
+# =========================================== CONSTANTS ===========================================
 
-THRESHS_CHOICE_GIVEN = "given"
-THRESHS_CHOICE_MINMAX_LINSPACE = "minmax-linspace"
-THRESHS_CHOICE_MEAN_FPR_OPTIMIZED = "mean-fpr-optimized"
-THRESHS_CHOICES = (
-    THRESHS_CHOICE_GIVEN,
-    THRESHS_CHOICE_MINMAX_LINSPACE,
-    THRESHS_CHOICE_MEAN_FPR_OPTIMIZED,
-)
 
+@dataclass
+class Algorithm:
+    """Algorithm to use."""
 
-logger = logging.getLogger(__name__)
+    PYTHON: ClassVar[str] = "python"
+    NUMBA: ClassVar[str] = "numba"
+    ALGORITHMS: ClassVar[tuple[str, ...]] = (PYTHON, NUMBA)
+
+
+@dataclass
+class ThreshsChoice:
+    """Sequence of thresholds to use."""
+
+    GIVEN: ClassVar[str] = "given"
+    MINMAX_LINSPACE: ClassVar[str] = "minmax-linspace"
+    MEAN_FPR_OPTIMIZED: ClassVar[str] = "mean-fpr-optimized"
+    CHOICES: ClassVar[tuple[str, ...]] = (GIVEN, MINMAX_LINSPACE, MEAN_FPR_OPTIMIZED)
 
 
 # =========================================== ARGS VALIDATION ===========================================
@@ -317,7 +325,7 @@ def binclf_multiple_curves(
     scores_batch: ndarray,
     gts_batch: ndarray,
     threshs: ndarray,
-    algorithm: str = ALGORITHM_NUMBA,
+    algorithm: str = Algorithm.NUMBA,
 ) -> ndarray:
     """Multiple binary classification matrix (per-instance scope) at each threshold (shared).
 
@@ -360,10 +368,10 @@ def binclf_multiple_curves(
     _validate_same_shape(scores_batch, gts_batch)
     _validate_threshs(threshs)
 
-    if algorithm == ALGORITHM_PYTHON:
+    if algorithm == Algorithm.PYTHON:
         return _binclf_multiple_curves_python(scores_batch, gts_batch, threshs)
 
-    if algorithm == ALGORITHM_NUMBA:
+    if algorithm == Algorithm.NUMBA:
         if not HAS_NUMBA:
             logger.warning(
                 "Algorithm 'numba' was selected, but numba is not installed. Fallback to 'python' algorithm.",
@@ -371,7 +379,7 @@ def binclf_multiple_curves(
             return _binclf_multiple_curves_python(scores_batch, gts_batch, threshs)
         return _binclf_curve_numba.binclf_multiple_curves_numba(scores_batch, gts_batch, threshs)
 
-    msg = f"Expected `algorithm` to be one of {ALGORIGHTMS}, but got {algorithm}"
+    msg = f"Expected `algorithm` to be one of {Algorithm.ALGORITHMS}, but got {algorithm}"
     raise NotImplementedError(msg)
 
 
@@ -381,8 +389,8 @@ def binclf_multiple_curves(
 def per_img_binclf_curve(
     anomaly_maps: ndarray,
     masks: ndarray,
-    algorithm: str = ALGORITHM_NUMBA,
-    threshs_choice: str = THRESHS_CHOICE_MINMAX_LINSPACE,
+    algorithm: str = Algorithm.NUMBA,
+    threshs_choice: str = ThreshsChoice.MINMAX_LINSPACE,
     threshs_given: ndarray | None = None,
     num_threshs: int | None = None,
 ) -> tuple[ndarray, ndarray]:
@@ -437,7 +445,7 @@ def per_img_binclf_curve(
 
     threshs: ndarray
 
-    if threshs_choice == THRESHS_CHOICE_GIVEN:
+    if threshs_choice == ThreshsChoice.GIVEN:
         assert threshs_given is not None
         _validate_threshs(threshs_given)
         if num_threshs is not None:
@@ -446,7 +454,7 @@ def per_img_binclf_curve(
             )
         threshs = threshs_given.astype(anomaly_maps.dtype)
 
-    elif threshs_choice == THRESHS_CHOICE_MINMAX_LINSPACE:
+    elif threshs_choice == ThreshsChoice.MINMAX_LINSPACE:
         assert num_threshs is not None
         if threshs_given is not None:
             logger.warning(
@@ -460,11 +468,11 @@ def per_img_binclf_curve(
             raise ValueError(msg) from ex
         threshs = np.linspace(thresh_low, thresh_high, num_threshs, dtype=anomaly_maps.dtype)
 
-    elif threshs_choice == THRESHS_CHOICE_MEAN_FPR_OPTIMIZED:
+    elif threshs_choice == ThreshsChoice.MEAN_FPR_OPTIMIZED:
         raise NotImplementedError(f"TODO implement {threshs_choice}")  # noqa: EM102
 
     else:
-        msg = f"Expected `threshs_choice` to be one of {THRESHS_CHOICES}, but got {threshs_choice}"
+        msg = f"Expected `threshs_choice` to be one of {ThreshsChoice.CHOICES}, but got {threshs_choice}"
         raise NotImplementedError(msg)
 
     # keep the batch dimension and flatten the rest
diff --git a/tests/unit/metrics/per_image/test_binclf_curve.py b/tests/unit/metrics/per_image/test_binclf_curve.py
index 1fa220c4a6..2b1b84c4a5 100644
--- a/tests/unit/metrics/per_image/test_binclf_curve.py
+++ b/tests/unit/metrics/per_image/test_binclf_curve.py
@@ -132,32 +132,29 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
 
     if metafunc.function is test_binclf_multiple_curves_validations:
         metafunc.parametrize(
-            argnames=("args", "exception"),
+            argnames=("args", "kwargs", "exception"),
             argvalues=[
                 # `scores` and `gts` must be 2D
-                ([preds.reshape(2, 2, 2), gts, threshs], ValueError),
-                ([preds, gts.flatten(), threshs], ValueError),
+                ([preds.reshape(2, 2, 2), gts, threshs], {"algorithm": "numba"}, ValueError),
+                ([preds, gts.flatten(), threshs], {"algorithm": "numba"}, ValueError),
                 # `threshs` must be 1D
-                ([preds, gts, threshs.reshape(2, 2)], ValueError),
+                ([preds, gts, threshs.reshape(2, 2)], {"algorithm": "numba"}, ValueError),
                 # `scores` and `gts` must have the same shape
-                ([preds, gts[:1], threshs], ValueError),
-                ([preds[:, :2], gts, threshs], ValueError),
+                ([preds, gts[:1], threshs], {"algorithm": "numba"}, ValueError),
+                ([preds[:, :2], gts, threshs], {"algorithm": "numba"}, ValueError),
                 # `scores` be of type float
-                ([preds.astype(int), gts, threshs], TypeError),
+                ([preds.astype(int), gts, threshs], {"algorithm": "numba"}, TypeError),
                 # `gts` be of type bool
-                ([preds, gts.astype(int), threshs], TypeError),
+                ([preds, gts.astype(int), threshs], {"algorithm": "numba"}, TypeError),
                 # `threshs` be of type float
-                ([preds, gts, threshs.astype(int)], TypeError),
+                ([preds, gts, threshs.astype(int)], {"algorithm": "numba"}, TypeError),
                 # `threshs` must be sorted in ascending order
-                ([preds, gts, np.flip(threshs)], ValueError),
-                ([preds, gts, np.concatenate([threshs[-2:], threshs[:2]])], ValueError),
-            ],
-        )
-        metafunc.parametrize(
-            argnames=("kwargs",),
-            argvalues=[
-                ({"algorithm": "python"},),
-                ({"algorithm": "numba"},),
+                ([preds, gts, np.flip(threshs)], {"algorithm": "numba"}, ValueError),
+                ([preds, gts, np.concatenate([threshs[-2:], threshs[:2]])], {"algorithm": "numba"}, ValueError),
+                # `threshs` must be unique
+                ([preds, gts, np.sort(np.concatenate([threshs, threshs]))], {"algorithm": "numba"}, ValueError),
+                # invalid `algorithm`
+                ([preds, gts, threshs], {"algorithm": "blurp"}, NotImplementedError),
             ],
         )
 
@@ -292,6 +289,20 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
             ],
         )
 
+    # same as above but testing other validations
+    if metafunc.function is test_per_img_binclf_curve_numpy_validations_alt:
+        metafunc.parametrize(
+            argnames=("args", "kwargs", "exception"),
+            argvalues=[
+                # invalid `threshs_choice`
+                (
+                    [preds, gts],
+                    {"algorithm": "glfrb", "threshs_choice": "given", "threshs_given": threshs, "num_threshs": None},
+                    NotImplementedError,
+                ),
+            ],
+        )
+
 
 # ==================================================================================================
 # LOW-LEVEL FUNCTIONS (PYTHON)
@@ -431,6 +442,11 @@ def test_per_img_binclf_curve_numpy_validations(args: list, kwargs: dict, except
         binclf_curve_numpy.per_img_binclf_curve(*args, **kwargs)
 
 
+def test_per_img_binclf_curve_numpy_validations_alt(args: list, kwargs: dict, exception: Exception) -> None:
+    """Test if `per_img_binclf_curve()` raises the expected errors."""
+    test_per_img_binclf_curve_numpy_validations(args, kwargs, exception)
+
+
 # ==================================================================================================
 # API FUNCTIONS (TORCH)
 

From 8df211e9e9a0b1f621141c7681dcd944c5786414 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Thu, 21 Dec 2023 17:41:18 +0100
Subject: [PATCH 07/57] result class was unneccesary for per_image_binclf_curve

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 .../metrics/per_image/binclf_curve.py         | 136 +++++++-----------
 .../metrics/per_image/binclf_curve_numpy.py   |  61 +++++++-
 .../metrics/per_image/test_binclf_curve.py    | 100 +++++++++----
 3 files changed, 178 insertions(+), 119 deletions(-)

diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
index 391e2edfbd..ad132e27ce 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -5,8 +5,6 @@
 
 from __future__ import annotations
 
-from dataclasses import dataclass
-
 import torch
 from torch import Tensor
 
@@ -44,92 +42,17 @@ def _validate_binclf_curves(binclf_curves: Tensor, valid_threshs: Tensor | None
     )
 
 
-# =========================================== RESULTS DATACLASS ===========================================
-
-
-@dataclass
-class PerImageBinClfCurveResult:
-    """Class interface for the results of `per_img_binclf_curve()`."""
-
-    algorithm: str
-    threshs_choice: str
-    threshs: Tensor
-    binclf_curves: Tensor
-
-    def __post_init__(self):  # noqa: D105, ANN204
-        try:
-            _validate_threshs(self.threshs)
-
-        except (TypeError, ValueError) as ex:
-            msg = "Invalid `threshs`!"
-            raise RuntimeError(msg) from ex
-
-        try:
-            _validate_binclf_curves(self.binclf_curves, valid_threshs=self.threshs)
-
-        except (TypeError, ValueError) as ex:
-            msg = "Invalid `binclf_curves`!"
-            raise RuntimeError(msg) from ex
-
-    @property
-    def num_images(self) -> int:
-        """Number of images (N)."""
-        return self.binclf_curves.shape[0]
-
-    @property
-    def num_threshs(self) -> int:
-        """Number of thresholds (K)."""
-        return self.threshs.shape[0]
-
-    @property
-    def tprs(self) -> Tensor:
-        """True positive rates (TPR) for image for each thresh.
-
-        TPR = TP / P = TP / (TP + FN)
-
-        Returns:
-            Tensor: shape (N, K), dtype float64
-            N: number of images
-            K: number of thresholds
-        """
-        # shape: (num images, num threshs)
-        tps = self.binclf_curves[..., 1, 1]
-        pos = self.binclf_curves[..., 1, :].sum(dim=-1)
-
-        # tprs will be nan if pos == 0 (normal image), which is expected
-        return tps.to(torch.float64) / pos.to(torch.float64)
-
-    @property
-    def fprs(self) -> Tensor:
-        """False positive rates (TPR) for image for each thresh.
-
-        FPR = FP / N = FP / (FP + TN)
-
-        Returns:
-            Tensor: shape (N, K), dtype float64
-            N: number of images
-            K: number of thresholds
-        """
-        # shape: (num images, num threshs)
-        fps = self.binclf_curves[..., 0, 1]
-        neg = self.binclf_curves[..., 0, :].sum(dim=-1)
-
-        # it can be `nan` if an anomalous image is fully covered by the mask
-        return fps.to(torch.float64) / neg.to(torch.float64)
-
-
 # =========================================== FUNCTIONAL ===========================================
 
 
-def per_img_binclf_curve(
+def per_image_binclf_curve(
     anomaly_maps: Tensor,
     masks: Tensor,
     algorithm: str = Algorithm.NUMBA,
     threshs_choice: str = ThreshsChoice.MINMAX_LINSPACE,
-    return_result_object: bool = True,
     threshs_given: Tensor | None = None,
     num_threshs: int | None = None,
-) -> PerImageBinClfCurveResult | tuple[Tensor, Tensor]:
+) -> tuple[Tensor, Tensor]:
     """Compute the binary classification matrix of each image in the batch for multiple thresholds (shared).
 
     ATTENTION: tensors are converted to numpy arrays and then converted back to tensors (same device as `anomaly_maps`).
@@ -189,7 +112,7 @@ def per_img_binclf_curve(
     else:
         threshs_given_array = None
 
-    threshs_array, binclf_curves_array = binclf_curve_numpy.per_img_binclf_curve(
+    threshs_array, binclf_curves_array = binclf_curve_numpy.per_image_binclf_curve(
         anomaly_maps=anomaly_maps_array,
         masks=masks_array,
         algorithm=algorithm,
@@ -200,12 +123,49 @@ def per_img_binclf_curve(
     threshs = torch.from_numpy(threshs_array).to(anomaly_maps.device)
     binclf_curves = torch.from_numpy(binclf_curves_array).to(anomaly_maps.device).long()
 
-    if not return_result_object:
-        return threshs, binclf_curves
+    return threshs, binclf_curves
 
-    return PerImageBinClfCurveResult(
-        algorithm=algorithm,
-        threshs_choice=threshs_choice,
-        threshs=threshs,
-        binclf_curves=binclf_curves,
-    )
+
+# =========================================== RATE METRICS ===========================================
+
+
+def per_image_tpr(binclf_curves: Tensor) -> Tensor:
+    """Compute the true positive rates (TPR) for each image in the batch.
+
+    Args:
+        binclf_curves (Tensor): Binary classification matrix curves (N, K, 2, 2). See `per_image_binclf_curve`.
+
+    Returns:
+        Tensor: True positive rates (TPR) of shape (N, K)
+
+            N: number of images/instances
+            K: number of thresholds
+
+            The last dimension is the TPR for each threshold.
+
+    """
+    _validate_binclf_curves(binclf_curves)
+    binclf_curves_array = binclf_curves.detach().cpu().numpy()
+    tprs_array = binclf_curve_numpy.per_image_tpr(binclf_curves_array)
+    return torch.from_numpy(tprs_array).to(binclf_curves.device)
+
+
+def per_image_fpr(binclf_curves: Tensor) -> Tensor:
+    """Compute the false positive rates (FPR) for each image in the batch.
+
+    Args:
+        binclf_curves (Tensor): Binary classification matrix curves (N, K, 2, 2). See `per_image_binclf_curve`.
+
+    Returns:
+        Tensor: False positive rates (FPR) of shape (N, K)
+
+            N: number of images/instances
+            K: number of thresholds
+
+            The last dimension is the FPR for each threshold.
+
+    """
+    _validate_binclf_curves(binclf_curves)
+    binclf_curves_array = binclf_curves.detach().cpu().numpy()
+    fprs_array = binclf_curve_numpy.per_image_fpr(binclf_curves_array)
+    return torch.from_numpy(fprs_array).to(binclf_curves.device)
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index 8726937ecd..c7035309f6 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -383,10 +383,10 @@ def binclf_multiple_curves(
     raise NotImplementedError(msg)
 
 
-# ========================================= PER-IMAGE ===========================================
+# ========================================= PER-IMAGE BINCLF CURVE =========================================
 
 
-def per_img_binclf_curve(
+def per_image_binclf_curve(
     anomaly_maps: ndarray,
     masks: ndarray,
     algorithm: str = Algorithm.NUMBA,
@@ -397,8 +397,8 @@ def per_img_binclf_curve(
     """Compute the binary classification matrix of each image in the batch for multiple thresholds (shared).
 
     Args:
-        anomaly_maps (ndarray): Anomaly score maps of shape (N, H, W [, D, ...])
-        masks (ndarray): Binary ground truth masks of shape (N, H, W [, D, ...])
+        anomaly_maps (ndarray): Anomaly score maps of shape (N, H, W)
+        masks (ndarray): Binary ground truth masks of shape (N, H, W)
         algorithm (str, optional): Algorithm to use. Defaults to ALGORITHM_NUMBA.
         threshs_choice (str, optional): Sequence of thresholds to use. Defaults to THRESH_SEQUENCE_MINMAX_LINSPACE.
         #
@@ -500,3 +500,56 @@ def per_img_binclf_curve(
         raise RuntimeError(msg) from ex
 
     return threshs, binclf_curves
+
+
+# =========================================== RATE METRICS ===========================================
+
+
+def per_image_tpr(binclf_curves: ndarray) -> ndarray:
+    """True positive rates (TPR) for image for each thresh.
+
+    TPR = TP / P = TP / (TP + FN)
+
+    TP: true positives
+    FM: false negatives
+    P: positives (TP + FN)
+
+    Args:
+        binclf_curves (ndarray): Binary classification matrix curves (N, K, 2, 2). See `per_image_binclf_curve`.
+
+    Returns:
+        Tensor: shape (N, K), dtype float64
+        N: number of images
+        K: number of thresholds
+    """
+    # shape: (num images, num threshs)
+    tps = binclf_curves[..., 1, 1]
+    pos = binclf_curves[..., 1, :].sum(axis=2)  # 2 was the 3 originally
+
+    # tprs will be nan if pos == 0 (normal image), which is expected
+    return tps.astype(np.float64) / pos.astype(np.float64)
+
+
+def per_image_fpr(binclf_curves: ndarray) -> ndarray:
+    """False positive rates (TPR) for image for each thresh.
+
+    FPR = FP / N = FP / (FP + TN)
+
+    FP: false positives
+    TN: true negatives
+    N: negatives (FP + TN)
+
+    Args:
+        binclf_curves (ndarray): Binary classification matrix curves (N, K, 2, 2). See `per_image_binclf_curve`.
+
+    Returns:
+        Tensor: shape (N, K), dtype float64
+        N: number of images
+        K: number of thresholds
+    """
+    # shape: (num images, num threshs)
+    fps = binclf_curves[..., 0, 1]
+    neg = binclf_curves[..., 0, :].sum(axis=2)  # 2 was the 3 originally
+
+    # it can be `nan` if an anomalous image is fully covered by the mask
+    return fps.astype(np.float64) / neg.astype(np.float64)
diff --git a/tests/unit/metrics/per_image/test_binclf_curve.py b/tests/unit/metrics/per_image/test_binclf_curve.py
index 2b1b84c4a5..0ed258ffe9 100644
--- a/tests/unit/metrics/per_image/test_binclf_curve.py
+++ b/tests/unit/metrics/per_image/test_binclf_curve.py
@@ -36,6 +36,14 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
         axis=0,
     ).astype(int)
 
+    expected_tprs_norm = np.array([np.nan, np.nan, np.nan, np.nan])
+    expected_tprs_anom = np.array([1.0, 1.0, 1.0, 0.5])
+    expected_tprs = np.stack([expected_tprs_anom, expected_tprs_norm], axis=0).astype(np.float64)
+
+    expected_fprs_norm = np.array([1.0, 0.75, 0.5, 0.25])
+    expected_fprs_anom = np.array([1.0, 0.5, 0.0, 0.0])
+    expected_fprs = np.stack([expected_fprs_anom, expected_fprs_norm], axis=0).astype(np.float64)
+
     # in the case where all thresholds are higher than the highest prediction
     expected_norm_threshs_too_high = np.stack(
         [
@@ -158,12 +166,12 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
             ],
         )
 
-    # the following tests are for `per_img_binclf_curve()`, which expects
+    # the following tests are for `per_image_binclf_curve()`, which expects
     # inputs in image spatial format, i.e. (height, width)
     preds = preds.reshape(2, 2, 2)
     gts = gts.reshape(2, 2, 2)
 
-    per_img_binclf_curves_numpy_argvalues = [
+    per_image_binclf_curves_numpy_argvalues = [
         # `threshs_choice` = "given"
         (
             preds,
@@ -213,7 +221,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
         ),
     ]
 
-    if metafunc.function is test_per_img_binclf_curve_numpy:
+    if metafunc.function is test_per_image_binclf_curve_numpy:
         metafunc.parametrize(
             argnames=(
                 "anomaly_maps",
@@ -224,11 +232,11 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
                 "expected_threshs",
                 "expected_binclf_curves",
             ),
-            argvalues=per_img_binclf_curves_numpy_argvalues,
+            argvalues=per_image_binclf_curves_numpy_argvalues,
         )
 
     # the test with the torch interface are the same we just convert ndarray to Tensor
-    if metafunc.function is test_per_img_binclf_curve_torch:
+    if metafunc.function is test_per_image_binclf_curve_torch:
         metafunc.parametrize(
             argnames=(
                 "anomaly_maps",
@@ -241,11 +249,11 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
             ),
             argvalues=[
                 tuple(torch.from_numpy(v) if isinstance(v, np.ndarray) else v for v in argvals)
-                for argvals in per_img_binclf_curves_numpy_argvalues
+                for argvals in per_image_binclf_curves_numpy_argvalues
             ],
         )
 
-    if metafunc.function is test_per_img_binclf_curve_numpy or metafunc.function is test_per_img_binclf_curve_torch:
+    if metafunc.function is test_per_image_binclf_curve_numpy or metafunc.function is test_per_image_binclf_curve_torch:
         metafunc.parametrize(
             argnames=("algorithm",),
             argvalues=[
@@ -254,9 +262,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
             ],
         )
 
-    # if metafunc.function is test_per_img_binclf_curve_numpy:
-
-    if metafunc.function is test_per_img_binclf_curve_numpy_validations:
+    if metafunc.function is test_per_image_binclf_curve_numpy_validations:
         metafunc.parametrize(
             argnames=("args", "exception"),
             argvalues=[
@@ -290,7 +296,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
         )
 
     # same as above but testing other validations
-    if metafunc.function is test_per_img_binclf_curve_numpy_validations_alt:
+    if metafunc.function is test_per_image_binclf_curve_numpy_validations_alt:
         metafunc.parametrize(
             argnames=("args", "kwargs", "exception"),
             argvalues=[
@@ -303,6 +309,23 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
             ],
         )
 
+    if metafunc.function is test_rate_metrics_numpy:
+        metafunc.parametrize(
+            argnames=("binclf_curves", "expected_fprs", "expected_tprs"),
+            argvalues=[
+                (binclf_curves, expected_fprs, expected_tprs),
+                (10 * binclf_curves, expected_fprs, expected_tprs),
+            ],
+        )
+
+    if metafunc.function is test_rate_metrics_torch:
+        metafunc.parametrize(
+            argnames=("binclf_curves", "expected_fprs", "expected_tprs"),
+            argvalues=[
+                (torch.from_numpy(binclf_curves), torch.from_numpy(expected_fprs), torch.from_numpy(expected_tprs)),
+            ],
+        )
+
 
 # ==================================================================================================
 # LOW-LEVEL FUNCTIONS (PYTHON)
@@ -401,7 +424,7 @@ def test_binclf_multiple_curves_validations(args: list, kwargs: dict, exception:
         binclf_curve_numpy.binclf_multiple_curves(*args, **kwargs)
 
 
-def test_per_img_binclf_curve_numpy(
+def test_per_image_binclf_curve_numpy(
     anomaly_maps: ndarray,
     masks: ndarray,
     algorithm: str,
@@ -411,10 +434,10 @@ def test_per_img_binclf_curve_numpy(
     expected_threshs: ndarray,
     expected_binclf_curves: ndarray,
 ) -> None:
-    """Test if `per_img_binclf_curve()` returns the expected values."""
+    """Test if `per_image_binclf_curve()` returns the expected values."""
     from anomalib.metrics.per_image import binclf_curve_numpy
 
-    computed_threshs, computed_binclf_curves = binclf_curve_numpy.per_img_binclf_curve(
+    computed_threshs, computed_binclf_curves = binclf_curve_numpy.per_image_binclf_curve(
         anomaly_maps,
         masks,
         algorithm=algorithm,
@@ -434,24 +457,38 @@ def test_per_img_binclf_curve_numpy(
     assert (computed_binclf_curves == expected_binclf_curves).all()
 
 
-def test_per_img_binclf_curve_numpy_validations(args: list, kwargs: dict, exception: Exception) -> None:
-    """Test if `per_img_binclf_curve()` raises the expected errors."""
+def test_per_image_binclf_curve_numpy_validations(args: list, kwargs: dict, exception: Exception) -> None:
+    """Test if `per_image_binclf_curve()` raises the expected errors."""
     from anomalib.metrics.per_image import binclf_curve_numpy
 
     with pytest.raises(exception):
-        binclf_curve_numpy.per_img_binclf_curve(*args, **kwargs)
+        binclf_curve_numpy.per_image_binclf_curve(*args, **kwargs)
+
+
+def test_per_image_binclf_curve_numpy_validations_alt(args: list, kwargs: dict, exception: Exception) -> None:
+    """Test if `per_image_binclf_curve()` raises the expected errors."""
+    test_per_image_binclf_curve_numpy_validations(args, kwargs, exception)
+
+
+def test_rate_metrics_numpy(binclf_curves: ndarray, expected_fprs: ndarray, expected_tprs: ndarray) -> None:
+    """Test if rate metrics are computed correctly."""
+    from anomalib.metrics.per_image import binclf_curve_numpy
 
+    tprs = binclf_curve_numpy.per_image_tpr(binclf_curves)
+    fprs = binclf_curve_numpy.per_image_fpr(binclf_curves)
 
-def test_per_img_binclf_curve_numpy_validations_alt(args: list, kwargs: dict, exception: Exception) -> None:
-    """Test if `per_img_binclf_curve()` raises the expected errors."""
-    test_per_img_binclf_curve_numpy_validations(args, kwargs, exception)
+    assert tprs.shape == expected_tprs.shape
+    assert fprs.shape == expected_fprs.shape
+
+    assert np.allclose(tprs, expected_tprs, equal_nan=True)
+    assert np.allclose(fprs, expected_fprs, equal_nan=True)
 
 
 # ==================================================================================================
 # API FUNCTIONS (TORCH)
 
 
-def test_per_img_binclf_curve_torch(
+def test_per_image_binclf_curve_torch(
     anomaly_maps: Tensor,
     masks: Tensor,
     algorithm: str,
@@ -461,10 +498,10 @@ def test_per_img_binclf_curve_torch(
     expected_threshs: Tensor,
     expected_binclf_curves: Tensor,
 ) -> None:
-    """Test if `per_img_binclf_curve()` returns the expected values."""
+    """Test if `per_image_binclf_curve()` returns the expected values."""
     from anomalib.metrics.per_image import binclf_curve
 
-    results_object = binclf_curve.per_img_binclf_curve(
+    computed_threshs, computed_binclf_curves = binclf_curve.per_image_binclf_curve(
         anomaly_maps,
         masks,
         algorithm=algorithm,
@@ -472,7 +509,6 @@ def test_per_img_binclf_curve_torch(
         threshs_given=threshs_given,
         num_threshs=num_threshs,
     )
-    computed_threshs, computed_binclf_curves = results_object.threshs, results_object.binclf_curves
 
     # threshs
     assert computed_threshs.shape == expected_threshs.shape
@@ -484,6 +520,16 @@ def test_per_img_binclf_curve_torch(
     assert computed_binclf_curves.dtype == expected_binclf_curves.dtype
     assert (computed_binclf_curves == expected_binclf_curves).all()
 
-    # test properties
-    results_object.fprs  # noqa: B018
-    results_object.tprs  # noqa: B018
+
+def test_rate_metrics_torch(binclf_curves: Tensor, expected_fprs: Tensor, expected_tprs: Tensor) -> None:
+    """Test if rate metrics are computed correctly."""
+    from anomalib.metrics.per_image import binclf_curve
+
+    tprs = binclf_curve.per_image_tpr(binclf_curves)
+    fprs = binclf_curve.per_image_fpr(binclf_curves)
+
+    assert tprs.shape == expected_tprs.shape
+    assert fprs.shape == expected_fprs.shape
+
+    assert torch.allclose(tprs, expected_tprs, equal_nan=True)
+    assert torch.allclose(fprs, expected_fprs, equal_nan=True)

From 9e74226eacaf43c6a108a1afaf42e931c52d339f Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Thu, 21 Dec 2023 19:46:49 +0100
Subject: [PATCH 08/57] factorize function _get_threshs_minmax_linspace

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 .../metrics/per_image/binclf_curve_numpy.py   | 22 +++++++++++++------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index c7035309f6..46da4e47f1 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -386,6 +386,19 @@ def binclf_multiple_curves(
 # ========================================= PER-IMAGE BINCLF CURVE =========================================
 
 
+def _get_threshs_minmax_linspace(anomaly_maps: ndarray, num_threshs: int) -> ndarray:
+    """Get thresholds linearly spaced between the min and max of the anomaly maps."""
+    _validate_num_threshs(num_threshs)
+    # this operation can be a bit expensive
+    thresh_low, thresh_high = thresh_bounds = (anomaly_maps.min().item(), anomaly_maps.max().item())
+    try:
+        _validate_thresh_bounds(thresh_bounds)
+    except ValueError as ex:
+        msg = "Invalid `thresh_bounds` computed from `anomaly_maps`."
+        raise ValueError(msg) from ex
+    return np.linspace(thresh_low, thresh_high, num_threshs, dtype=anomaly_maps.dtype)
+
+
 def per_image_binclf_curve(
     anomaly_maps: ndarray,
     masks: ndarray,
@@ -437,6 +450,7 @@ def per_image_binclf_curve(
             Thresholds are shared across all images, so all confusion matrices, for instance,
             at position [:, 0, :, :] are relative to the 1st threshold in `threshs`.
 
+            Thresholds are sorted in ascending order.
     """
     # validate inputs
     _validate_anomaly_maps(anomaly_maps)
@@ -460,13 +474,7 @@ def per_image_binclf_curve(
             logger.warning(
                 f"Argument `threshs_given` was given, but it is ignored because `threshs_choice` is {threshs_choice}.",
             )
-        thresh_low, thresh_high = thresh_bounds = (anomaly_maps.min().item(), anomaly_maps.max().item())
-        try:
-            _validate_thresh_bounds(thresh_bounds)
-        except ValueError as ex:
-            msg = "Invalid `thresh_bounds` computed from `anomaly_maps`."
-            raise ValueError(msg) from ex
-        threshs = np.linspace(thresh_low, thresh_high, num_threshs, dtype=anomaly_maps.dtype)
+        threshs = _get_threshs_minmax_linspace(anomaly_maps, num_threshs)
 
     elif threshs_choice == ThreshsChoice.MEAN_FPR_OPTIMIZED:
         raise NotImplementedError(f"TODO implement {threshs_choice}")  # noqa: EM102

From 283d704401b6d0a33f4b780dd0b5a9638a4742fd Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Thu, 21 Dec 2023 19:48:03 +0100
Subject: [PATCH 09/57] small docs fixes

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/binclf_curve.py       |  8 +++++++-
 src/anomalib/metrics/per_image/binclf_curve_numpy.py | 10 ++++++++--
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
index ad132e27ce..c1d353cf7c 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -1,6 +1,9 @@
 """Binary classification curve (torch interface).
 
-This module implements interfaces for the code in `binclf_curve_numpy.py`. Check its docstring for more details.
+This module implements torch interfaces to access the numpy code in `binclf_curve_numpy.py`.
+Check its docstring for more details.
+
+Tensors are build with `torch.from_numpy` and so the returned tensors will share the same memory as the numpy arrays.
 """
 
 from __future__ import annotations
@@ -99,6 +102,7 @@ def per_image_binclf_curve(
             Thresholds are shared across all images, so all confusion matrices, for instance,
             at position [:, 0, :, :] are relative to the 1st threshold in `threshs`.
 
+            Thresholds are sorted in ascending order.
     """
     _validate_is_tensor(anomaly_maps, argname="anomaly_maps")
     anomaly_maps_array = anomaly_maps.detach().cpu().numpy()
@@ -143,6 +147,7 @@ def per_image_tpr(binclf_curves: Tensor) -> Tensor:
 
             The last dimension is the TPR for each threshold.
 
+            Thresholds are sorted in ascending order, so TPR is in descending order.
     """
     _validate_binclf_curves(binclf_curves)
     binclf_curves_array = binclf_curves.detach().cpu().numpy()
@@ -164,6 +169,7 @@ def per_image_fpr(binclf_curves: Tensor) -> Tensor:
 
             The last dimension is the FPR for each threshold.
 
+            Thresholds are sorted in ascending order, so FPR is in descending order.
     """
     _validate_binclf_curves(binclf_curves)
     binclf_curves_array = binclf_curves.detach().cpu().numpy()
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index 46da4e47f1..6e5bbeaf9d 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -362,6 +362,8 @@ def binclf_multiple_curves(
 
         Thresholds are shared across all instances, so all confusion matrices, for instance,
         at position [:, 0, :, :] are relative to the 1st threshold in `threshs`.
+
+        Thresholds are sorted in ascending order.
     """
     _validate_scores_batch(scores_batch)
     _validate_gts_batch(gts_batch)
@@ -526,9 +528,11 @@ def per_image_tpr(binclf_curves: ndarray) -> ndarray:
         binclf_curves (ndarray): Binary classification matrix curves (N, K, 2, 2). See `per_image_binclf_curve`.
 
     Returns:
-        Tensor: shape (N, K), dtype float64
+        ndarray: shape (N, K), dtype float64
         N: number of images
         K: number of thresholds
+
+        Thresholds are sorted in ascending order, so TPR is in descending order.
     """
     # shape: (num images, num threshs)
     tps = binclf_curves[..., 1, 1]
@@ -551,9 +555,11 @@ def per_image_fpr(binclf_curves: ndarray) -> ndarray:
         binclf_curves (ndarray): Binary classification matrix curves (N, K, 2, 2). See `per_image_binclf_curve`.
 
     Returns:
-        Tensor: shape (N, K), dtype float64
+        ndarray: shape (N, K), dtype float64
         N: number of images
         K: number of thresholds
+
+        Thresholds are sorted in ascending order, so FPR is in descending order.
     """
     # shape: (num images, num threshs)
     fps = binclf_curves[..., 0, 1]

From 2bc3c06332eb93c4881e4c33914ee015e4d0d170 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Thu, 21 Dec 2023 20:23:23 +0100
Subject: [PATCH 10/57] add pimo numpy version and test

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/pimo_numpy.py | 371 +++++++++++++++++++
 tests/unit/metrics/per_image/test_pimo.py    | 272 ++++++++++++++
 2 files changed, 643 insertions(+)
 create mode 100644 src/anomalib/metrics/per_image/pimo_numpy.py
 create mode 100644 tests/unit/metrics/per_image/test_pimo.py

diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
new file mode 100644
index 0000000000..8f0fdd2b3f
--- /dev/null
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -0,0 +1,371 @@
+"""Per-Image Overlap curve (PIMO, pronounced pee-mo) and its area under the curve (AUPIMO).
+
+# PIMO
+
+PIMO is a measure of average True Positive Rate (TPR), on each image, across multiple anomaly score thresholds.
+The anomaly score thresholds are indexed by an False Positive Rate (FPR) measure on the normal images.
+
+Each *anomalous* image has its own curve such that the X-axis is shared by all of them.
+
+At a given threshold:
+    X-axis: Shared FPR (may vary)
+        1. Log of the Average of per-image FPR on normal images.
+        SEE NOTE BELOW.
+    Y-axis: per-image TP Rate (TPR), or "Overlap" between the ground truth and the predicted masks.
+
+*** Note about other shared FPR alternatives ***
+The shared FPR metric can be made harder by using the cross-image max (or high-percentile) FPRs instead of the mean.
+Rationale: this will further punish models that have exceptional FPs in normal images.
+So far there is only one shared FPR metric implemented but others will be added in the future.
+
+# AUPIMO
+
+`AUPIMO` is the area under each `PIMO` curve with bounded integration range in terms of shared FPR.
+
+TODO(jpcbertoldo): add ref to paper
+"""
+
+import logging
+import warnings
+from dataclasses import dataclass
+from typing import ClassVar
+
+import numpy as np
+from numpy import ndarray
+
+from . import binclf_curve_numpy
+from .binclf_curve_numpy import Algorithm as BinclfAlgorithm
+from .binclf_curve_numpy import ThreshsChoice as BinclfThreshsChoice
+
+logger = logging.getLogger(__name__)
+
+# =========================================== CONSTANTS ===========================================
+
+
+@dataclass
+class SharedFPRMetric:
+    """Shared FPR metric (x-axis of the PIMO curve)."""
+
+    MEAN_PERIMAGE_FPR: ClassVar[str] = "mean_perimage_fpr"
+
+    METRICS: ClassVar[tuple[str, ...]] = (MEAN_PERIMAGE_FPR,)
+
+
+# =========================================== ARGS VALIDATION ===========================================
+
+
+def _validate_rate(rate: float | int, zero_ok: bool, one_ok: bool) -> None:
+    if not isinstance(rate, float | int):
+        msg = f"Expected rate to be a float or int, but got {type(rate)}."
+        raise TypeError(msg)
+
+    if rate < 0.0 or rate > 1.0:
+        msg = f"Rate `{rate}` is not a valid because it must be in [0, 1]."
+        raise ValueError(msg)
+
+    if not zero_ok and rate == 0.0:
+        msg = "Rate cannot be 0."
+        raise ValueError(msg)
+
+    if not one_ok and rate == 1.0:
+        msg = "Rate cannot be 1."
+        raise ValueError(msg)
+
+
+def _validate_fpr_bounds(fpr_bounds: tuple[float, float]) -> None:
+    if not isinstance(fpr_bounds, tuple):
+        msg = f"Expected `fpr_bounds` to be a tuple, but got {type(fpr_bounds)}"
+        raise TypeError(msg)
+
+    if len(fpr_bounds) != 2:
+        msg = f"Expected `fpr_bounds` to be a tuple of length 2, but got {len(fpr_bounds)}"
+        raise ValueError(msg)
+
+    lower, upper = fpr_bounds
+    _validate_rate(lower, zero_ok=False, one_ok=False)
+    _validate_rate(upper, zero_ok=False, one_ok=True)
+
+    if lower >= upper:
+        msg = f"Expected `fpr_bounds[1]` > `fpr_bounds[0]`, but got {fpr_bounds[1]} <= {fpr_bounds[0]}"
+        raise ValueError(msg)
+
+
+def _images_classes_from_masks(masks: ndarray) -> ndarray:
+    """Deduce the image classes from the masks."""
+    binclf_curve_numpy._validate_masks(masks)  # noqa: SLF001
+    return (masks == 1).any(axis=(1, 2)).astype(np.int32)
+
+
+def _validate_atleast_one_anomalous_image(masks: ndarray) -> None:
+    image_classes = _images_classes_from_masks(masks)
+    if (image_classes == 1).sum() == 0:
+        msg = "Expected at least one ANOMALOUS image, but found none."
+        raise ValueError(msg)
+
+
+def _validate_atleast_one_normal_image(masks: ndarray) -> None:
+    image_classes = _images_classes_from_masks(masks)
+    if (image_classes == 0).sum() == 0:
+        msg = "Expected at least one NORMAL image, but found none."
+        raise ValueError(msg)
+
+
+# =========================================== PIMO ===========================================
+
+
+# TODO(jpcbertoldo): missing docstring for `pimo`  # noqa: TD003
+def pimo(  # noqa: D103
+    anomaly_maps: ndarray,
+    masks: ndarray,
+    num_threshs: int,
+    binclf_algorithm: str = BinclfAlgorithm.NUMBA,
+) -> tuple[ndarray, ndarray, ndarray, ndarray]:
+    # validate inputs
+    binclf_curve_numpy._validate_num_threshs(num_threshs)  # noqa: SLF001
+    binclf_curve_numpy._validate_anomaly_maps(anomaly_maps)  # noqa: SLF001
+    binclf_curve_numpy._validate_masks(masks)  # noqa: SLF001
+    binclf_curve_numpy._validate_same_shape(anomaly_maps, masks)  # noqa: SLF001
+    _validate_atleast_one_anomalous_image(masks)
+    _validate_atleast_one_normal_image(masks)
+
+    image_classes = _images_classes_from_masks(masks)
+
+    # the thresholds are computed here so that they can be restrained to the normal images
+    # therefore getting a better resolution in terms of FPR quantization
+    # otherwise the function `binclf_curve_numpy.per_image_binclf_curve` would have the range of thresholds
+    # computed from all the images (normal + anomalous)
+    threshs = binclf_curve_numpy._get_threshs_minmax_linspace(  # noqa: SLF001
+        anomaly_maps[image_classes == 0],
+        num_threshs,
+    )
+
+    # N: number of images, K: number of thresholds
+    # shapes are (K,) and (N, K, 2, 2)
+    threshs, binclf_curves = binclf_curve_numpy.per_image_binclf_curve(
+        anomaly_maps=anomaly_maps,
+        masks=masks,
+        algorithm=binclf_algorithm,
+        threshs_choice=BinclfThreshsChoice.GIVEN,
+        threshs_given=threshs,
+        num_threshs=None,
+    )
+
+    # shape -> (N, K)
+    per_image_fprs = binclf_curve_numpy.per_image_fpr(binclf_curves)
+    # TODO(jpcbertoldo): validate per_image_fprs  # noqa: TD003
+
+    # shape -> (K,)
+    # this is the only shared FPR metric implemented so far, see note about shared FPR in the module's docstring
+    shared_fpr = per_image_fprs[image_classes == 0].mean(axis=0)
+    # TODO(jpcbertoldo): validate shared_fpr  # noqa: TD003
+
+    # shape -> (N, K)
+    per_image_tprs = binclf_curve_numpy.per_image_tpr(binclf_curves)
+
+    return threshs, shared_fpr, per_image_tprs, image_classes
+
+
+def _joint_validate_threshs_shared_fpr(threshs: ndarray, shared_fpr: ndarray) -> None:
+    if threshs.shape[0] != shared_fpr.shape[0]:
+        msg = (
+            "Expected `threshs` and `shared_fpr` to have the same number of elements, "
+            f"but got {threshs.shape[0]} != {shared_fpr.shape[0]}"
+        )
+        raise ValueError(msg)
+
+
+# =========================================== AUPIMO ===========================================
+
+
+# TODO(jpcbertoldo): missing docstring for `aupimo`  # noqa: TD003
+def aupimo(  # noqa: D103
+    anomaly_maps: ndarray,
+    masks: ndarray,
+    num_threshs: int = 300_000,
+    binclf_algorithm: str = BinclfAlgorithm.NUMBA,
+    fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
+    force: bool = False,
+) -> tuple[ndarray, ndarray, ndarray, ndarray, ndarray]:
+    # validate inputs
+    _validate_fpr_bounds(fpr_bounds)
+
+    # other validations are done in the `pimo` function
+    threshs, shared_fpr, per_image_tprs, image_classes = pimo(
+        anomaly_maps=anomaly_maps,
+        masks=masks,
+        num_threshs=num_threshs,
+        binclf_algorithm=binclf_algorithm,
+    )
+
+    fpr_lower_bound, fpr_upper_bound = fpr_bounds
+
+    # get the threshold indices where the fpr bounds are achieved
+    fpr_lower_bound_thresh_idx, _, fpr_lower_bound_defacto = thresh_at_shared_fpr_level(
+        threshs,
+        shared_fpr,
+        fpr_lower_bound,
+    )
+    fpr_upper_bound_thresh_idx, _, fpr_upper_bound_defacto = thresh_at_shared_fpr_level(
+        threshs,
+        shared_fpr,
+        fpr_upper_bound,
+    )
+
+    if not np.isclose(fpr_lower_bound_defacto, fpr_lower_bound, rtol=(rtol := 1e-2)):
+        msg = (
+            "The lower bound of the shared FPR integration range is not exactly achieved. "
+            f"Expected {fpr_lower_bound} but got {fpr_lower_bound_defacto}, which is not within {rtol=}."
+        )
+        warnings.warn(msg, RuntimeWarning, stacklevel=1)
+
+    if not np.isclose(fpr_upper_bound_defacto, fpr_upper_bound, rtol=rtol):
+        msg = (
+            "The upper bound of the shared FPR integration range is not exactly achieved. "
+            f"Expected {fpr_upper_bound} but got {fpr_upper_bound_defacto}, which is not within {rtol=}."
+        )
+        warnings.warn(msg, RuntimeWarning, stacklevel=1)
+
+    # reminder: fpr lower/upper bound is threshold upper/lower bound (reversed)
+    thresh_lower_bound_idx = fpr_upper_bound_thresh_idx
+    thresh_upper_bound_idx = fpr_lower_bound_thresh_idx
+
+    # deal with edge cases
+    if thresh_lower_bound_idx >= thresh_upper_bound_idx:
+        msg = (
+            "The thresholds corresponding to the given `fpr_bounds` are not valid because "
+            "they matched the same threshold or the are in the wrong order. "
+            f"FPR upper/lower = threshold lower/upper = {thresh_lower_bound_idx} and {thresh_upper_bound_idx}."
+        )
+        raise RuntimeError(msg)
+
+    # limit the curves to the integration range [lbound, ubound]
+    shared_fpr_bounded: ndarray = shared_fpr[thresh_lower_bound_idx : (thresh_upper_bound_idx + 1)]
+    per_image_tprs_bounded: ndarray = per_image_tprs[:, thresh_lower_bound_idx : (thresh_upper_bound_idx + 1)]
+
+    # `shared_fpr` and `tprs` are in descending order; `flip()` reverts to ascending order
+    shared_fpr_bounded = np.flip(shared_fpr_bounded)
+    per_image_tprs_bounded = np.flip(per_image_tprs_bounded, axis=1)
+
+    # the log's base does not matter because it's a constant factor canceled by normalization factor
+    shared_fpr_bounded_log = np.log(shared_fpr_bounded)
+
+    # deal with edge cases
+    invalid_shared_fpr = ~np.isfinite(shared_fpr_bounded_log)
+
+    if invalid_shared_fpr.all():
+        msg = (
+            "Cannot compute AUPIMO because the shared fpr integration range is invalid). "
+            "Try increasing the number of thresholds."
+        )
+        raise RuntimeError(msg)
+
+    if invalid_shared_fpr.any():
+        msg = (
+            "Some values in the shared fpr integration range are nan. "
+            "The AUPIMO will be computed without these values."
+        )
+        warnings.warn(msg, RuntimeWarning, stacklevel=1)
+        logger.warning(msg)
+
+        # get rid of nan values by removing them from the integration range
+        shared_fpr_bounded_log = shared_fpr_bounded_log[~invalid_shared_fpr]
+        per_image_tprs_bounded = per_image_tprs_bounded[:, ~invalid_shared_fpr]
+
+    num_points_integral = shared_fpr_bounded_log.shape[0]
+
+    if num_points_integral <= 30:
+        msg = (
+            "Cannot compute AUPIMO because the shared fpr integration range doesnt have enough points. "
+            f"Found {num_points_integral} points in the integration range. "
+            "Try increasing `num_threshs`."
+        )
+        if not force:
+            raise RuntimeError(msg)
+        msg += " Computation was forced!"
+        warnings.warn(msg, RuntimeWarning, stacklevel=1)
+        logger.warning(msg)
+
+    if num_points_integral < 300:
+        msg = (
+            "The AUPIMO may be inaccurate because the shared fpr integration range doesnt have enough points. "
+            f"Found {num_points_integral} points in the integration range. "
+            "Try increasing `num_threshs`."
+        )
+        warnings.warn(msg, RuntimeWarning, stacklevel=1)
+        logger.warning(msg)
+
+    aucs: ndarray = np.trapz(per_image_tprs_bounded, x=shared_fpr_bounded_log, axis=1)
+
+    # normalize, then clip(0, 1) makes sure that the values are in [0, 1] in case of numerical errors
+    normalization_factor = _aupimo_max_integral_value(fpr_bounds)
+    aucs = (aucs / normalization_factor).clip(0, 1)
+
+    return threshs, shared_fpr, per_image_tprs, image_classes, aucs
+
+
+# =========================================== AUX ===========================================
+
+
+def thresh_at_shared_fpr_level(threshs: ndarray, shared_fpr: ndarray, fpr_level: float) -> tuple[int, float, float]:
+    """Return the threshold and its index at the given shared FPR level.
+
+    Three cases are possible:
+    - fpr_level == 0: the lowest threshold that achieves 0 FPR is returned
+    - fpr_level == 1: the highest threshold that achieves 1 FPR is returned
+    - 0 < fpr_level < 1: the threshold that achieves the closest (higher or lower) FPR to `fpr_level` is returned
+
+    Args:
+        threshs: thresholds at which the shared FPR was computed.
+        shared_fpr: shared FPR values.
+        fpr_level: shared FPR value at which to get the threshold.
+
+    Returns:
+        tuple[int, float, float]:
+            [0] index of the threshold
+            [1] threshold
+            [2] the actual shared FPR value at the returned threshold
+    """
+    binclf_curve_numpy._validate_threshs(threshs)  # noqa: SLF001
+    # TODO(jpcbertoldo): validate shared_fpr  # noqa: TD003
+    _joint_validate_threshs_shared_fpr(threshs, shared_fpr)
+    _validate_rate(fpr_level, zero_ok=True, one_ok=True)
+
+    shared_fpr_min, shared_fpr_max = shared_fpr.min(), shared_fpr.max()
+
+    if fpr_level < shared_fpr_min:
+        msg = (
+            "Invalid `fpr_level` because it's out of the range of `shared_fpr` = "
+            f"[{shared_fpr_min}, {shared_fpr_max}], and got {fpr_level}."
+        )
+        raise ValueError(msg)
+
+    if fpr_level > shared_fpr_max:
+        msg = (
+            "Invalid `fpr_level` because it's out of the range of `shared_fpr` = "
+            f"[{shared_fpr_min}, {shared_fpr_max}], and got {fpr_level}."
+        )
+        raise ValueError(msg)
+
+    # fpr_level == 0 or 1 are special case
+    # because there may be multiple solutions, and the chosen should their MINIMUM/MAXIMUM respectively
+    if fpr_level == 0.0:
+        index = np.min(np.where(shared_fpr == fpr_level))
+
+    elif fpr_level == 1.0:
+        index = np.max(np.where(shared_fpr == fpr_level))
+
+    else:
+        index = np.argmin(np.abs(shared_fpr - fpr_level))
+
+    index = int(index)
+    fpr_level_defacto = shared_fpr[index]
+    thresh = threshs[index]
+    return index, thresh, fpr_level_defacto
+
+
+def _aupimo_max_integral_value(fpr_bounds: tuple[float, float]) -> float:
+    """Constant that normalizes the AUPIMO integral to 0-1 range."""
+    _validate_fpr_bounds(fpr_bounds)
+    fpr_lower_bound, fpr_upper_bound = fpr_bounds
+    # the log's base must be the same as the one used in the integration!
+    return float(np.log(fpr_upper_bound / fpr_lower_bound))
diff --git a/tests/unit/metrics/per_image/test_pimo.py b/tests/unit/metrics/per_image/test_pimo.py
new file mode 100644
index 0000000000..ec4caa56f6
--- /dev/null
+++ b/tests/unit/metrics/per_image/test_pimo.py
@@ -0,0 +1,272 @@
+"""Test `anomalib.metrics.per_image.pimo_numpy`."""
+
+import numpy as np
+import pytest
+from numpy import ndarray
+
+
+def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
+    """Generate tests for all functions in this module.
+
+    All functions are parametrized with the same setting: 1 normal and 2 anomalous images.
+    The anomaly maps are the same for all functions, but the masks are different.
+    """
+    expected_threshs = np.arange(1, 7 + 1, dtype=np.float32)
+    shape = (1000, 1000)  # (H, W), 1 million pixels
+
+    # --- normal ---
+    # histogram of scores:
+    # value:   7   6    5    4    3    2     1
+    # count:   1   9   90  900   9k   90k  900k
+    # cumsum:  1  10  100   1k  10k  100k    1M
+    pred_norm = np.ones(1_000_000, dtype=np.float32)
+    pred_norm[:100_000] += 1
+    pred_norm[:10_000] += 1
+    pred_norm[:1_000] += 1
+    pred_norm[:100] += 1
+    pred_norm[:10] += 1
+    pred_norm[:1] += 1
+    pred_norm = pred_norm.reshape(shape)
+    mask_norm = np.zeros_like(pred_norm, dtype=np.int32)
+
+    expected_fpr_norm = np.array([1.0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6], dtype=np.float64)
+    expected_tpr_norm = np.full((7,), np.nan, dtype=np.float64)
+
+    # --- anomalous ---
+    pred_anom1 = pred_norm.copy()
+    mask_anom1 = np.ones_like(pred_anom1, dtype=np.int32)
+    expected_tpr_anom1 = expected_fpr_norm.copy()
+
+    # only the first 100_000 pixels are anomalous
+    # which corresponds to the first 100_000 highest scores (2 to 7)
+    pred_anom2 = pred_norm.copy()
+    mask_anom2 = np.concatenate([np.ones(100_000), np.zeros(900_000)]).reshape(shape).astype(np.int32)
+    expected_tpr_anom2 = (10 * expected_fpr_norm).clip(0, 1)
+
+    anomaly_maps = np.stack([pred_norm, pred_anom1, pred_anom2], axis=0)
+    masks = np.stack([mask_norm, mask_anom1, mask_anom2], axis=0)
+
+    expected_shared_fpr = expected_fpr_norm
+    expected_per_image_tprs = np.stack([expected_tpr_norm, expected_tpr_anom1, expected_tpr_anom2], axis=0)
+    expected_image_classes = np.array([0, 1, 1], dtype=np.int32)
+
+    metafunc.parametrize(
+        argnames=("binclf_algorithm",),
+        argvalues=[("python",), ("numba",)],
+    )
+
+    if metafunc.function is test_pimo or metafunc.function is test_aupimo_values:
+        metafunc.parametrize(
+            argnames=(
+                "anomaly_maps",
+                "masks",
+                "expected_threshs",
+                "expected_shared_fpr",
+                "expected_per_image_tprs",
+                "expected_image_classes",
+            ),
+            argvalues=[
+                (
+                    anomaly_maps,
+                    masks,
+                    expected_threshs,
+                    expected_shared_fpr,
+                    expected_per_image_tprs,
+                    expected_image_classes,
+                ),
+                (
+                    10 * anomaly_maps,
+                    masks,
+                    10 * expected_threshs,
+                    expected_shared_fpr,
+                    expected_per_image_tprs,
+                    expected_image_classes,
+                ),
+            ],
+        )
+
+    if metafunc.function is test_aupimo_values:
+        metafunc.parametrize(
+            argnames=(
+                "fpr_bounds",
+                "expected_aupimos",  # trapezoid surfaces
+            ),
+            argvalues=[
+                (
+                    (1e-1, 1.0),
+                    np.array(
+                        [
+                            np.nan,
+                            # recall: trapezium area = (a + b) * h / 2
+                            (0.10 + 1.0) * 1 / 2,
+                            (1.0 + 1.0) * 1 / 2,
+                        ],
+                        dtype=np.float64,
+                    ),
+                ),
+                (
+                    (1e-3, 1e-1),
+                    np.array(
+                        [
+                            np.nan,
+                            # average of two trapezium areas / 2 (normalizing factor)
+                            (((1e-3 + 1e-2) * 1 / 2) + ((1e-2 + 1e-1) * 1 / 2)) / 2,
+                            (((1e-2 + 1e-1) * 1 / 2) + ((1e-1 + 1.0) * 1 / 2)) / 2,
+                        ],
+                        dtype=np.float64,
+                    ),
+                ),
+                (
+                    (1e-5, 1e-4),
+                    np.array(
+                        [
+                            np.nan,
+                            (1e-5 + 1e-4) * 1 / 2,
+                            (1e-4 + 1e-3) * 1 / 2,
+                        ],
+                        dtype=np.float64,
+                    ),
+                ),
+            ],
+        )
+
+    if metafunc.function is test_aupimo_edge:
+        metafunc.parametrize(
+            argnames=(
+                "anomaly_maps",
+                "masks",
+            ),
+            argvalues=[
+                (
+                    anomaly_maps,
+                    masks,
+                ),
+                (
+                    10 * anomaly_maps,
+                    masks,
+                ),
+            ],
+        )
+        metafunc.parametrize(
+            argnames=("fpr_bounds",),
+            argvalues=[
+                ((1e-1, 1.0),),
+                ((1e-3, 1e-2),),
+                ((1e-5, 1e-4),),
+                (None,),
+            ],
+        )
+
+
+def test_pimo(
+    anomaly_maps: ndarray,
+    masks: ndarray,
+    binclf_algorithm: str,
+    expected_threshs: ndarray,
+    expected_shared_fpr: ndarray,
+    expected_per_image_tprs: ndarray,
+    expected_image_classes: ndarray,
+) -> None:
+    """Test if `pimo()` returns the expected values."""
+    from anomalib.metrics.per_image import pimo_numpy
+
+    threshs, shared_fpr, per_image_tprs, image_classes = pimo_numpy.pimo(
+        anomaly_maps,
+        masks,
+        num_threshs=7,
+        binclf_algorithm=binclf_algorithm,
+    )
+
+    assert threshs.ndim == 1
+    assert shared_fpr.ndim == 1
+    assert per_image_tprs.ndim == 2
+    assert image_classes.ndim == 1
+
+    assert np.allclose(threshs, expected_threshs)
+    assert np.allclose(shared_fpr, expected_shared_fpr)
+    assert np.allclose(per_image_tprs, expected_per_image_tprs, equal_nan=True)
+    assert np.allclose(image_classes, expected_image_classes)
+
+
+def test_aupimo_values(
+    anomaly_maps: ndarray,
+    masks: ndarray,
+    binclf_algorithm: str,
+    fpr_bounds: tuple[float, float],
+    expected_threshs: ndarray,
+    expected_shared_fpr: ndarray,
+    expected_per_image_tprs: ndarray,
+    expected_image_classes: ndarray,
+    expected_aupimos: ndarray,
+) -> None:
+    """Test if `aupimo()` returns the expected values."""
+    from anomalib.metrics.per_image import pimo_numpy
+
+    threshs, shared_fpr, per_image_tprs, image_classes, aupimos = pimo_numpy.aupimo(
+        anomaly_maps,
+        masks,
+        num_threshs=7,
+        binclf_algorithm=binclf_algorithm,
+        fpr_bounds=fpr_bounds,
+        force=True,
+    )
+
+    assert threshs.ndim == 1
+    assert shared_fpr.ndim == 1
+    assert per_image_tprs.ndim == 2
+    assert image_classes.ndim == 1
+
+    assert np.allclose(threshs, expected_threshs)
+    assert np.allclose(shared_fpr, expected_shared_fpr)
+    assert np.allclose(per_image_tprs, expected_per_image_tprs, equal_nan=True)
+    assert np.allclose(image_classes, expected_image_classes)
+
+    assert aupimos.ndim == 1
+    assert aupimos.shape == (3,)
+    assert np.allclose(aupimos, expected_aupimos, equal_nan=True)
+
+
+def test_aupimo_edge(
+    anomaly_maps: ndarray,
+    masks: ndarray,
+    binclf_algorithm: str,
+    fpr_bounds: tuple[float, float],
+) -> None:
+    """Test some edge cases."""
+    from anomalib.metrics.per_image import pimo_numpy
+
+    # None is the case of testing the default bounds
+    fpr_bounds = {"fpr_bounds": fpr_bounds} if fpr_bounds is not None else {}
+
+    # not enough points on the curve
+    # 10 threshs / 6 decades = 1.6 threshs per decade < 3
+    with pytest.raises(RuntimeError):  # force=False --> raise error
+        pimo_numpy.aupimo(
+            anomaly_maps,
+            masks,
+            num_threshs=10,
+            binclf_algorithm=binclf_algorithm,
+            force=False,
+            **fpr_bounds,
+        )
+
+    with pytest.warns(RuntimeWarning):  # force=True --> warn
+        pimo_numpy.aupimo(
+            anomaly_maps,
+            masks,
+            num_threshs=10,
+            binclf_algorithm=binclf_algorithm,
+            force=True,
+            **fpr_bounds,
+        )
+
+    # default number of points on the curve (300k threshs) should be enough
+    rng = np.random.default_rng(42)
+    pimo_numpy.aupimo(
+        anomaly_maps * rng.uniform(1.0, 1.1, size=anomaly_maps.shape),
+        masks,
+        # num_threshs=,
+        binclf_algorithm=binclf_algorithm,
+        force=False,
+        **fpr_bounds,
+    )

From c864b540bbbf7142ba9a6dbc6b5d3bc5931adeb3 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Fri, 22 Dec 2023 15:41:43 +0100
Subject: [PATCH 11/57] move validation

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/_validate.py   | 19 +++++++++++++
 .../metrics/per_image/binclf_curve.py         | 28 ++++++-------------
 2 files changed, 28 insertions(+), 19 deletions(-)
 create mode 100644 src/anomalib/metrics/per_image/_validate.py

diff --git a/src/anomalib/metrics/per_image/_validate.py b/src/anomalib/metrics/per_image/_validate.py
new file mode 100644
index 0000000000..68c77df17f
--- /dev/null
+++ b/src/anomalib/metrics/per_image/_validate.py
@@ -0,0 +1,19 @@
+"""Utils for validating arguments and results.
+
+`torch` is imported in the functions that use it, so this module can be used in numpy-standalone mode.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def is_tensor(tensor: Any, argname: str | None = None) -> None:  # noqa: ANN401
+    """Validate that `tensor` is a `torch.Tensor`."""
+    from torch import Tensor
+
+    argname = f"'{argname}'" if argname is not None else "argument"
+
+    if not isinstance(tensor, Tensor):
+        msg = f"Expected {argname} to be a tensor, but got {type(tensor)}"
+        raise TypeError(msg)
diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
index c1d353cf7c..b3d88e14c9 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -4,6 +4,9 @@
 Check its docstring for more details.
 
 Tensors are build with `torch.from_numpy` and so the returned tensors will share the same memory as the numpy arrays.
+
+Validations will preferably happen in ndarray so the numpy code can be reused without torch,
+so often times the Tensor arguments will be converted to ndarray and then validated.
 """
 
 from __future__ import annotations
@@ -11,32 +14,19 @@
 import torch
 from torch import Tensor
 
-from . import binclf_curve_numpy
+from . import _validate, binclf_curve_numpy
 from .binclf_curve_numpy import Algorithm, ThreshsChoice
 
 # =========================================== ARGS VALIDATION ===========================================
 
 
-def _validate_is_tensor(tensor: Tensor, argname: str | None = None) -> None:
-    """Validate that `tensor` is a tensor and convert it to a numpy ndarray.
-
-    Validations will preferably happen in ndarray so the numpy code can be reused without torch,
-    so often times the Tensor arguments will be converted to ndarray and then validated.
-    """
-    argname = f"'{argname}'" if argname is not None else "argument"
-
-    if not isinstance(tensor, Tensor):
-        msg = f"Expected {argname} to be a tensor, but got {type(tensor)}"
-        raise TypeError(msg)
-
-
 def _validate_threshs(threshs: Tensor) -> None:
-    _validate_is_tensor(threshs, argname="threshs")
+    _validate.is_tensor(threshs, argname="threshs")
     binclf_curve_numpy._validate_threshs(threshs.detach().cpu().numpy())  # noqa: SLF001
 
 
 def _validate_binclf_curves(binclf_curves: Tensor, valid_threshs: Tensor | None = None) -> None:
-    _validate_is_tensor(binclf_curves, argname="binclf_curves")
+    _validate.is_tensor(binclf_curves, argname="binclf_curves")
     if valid_threshs is not None:
         _validate_threshs(valid_threshs)
     binclf_curve_numpy._validate_binclf_curves(  # noqa: SLF001
@@ -104,14 +94,14 @@ def per_image_binclf_curve(
 
             Thresholds are sorted in ascending order.
     """
-    _validate_is_tensor(anomaly_maps, argname="anomaly_maps")
+    _validate.is_tensor(anomaly_maps, argname="anomaly_maps")
     anomaly_maps_array = anomaly_maps.detach().cpu().numpy()
 
-    _validate_is_tensor(masks, argname="masks")
+    _validate.is_tensor(masks, argname="masks")
     masks_array = masks.detach().cpu().numpy()
 
     if threshs_given is not None:
-        _validate_is_tensor(threshs_given, argname="threshs_given")
+        _validate.is_tensor(threshs_given, argname="threshs_given")
         threshs_given_array = threshs_given.detach().cpu().numpy()
     else:
         threshs_given_array = None

From a3d1060c97cddaf537869c5d27929b8576339fb9 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Fri, 22 Dec 2023 18:35:40 +0100
Subject: [PATCH 12/57] add `shared_fpr_metric` option

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/pimo_numpy.py | 24 ++++++++++++++------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index 8f0fdd2b3f..93df1e6ae2 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -46,7 +46,7 @@
 class SharedFPRMetric:
     """Shared FPR metric (x-axis of the PIMO curve)."""
 
-    MEAN_PERIMAGE_FPR: ClassVar[str] = "mean_perimage_fpr"
+    MEAN_PERIMAGE_FPR: ClassVar[str] = "mean-per-image-fpr"
 
     METRICS: ClassVar[tuple[str, ...]] = (MEAN_PERIMAGE_FPR,)
 
@@ -119,6 +119,7 @@ def pimo(  # noqa: D103
     masks: ndarray,
     num_threshs: int,
     binclf_algorithm: str = BinclfAlgorithm.NUMBA,
+    shared_fpr_metric: str = SharedFPRMetric.MEAN_PERIMAGE_FPR,
 ) -> tuple[ndarray, ndarray, ndarray, ndarray]:
     # validate inputs
     binclf_curve_numpy._validate_num_threshs(num_threshs)  # noqa: SLF001
@@ -150,13 +151,20 @@ def pimo(  # noqa: D103
         num_threshs=None,
     )
 
-    # shape -> (N, K)
-    per_image_fprs = binclf_curve_numpy.per_image_fpr(binclf_curves)
-    # TODO(jpcbertoldo): validate per_image_fprs  # noqa: TD003
+    shared_fpr: ndarray
+    if shared_fpr_metric == SharedFPRMetric.MEAN_PERIMAGE_FPR:
+        # shape -> (N, K)
+        per_image_fprs = binclf_curve_numpy.per_image_fpr(binclf_curves)
+        # TODO(jpcbertoldo): validate per_image_fprs  # noqa: TD003
+
+        # shape -> (K,)
+        # this is the only shared FPR metric implemented so far, see note about shared FPR in the module's docstring
+        shared_fpr = per_image_fprs[image_classes == 0].mean(axis=0)
+
+    else:
+        msg = f"Shared FPR metric `{shared_fpr_metric}` is not implemented."
+        raise NotImplementedError(msg)
 
-    # shape -> (K,)
-    # this is the only shared FPR metric implemented so far, see note about shared FPR in the module's docstring
-    shared_fpr = per_image_fprs[image_classes == 0].mean(axis=0)
     # TODO(jpcbertoldo): validate shared_fpr  # noqa: TD003
 
     # shape -> (N, K)
@@ -183,6 +191,7 @@ def aupimo(  # noqa: D103
     masks: ndarray,
     num_threshs: int = 300_000,
     binclf_algorithm: str = BinclfAlgorithm.NUMBA,
+    shared_fpr_metric: str = SharedFPRMetric.MEAN_PERIMAGE_FPR,
     fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
     force: bool = False,
 ) -> tuple[ndarray, ndarray, ndarray, ndarray, ndarray]:
@@ -195,6 +204,7 @@ def aupimo(  # noqa: D103
         masks=masks,
         num_threshs=num_threshs,
         binclf_algorithm=binclf_algorithm,
+        shared_fpr_metric=shared_fpr_metric,
     )
 
     fpr_lower_bound, fpr_upper_bound = fpr_bounds

From bfc287eb7e90b895a9be24b97f84cc0a2803b2e2 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Fri, 22 Dec 2023 20:12:37 +0100
Subject: [PATCH 13/57] add pimo torch functional version and test

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/pimo.py    | 211 +++++++++++++++++++
 tests/unit/metrics/per_image/test_pimo.py | 241 ++++++++++++++++------
 2 files changed, 390 insertions(+), 62 deletions(-)
 create mode 100644 src/anomalib/metrics/per_image/pimo.py

diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
new file mode 100644
index 0000000000..22e380e65d
--- /dev/null
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -0,0 +1,211 @@
+"""Per-Image Overlap curve (PIMO, pronounced pee-mo) and its area under the curve (AUPIMO).
+
+This module implements torch interfaces to access the numpy code in `pimo_numpy.py`.
+Check its docstring for more details.
+
+Tensors are build with `torch.from_numpy` and so the returned tensors will share the same memory as the numpy arrays.
+
+Validations will preferably happen in ndarray so the numpy code can be reused without torch,
+so often times the Tensor arguments will be converted to ndarray and then validated.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+import torch
+from torch import Tensor
+
+from . import _validate, pimo_numpy
+from .binclf_curve_numpy import Algorithm as BinclfAlgorithm
+from .pimo_numpy import SharedFPRMetric
+
+# =========================================== ARGS VALIDATION ===========================================
+
+
+# =========================================== RESULT OBJECT ===========================================
+
+
+# TODO(jpcbertoldo): missing docstring for `PIMOResult`  # noqa: TD003
+@dataclass
+class PIMOResult:  # noqa: D101
+    # metadata
+    shared_fpr_metric: str
+
+    # data
+    threshs: Tensor = field(repr=False)
+    shared_fpr: Tensor = field(repr=False)
+    per_image_tprs: Tensor = field(repr=False)
+
+    @property
+    def num_threshs(self) -> int:
+        """Number of thresholds."""
+        return self.threshs.shape[0]
+
+    @property
+    def num_images(self) -> int:
+        """Number of images."""
+        return self.per_image_tprs.shape[0]
+
+    @property
+    def image_classes(self) -> Tensor:
+        """Image classes (0: normal, 1: anomalous)."""
+        return (self.per_image_tprs.flatten(1) == 1).any(dim=1).to(torch.int32)
+
+    def thresh_at(self, fpr_level: float) -> tuple[int, float, float]:
+        """Return the threshold at the given shared FPR.
+
+        See `anomalib.utils.metrics.per_image.pimo_numpy.thresh_at_shared_fpr_level` for details.
+
+        Args:
+            fpr_level (float): shared FPR level
+
+        Returns:
+            tuple[int, float, float]:
+                [0] index of the threshold
+                [1] threshold
+                [2] the actual shared FPR value at the returned threshold
+        """
+        return pimo_numpy.thresh_at_shared_fpr_level(
+            self.threshs.numpy(),
+            self.shared_fpr.numpy(),
+            fpr_level,
+        )
+
+
+# TODO(jpcbertoldo): missing docstring for `AUPIMOResult`  # noqa: TD003
+# TODO(jpcbertoldo): change `aucs` in the paper supp mat to `aupimos`  # noqa: TD003
+@dataclass
+class AUPIMOResult:  # noqa: D101
+    # metadata
+    shared_fpr_metric: str
+    fpr_lower_bound: float
+    fpr_upper_bound: float
+    num_threshs: int
+
+    # data
+    thresh_lower_bound: float = field(repr=False)
+    thresh_upper_bound: float = field(repr=False)
+    aupimos: Tensor = field(repr=False)
+
+    @property
+    def num_images(self) -> int:
+        """Number of images."""
+        return self.aupimos.shape[0]
+
+    @property
+    def image_classes(self) -> Tensor:
+        """Image classes (0: normal, 1: anomalous)."""
+        # if an instance has `nan` aupimo it's because it's a normal image
+        return self.aupimos.isnan().to(torch.int32)
+
+
+# =========================================== FUNCTIONAL ===========================================
+
+
+# TODO(jpcbertoldo): missing docstring for `pimo`  # noqa: TD003
+def pimo(  # noqa: D103
+    anomaly_maps: Tensor,
+    masks: Tensor,
+    num_threshs: int,
+    binclf_algorithm: str = BinclfAlgorithm.NUMBA,
+    shared_fpr_metric: str = SharedFPRMetric.MEAN_PERIMAGE_FPR,
+) -> PIMOResult:
+    _validate.is_tensor(anomaly_maps, argname="anomaly_maps")
+    anomaly_maps_array = anomaly_maps.detach().cpu().numpy()
+
+    _validate.is_tensor(masks, argname="masks")
+    masks_array = masks.detach().cpu().numpy()
+
+    # other validations are done in the numpy code
+    threshs_array, shared_fpr_array, per_image_tprs_array, _ = pimo_numpy.pimo(
+        anomaly_maps_array,
+        masks_array,
+        num_threshs,
+        binclf_algorithm=binclf_algorithm,
+        shared_fpr_metric=shared_fpr_metric,
+    )
+    # _ is `image_classes` -- not needed here because it's a property in the result object
+
+    # tensors are build with `torch.from_numpy` and so the returned tensors
+    # will share the same memory as the numpy arrays
+    device = anomaly_maps.device
+    # N: number of images, K: number of thresholds
+    # shape => (K,)
+    threshs = torch.from_numpy(threshs_array).to(device)
+    # shape => (K,)
+    shared_fpr = torch.from_numpy(shared_fpr_array).to(device)
+    # shape => (N, K)
+    per_image_tprs = torch.from_numpy(per_image_tprs_array).to(device)
+
+    return PIMOResult(
+        shared_fpr_metric=shared_fpr_metric,
+        threshs=threshs,
+        shared_fpr=shared_fpr,
+        per_image_tprs=per_image_tprs,
+    )
+
+
+# TODO(jpcbertoldo): missing docstring for `aupimo`  # noqa: TD003
+def aupimo(  # noqa: D103
+    anomaly_maps: Tensor,
+    masks: Tensor,
+    num_threshs: int = 300_000,
+    binclf_algorithm: str = BinclfAlgorithm.NUMBA,
+    shared_fpr_metric: str = SharedFPRMetric.MEAN_PERIMAGE_FPR,
+    fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
+    force: bool = False,
+) -> tuple[PIMOResult, AUPIMOResult]:
+    _validate.is_tensor(anomaly_maps, argname="anomaly_maps")
+    anomaly_maps_array = anomaly_maps.detach().cpu().numpy()
+
+    _validate.is_tensor(masks, argname="masks")
+    masks_array = masks.detach().cpu().numpy()
+
+    # other validations are done in the numpy code
+
+    threshs_array, shared_fpr_array, per_image_tprs_array, _, aupimos_array = pimo_numpy.aupimo(
+        anomaly_maps_array,
+        masks_array,
+        num_threshs,
+        binclf_algorithm=binclf_algorithm,
+        shared_fpr_metric=shared_fpr_metric,
+        fpr_bounds=fpr_bounds,
+        force=force,
+    )
+
+    # tensors are build with `torch.from_numpy` and so the returned tensors
+    # will share the same memory as the numpy arrays
+    device = anomaly_maps.device
+    # N: number of images, K: number of thresholds
+    # shape => (K,)
+    threshs = torch.from_numpy(threshs_array).to(device)
+    # shape => (K,)
+    shared_fpr = torch.from_numpy(shared_fpr_array).to(device)
+    # shape => (N, K)
+    per_image_tprs = torch.from_numpy(per_image_tprs_array).to(device)
+    # shape => (N,)
+    aupimos = torch.from_numpy(aupimos_array).to(device)
+
+    pimoresult = PIMOResult(
+        shared_fpr_metric=shared_fpr_metric,
+        threshs=threshs,
+        shared_fpr=shared_fpr,
+        per_image_tprs=per_image_tprs,
+    )
+    fpr_lower_bound, fpr_upper_bound = fpr_bounds
+    # recall: fpr upper/lower bounds are the same as the thresh lower/upper bounds
+    # `_` is the threshold's index, `__` is the actual fpr value
+    _, thresh_lower_bound, __ = pimoresult.thresh_at(fpr_upper_bound)
+    _, thresh_upper_bound, __ = pimoresult.thresh_at(fpr_lower_bound)
+    return (
+        pimoresult,
+        AUPIMOResult(
+            shared_fpr_metric=shared_fpr_metric,
+            fpr_lower_bound=(fpr_lower_bound),
+            fpr_upper_bound=(fpr_upper_bound),
+            num_threshs=num_threshs,
+            thresh_lower_bound=thresh_lower_bound,
+            thresh_upper_bound=thresh_upper_bound,
+            aupimos=aupimos,
+        ),
+    )
diff --git a/tests/unit/metrics/per_image/test_pimo.py b/tests/unit/metrics/per_image/test_pimo.py
index ec4caa56f6..a7308f9afa 100644
--- a/tests/unit/metrics/per_image/test_pimo.py
+++ b/tests/unit/metrics/per_image/test_pimo.py
@@ -2,7 +2,9 @@
 
 import numpy as np
 import pytest
+import torch
 from numpy import ndarray
+from torch import Tensor
 
 
 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
@@ -55,7 +57,35 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
         argvalues=[("python",), ("numba",)],
     )
 
-    if metafunc.function is test_pimo or metafunc.function is test_aupimo_values:
+    if (
+        metafunc.function is test_pimo_numpy
+        or metafunc.function is test_pimo
+        or metafunc.function is test_aupimo_values_numpy
+        or metafunc.function is test_aupimo_values
+    ):
+        argvalues_arrays = [
+            (
+                anomaly_maps,
+                masks,
+                expected_threshs,
+                expected_shared_fpr,
+                expected_per_image_tprs,
+                expected_image_classes,
+            ),
+            (
+                10 * anomaly_maps,
+                masks,
+                10 * expected_threshs,
+                expected_shared_fpr,
+                expected_per_image_tprs,
+                expected_image_classes,
+            ),
+        ]
+        argvalues_tensors = [
+            tuple(torch.from_numpy(arg) if isinstance(arg, ndarray) else arg for arg in arvals)
+            for arvals in argvalues_arrays
+        ]
+        argvalues = argvalues_arrays if "numpy" in metafunc.function.__name__ else argvalues_tensors
         metafunc.parametrize(
             argnames=(
                 "anomaly_maps",
@@ -65,69 +95,58 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
                 "expected_per_image_tprs",
                 "expected_image_classes",
             ),
-            argvalues=[
-                (
-                    anomaly_maps,
-                    masks,
-                    expected_threshs,
-                    expected_shared_fpr,
-                    expected_per_image_tprs,
-                    expected_image_classes,
-                ),
-                (
-                    10 * anomaly_maps,
-                    masks,
-                    10 * expected_threshs,
-                    expected_shared_fpr,
-                    expected_per_image_tprs,
-                    expected_image_classes,
-                ),
-            ],
+            argvalues=argvalues,
         )
 
-    if metafunc.function is test_aupimo_values:
+    if metafunc.function is test_aupimo_values_numpy or metafunc.function is test_aupimo_values:
+        argvalues_arrays = [
+            (
+                (1e-1, 1.0),
+                np.array(
+                    [
+                        np.nan,
+                        # recall: trapezium area = (a + b) * h / 2
+                        (0.10 + 1.0) * 1 / 2,
+                        (1.0 + 1.0) * 1 / 2,
+                    ],
+                    dtype=np.float64,
+                ),
+            ),
+            (
+                (1e-3, 1e-1),
+                np.array(
+                    [
+                        np.nan,
+                        # average of two trapezium areas / 2 (normalizing factor)
+                        (((1e-3 + 1e-2) * 1 / 2) + ((1e-2 + 1e-1) * 1 / 2)) / 2,
+                        (((1e-2 + 1e-1) * 1 / 2) + ((1e-1 + 1.0) * 1 / 2)) / 2,
+                    ],
+                    dtype=np.float64,
+                ),
+            ),
+            (
+                (1e-5, 1e-4),
+                np.array(
+                    [
+                        np.nan,
+                        (1e-5 + 1e-4) * 1 / 2,
+                        (1e-4 + 1e-3) * 1 / 2,
+                    ],
+                    dtype=np.float64,
+                ),
+            ),
+        ]
+        argvalues_tensors = [
+            tuple(torch.from_numpy(arg) if isinstance(arg, ndarray) else arg for arg in arvals)
+            for arvals in argvalues_arrays
+        ]
+        argvalues = argvalues_arrays if "numpy" in metafunc.function.__name__ else argvalues_tensors
         metafunc.parametrize(
             argnames=(
                 "fpr_bounds",
                 "expected_aupimos",  # trapezoid surfaces
             ),
-            argvalues=[
-                (
-                    (1e-1, 1.0),
-                    np.array(
-                        [
-                            np.nan,
-                            # recall: trapezium area = (a + b) * h / 2
-                            (0.10 + 1.0) * 1 / 2,
-                            (1.0 + 1.0) * 1 / 2,
-                        ],
-                        dtype=np.float64,
-                    ),
-                ),
-                (
-                    (1e-3, 1e-1),
-                    np.array(
-                        [
-                            np.nan,
-                            # average of two trapezium areas / 2 (normalizing factor)
-                            (((1e-3 + 1e-2) * 1 / 2) + ((1e-2 + 1e-1) * 1 / 2)) / 2,
-                            (((1e-2 + 1e-1) * 1 / 2) + ((1e-1 + 1.0) * 1 / 2)) / 2,
-                        ],
-                        dtype=np.float64,
-                    ),
-                ),
-                (
-                    (1e-5, 1e-4),
-                    np.array(
-                        [
-                            np.nan,
-                            (1e-5 + 1e-4) * 1 / 2,
-                            (1e-4 + 1e-3) * 1 / 2,
-                        ],
-                        dtype=np.float64,
-                    ),
-                ),
-            ],
+            argvalues=argvalues,
         )
 
     if metafunc.function is test_aupimo_edge:
@@ -158,7 +177,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
         )
 
 
-def test_pimo(
+def test_pimo_numpy(
     anomaly_maps: ndarray,
     masks: ndarray,
     binclf_algorithm: str,
@@ -175,6 +194,7 @@ def test_pimo(
         masks,
         num_threshs=7,
         binclf_algorithm=binclf_algorithm,
+        shared_fpr_metric="mean-per-image-fpr",
     )
 
     assert threshs.ndim == 1
@@ -185,10 +205,48 @@ def test_pimo(
     assert np.allclose(threshs, expected_threshs)
     assert np.allclose(shared_fpr, expected_shared_fpr)
     assert np.allclose(per_image_tprs, expected_per_image_tprs, equal_nan=True)
-    assert np.allclose(image_classes, expected_image_classes)
+    assert (image_classes == expected_image_classes).all()
 
 
-def test_aupimo_values(
+def test_pimo(
+    anomaly_maps: Tensor,
+    masks: Tensor,
+    binclf_algorithm: str,
+    expected_threshs: Tensor,
+    expected_shared_fpr: Tensor,
+    expected_per_image_tprs: Tensor,
+    expected_image_classes: Tensor,
+) -> None:
+    """Test if `pimo()` returns the expected values."""
+    from anomalib.metrics.per_image import pimo
+
+    pimoresult = pimo.pimo(
+        anomaly_maps,
+        masks,
+        num_threshs=7,
+        binclf_algorithm=binclf_algorithm,
+        shared_fpr_metric="mean-per-image-fpr",
+    )
+    threshs = pimoresult.threshs
+    shared_fpr = pimoresult.shared_fpr
+    per_image_tprs = pimoresult.per_image_tprs
+    image_classes = pimoresult.image_classes
+
+    # metadata
+    assert pimoresult.shared_fpr_metric == "mean-per-image-fpr"
+    # data
+    assert threshs.ndim == 1
+    assert shared_fpr.ndim == 1
+    assert per_image_tprs.ndim == 2
+    assert image_classes.ndim == 1
+
+    assert torch.allclose(threshs, expected_threshs)
+    assert torch.allclose(shared_fpr, expected_shared_fpr)
+    assert torch.allclose(per_image_tprs, expected_per_image_tprs, equal_nan=True)
+    assert (image_classes == expected_image_classes).all()
+
+
+def test_aupimo_values_numpy(
     anomaly_maps: ndarray,
     masks: ndarray,
     binclf_algorithm: str,
@@ -207,6 +265,7 @@ def test_aupimo_values(
         masks,
         num_threshs=7,
         binclf_algorithm=binclf_algorithm,
+        shared_fpr_metric="mean-per-image-fpr",
         fpr_bounds=fpr_bounds,
         force=True,
     )
@@ -219,13 +278,71 @@ def test_aupimo_values(
     assert np.allclose(threshs, expected_threshs)
     assert np.allclose(shared_fpr, expected_shared_fpr)
     assert np.allclose(per_image_tprs, expected_per_image_tprs, equal_nan=True)
-    assert np.allclose(image_classes, expected_image_classes)
+    assert (image_classes == expected_image_classes).all()
 
     assert aupimos.ndim == 1
     assert aupimos.shape == (3,)
     assert np.allclose(aupimos, expected_aupimos, equal_nan=True)
 
 
+def test_aupimo_values(
+    anomaly_maps: ndarray,
+    masks: ndarray,
+    binclf_algorithm: str,
+    fpr_bounds: tuple[float, float],
+    expected_threshs: ndarray,
+    expected_shared_fpr: ndarray,
+    expected_per_image_tprs: ndarray,
+    expected_image_classes: ndarray,
+    expected_aupimos: ndarray,
+) -> None:
+    """Test if `aupimo()` returns the expected values."""
+    from anomalib.metrics.per_image import pimo
+
+    pimoresult, aupimoresult = pimo.aupimo(
+        anomaly_maps,
+        masks,
+        num_threshs=7,
+        binclf_algorithm=binclf_algorithm,
+        shared_fpr_metric="mean-per-image-fpr",
+        fpr_bounds=fpr_bounds,
+        force=True,
+    )
+
+    # from pimo result
+    threshs = pimoresult.threshs
+    shared_fpr = pimoresult.shared_fpr
+    per_image_tprs = pimoresult.per_image_tprs
+    image_classes = pimoresult.image_classes
+
+    # from aupimo result
+    fpr_lower_bound = aupimoresult.fpr_lower_bound
+    fpr_upper_bound = aupimoresult.fpr_upper_bound
+    thresh_lower_bound = aupimoresult.thresh_lower_bound
+    thresh_upper_bound = aupimoresult.thresh_upper_bound
+    num_threshs = aupimoresult.num_threshs
+    aupimos = aupimoresult.aupimos
+
+    # test metadata
+    assert pimoresult.shared_fpr_metric == "mean-per-image-fpr"
+    assert aupimoresult.shared_fpr_metric == "mean-per-image-fpr"
+    assert (fpr_lower_bound, fpr_upper_bound) == fpr_bounds
+    assert num_threshs == 7
+
+    # test data
+    assert threshs.ndim == 1
+    assert shared_fpr.ndim == 1
+    assert per_image_tprs.ndim == 2
+    assert image_classes.ndim == 1
+    assert anomaly_maps.min() <= thresh_lower_bound < thresh_upper_bound <= anomaly_maps.max()
+
+    assert torch.allclose(threshs, expected_threshs)
+    assert torch.allclose(shared_fpr, expected_shared_fpr)
+    assert torch.allclose(per_image_tprs, expected_per_image_tprs, equal_nan=True)
+    assert (image_classes == expected_image_classes).all()
+    assert torch.allclose(aupimos, expected_aupimos, equal_nan=True)
+
+
 def test_aupimo_edge(
     anomaly_maps: ndarray,
     masks: ndarray,
@@ -236,7 +353,7 @@ def test_aupimo_edge(
     from anomalib.metrics.per_image import pimo_numpy
 
     # None is the case of testing the default bounds
-    fpr_bounds = {"fpr_bounds": fpr_bounds} if fpr_bounds is not None else {}
+    fpr_bounds = {"fpr_bounds": fpr_bounds, "shared_fpr_metric": "mean-per-image-fpr"} if fpr_bounds is not None else {}
 
     # not enough points on the curve
     # 10 threshs / 6 decades = 1.6 threshs per decade < 3

From 60483fc5b1d1c75ca1ac8570566bef5d48350b7c Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Fri, 22 Dec 2023 20:36:57 +0100
Subject: [PATCH 14/57] add torchmetrics interface and test

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/_validate.py   |  68 +++++
 .../metrics/per_image/binclf_curve.py         |   4 +-
 .../metrics/per_image/binclf_curve_numpy.py   |  37 ++-
 src/anomalib/metrics/per_image/pimo.py        | 214 ++++++++++++++-
 src/anomalib/metrics/per_image/pimo_numpy.py  |  95 +++----
 .../metrics/per_image/test_binclf_curve.py    |   4 +-
 tests/unit/metrics/per_image/test_pimo.py     | 251 +++++++++++++-----
 7 files changed, 526 insertions(+), 147 deletions(-)

diff --git a/src/anomalib/metrics/per_image/_validate.py b/src/anomalib/metrics/per_image/_validate.py
index 68c77df17f..eb2fe61c45 100644
--- a/src/anomalib/metrics/per_image/_validate.py
+++ b/src/anomalib/metrics/per_image/_validate.py
@@ -17,3 +17,71 @@ def is_tensor(tensor: Any, argname: str | None = None) -> None:  # noqa: ANN401
     if not isinstance(tensor, Tensor):
         msg = f"Expected {argname} to be a tensor, but got {type(tensor)}"
         raise TypeError(msg)
+
+
+def num_threshs(num_threshs: int) -> None:
+    """Validate that `num_threshs` is a positive integer > 2."""
+    if not isinstance(num_threshs, int):
+        msg = f"Expected `num_threshs` to be an integer, but got {type(num_threshs)}"
+        raise TypeError(msg)
+
+    if num_threshs < 2:
+        msg = f"If argument `num_threshs` is an integer, expected it to be larger than 1, but got {num_threshs}"
+        raise ValueError(msg)
+
+
+def same_shape(*args) -> None:
+    """Works both for tensors and ndarrays."""
+    assert len(args) > 0
+    shapes = sorted({tuple(arg.shape) for arg in args})
+    if len(shapes) > 1:
+        msg = f"Expected arguments to have the same shape, but got {shapes}"
+        raise ValueError(msg)
+
+
+def rate(rate: float | int, zero_ok: bool, one_ok: bool) -> None:
+    """Validates a rate parameter.
+
+    Args:
+        rate (float | int): The rate to be validated.
+        zero_ok (bool): Flag indicating if rate can be 0.
+        one_ok (bool): Flag indicating if rate can be 1.
+    """
+    if not isinstance(rate, float | int):
+        msg = f"Expected rate to be a float or int, but got {type(rate)}."
+        raise TypeError(msg)
+
+    if rate < 0.0 or rate > 1.0:
+        msg = f"Rate `{rate}` is not a valid because it must be in [0, 1]."
+        raise ValueError(msg)
+
+    if not zero_ok and rate == 0.0:
+        msg = "Rate cannot be 0."
+        raise ValueError(msg)
+
+    if not one_ok and rate == 1.0:
+        msg = "Rate cannot be 1."
+        raise ValueError(msg)
+
+
+def rate_range(bounds: tuple[float, float]) -> None:
+    """Validates the range of rates within `bounds`.
+
+    Args:
+        bounds (tuple[float, float]): The lower and upper bounds of the rates.
+    """
+    if not isinstance(bounds, tuple):
+        msg = f"Expected `bounds` to be a tuple, but got {type(bounds)}"
+        raise TypeError(msg)
+
+    if len(bounds) != 2:
+        msg = f"Expected `bounds` to be a tuple of length 2, but got {len(bounds)}"
+        raise ValueError(msg)
+
+    lower, upper = bounds
+    rate(lower, zero_ok=False, one_ok=False)
+    rate(upper, zero_ok=False, one_ok=True)
+
+    if lower >= upper:
+        msg = f"Expected `bounds[1]` > `bounds[0]`, but got {bounds[1]} <= {bounds[0]}"
+        raise ValueError(msg)
diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
index b3d88e14c9..c6077fef9a 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -22,7 +22,7 @@
 
 def _validate_threshs(threshs: Tensor) -> None:
     _validate.is_tensor(threshs, argname="threshs")
-    binclf_curve_numpy._validate_threshs(threshs.detach().cpu().numpy())  # noqa: SLF001
+    binclf_curve_numpy._validate_threshs(threshs.numpy())  # noqa: SLF001
 
 
 def _validate_binclf_curves(binclf_curves: Tensor, valid_threshs: Tensor | None = None) -> None:
@@ -31,7 +31,7 @@ def _validate_binclf_curves(binclf_curves: Tensor, valid_threshs: Tensor | None
         _validate_threshs(valid_threshs)
     binclf_curve_numpy._validate_binclf_curves(  # noqa: SLF001
         binclf_curves.detach().cpu().numpy(),
-        valid_threshs=valid_threshs.detach().cpu().numpy() if valid_threshs is not None else None,
+        valid_threshs=valid_threshs.numpy() if valid_threshs is not None else None,
     )
 
 
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index 6e5bbeaf9d..622070caac 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -21,6 +21,8 @@
     HAS_NUMBA = True
     from . import _binclf_curve_numba
 
+from . import _validate
+
 logger = logging.getLogger(__name__)
 
 # =========================================== CONSTANTS ===========================================
@@ -34,6 +36,13 @@ class Algorithm:
     NUMBA: ClassVar[str] = "numba"
     ALGORITHMS: ClassVar[tuple[str, ...]] = (PYTHON, NUMBA)
 
+    @staticmethod
+    def validate(algorithm: str) -> None:
+        """Validate `algorithm` argument."""
+        if algorithm not in Algorithm.ALGORITHMS:
+            msg = f"Expected `algorithm` to be one of {Algorithm.ALGORITHMS}, but got {algorithm}"
+            raise ValueError(msg)
+
 
 @dataclass
 class ThreshsChoice:
@@ -106,16 +115,6 @@ def _validate_threshs(threshs: ndarray) -> None:
         raise ValueError(msg)
 
 
-def _validate_num_threshs(num_threshs: int) -> None:
-    if not isinstance(num_threshs, int):
-        msg = f"Expected `num_threshs` to be an integer, but got {type(num_threshs)}"
-        raise TypeError(msg)
-
-    if num_threshs < 2:
-        msg = f"If argument `num_threshs` is an integer, expected it to be larger than 1, but got {num_threshs}"
-        raise ValueError(msg)
-
-
 def _validate_thresh_bounds(thresh_bounds: tuple[float, float]) -> None:
     if not isinstance(thresh_bounds, tuple):
         msg = f"Expected `thresh_bounds` to be a tuple, but got {type(thresh_bounds)}"
@@ -182,14 +181,6 @@ def _validate_masks(masks: ndarray) -> None:
         raise TypeError(msg)
 
 
-def _validate_same_shape(*args) -> None:
-    assert len(args) > 0
-    shapes = [tuple(arg.shape) for arg in args]
-    if not all(shape == shapes[0] for shape in shapes):
-        msg = f"Expecteds to have the same shape, but got {shapes}"
-        raise ValueError(msg)
-
-
 def _validate_binclf_curves(binclf_curves: ndarray, valid_threshs: ndarray | None) -> None:
     if not isinstance(binclf_curves, ndarray):
         msg = f"Expected `binclf_curves` to be an ndarray, but got {type(binclf_curves)}"
@@ -365,9 +356,10 @@ def binclf_multiple_curves(
 
         Thresholds are sorted in ascending order.
     """
+    Algorithm.validate(algorithm)
     _validate_scores_batch(scores_batch)
     _validate_gts_batch(gts_batch)
-    _validate_same_shape(scores_batch, gts_batch)
+    _validate.same_shape(scores_batch, gts_batch)
     _validate_threshs(threshs)
 
     if algorithm == Algorithm.PYTHON:
@@ -390,7 +382,7 @@ def binclf_multiple_curves(
 
 def _get_threshs_minmax_linspace(anomaly_maps: ndarray, num_threshs: int) -> ndarray:
     """Get thresholds linearly spaced between the min and max of the anomaly maps."""
-    _validate_num_threshs(num_threshs)
+    _validate.num_threshs(num_threshs)
     # this operation can be a bit expensive
     thresh_low, thresh_high = thresh_bounds = (anomaly_maps.min().item(), anomaly_maps.max().item())
     try:
@@ -454,10 +446,10 @@ def per_image_binclf_curve(
 
             Thresholds are sorted in ascending order.
     """
-    # validate inputs
+    Algorithm.validate(algorithm)
     _validate_anomaly_maps(anomaly_maps)
     _validate_masks(masks)
-    _validate_same_shape(anomaly_maps, masks)
+    _validate.same_shape(anomaly_maps, masks)
 
     threshs: ndarray
 
@@ -476,6 +468,7 @@ def per_image_binclf_curve(
             logger.warning(
                 f"Argument `threshs_given` was given, but it is ignored because `threshs_choice` is {threshs_choice}.",
             )
+        # `num_threshs` is validated in the function below
         threshs = _get_threshs_minmax_linspace(anomaly_maps, num_threshs)
 
     elif threshs_choice == ThreshsChoice.MEAN_FPR_OPTIMIZED:
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 22e380e65d..deff383d19 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -10,18 +10,30 @@
 """
 from __future__ import annotations
 
+import warnings
 from dataclasses import dataclass, field
 
 import torch
 from torch import Tensor
+from torchmetrics import Metric
 
-from . import _validate, pimo_numpy
+from . import _validate, binclf_curve_numpy, pimo_numpy
 from .binclf_curve_numpy import Algorithm as BinclfAlgorithm
 from .pimo_numpy import SharedFPRMetric
 
 # =========================================== ARGS VALIDATION ===========================================
 
 
+def _validate_anomaly_maps(anomaly_maps: Tensor) -> None:
+    _validate.is_tensor(anomaly_maps, argname="anomaly_maps")
+    binclf_curve_numpy._validate_anomaly_maps(anomaly_maps.numpy())  # noqa: SLF001
+
+
+def _validate_masks(masks: Tensor) -> None:
+    _validate.is_tensor(masks, argname="masks")
+    binclf_curve_numpy._validate_masks(masks.numpy())  # noqa: SLF001
+
+
 # =========================================== RESULT OBJECT ===========================================
 
 
@@ -98,6 +110,22 @@ def image_classes(self) -> Tensor:
         # if an instance has `nan` aupimo it's because it's a normal image
         return self.aupimos.isnan().to(torch.int32)
 
+    @property
+    def fpr_bounds(self) -> tuple[float, float]:
+        """Lower and upper bounds of the FPR integration range."""
+        return self.fpr_lower_bound, self.fpr_upper_bound
+
+    @property
+    def thresh_bounds(self) -> tuple[float, float]:
+        """Lower and upper bounds of the threshold integration range.
+
+        Recall: they correspond to the FPR bounds in reverse order.
+        I.e.:
+            fpr_lower_bound --> thresh_upper_bound
+            fpr_upper_bound --> thresh_lower_bound
+        """
+        return self.thresh_lower_bound, self.thresh_upper_bound
+
 
 # =========================================== FUNCTIONAL ===========================================
 
@@ -209,3 +237,187 @@ def aupimo(  # noqa: D103
             aupimos=aupimos,
         ),
     )
+
+
+# =========================================== TORCHMETRICS ===========================================
+
+
+# TODO(jpcbertoldo): missing docstring for `PIMO`  # noqa: TD003
+class PIMO(Metric):  # noqa: D101
+    is_differentiable: bool = False
+    higher_is_better: bool | None = None
+    full_state_update: bool = False
+
+    num_threshs: int
+    binclf_algorithm: str
+    shared_fpr_metric: str
+
+    anomaly_maps: list[Tensor]
+    masks: list[Tensor]
+
+    @property
+    def is_empty(self) -> bool:
+        """Return True if the metric has not been updated yet."""
+        return len(self.anomaly_maps) == 0
+
+    @property
+    def num_images(self) -> int:
+        """Number of images."""
+        return sum([am.shape[0] for am in self.anomaly_maps])
+
+    @property
+    def image_classes(self) -> Tensor:
+        """Image classes (0: normal, 1: anomalous)."""
+        return pimo_numpy._images_classes_from_masks(torch.concat(self.masks, dim=0).cpu().numpy())  # noqa: SLF001
+
+    def __init__(
+        self,
+        num_threshs: int,
+        binclf_algorithm: str = BinclfAlgorithm.NUMBA,
+        shared_fpr_metric: str = SharedFPRMetric.MEAN_PERIMAGE_FPR,
+    ) -> None:
+        """Per-Image Overlap (PIMO) curve."""
+        # TODO(jpcbertoldo): docstring of `PIMO.__init__()`  # noqa: TD003
+        super().__init__()
+
+        warnings.warn(
+            f"Metric `{self.__class__.__name__}` will save all targets and predictions in buffer."
+            " For large datasets this may lead to large memory footprint.",
+            UserWarning,
+            stacklevel=1,
+        )
+
+        # the options below are, redundantly, validated here to avoid reaching
+        # an error later in the execution
+
+        _validate.num_threshs(num_threshs)
+        self.num_threshs = num_threshs
+
+        # validate binclf_algorithm and shared_fpr_metric
+        BinclfAlgorithm.validate(binclf_algorithm)
+        self.binclf_algorithm = binclf_algorithm
+
+        SharedFPRMetric.validate(shared_fpr_metric)
+        self.shared_fpr_metric = SharedFPRMetric.MEAN_PERIMAGE_FPR
+
+        self.add_state("anomaly_maps", default=[], dist_reduce_fx="cat")
+        self.add_state("masks", default=[], dist_reduce_fx="cat")
+
+    def update(self, anomaly_maps: Tensor, masks: Tensor) -> None:
+        """Update list of anomaly maps and masks.
+
+        Args:
+            anomaly_maps (Tensor): predictions of the model (ndim == 2, float)
+            masks (Tensor): ground truth masks (ndim == 2, binary)
+        """
+        _validate_anomaly_maps(anomaly_maps)
+        _validate_masks(masks)
+        _validate.same_shape(anomaly_maps, masks)
+        self.anomaly_maps.append(anomaly_maps)
+        self.masks.append(masks)
+
+    # TODO(jpcbertoldo): missing docstring for `PIMO.compute`  # noqa: TD003
+    def compute(self) -> PIMOResult:  # noqa: D102
+        if self.is_empty:
+            msg = "No anomaly maps and masks have been added yet. Please call `update()` first."
+            raise RuntimeError(msg)
+        anomaly_maps = torch.concat(self.anomaly_maps, dim=0)
+        masks = torch.concat(self.masks, dim=0)
+        return pimo(
+            anomaly_maps,
+            masks,
+            self.num_threshs,
+            binclf_algorithm=self.binclf_algorithm,
+            shared_fpr_metric=self.shared_fpr_metric,
+        )
+
+
+class AUPIMO(PIMO):
+    """Area Under the Per-Image Overlap (PIMO) curve.
+
+    TODO(jpcbertoldo): docstring of `AUPIMO`  # noqa: DAR101
+    """
+
+    fpr_bounds: tuple[float, float]
+    force: bool
+
+    @staticmethod
+    def normalizing_factor(fpr_bounds: tuple[float, float]) -> float:
+        """Constant that normalizes the AUPIMO integral to 0-1 range.
+
+        It is the maximum possible value from the integral in AUPIMO's definition.
+        It corresponds to assuming a constant function T_i: thresh --> 1.
+
+        Args:
+            fpr_bounds: lower and upper bounds of the FPR integration range.
+
+        Returns:
+            float: the normalization factor (>0).
+        """
+        return pimo_numpy.aupimo_normalizing_factor(fpr_bounds)
+
+    @staticmethod
+    def random_model_score(fpr_bounds: tuple[float, float]) -> float:
+        """AUPIMO of a theoretical random model.
+
+        "Random model" means that there is no discrimination between normal and anomalous pixels/patches/images.
+        It corresponds to assuming the functions T = F.
+
+        For the FPR bounds (1e-5, 1e-4), the random model AUPIMO is ~4e-5.
+
+        Args:
+            fpr_bounds: lower and upper bounds of the FPR integration range.
+
+        Returns:
+            float: the AUPIMO score.
+        """
+        return pimo_numpy.aupimo_random_model_score(fpr_bounds)
+
+    def __repr__(self) -> str:
+        """Show the metric name and its integration bounds."""
+        metric = self.shared_fpr_metric
+        lower, upper = self.fpr_bounds
+        return f"{self.__class__.__name__}({metric} in [{lower:.2g}, {upper:.2g}])"
+
+    def __init__(
+        self,
+        num_threshs: int = 300_000,
+        binclf_algorithm: str = BinclfAlgorithm.NUMBA,
+        shared_fpr_metric: str = SharedFPRMetric.MEAN_PERIMAGE_FPR,
+        fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
+        force: bool = False,
+    ) -> None:
+        """Area Under the Per-Image Overlap (PIMO) curve.
+
+        TODO(jpcbertoldo): docstring of `AUPIMO.__init__()`  # noqa: DAR101
+        """
+        super().__init__(
+            num_threshs=num_threshs,
+            binclf_algorithm=binclf_algorithm,
+            shared_fpr_metric=shared_fpr_metric,
+        )
+
+        # other validations are done in PIMO.__init__()
+
+        _validate.rate_range(fpr_bounds)
+        self.fpr_bounds = fpr_bounds
+
+        self.force = force
+
+    def compute(self, force: bool | None = None) -> tuple[PIMOResult, AUPIMOResult]:  # type: ignore[override]
+        """TODO(jpcbertoldo): docstring of `AUPIMO.compute()`."""  # noqa: D402
+        if self.is_empty:
+            msg = "No anomaly maps and masks have been added yet. Please call `update()` first."
+            raise RuntimeError(msg)
+        anomaly_maps = torch.concat(self.anomaly_maps, dim=0)
+        masks = torch.concat(self.masks, dim=0)
+        force = force if force is not None else self.force
+        return aupimo(
+            anomaly_maps,
+            masks,
+            self.num_threshs,
+            binclf_algorithm=self.binclf_algorithm,
+            shared_fpr_metric=self.shared_fpr_metric,
+            fpr_bounds=self.fpr_bounds,
+            force=force,
+        )
diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index 93df1e6ae2..51600d0c85 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -33,7 +33,7 @@
 import numpy as np
 from numpy import ndarray
 
-from . import binclf_curve_numpy
+from . import _validate, binclf_curve_numpy
 from .binclf_curve_numpy import Algorithm as BinclfAlgorithm
 from .binclf_curve_numpy import ThreshsChoice as BinclfThreshsChoice
 
@@ -50,44 +50,15 @@ class SharedFPRMetric:
 
     METRICS: ClassVar[tuple[str, ...]] = (MEAN_PERIMAGE_FPR,)
 
+    @staticmethod
+    def validate(metric: str) -> None:
+        """Validate the argument `metric`."""
+        if metric not in SharedFPRMetric.METRICS:
+            msg = f"Invalid `metric`. Expected one of {SharedFPRMetric.METRICS}, but got {metric} instead."
+            raise ValueError(msg)
 
-# =========================================== ARGS VALIDATION ===========================================
-
-
-def _validate_rate(rate: float | int, zero_ok: bool, one_ok: bool) -> None:
-    if not isinstance(rate, float | int):
-        msg = f"Expected rate to be a float or int, but got {type(rate)}."
-        raise TypeError(msg)
-
-    if rate < 0.0 or rate > 1.0:
-        msg = f"Rate `{rate}` is not a valid because it must be in [0, 1]."
-        raise ValueError(msg)
-
-    if not zero_ok and rate == 0.0:
-        msg = "Rate cannot be 0."
-        raise ValueError(msg)
-
-    if not one_ok and rate == 1.0:
-        msg = "Rate cannot be 1."
-        raise ValueError(msg)
-
-
-def _validate_fpr_bounds(fpr_bounds: tuple[float, float]) -> None:
-    if not isinstance(fpr_bounds, tuple):
-        msg = f"Expected `fpr_bounds` to be a tuple, but got {type(fpr_bounds)}"
-        raise TypeError(msg)
-
-    if len(fpr_bounds) != 2:
-        msg = f"Expected `fpr_bounds` to be a tuple of length 2, but got {len(fpr_bounds)}"
-        raise ValueError(msg)
-
-    lower, upper = fpr_bounds
-    _validate_rate(lower, zero_ok=False, one_ok=False)
-    _validate_rate(upper, zero_ok=False, one_ok=True)
 
-    if lower >= upper:
-        msg = f"Expected `fpr_bounds[1]` > `fpr_bounds[0]`, but got {fpr_bounds[1]} <= {fpr_bounds[0]}"
-        raise ValueError(msg)
+# =========================================== ARGS VALIDATION ===========================================
 
 
 def _images_classes_from_masks(masks: ndarray) -> ndarray:
@@ -121,11 +92,12 @@ def pimo(  # noqa: D103
     binclf_algorithm: str = BinclfAlgorithm.NUMBA,
     shared_fpr_metric: str = SharedFPRMetric.MEAN_PERIMAGE_FPR,
 ) -> tuple[ndarray, ndarray, ndarray, ndarray]:
-    # validate inputs
-    binclf_curve_numpy._validate_num_threshs(num_threshs)  # noqa: SLF001
+    BinclfAlgorithm.validate(binclf_algorithm)
+    SharedFPRMetric.validate(shared_fpr_metric)
+    _validate.num_threshs(num_threshs)
     binclf_curve_numpy._validate_anomaly_maps(anomaly_maps)  # noqa: SLF001
     binclf_curve_numpy._validate_masks(masks)  # noqa: SLF001
-    binclf_curve_numpy._validate_same_shape(anomaly_maps, masks)  # noqa: SLF001
+    _validate.same_shape(anomaly_maps, masks)
     _validate_atleast_one_anomalous_image(masks)
     _validate_atleast_one_normal_image(masks)
 
@@ -195,8 +167,7 @@ def aupimo(  # noqa: D103
     fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
     force: bool = False,
 ) -> tuple[ndarray, ndarray, ndarray, ndarray, ndarray]:
-    # validate inputs
-    _validate_fpr_bounds(fpr_bounds)
+    _validate.rate_range(fpr_bounds)
 
     # other validations are done in the `pimo` function
     threshs, shared_fpr, per_image_tprs, image_classes = pimo(
@@ -307,7 +278,7 @@ def aupimo(  # noqa: D103
     aucs: ndarray = np.trapz(per_image_tprs_bounded, x=shared_fpr_bounded_log, axis=1)
 
     # normalize, then clip(0, 1) makes sure that the values are in [0, 1] in case of numerical errors
-    normalization_factor = _aupimo_max_integral_value(fpr_bounds)
+    normalization_factor = aupimo_normalizing_factor(fpr_bounds)
     aucs = (aucs / normalization_factor).clip(0, 1)
 
     return threshs, shared_fpr, per_image_tprs, image_classes, aucs
@@ -338,7 +309,7 @@ def thresh_at_shared_fpr_level(threshs: ndarray, shared_fpr: ndarray, fpr_level:
     binclf_curve_numpy._validate_threshs(threshs)  # noqa: SLF001
     # TODO(jpcbertoldo): validate shared_fpr  # noqa: TD003
     _joint_validate_threshs_shared_fpr(threshs, shared_fpr)
-    _validate_rate(fpr_level, zero_ok=True, one_ok=True)
+    _validate.rate(fpr_level, zero_ok=True, one_ok=True)
 
     shared_fpr_min, shared_fpr_max = shared_fpr.min(), shared_fpr.max()
 
@@ -373,9 +344,39 @@ def thresh_at_shared_fpr_level(threshs: ndarray, shared_fpr: ndarray, fpr_level:
     return index, thresh, fpr_level_defacto
 
 
-def _aupimo_max_integral_value(fpr_bounds: tuple[float, float]) -> float:
-    """Constant that normalizes the AUPIMO integral to 0-1 range."""
-    _validate_fpr_bounds(fpr_bounds)
+def aupimo_normalizing_factor(fpr_bounds: tuple[float, float]) -> float:
+    """Constant that normalizes the AUPIMO integral to 0-1 range.
+
+    It is the maximum possible value from the integral in AUPIMO's definition.
+    It corresponds to assuming a constant function T_i: thresh --> 1.
+
+    Args:
+        fpr_bounds: lower and upper bounds of the FPR integration range.
+
+    Returns:
+        float: the normalization factor (>0).
+    """
+    _validate.rate_range(fpr_bounds)
     fpr_lower_bound, fpr_upper_bound = fpr_bounds
     # the log's base must be the same as the one used in the integration!
     return float(np.log(fpr_upper_bound / fpr_lower_bound))
+
+
+def aupimo_random_model_score(fpr_bounds: tuple[float, float]) -> float:
+    """AUPIMO of a theoretical random model.
+
+    "Random model" means that there is no discrimination between normal and anomalous pixels/patches/images.
+    It corresponds to assuming the functions T = F.
+
+    For the FPR bounds (1e-5, 1e-4), the random model AUPIMO is ~4e-5.
+
+    Args:
+        fpr_bounds: lower and upper bounds of the FPR integration range.
+
+    Returns:
+        float: the AUPIMO score.
+    """
+    _validate.rate_range(fpr_bounds)
+    fpr_lower_bound, fpr_upper_bound = fpr_bounds
+    integral_value = fpr_upper_bound - fpr_lower_bound
+    return float(integral_value / aupimo_normalizing_factor(fpr_bounds))
diff --git a/tests/unit/metrics/per_image/test_binclf_curve.py b/tests/unit/metrics/per_image/test_binclf_curve.py
index 0ed258ffe9..5ea2a658d0 100644
--- a/tests/unit/metrics/per_image/test_binclf_curve.py
+++ b/tests/unit/metrics/per_image/test_binclf_curve.py
@@ -162,7 +162,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
                 # `threshs` must be unique
                 ([preds, gts, np.sort(np.concatenate([threshs, threshs]))], {"algorithm": "numba"}, ValueError),
                 # invalid `algorithm`
-                ([preds, gts, threshs], {"algorithm": "blurp"}, NotImplementedError),
+                ([preds, gts, threshs], {"algorithm": "blurp"}, ValueError),
             ],
         )
 
@@ -304,7 +304,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
                 (
                     [preds, gts],
                     {"algorithm": "glfrb", "threshs_choice": "given", "threshs_given": threshs, "num_threshs": None},
-                    NotImplementedError,
+                    ValueError,
                 ),
             ],
         )
diff --git a/tests/unit/metrics/per_image/test_pimo.py b/tests/unit/metrics/per_image/test_pimo.py
index a7308f9afa..9f595c3622 100644
--- a/tests/unit/metrics/per_image/test_pimo.py
+++ b/tests/unit/metrics/per_image/test_pimo.py
@@ -177,6 +177,52 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
         )
 
 
+def _do_test_pimo_outputs(
+    threshs: ndarray | Tensor,
+    shared_fpr: ndarray | Tensor,
+    per_image_tprs: ndarray | Tensor,
+    image_classes: ndarray | Tensor,
+    expected_threshs: ndarray | Tensor,
+    expected_shared_fpr: ndarray | Tensor,
+    expected_per_image_tprs: ndarray | Tensor,
+    expected_image_classes: ndarray | Tensor,
+) -> None:
+    """Test if the outputs of any of the PIMO interfaces are correct."""
+    if isinstance(threshs, Tensor):
+        assert isinstance(shared_fpr, Tensor)
+        assert isinstance(per_image_tprs, Tensor)
+        assert isinstance(image_classes, Tensor)
+        assert isinstance(expected_threshs, Tensor)
+        assert isinstance(expected_shared_fpr, Tensor)
+        assert isinstance(expected_per_image_tprs, Tensor)
+        assert isinstance(expected_image_classes, Tensor)
+        allclose = torch.allclose
+
+    elif isinstance(threshs, ndarray):
+        assert isinstance(shared_fpr, ndarray)
+        assert isinstance(per_image_tprs, ndarray)
+        assert isinstance(image_classes, ndarray)
+        assert isinstance(expected_threshs, ndarray)
+        assert isinstance(expected_shared_fpr, ndarray)
+        assert isinstance(expected_per_image_tprs, ndarray)
+        assert isinstance(expected_image_classes, ndarray)
+        allclose = np.allclose
+
+    else:
+        msg = "Expected `threshs` to be a Tensor or ndarray."
+        raise TypeError(msg)
+
+    assert threshs.ndim == 1
+    assert shared_fpr.ndim == 1
+    assert per_image_tprs.ndim == 2
+    assert tuple(image_classes.shape) == (3,)
+
+    assert allclose(threshs, expected_threshs)
+    assert allclose(shared_fpr, expected_shared_fpr)
+    assert allclose(per_image_tprs, expected_per_image_tprs, equal_nan=True)
+    assert (image_classes == expected_image_classes).all()
+
+
 def test_pimo_numpy(
     anomaly_maps: ndarray,
     masks: ndarray,
@@ -196,16 +242,16 @@ def test_pimo_numpy(
         binclf_algorithm=binclf_algorithm,
         shared_fpr_metric="mean-per-image-fpr",
     )
-
-    assert threshs.ndim == 1
-    assert shared_fpr.ndim == 1
-    assert per_image_tprs.ndim == 2
-    assert image_classes.ndim == 1
-
-    assert np.allclose(threshs, expected_threshs)
-    assert np.allclose(shared_fpr, expected_shared_fpr)
-    assert np.allclose(per_image_tprs, expected_per_image_tprs, equal_nan=True)
-    assert (image_classes == expected_image_classes).all()
+    _do_test_pimo_outputs(
+        threshs,
+        shared_fpr,
+        per_image_tprs,
+        image_classes,
+        expected_threshs,
+        expected_shared_fpr,
+        expected_per_image_tprs,
+        expected_image_classes,
+    )
 
 
 def test_pimo(
@@ -219,7 +265,26 @@ def test_pimo(
 ) -> None:
     """Test if `pimo()` returns the expected values."""
     from anomalib.metrics.per_image import pimo
+    from anomalib.metrics.per_image.pimo import PIMOResult
+
+    def do_assertions(pimoresult: PIMOResult) -> None:
+        assert pimoresult.shared_fpr_metric == "mean-per-image-fpr"
+        threshs = pimoresult.threshs
+        shared_fpr = pimoresult.shared_fpr
+        per_image_tprs = pimoresult.per_image_tprs
+        image_classes = pimoresult.image_classes
+        _do_test_pimo_outputs(
+            threshs,
+            shared_fpr,
+            per_image_tprs,
+            image_classes,
+            expected_threshs,
+            expected_shared_fpr,
+            expected_per_image_tprs,
+            expected_image_classes,
+        )
 
+    # functional interface
     pimoresult = pimo.pimo(
         anomaly_maps,
         masks,
@@ -227,23 +292,52 @@ def test_pimo(
         binclf_algorithm=binclf_algorithm,
         shared_fpr_metric="mean-per-image-fpr",
     )
-    threshs = pimoresult.threshs
-    shared_fpr = pimoresult.shared_fpr
-    per_image_tprs = pimoresult.per_image_tprs
-    image_classes = pimoresult.image_classes
-
-    # metadata
-    assert pimoresult.shared_fpr_metric == "mean-per-image-fpr"
-    # data
-    assert threshs.ndim == 1
-    assert shared_fpr.ndim == 1
-    assert per_image_tprs.ndim == 2
-    assert image_classes.ndim == 1
+    do_assertions(pimoresult)
 
-    assert torch.allclose(threshs, expected_threshs)
-    assert torch.allclose(shared_fpr, expected_shared_fpr)
-    assert torch.allclose(per_image_tprs, expected_per_image_tprs, equal_nan=True)
-    assert (image_classes == expected_image_classes).all()
+    # metric interface
+    metric = pimo.PIMO(
+        num_threshs=7,
+        binclf_algorithm=binclf_algorithm,
+        shared_fpr_metric="mean-per-image-fpr",
+    )
+    metric.update(anomaly_maps, masks)
+    pimoresult = metric.compute()
+    do_assertions(pimoresult)
+
+
+def _do_test_aupimo_outputs(
+    threshs: ndarray | Tensor,
+    shared_fpr: ndarray | Tensor,
+    per_image_tprs: ndarray | Tensor,
+    image_classes: ndarray | Tensor,
+    aupimos: ndarray | Tensor,
+    expected_threshs: ndarray | Tensor,
+    expected_shared_fpr: ndarray | Tensor,
+    expected_per_image_tprs: ndarray | Tensor,
+    expected_image_classes: ndarray | Tensor,
+    expected_aupimos: ndarray | Tensor,
+) -> None:
+    _do_test_pimo_outputs(
+        threshs,
+        shared_fpr,
+        per_image_tprs,
+        image_classes,
+        expected_threshs,
+        expected_shared_fpr,
+        expected_per_image_tprs,
+        expected_image_classes,
+    )
+    if isinstance(threshs, Tensor):
+        assert isinstance(aupimos, Tensor)
+        assert isinstance(expected_aupimos, Tensor)
+        allclose = torch.allclose
+
+    elif isinstance(threshs, ndarray):
+        assert isinstance(aupimos, ndarray)
+        assert isinstance(expected_aupimos, ndarray)
+        allclose = np.allclose
+    assert tuple(aupimos.shape) == (3,)
+    assert allclose(aupimos, expected_aupimos, equal_nan=True)
 
 
 def test_aupimo_values_numpy(
@@ -269,20 +363,18 @@ def test_aupimo_values_numpy(
         fpr_bounds=fpr_bounds,
         force=True,
     )
-
-    assert threshs.ndim == 1
-    assert shared_fpr.ndim == 1
-    assert per_image_tprs.ndim == 2
-    assert image_classes.ndim == 1
-
-    assert np.allclose(threshs, expected_threshs)
-    assert np.allclose(shared_fpr, expected_shared_fpr)
-    assert np.allclose(per_image_tprs, expected_per_image_tprs, equal_nan=True)
-    assert (image_classes == expected_image_classes).all()
-
-    assert aupimos.ndim == 1
-    assert aupimos.shape == (3,)
-    assert np.allclose(aupimos, expected_aupimos, equal_nan=True)
+    _do_test_aupimo_outputs(
+        threshs,
+        shared_fpr,
+        per_image_tprs,
+        image_classes,
+        aupimos,
+        expected_threshs,
+        expected_shared_fpr,
+        expected_per_image_tprs,
+        expected_image_classes,
+        expected_aupimos,
+    )
 
 
 def test_aupimo_values(
@@ -298,8 +390,41 @@ def test_aupimo_values(
 ) -> None:
     """Test if `aupimo()` returns the expected values."""
     from anomalib.metrics.per_image import pimo
+    from anomalib.metrics.per_image.pimo import AUPIMOResult, PIMOResult
+
+    def do_assertions(pimoresult: PIMOResult, aupimoresult: AUPIMOResult) -> None:
+        # test metadata
+        assert pimoresult.shared_fpr_metric == "mean-per-image-fpr"
+        assert aupimoresult.shared_fpr_metric == "mean-per-image-fpr"
+        assert aupimoresult.fpr_bounds == fpr_bounds
+        assert aupimoresult.num_threshs == 7
+
+        # test data
+        # from pimo result
+        threshs = pimoresult.threshs
+        shared_fpr = pimoresult.shared_fpr
+        per_image_tprs = pimoresult.per_image_tprs
+        image_classes = pimoresult.image_classes
+        # from aupimo result
+        aupimos = aupimoresult.aupimos
+        _do_test_aupimo_outputs(
+            threshs,
+            shared_fpr,
+            per_image_tprs,
+            image_classes,
+            aupimos,
+            expected_threshs,
+            expected_shared_fpr,
+            expected_per_image_tprs,
+            expected_image_classes,
+            expected_aupimos,
+        )
+        thresh_lower_bound = aupimoresult.thresh_lower_bound
+        thresh_upper_bound = aupimoresult.thresh_upper_bound
+        assert anomaly_maps.min() <= thresh_lower_bound < thresh_upper_bound <= anomaly_maps.max()
 
-    pimoresult, aupimoresult = pimo.aupimo(
+    # functional interface
+    pimoresult_from_functional, aupimoresult_from_functional = pimo.aupimo(
         anomaly_maps,
         masks,
         num_threshs=7,
@@ -308,39 +433,19 @@ def test_aupimo_values(
         fpr_bounds=fpr_bounds,
         force=True,
     )
+    do_assertions(pimoresult_from_functional, aupimoresult_from_functional)
 
-    # from pimo result
-    threshs = pimoresult.threshs
-    shared_fpr = pimoresult.shared_fpr
-    per_image_tprs = pimoresult.per_image_tprs
-    image_classes = pimoresult.image_classes
-
-    # from aupimo result
-    fpr_lower_bound = aupimoresult.fpr_lower_bound
-    fpr_upper_bound = aupimoresult.fpr_upper_bound
-    thresh_lower_bound = aupimoresult.thresh_lower_bound
-    thresh_upper_bound = aupimoresult.thresh_upper_bound
-    num_threshs = aupimoresult.num_threshs
-    aupimos = aupimoresult.aupimos
-
-    # test metadata
-    assert pimoresult.shared_fpr_metric == "mean-per-image-fpr"
-    assert aupimoresult.shared_fpr_metric == "mean-per-image-fpr"
-    assert (fpr_lower_bound, fpr_upper_bound) == fpr_bounds
-    assert num_threshs == 7
-
-    # test data
-    assert threshs.ndim == 1
-    assert shared_fpr.ndim == 1
-    assert per_image_tprs.ndim == 2
-    assert image_classes.ndim == 1
-    assert anomaly_maps.min() <= thresh_lower_bound < thresh_upper_bound <= anomaly_maps.max()
-
-    assert torch.allclose(threshs, expected_threshs)
-    assert torch.allclose(shared_fpr, expected_shared_fpr)
-    assert torch.allclose(per_image_tprs, expected_per_image_tprs, equal_nan=True)
-    assert (image_classes == expected_image_classes).all()
-    assert torch.allclose(aupimos, expected_aupimos, equal_nan=True)
+    # metric interface
+    metric = pimo.AUPIMO(
+        num_threshs=7,
+        binclf_algorithm=binclf_algorithm,
+        shared_fpr_metric="mean-per-image-fpr",
+        fpr_bounds=fpr_bounds,
+        force=True,
+    )
+    metric.update(anomaly_maps, masks)
+    pimoresult_from_metric, aupimoresult_from_metric = metric.compute()
+    do_assertions(pimoresult_from_metric, aupimoresult_from_metric)
 
 
 def test_aupimo_edge(

From 4bfe3da330d4061ef978e03e56fcb849f9cc2c7d Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Sat, 23 Dec 2023 19:44:18 +0100
Subject: [PATCH 15/57] renames and put things in init

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/__init__.py   | 25 ++++++++++++++++++++
 src/anomalib/metrics/per_image/pimo.py       | 12 +++++-----
 src/anomalib/metrics/per_image/pimo_numpy.py | 14 +++++------
 tests/unit/metrics/per_image/test_pimo.py    | 14 +++++------
 4 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/src/anomalib/metrics/per_image/__init__.py b/src/anomalib/metrics/per_image/__init__.py
index 85c5c5cd4f..9a6ef065d3 100644
--- a/src/anomalib/metrics/per_image/__init__.py
+++ b/src/anomalib/metrics/per_image/__init__.py
@@ -1 +1,26 @@
 """Per-Image Metrics."""
+
+from .binclf_curve import per_image_binclf_curve, per_image_fpr, per_image_tpr
+from .binclf_curve_numpy import Algorithm as BinClfAlgorithm
+from .binclf_curve_numpy import ThreshsChoice as BinclfThreshsChoice
+from .pimo import AUPIMO, PIMO, AUPIMOResult, PIMOResult, aupimo_scores, pimo_curves
+from .pimo_numpy import SharedFPRMetric
+
+__all__ = [
+    # constants
+    "BinClfAlgorithm",
+    "BinclfThreshsChoice",
+    "SharedFPRMetric",
+    # result classes
+    "PIMOResult",
+    "AUPIMOResult",
+    # functional interfaces
+    "per_image_binclf_curve",
+    "per_image_fpr",
+    "per_image_tpr",
+    "pimo_curves",
+    "aupimo_scores",
+    # torchmetrics interfaces
+    "PIMO",
+    "AUPIMO",
+]
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index deff383d19..4f5126edc8 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -131,7 +131,7 @@ def thresh_bounds(self) -> tuple[float, float]:
 
 
 # TODO(jpcbertoldo): missing docstring for `pimo`  # noqa: TD003
-def pimo(  # noqa: D103
+def pimo_curves(  # noqa: D103
     anomaly_maps: Tensor,
     masks: Tensor,
     num_threshs: int,
@@ -145,7 +145,7 @@ def pimo(  # noqa: D103
     masks_array = masks.detach().cpu().numpy()
 
     # other validations are done in the numpy code
-    threshs_array, shared_fpr_array, per_image_tprs_array, _ = pimo_numpy.pimo(
+    threshs_array, shared_fpr_array, per_image_tprs_array, _ = pimo_numpy.pimo_curves(
         anomaly_maps_array,
         masks_array,
         num_threshs,
@@ -174,7 +174,7 @@ def pimo(  # noqa: D103
 
 
 # TODO(jpcbertoldo): missing docstring for `aupimo`  # noqa: TD003
-def aupimo(  # noqa: D103
+def aupimo_scores(  # noqa: D103
     anomaly_maps: Tensor,
     masks: Tensor,
     num_threshs: int = 300_000,
@@ -191,7 +191,7 @@ def aupimo(  # noqa: D103
 
     # other validations are done in the numpy code
 
-    threshs_array, shared_fpr_array, per_image_tprs_array, _, aupimos_array = pimo_numpy.aupimo(
+    threshs_array, shared_fpr_array, per_image_tprs_array, _, aupimos_array = pimo_numpy.aupimo_scores(
         anomaly_maps_array,
         masks_array,
         num_threshs,
@@ -323,7 +323,7 @@ def compute(self) -> PIMOResult:  # noqa: D102
             raise RuntimeError(msg)
         anomaly_maps = torch.concat(self.anomaly_maps, dim=0)
         masks = torch.concat(self.masks, dim=0)
-        return pimo(
+        return pimo_curves(
             anomaly_maps,
             masks,
             self.num_threshs,
@@ -412,7 +412,7 @@ def compute(self, force: bool | None = None) -> tuple[PIMOResult, AUPIMOResult]:
         anomaly_maps = torch.concat(self.anomaly_maps, dim=0)
         masks = torch.concat(self.masks, dim=0)
         force = force if force is not None else self.force
-        return aupimo(
+        return aupimo_scores(
             anomaly_maps,
             masks,
             self.num_threshs,
diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index 51600d0c85..16f1c1133f 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -67,14 +67,14 @@ def _images_classes_from_masks(masks: ndarray) -> ndarray:
     return (masks == 1).any(axis=(1, 2)).astype(np.int32)
 
 
-def _validate_atleast_one_anomalous_image(masks: ndarray) -> None:
+def _validate_at_least_one_anomalous_image(masks: ndarray) -> None:
     image_classes = _images_classes_from_masks(masks)
     if (image_classes == 1).sum() == 0:
         msg = "Expected at least one ANOMALOUS image, but found none."
         raise ValueError(msg)
 
 
-def _validate_atleast_one_normal_image(masks: ndarray) -> None:
+def _validate_at_least_one_normal_image(masks: ndarray) -> None:
     image_classes = _images_classes_from_masks(masks)
     if (image_classes == 0).sum() == 0:
         msg = "Expected at least one NORMAL image, but found none."
@@ -85,7 +85,7 @@ def _validate_atleast_one_normal_image(masks: ndarray) -> None:
 
 
 # TODO(jpcbertoldo): missing docstring for `pimo`  # noqa: TD003
-def pimo(  # noqa: D103
+def pimo_curves(  # noqa: D103
     anomaly_maps: ndarray,
     masks: ndarray,
     num_threshs: int,
@@ -98,8 +98,8 @@ def pimo(  # noqa: D103
     binclf_curve_numpy._validate_anomaly_maps(anomaly_maps)  # noqa: SLF001
     binclf_curve_numpy._validate_masks(masks)  # noqa: SLF001
     _validate.same_shape(anomaly_maps, masks)
-    _validate_atleast_one_anomalous_image(masks)
-    _validate_atleast_one_normal_image(masks)
+    _validate_at_least_one_anomalous_image(masks)
+    _validate_at_least_one_normal_image(masks)
 
     image_classes = _images_classes_from_masks(masks)
 
@@ -158,7 +158,7 @@ def _joint_validate_threshs_shared_fpr(threshs: ndarray, shared_fpr: ndarray) ->
 
 
 # TODO(jpcbertoldo): missing docstring for `aupimo`  # noqa: TD003
-def aupimo(  # noqa: D103
+def aupimo_scores(  # noqa: D103
     anomaly_maps: ndarray,
     masks: ndarray,
     num_threshs: int = 300_000,
@@ -170,7 +170,7 @@ def aupimo(  # noqa: D103
     _validate.rate_range(fpr_bounds)
 
     # other validations are done in the `pimo` function
-    threshs, shared_fpr, per_image_tprs, image_classes = pimo(
+    threshs, shared_fpr, per_image_tprs, image_classes = pimo_curves(
         anomaly_maps=anomaly_maps,
         masks=masks,
         num_threshs=num_threshs,
diff --git a/tests/unit/metrics/per_image/test_pimo.py b/tests/unit/metrics/per_image/test_pimo.py
index 9f595c3622..d6ecd79941 100644
--- a/tests/unit/metrics/per_image/test_pimo.py
+++ b/tests/unit/metrics/per_image/test_pimo.py
@@ -235,7 +235,7 @@ def test_pimo_numpy(
     """Test if `pimo()` returns the expected values."""
     from anomalib.metrics.per_image import pimo_numpy
 
-    threshs, shared_fpr, per_image_tprs, image_classes = pimo_numpy.pimo(
+    threshs, shared_fpr, per_image_tprs, image_classes = pimo_numpy.pimo_curves(
         anomaly_maps,
         masks,
         num_threshs=7,
@@ -285,7 +285,7 @@ def do_assertions(pimoresult: PIMOResult) -> None:
         )
 
     # functional interface
-    pimoresult = pimo.pimo(
+    pimoresult = pimo.pimo_curves(
         anomaly_maps,
         masks,
         num_threshs=7,
@@ -354,7 +354,7 @@ def test_aupimo_values_numpy(
     """Test if `aupimo()` returns the expected values."""
     from anomalib.metrics.per_image import pimo_numpy
 
-    threshs, shared_fpr, per_image_tprs, image_classes, aupimos = pimo_numpy.aupimo(
+    threshs, shared_fpr, per_image_tprs, image_classes, aupimos = pimo_numpy.aupimo_scores(
         anomaly_maps,
         masks,
         num_threshs=7,
@@ -424,7 +424,7 @@ def do_assertions(pimoresult: PIMOResult, aupimoresult: AUPIMOResult) -> None:
         assert anomaly_maps.min() <= thresh_lower_bound < thresh_upper_bound <= anomaly_maps.max()
 
     # functional interface
-    pimoresult_from_functional, aupimoresult_from_functional = pimo.aupimo(
+    pimoresult_from_functional, aupimoresult_from_functional = pimo.aupimo_scores(
         anomaly_maps,
         masks,
         num_threshs=7,
@@ -463,7 +463,7 @@ def test_aupimo_edge(
     # not enough points on the curve
     # 10 threshs / 6 decades = 1.6 threshs per decade < 3
     with pytest.raises(RuntimeError):  # force=False --> raise error
-        pimo_numpy.aupimo(
+        pimo_numpy.aupimo_scores(
             anomaly_maps,
             masks,
             num_threshs=10,
@@ -473,7 +473,7 @@ def test_aupimo_edge(
         )
 
     with pytest.warns(RuntimeWarning):  # force=True --> warn
-        pimo_numpy.aupimo(
+        pimo_numpy.aupimo_scores(
             anomaly_maps,
             masks,
             num_threshs=10,
@@ -484,7 +484,7 @@ def test_aupimo_edge(
 
     # default number of points on the curve (300k threshs) should be enough
     rng = np.random.default_rng(42)
-    pimo_numpy.aupimo(
+    pimo_numpy.aupimo_scores(
         anomaly_maps * rng.uniform(1.0, 1.1, size=anomaly_maps.shape),
         masks,
         # num_threshs=,

From 1a7398b7d0b29272137b27b0a384213ad0e84945 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Sat, 23 Dec 2023 21:24:50 +0100
Subject: [PATCH 16/57] validate inputs in result objects

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/_validate.py   |   2 +-
 .../metrics/per_image/binclf_curve_numpy.py   |   2 +-
 src/anomalib/metrics/per_image/pimo.py        |  89 ++++++++++-
 src/anomalib/metrics/per_image/pimo_numpy.py  | 150 +++++++++++++++++-
 4 files changed, 231 insertions(+), 12 deletions(-)

diff --git a/src/anomalib/metrics/per_image/_validate.py b/src/anomalib/metrics/per_image/_validate.py
index eb2fe61c45..a02a8a36f2 100644
--- a/src/anomalib/metrics/per_image/_validate.py
+++ b/src/anomalib/metrics/per_image/_validate.py
@@ -20,7 +20,7 @@ def is_tensor(tensor: Any, argname: str | None = None) -> None:  # noqa: ANN401
 
 
 def num_threshs(num_threshs: int) -> None:
-    """Validate that `num_threshs` is a positive integer > 2."""
+    """Validate that `num_threshs` is a positive integer >= 2."""
     if not isinstance(num_threshs, int):
         msg = f"Expected `num_threshs` to be an integer, but got {type(num_threshs)}"
         raise TypeError(msg)
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index 622070caac..87db5c3f9b 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -169,7 +169,7 @@ def _validate_masks(masks: ndarray) -> None:
         if np.any((masks_unique_vals != 0) & (masks_unique_vals != 1)):
             msg = (
                 "Expected `masks` to be a *binary* ndarray with ground truth labels, "
-                f"but got ndarray with unique values {masks_unique_vals}"
+                f"but got ndarray with unique values {sorted(masks_unique_vals)}"
             )
             raise ValueError(msg)
 
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 4f5126edc8..9de4559768 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -34,6 +34,42 @@ def _validate_masks(masks: Tensor) -> None:
     binclf_curve_numpy._validate_masks(masks.numpy())  # noqa: SLF001
 
 
+def _validate_threshs(threshs: Tensor) -> None:
+    _validate.is_tensor(threshs, argname="threshs")
+    binclf_curve_numpy._validate_threshs(threshs.numpy())  # noqa: SLF001
+
+
+def _validate_shared_fpr(shared_fpr: Tensor, nan_allowed: bool = False, decreasing: bool = True) -> None:
+    _validate.is_tensor(shared_fpr, argname="shared_fpr")
+    pimo_numpy._validate_rate_curve(shared_fpr.numpy(), nan_allowed=nan_allowed, decreasing=decreasing)  # noqa: SLF001
+
+
+def _validate_image_classes(image_classes: Tensor) -> None:
+    _validate.is_tensor(image_classes, argname="image_classes")
+    pimo_numpy._validate_image_classes(image_classes.numpy())  # noqa: SLF001
+
+
+def _validate_per_image_tprs(per_image_tprs: Tensor, image_classes: Tensor) -> None:
+    _validate.is_tensor(per_image_tprs, argname="per_image_tprs")
+    _validate_image_classes(image_classes)
+
+    pimo_numpy._validate_per_image_rate_curves(  # noqa: SLF001
+        per_image_tprs[image_classes == 1].numpy(),
+        nan_allowed=False,
+        decreasing=True,
+    )
+
+    normal_images_tprs = per_image_tprs[image_classes == 0]
+    if not normal_images_tprs.isnan().all():
+        msg = "Expected all normal images to have NaN TPRs, but some have non-NaN values."
+        raise ValueError(msg)
+
+
+def _validate_aupimos(aupimos: Tensor) -> None:
+    _validate.is_tensor(aupimos, argname="aupimos")
+    pimo_numpy._validate_rates(aupimos.numpy(), nan_allowed=True)  # noqa: SLF001
+
+
 # =========================================== RESULT OBJECT ===========================================
 
 
@@ -44,9 +80,9 @@ class PIMOResult:  # noqa: D101
     shared_fpr_metric: str
 
     # data
-    threshs: Tensor = field(repr=False)
-    shared_fpr: Tensor = field(repr=False)
-    per_image_tprs: Tensor = field(repr=False)
+    threshs: Tensor = field(repr=False)  # shape => (K,)
+    shared_fpr: Tensor = field(repr=False)  # shape => (K,)
+    per_image_tprs: Tensor = field(repr=False)  # shape => (N, K)
 
     @property
     def num_threshs(self) -> int:
@@ -63,6 +99,32 @@ def image_classes(self) -> Tensor:
         """Image classes (0: normal, 1: anomalous)."""
         return (self.per_image_tprs.flatten(1) == 1).any(dim=1).to(torch.int32)
 
+    def __post_init__(self) -> None:
+        """Validate the inputs for the result object are consistent."""
+        try:
+            SharedFPRMetric.validate(self.shared_fpr_metric)
+            _validate_threshs(self.threshs)
+            _validate_shared_fpr(self.shared_fpr, nan_allowed=False)
+            _validate_per_image_tprs(self.per_image_tprs, self.image_classes)
+
+        except (TypeError, ValueError) as ex:
+            msg = f"Invalid inputs for {self.__class__.__name__} object."
+            raise ValueError(msg) from ex
+
+        if self.threshs.shape != self.shared_fpr.shape:
+            msg = (
+                f"Invalid {self.__class__.__name__} object. Attributes have inconsistent shapes: "
+                f"threshs.shape={self.threshs.shape} != shared_fpr.shape={self.shared_fpr.shape}."
+            )
+            raise ValueError(msg)
+
+        if self.threshs.shape[0] != self.per_image_tprs.shape[1]:
+            msg = (
+                f"Invalid {self.__class__.__name__} object. Attributes have inconsistent shapes: "
+                f"threshs.shape[0]={self.threshs.shape[0]} != per_image_tprs.shape[1]={self.per_image_tprs.shape[1]}."
+            )
+            raise ValueError(msg)
+
     def thresh_at(self, fpr_level: float) -> tuple[int, float, float]:
         """Return the threshold at the given shared FPR.
 
@@ -97,7 +159,7 @@ class AUPIMOResult:  # noqa: D101
     # data
     thresh_lower_bound: float = field(repr=False)
     thresh_upper_bound: float = field(repr=False)
-    aupimos: Tensor = field(repr=False)
+    aupimos: Tensor = field(repr=False)  # shape => (N,)
 
     @property
     def num_images(self) -> int:
@@ -126,6 +188,25 @@ def thresh_bounds(self) -> tuple[float, float]:
         """
         return self.thresh_lower_bound, self.thresh_upper_bound
 
+    def __post_init__(self) -> None:
+        """Validate the inputs for the result object are consistent."""
+        try:
+            SharedFPRMetric.validate(self.shared_fpr_metric)
+            _validate.rate_range((self.fpr_lower_bound, self.fpr_upper_bound))
+            _validate.num_threshs(self.num_threshs)
+            _validate_aupimos(self.aupimos)
+
+        except (TypeError, ValueError) as ex:
+            msg = f"Invalid inputs for {self.__class__.__name__} object."
+            raise ValueError(msg) from ex
+
+        if self.thresh_lower_bound >= self.thresh_upper_bound:
+            msg = (
+                f"Invalid {self.__class__.__name__} object. "
+                f"thresh_lower_bound={self.thresh_lower_bound} >= thresh_upper_bound={self.thresh_upper_bound}."
+            )
+            raise ValueError(msg)
+
 
 # =========================================== FUNCTIONAL ===========================================
 
diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index 16f1c1133f..ef49e828fa 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -67,6 +67,33 @@ def _images_classes_from_masks(masks: ndarray) -> ndarray:
     return (masks == 1).any(axis=(1, 2)).astype(np.int32)
 
 
+def _validate_image_classes(image_classes: ndarray) -> None:
+    if not isinstance(image_classes, ndarray):
+        msg = f"Expected image classes to be an ndarray, but got {type(image_classes)}."
+        raise TypeError(msg)
+
+    if image_classes.ndim != 1:
+        msg = f"Expected image classes to be 1D, but got {image_classes.ndim}D."
+        raise ValueError(msg)
+
+    if image_classes.dtype.kind == "b":
+        pass
+    elif image_classes.dtype.kind in ("i", "u"):
+        unique_vals = np.unique(image_classes)
+        if np.any((unique_vals != 0) & (unique_vals != 1)):
+            msg = (
+                "Expected image classes to be a *binary* ndarray with ground truth labels, "
+                f"but got ndarray with unique values {sorted(unique_vals)}"
+            )
+            raise ValueError(msg)
+    else:
+        msg = (
+            "Expected image classes to be an integer or boolean ndarray with ground truth labels, "
+            f"but got ndarray with dtype {image_classes.dtype}"
+        )
+        raise TypeError(msg)
+
+
 def _validate_at_least_one_anomalous_image(masks: ndarray) -> None:
     image_classes = _images_classes_from_masks(masks)
     if (image_classes == 1).sum() == 0:
@@ -81,6 +108,106 @@ def _validate_at_least_one_normal_image(masks: ndarray) -> None:
         raise ValueError(msg)
 
 
+def _validate_rates(rates: ndarray, nan_allowed: bool) -> None:
+    if not isinstance(rates, ndarray):
+        msg = f"Expected rates to be an ndarray, but got {type(rates)}."
+        raise TypeError(msg)
+
+    if rates.ndim != 1:
+        msg = f"Expected rates to be 1D, but got {rates.ndim}D."
+        raise ValueError(msg)
+
+    if rates.dtype.kind != "f":
+        msg = f"Expected rates to have dtype of float type, but got {rates.dtype}."
+        raise ValueError(msg)
+
+    isnan_mask = np.isnan(rates)
+    if nan_allowed:
+        # if they are all nan, then there is nothing to validate
+        if isnan_mask.all():
+            return
+        valid_values = rates[~isnan_mask]
+    elif isnan_mask.any():
+        msg = "Expected rates to not contain NaN values, but got NaN values."
+        raise ValueError(msg)
+    else:
+        valid_values = rates
+
+    if (valid_values < 0).any():
+        msg = "Expected rates to have values in the interval [0, 1], but got values < 0."
+        raise ValueError(msg)
+
+    if (valid_values > 1).any():
+        msg = "Expected rates to have values in the interval [0, 1], but got values > 1."
+        raise ValueError(msg)
+
+
+def _validate_rate_curve(rate_curve: ndarray, nan_allowed: bool, decreasing: bool) -> None:
+    _validate_rates(rate_curve, nan_allowed=nan_allowed)
+
+    diffs = np.diff(rate_curve)
+    diffs_valid = diffs[~np.isnan(diffs)] if nan_allowed else diffs
+
+    if decreasing and (diffs_valid > 0).any():
+        msg = "Expected rate curve to be monotonically decreasing, but got non-monotonically decreasing values."
+        raise ValueError(msg)
+
+    if not decreasing and (diffs_valid < 0).any():
+        msg = "Expected rate curve to be monotonically increasing, but got non-monotonically increasing values."
+        raise ValueError(msg)
+
+
+def _validate_per_image_rate_curves(rate_curves: ndarray, nan_allowed: bool, decreasing: bool) -> None:
+    if not isinstance(rate_curves, ndarray):
+        msg = f"Expected per-image rate curves to be an ndarray, but got {type(rate_curves)}."
+        raise TypeError(msg)
+
+    if rate_curves.ndim != 2:
+        msg = f"Expected per-image rate curves to be 2D, but got {rate_curves.ndim}D."
+        raise ValueError(msg)
+
+    if rate_curves.dtype.kind != "f":
+        msg = f"Expected per-image rate curves to have dtype of float type, but got {rate_curves.dtype}."
+        raise ValueError(msg)
+
+    isnan_mask = np.isnan(rate_curves)
+    if nan_allowed:
+        # if they are all nan, then there is nothing to validate
+        if isnan_mask.all():
+            return
+        valid_values = rate_curves[~isnan_mask]
+    elif isnan_mask.any():
+        msg = "Expected per-image rate curves to not contain NaN values, but got NaN values."
+        raise ValueError(msg)
+    else:
+        valid_values = rate_curves
+
+    if (valid_values < 0).any():
+        msg = "Expected per-image rate curves to have values in the interval [0, 1], but got values < 0."
+        raise ValueError(msg)
+
+    if (valid_values > 1).any():
+        msg = "Expected per-image rate curves to have values in the interval [0, 1], but got values > 1."
+        raise ValueError(msg)
+
+    diffs = np.diff(rate_curves, axis=1)
+    diffs_valid = diffs[~np.isnan(diffs)] if nan_allowed else diffs
+
+    if decreasing and (diffs_valid > 0).any():
+        msg = (
+            "Expected per-image rate curves to be monotonically decreasing, "
+            "but got non-monotonically decreasing values."
+        )
+        raise ValueError(msg)
+
+    if not decreasing and (diffs_valid < 0).any():
+        msg = (
+            "Expected per-image rate curves to be monotonically increasing, "
+            "but got non-monotonically increasing values."
+        )
+        raise ValueError(msg)
+
+
 # =========================================== PIMO ===========================================
 
 
@@ -126,19 +253,21 @@ def pimo_curves(  # noqa: D103
     shared_fpr: ndarray
     if shared_fpr_metric == SharedFPRMetric.MEAN_PERIMAGE_FPR:
         # shape -> (N, K)
-        per_image_fprs = binclf_curve_numpy.per_image_fpr(binclf_curves)
-        # TODO(jpcbertoldo): validate per_image_fprs  # noqa: TD003
+        per_image_fprs_normals = binclf_curve_numpy.per_image_fpr(binclf_curves[image_classes == 0])
+        try:
+            _validate_per_image_rate_curves(per_image_fprs_normals, nan_allowed=False, decreasing=True)
+        except ValueError as ex:
+            msg = "Cannot compute PIMO because the per-image FPR curves from normal images are invalid."
+            raise RuntimeError(msg) from ex
 
         # shape -> (K,)
         # this is the only shared FPR metric implemented so far, see note about shared FPR in the module's docstring
-        shared_fpr = per_image_fprs[image_classes == 0].mean(axis=0)
+        shared_fpr = per_image_fprs_normals.mean(axis=0)
 
     else:
         msg = f"Shared FPR metric `{shared_fpr_metric}` is not implemented."
         raise NotImplementedError(msg)
 
-    # TODO(jpcbertoldo): validate shared_fpr  # noqa: TD003
-
     # shape -> (N, K)
     per_image_tprs = binclf_curve_numpy.per_image_tpr(binclf_curves)
 
@@ -177,6 +306,15 @@ def aupimo_scores(  # noqa: D103
         binclf_algorithm=binclf_algorithm,
         shared_fpr_metric=shared_fpr_metric,
     )
+    try:
+        binclf_curve_numpy._validate_threshs(threshs)  # noqa: SLF001
+        _validate_rate_curve(shared_fpr, nan_allowed=False, decreasing=True)
+        _validate_image_classes(image_classes)
+        _validate_per_image_rate_curves(per_image_tprs[image_classes == 1], nan_allowed=False, decreasing=True)
+
+    except ValueError as ex:
+        msg = "Cannot compute AUPIMO because the PIMO curves are invalid."
+        raise RuntimeError(msg) from ex
 
     fpr_lower_bound, fpr_upper_bound = fpr_bounds
 
@@ -307,7 +445,7 @@ def thresh_at_shared_fpr_level(threshs: ndarray, shared_fpr: ndarray, fpr_level:
             [2] the actual shared FPR value at the returned threshold
     """
     binclf_curve_numpy._validate_threshs(threshs)  # noqa: SLF001
-    # TODO(jpcbertoldo): validate shared_fpr  # noqa: TD003
+    _validate_rate_curve(shared_fpr, nan_allowed=False, decreasing=True)
     _joint_validate_threshs_shared_fpr(threshs, shared_fpr)
     _validate.rate(fpr_level, zero_ok=True, one_ok=True)
 

From 403b4ae41d7894ad73d789094452bfa3de86da97 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Sat, 23 Dec 2023 22:20:21 +0100
Subject: [PATCH 17/57] result objects to from dict and tests

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/pimo.py    | 53 +++++++++++++
 tests/unit/metrics/per_image/test_pimo.py | 92 ++++++++++++++++++-----
 2 files changed, 126 insertions(+), 19 deletions(-)

diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 9de4559768..06c5328741 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -73,6 +73,7 @@ def _validate_aupimos(aupimos: Tensor) -> None:
 # =========================================== RESULT OBJECT ===========================================
 
 
+# TODO(jpcbertoldo): add image file path to `PIMOResult`  # noqa: TD003
 # TODO(jpcbertoldo): missing docstring for `PIMOResult`  # noqa: TD003
 @dataclass
 class PIMOResult:  # noqa: D101
@@ -145,7 +146,28 @@ def thresh_at(self, fpr_level: float) -> tuple[int, float, float]:
             fpr_level,
         )
 
+    def to_dict(self) -> dict[str, Tensor | str]:
+        """Return a dictionary with the result object's attributes."""
+        return {
+            "shared_fpr_metric": self.shared_fpr_metric,
+            "threshs": self.threshs,
+            "shared_fpr": self.shared_fpr,
+            "per_image_tprs": self.per_image_tprs,
+        }
 
+    @classmethod
+    def from_dict(cls: type[PIMOResult], dic: dict[str, Tensor | str]) -> PIMOResult:
+        """Return a result object from a dictionary."""
+        keys = ["shared_fpr_metric", "threshs", "shared_fpr", "per_image_tprs"]
+        for key in keys:
+            if key not in dic:
+                msg = f"Invalid input dictionary for {cls.__name__} object, missing key: {key}. Must contain: {keys}."
+                raise ValueError(msg)
+
+        return cls(**dic)
+
+
+# TODO(jpcbertoldo): add image file path to `AUPIMOResult`  # noqa: TD003
 # TODO(jpcbertoldo): missing docstring for `AUPIMOResult`  # noqa: TD003
 # TODO(jpcbertoldo): change `aucs` in the paper supp mat to `aupimos`  # noqa: TD003
 @dataclass
@@ -207,6 +229,37 @@ def __post_init__(self) -> None:
             )
             raise ValueError(msg)
 
+    def to_dict(self) -> dict[str, Tensor | str | float | int]:
+        """Return a dictionary with the result object's attributes."""
+        return {
+            "shared_fpr_metric": self.shared_fpr_metric,
+            "fpr_lower_bound": self.fpr_lower_bound,
+            "fpr_upper_bound": self.fpr_upper_bound,
+            "num_threshs": self.num_threshs,
+            "thresh_lower_bound": self.thresh_lower_bound,
+            "thresh_upper_bound": self.thresh_upper_bound,
+            "aupimos": self.aupimos,
+        }
+
+    @classmethod
+    def from_dict(cls: type[AUPIMOResult], dic: dict[str, Tensor | str | float | int]) -> AUPIMOResult:
+        """Return a result object from a dictionary."""
+        keys = [
+            "shared_fpr_metric",
+            "fpr_lower_bound",
+            "fpr_upper_bound",
+            "num_threshs",
+            "thresh_lower_bound",
+            "thresh_upper_bound",
+            "aupimos",
+        ]
+        for key in keys:
+            if key not in dic:
+                msg = f"Invalid input dictionary for {cls.__name__} object, missing key: {key}. Must contain: {keys}."
+                raise ValueError(msg)
+
+        return cls(**dic)  # type: ignore[arg-type]
+
 
 # =========================================== FUNCTIONAL ===========================================
 
diff --git a/tests/unit/metrics/per_image/test_pimo.py b/tests/unit/metrics/per_image/test_pimo.py
index d6ecd79941..4ef5135150 100644
--- a/tests/unit/metrics/per_image/test_pimo.py
+++ b/tests/unit/metrics/per_image/test_pimo.py
@@ -52,11 +52,6 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
     expected_per_image_tprs = np.stack([expected_tpr_norm, expected_tpr_anom1, expected_tpr_anom2], axis=0)
     expected_image_classes = np.array([0, 1, 1], dtype=np.int32)
 
-    metafunc.parametrize(
-        argnames=("binclf_algorithm",),
-        argvalues=[("python",), ("numba",)],
-    )
-
     if (
         metafunc.function is test_pimo_numpy
         or metafunc.function is test_pimo
@@ -176,6 +171,11 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
             ],
         )
 
+    if metafunc.function is test_pimoresult_conversions or metafunc.function is test_aupimoresult_conversions:
+        anomaly_maps = torch.from_numpy(anomaly_maps)
+        masks = torch.from_numpy(masks)
+        metafunc.parametrize(argnames=("anomaly_maps", "masks"), argvalues=[(anomaly_maps, masks)])
+
 
 def _do_test_pimo_outputs(
     threshs: ndarray | Tensor,
@@ -226,7 +226,6 @@ def _do_test_pimo_outputs(
 def test_pimo_numpy(
     anomaly_maps: ndarray,
     masks: ndarray,
-    binclf_algorithm: str,
     expected_threshs: ndarray,
     expected_shared_fpr: ndarray,
     expected_per_image_tprs: ndarray,
@@ -239,7 +238,7 @@ def test_pimo_numpy(
         anomaly_maps,
         masks,
         num_threshs=7,
-        binclf_algorithm=binclf_algorithm,
+        binclf_algorithm="numba",
         shared_fpr_metric="mean-per-image-fpr",
     )
     _do_test_pimo_outputs(
@@ -257,7 +256,6 @@ def test_pimo_numpy(
 def test_pimo(
     anomaly_maps: Tensor,
     masks: Tensor,
-    binclf_algorithm: str,
     expected_threshs: Tensor,
     expected_shared_fpr: Tensor,
     expected_per_image_tprs: Tensor,
@@ -289,7 +287,7 @@ def do_assertions(pimoresult: PIMOResult) -> None:
         anomaly_maps,
         masks,
         num_threshs=7,
-        binclf_algorithm=binclf_algorithm,
+        binclf_algorithm="numba",
         shared_fpr_metric="mean-per-image-fpr",
     )
     do_assertions(pimoresult)
@@ -297,7 +295,7 @@ def do_assertions(pimoresult: PIMOResult) -> None:
     # metric interface
     metric = pimo.PIMO(
         num_threshs=7,
-        binclf_algorithm=binclf_algorithm,
+        binclf_algorithm="numba",
         shared_fpr_metric="mean-per-image-fpr",
     )
     metric.update(anomaly_maps, masks)
@@ -343,7 +341,6 @@ def _do_test_aupimo_outputs(
 def test_aupimo_values_numpy(
     anomaly_maps: ndarray,
     masks: ndarray,
-    binclf_algorithm: str,
     fpr_bounds: tuple[float, float],
     expected_threshs: ndarray,
     expected_shared_fpr: ndarray,
@@ -358,7 +355,7 @@ def test_aupimo_values_numpy(
         anomaly_maps,
         masks,
         num_threshs=7,
-        binclf_algorithm=binclf_algorithm,
+        binclf_algorithm="numba",
         shared_fpr_metric="mean-per-image-fpr",
         fpr_bounds=fpr_bounds,
         force=True,
@@ -380,7 +377,6 @@ def test_aupimo_values_numpy(
 def test_aupimo_values(
     anomaly_maps: ndarray,
     masks: ndarray,
-    binclf_algorithm: str,
     fpr_bounds: tuple[float, float],
     expected_threshs: ndarray,
     expected_shared_fpr: ndarray,
@@ -428,7 +424,7 @@ def do_assertions(pimoresult: PIMOResult, aupimoresult: AUPIMOResult) -> None:
         anomaly_maps,
         masks,
         num_threshs=7,
-        binclf_algorithm=binclf_algorithm,
+        binclf_algorithm="numba",
         shared_fpr_metric="mean-per-image-fpr",
         fpr_bounds=fpr_bounds,
         force=True,
@@ -438,7 +434,7 @@ def do_assertions(pimoresult: PIMOResult, aupimoresult: AUPIMOResult) -> None:
     # metric interface
     metric = pimo.AUPIMO(
         num_threshs=7,
-        binclf_algorithm=binclf_algorithm,
+        binclf_algorithm="numba",
         shared_fpr_metric="mean-per-image-fpr",
         fpr_bounds=fpr_bounds,
         force=True,
@@ -451,7 +447,6 @@ def do_assertions(pimoresult: PIMOResult, aupimoresult: AUPIMOResult) -> None:
 def test_aupimo_edge(
     anomaly_maps: ndarray,
     masks: ndarray,
-    binclf_algorithm: str,
     fpr_bounds: tuple[float, float],
 ) -> None:
     """Test some edge cases."""
@@ -467,7 +462,7 @@ def test_aupimo_edge(
             anomaly_maps,
             masks,
             num_threshs=10,
-            binclf_algorithm=binclf_algorithm,
+            binclf_algorithm="numba",
             force=False,
             **fpr_bounds,
         )
@@ -477,7 +472,7 @@ def test_aupimo_edge(
             anomaly_maps,
             masks,
             num_threshs=10,
-            binclf_algorithm=binclf_algorithm,
+            binclf_algorithm="numba",
             force=True,
             **fpr_bounds,
         )
@@ -488,7 +483,66 @@ def test_aupimo_edge(
         anomaly_maps * rng.uniform(1.0, 1.1, size=anomaly_maps.shape),
         masks,
         # num_threshs=,
-        binclf_algorithm=binclf_algorithm,
+        binclf_algorithm="numba",
         force=False,
         **fpr_bounds,
     )
+
+
+def test_pimoresult_conversions(
+    anomaly_maps: Tensor,
+    masks: Tensor,
+) -> None:
+    """Test if `PIMOResult` can be converted to other formats and back."""
+    from anomalib.metrics.per_image import pimo
+    from anomalib.metrics.per_image.pimo import PIMOResult
+
+    pimoresult = pimo.pimo_curves(
+        anomaly_maps,
+        masks,
+        num_threshs=7,
+        binclf_algorithm="numba",
+        shared_fpr_metric="mean-per-image-fpr",
+    )
+    # convert to dict
+    dic = pimoresult.to_dict()
+    assert isinstance(dic, dict)
+    # convert back to PIMOResult
+    pimoresult_from_dict = PIMOResult.from_dict(dic)
+    assert isinstance(pimoresult_from_dict, PIMOResult)
+    # values should be the same
+    assert pimoresult_from_dict.shared_fpr_metric == pimoresult.shared_fpr_metric
+    assert torch.allclose(pimoresult_from_dict.threshs, pimoresult.threshs)
+    assert torch.allclose(pimoresult_from_dict.shared_fpr, pimoresult.shared_fpr)
+    assert torch.allclose(pimoresult_from_dict.per_image_tprs, pimoresult.per_image_tprs, equal_nan=True)
+
+
+def test_aupimoresult_conversions(
+    anomaly_maps: Tensor,
+    masks: Tensor,
+) -> None:
+    """Test if `AUPIMOResult` can be converted to other formats and back."""
+    from anomalib.metrics.per_image import pimo
+    from anomalib.metrics.per_image.pimo import AUPIMOResult
+
+    _, aupimoresult = pimo.aupimo_scores(
+        anomaly_maps,
+        masks,
+        num_threshs=7,
+        binclf_algorithm="numba",
+        shared_fpr_metric="mean-per-image-fpr",
+        fpr_bounds=(1e-5, 1e-4),
+        force=True,
+    )
+    # convert to dict
+    dic = aupimoresult.to_dict()
+    assert isinstance(dic, dict)
+    # convert back to AUPIMOResult
+    aupimoresult_from_dict = AUPIMOResult.from_dict(dic)
+    assert isinstance(aupimoresult_from_dict, AUPIMOResult)
+    # values should be the same
+    assert aupimoresult_from_dict.shared_fpr_metric == aupimoresult.shared_fpr_metric
+    assert aupimoresult_from_dict.fpr_bounds == aupimoresult.fpr_bounds
+    assert aupimoresult_from_dict.num_threshs == aupimoresult.num_threshs
+    assert aupimoresult_from_dict.thresh_bounds == aupimoresult.thresh_bounds
+    assert torch.allclose(aupimoresult_from_dict.aupimos, aupimoresult.aupimos, equal_nan=True)

From b12fb86520d356ac49d2be421f64e2b9f8f5deef Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Sat, 23 Dec 2023 22:49:57 +0100
Subject: [PATCH 18/57] add save and load methods to result objects and test

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/_validate.py | 25 ++++++
 src/anomalib/metrics/per_image/pimo.py      | 91 ++++++++++++++++++++-
 tests/unit/metrics/per_image/test_pimo.py   | 32 ++++++++
 3 files changed, 144 insertions(+), 4 deletions(-)

diff --git a/src/anomalib/metrics/per_image/_validate.py b/src/anomalib/metrics/per_image/_validate.py
index a02a8a36f2..ba5feafe63 100644
--- a/src/anomalib/metrics/per_image/_validate.py
+++ b/src/anomalib/metrics/per_image/_validate.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+from pathlib import Path
 from typing import Any
 
 
@@ -85,3 +86,27 @@ def rate_range(bounds: tuple[float, float]) -> None:
     if lower >= upper:
         msg = f"Expected `bounds[1]` > `bounds[0]`, but got {bounds[1]} <= {bounds[0]}"
         raise ValueError(msg)
+
+
+def file_path(file_path: str | Path, must_exist: bool, extension: str | None) -> None:
+    if isinstance(file_path, str):
+        file_path = Path(file_path)
+
+    elif not isinstance(file_path, Path):
+        msg = f"Expected file path to be a string or pathlib.Path, but got {type(file_path)}"
+        raise TypeError(msg)
+
+    if file_path.is_dir():
+        msg = "Expected file path to be a file, but got a directory."
+        raise ValueError(msg)
+
+    if must_exist and not file_path.exists():
+        msg = f"File does not exist: {file_path}"
+        raise FileNotFoundError(msg)
+
+    if extension is None:
+        return
+
+    if file_path.suffix != extension:
+        msg = f"Expected file path to have extension '{extension}', but got '{file_path.suffix}'"
+        raise ValueError(msg)
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 06c5328741..4a8e60388d 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -10,13 +10,17 @@
 """
 from __future__ import annotations
 
+import json
 import warnings
 from dataclasses import dataclass, field
+from pathlib import Path
 
 import torch
 from torch import Tensor
 from torchmetrics import Metric
 
+from anomalib.data.utils.image import duplicate_filename
+
 from . import _validate, binclf_curve_numpy, pimo_numpy
 from .binclf_curve_numpy import Algorithm as BinclfAlgorithm
 from .pimo_numpy import SharedFPRMetric
@@ -166,6 +170,38 @@ def from_dict(cls: type[PIMOResult], dic: dict[str, Tensor | str]) -> PIMOResult
 
         return cls(**dic)
 
+    def save(self, file_path: str | Path) -> None:
+        """Save to a `.pt` file.
+
+        Args:
+            file_path: path to the `.pt` file where to save the PIMO result.
+                - must have a `.pt` extension
+                - if the file already exists, a numerical suffix is added to the filename
+        """
+        _validate.file_path(file_path, must_exist=False, extension=".pt")
+        file_path = duplicate_filename(file_path)
+        payload = self.to_dict()
+        torch.save(payload, file_path)
+
+    @classmethod
+    def load(cls: type[PIMOResult], file_path: str | Path) -> PIMOResult:
+        """Load from a `.pt` file.
+
+        Args:
+            file_path: path to the `.pt` file where to load the PIMO result.
+                - must have a `.pt` extension
+        """
+        _validate.file_path(file_path, must_exist=True, extension=".pt")
+        payload = torch.load(file_path)
+        if not isinstance(payload, dict):
+            msg = f"Invalid payload in file {file_path}. Must be a dictionary."
+            raise TypeError(msg)
+        try:
+            return cls.from_dict(payload)
+        except (TypeError, ValueError) as ex:
+            msg = f"Invalid payload in file {file_path}."
+            raise ValueError(msg) from ex
+
 
 # TODO(jpcbertoldo): add image file path to `AUPIMOResult`  # noqa: TD003
 # TODO(jpcbertoldo): missing docstring for `AUPIMOResult`  # noqa: TD003
@@ -222,6 +258,14 @@ def __post_init__(self) -> None:
             msg = f"Invalid inputs for {self.__class__.__name__} object."
             raise ValueError(msg) from ex
 
+        if not isinstance(self.thresh_lower_bound, float):
+            msg = f"Invalid inputs for {self.__class__.__name__} object. `thresh_lower_bound` must be a float."
+            raise TypeError(msg)
+
+        if not isinstance(self.thresh_upper_bound, float):
+            msg = f"Invalid inputs for {self.__class__.__name__} object. `thresh_upper_bound` must be a float."
+            raise TypeError(msg)
+
         if self.thresh_lower_bound >= self.thresh_upper_bound:
             msg = (
                 f"Invalid {self.__class__.__name__} object. "
@@ -260,6 +304,45 @@ def from_dict(cls: type[AUPIMOResult], dic: dict[str, Tensor | str | float | int
 
         return cls(**dic)  # type: ignore[arg-type]
 
+    def save(self, file_path: str | Path) -> None:
+        """Save to a `.json` file.
+
+        Args:
+            file_path: path to the `.json` file where to save the AUPIMO result.
+                - must have a `.json` extension
+                - if the file already exists, a numerical suffix is added to the filename
+        """
+        _validate.file_path(file_path, must_exist=False, extension=".json")
+        file_path = duplicate_filename(file_path)
+        file_path = Path(file_path)
+        payload = self.to_dict()
+        payload = {k: v.numpy().tolist() if isinstance(v, Tensor) else v for k, v in payload.items()}
+        with file_path.open("w") as f:
+            json.dump(payload, f, indent=4)
+
+    @classmethod
+    def load(cls: type[AUPIMOResult], file_path: str | Path) -> AUPIMOResult:
+        """Load from a `.json` file.
+
+        Args:
+            file_path: path to the `.json` file where to load the AUPIMO result.
+                - must have a `.json` extension
+        """
+        _validate.file_path(file_path, must_exist=True, extension=".json")
+        file_path = Path(file_path)
+        with file_path.open("r") as f:
+            payload = json.load(f)
+        if not isinstance(payload, dict):
+            file_path = str(file_path)
+            msg = f"Invalid payload in file {file_path}. Must be a dictionary."
+            raise TypeError(msg)
+        payload["aupimos"] = torch.tensor(payload["aupimos"], dtype=torch.float64)
+        try:
+            return cls.from_dict(payload)
+        except (TypeError, ValueError) as ex:
+            msg = f"Invalid payload in file {file_path}."
+            raise ValueError(msg) from ex
+
 
 # =========================================== FUNCTIONAL ===========================================
 
@@ -363,11 +446,11 @@ def aupimo_scores(  # noqa: D103
         pimoresult,
         AUPIMOResult(
             shared_fpr_metric=shared_fpr_metric,
-            fpr_lower_bound=(fpr_lower_bound),
-            fpr_upper_bound=(fpr_upper_bound),
+            fpr_lower_bound=fpr_lower_bound,
+            fpr_upper_bound=fpr_upper_bound,
             num_threshs=num_threshs,
-            thresh_lower_bound=thresh_lower_bound,
-            thresh_upper_bound=thresh_upper_bound,
+            thresh_lower_bound=float(thresh_lower_bound),
+            thresh_upper_bound=float(thresh_upper_bound),
             aupimos=aupimos,
         ),
     )
diff --git a/tests/unit/metrics/per_image/test_pimo.py b/tests/unit/metrics/per_image/test_pimo.py
index 4ef5135150..7a8eeb19ac 100644
--- a/tests/unit/metrics/per_image/test_pimo.py
+++ b/tests/unit/metrics/per_image/test_pimo.py
@@ -1,5 +1,8 @@
 """Test `anomalib.metrics.per_image.pimo_numpy`."""
 
+import tempfile
+from pathlib import Path
+
 import numpy as np
 import pytest
 import torch
@@ -497,6 +500,7 @@ def test_pimoresult_conversions(
     from anomalib.metrics.per_image import pimo
     from anomalib.metrics.per_image.pimo import PIMOResult
 
+    # object -> dict -> object
     pimoresult = pimo.pimo_curves(
         anomaly_maps,
         masks,
@@ -516,6 +520,19 @@ def test_pimoresult_conversions(
     assert torch.allclose(pimoresult_from_dict.shared_fpr, pimoresult.shared_fpr)
     assert torch.allclose(pimoresult_from_dict.per_image_tprs, pimoresult.per_image_tprs, equal_nan=True)
 
+    # object -> file -> object
+    with tempfile.TemporaryDirectory() as tmpdir:
+        file_path = Path(tmpdir) / "pimo.pt"
+        pimoresult.save(str(file_path))
+        assert file_path.exists()
+        pimoresult_from_load = PIMOResult.load(str(file_path))
+    assert isinstance(pimoresult_from_load, PIMOResult)
+    # values should be the same
+    assert pimoresult_from_load.shared_fpr_metric == pimoresult.shared_fpr_metric
+    assert torch.allclose(pimoresult_from_load.threshs, pimoresult.threshs)
+    assert torch.allclose(pimoresult_from_load.shared_fpr, pimoresult.shared_fpr)
+    assert torch.allclose(pimoresult_from_load.per_image_tprs, pimoresult.per_image_tprs, equal_nan=True)
+
 
 def test_aupimoresult_conversions(
     anomaly_maps: Tensor,
@@ -525,6 +542,7 @@ def test_aupimoresult_conversions(
     from anomalib.metrics.per_image import pimo
     from anomalib.metrics.per_image.pimo import AUPIMOResult
 
+    # object -> dict -> object
     _, aupimoresult = pimo.aupimo_scores(
         anomaly_maps,
         masks,
@@ -546,3 +564,17 @@ def test_aupimoresult_conversions(
     assert aupimoresult_from_dict.num_threshs == aupimoresult.num_threshs
     assert aupimoresult_from_dict.thresh_bounds == aupimoresult.thresh_bounds
     assert torch.allclose(aupimoresult_from_dict.aupimos, aupimoresult.aupimos, equal_nan=True)
+
+    # object -> file -> object
+    with tempfile.TemporaryDirectory() as tmpdir:
+        file_path = Path(tmpdir) / "aupimo.json"
+        aupimoresult.save(str(file_path))
+        assert file_path.exists()
+        aupimoresult_from_load = AUPIMOResult.load(str(file_path))
+    assert isinstance(aupimoresult_from_load, AUPIMOResult)
+    # values should be the same
+    assert aupimoresult_from_load.shared_fpr_metric == aupimoresult.shared_fpr_metric
+    assert aupimoresult_from_load.fpr_bounds == aupimoresult.fpr_bounds
+    assert aupimoresult_from_load.num_threshs == aupimoresult.num_threshs
+    assert aupimoresult_from_load.thresh_bounds == aupimoresult.thresh_bounds
+    assert torch.allclose(aupimoresult_from_load.aupimos, aupimoresult.aupimos, equal_nan=True)

From b7e5439348a7fd27d6a6ca767b03eaa951493f3c Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Sun, 24 Dec 2023 14:08:46 +0100
Subject: [PATCH 19/57] refactor validations and minor changes

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/_validate.py   | 287 +++++++++++++++++-
 .../metrics/per_image/binclf_curve.py         |   4 +-
 .../metrics/per_image/binclf_curve_numpy.py   | 150 +--------
 src/anomalib/metrics/per_image/pimo.py        |  52 ++--
 src/anomalib/metrics/per_image/pimo_numpy.py  | 160 ++--------
 5 files changed, 332 insertions(+), 321 deletions(-)

diff --git a/src/anomalib/metrics/per_image/_validate.py b/src/anomalib/metrics/per_image/_validate.py
index ba5feafe63..2a6a574cd8 100644
--- a/src/anomalib/metrics/per_image/_validate.py
+++ b/src/anomalib/metrics/per_image/_validate.py
@@ -8,6 +8,9 @@
 from pathlib import Path
 from typing import Any
 
+import numpy as np
+from numpy import ndarray
+
 
 def is_tensor(tensor: Any, argname: str | None = None) -> None:  # noqa: ANN401
     """Validate that `tensor` is a `torch.Tensor`."""
@@ -21,13 +24,13 @@ def is_tensor(tensor: Any, argname: str | None = None) -> None:  # noqa: ANN401
 
 
 def num_threshs(num_threshs: int) -> None:
-    """Validate that `num_threshs` is a positive integer >= 2."""
+    """Validate the number of thresholds is a positive integer >= 2."""
     if not isinstance(num_threshs, int):
-        msg = f"Expected `num_threshs` to be an integer, but got {type(num_threshs)}"
+        msg = f"Expected the number of thresholds to be an integer, but got {type(num_threshs)}"
         raise TypeError(msg)
 
     if num_threshs < 2:
-        msg = f"If argument `num_threshs` is an integer, expected it to be larger than 1, but got {num_threshs}"
+        msg = f"Expected the number of thresholds to be larger than 1, but got {num_threshs}"
         raise ValueError(msg)
 
 
@@ -53,7 +56,7 @@ def rate(rate: float | int, zero_ok: bool, one_ok: bool) -> None:
         raise TypeError(msg)
 
     if rate < 0.0 or rate > 1.0:
-        msg = f"Rate `{rate}` is not a valid because it must be in [0, 1]."
+        msg = f"Expected rate to be in [0, 1], but got {rate}."
         raise ValueError(msg)
 
     if not zero_ok and rate == 0.0:
@@ -66,17 +69,17 @@ def rate(rate: float | int, zero_ok: bool, one_ok: bool) -> None:
 
 
 def rate_range(bounds: tuple[float, float]) -> None:
-    """Validates the range of rates within `bounds`.
+    """Validates the range of rates within the bounds.
 
     Args:
         bounds (tuple[float, float]): The lower and upper bounds of the rates.
     """
     if not isinstance(bounds, tuple):
-        msg = f"Expected `bounds` to be a tuple, but got {type(bounds)}"
+        msg = f"Expected the bounds to be a tuple, but got {type(bounds)}"
         raise TypeError(msg)
 
     if len(bounds) != 2:
-        msg = f"Expected `bounds` to be a tuple of length 2, but got {len(bounds)}"
+        msg = f"Expected the bounds to be a tuple of length 2, but got {len(bounds)}"
         raise ValueError(msg)
 
     lower, upper = bounds
@@ -84,11 +87,18 @@ def rate_range(bounds: tuple[float, float]) -> None:
     rate(upper, zero_ok=False, one_ok=True)
 
     if lower >= upper:
-        msg = f"Expected `bounds[1]` > `bounds[0]`, but got {bounds[1]} <= {bounds[0]}"
+        msg = f"Expected the upper bound to be larger than the lower bound, but got {upper=} <= {lower=}"
         raise ValueError(msg)
 
 
 def file_path(file_path: str | Path, must_exist: bool, extension: str | None) -> None:
+    """Validate the given path is a file (optionally) with the expected extension.
+
+    Args:
+        file_path (str | Path): The file path to validate.
+        must_exist (bool): Flag indicating whether the file must exist.
+        extension (str | None): The expected file extension, eg. .png, .jpg, etc. If `None`, no validation is performed.
+    """
     if isinstance(file_path, str):
         file_path = Path(file_path)
 
@@ -110,3 +120,264 @@ def file_path(file_path: str | Path, must_exist: bool, extension: str | None) ->
     if file_path.suffix != extension:
         msg = f"Expected file path to have extension '{extension}', but got '{file_path.suffix}'"
         raise ValueError(msg)
+
+
+def threshs(threshs: ndarray) -> None:
+    """Validate that the thresholds are valid and monotonically increasing."""
+    if not isinstance(threshs, ndarray):
+        msg = f"Expected thresholds to be an ndarray, but got {type(threshs)}"
+        raise TypeError(msg)
+
+    if threshs.ndim != 1:
+        msg = f"Expected thresholds to be 1D, but got {threshs.ndim}"
+        raise ValueError(msg)
+
+    if threshs.dtype.kind != "f":
+        msg = f"Expected thresholds to be of float type, but got ndarray with dtype {threshs.dtype}"
+        raise TypeError(msg)
+
+    # make sure they are strictly increasing
+    if not np.all(np.diff(threshs) > 0):
+        msg = "Expected thresholds to be strictly increasing, but it is not."
+        raise ValueError(msg)
+
+
+def thresh_bounds(thresh_bounds: tuple[float, float]) -> None:
+    if not isinstance(thresh_bounds, tuple):
+        msg = f"Expected threshold bounds to be a tuple, but got {type(thresh_bounds)}."
+        raise TypeError(msg)
+
+    if len(thresh_bounds) != 2:
+        msg = f"Expected threshold bounds to be a tuple of length 2, but got {len(thresh_bounds)}."
+        raise ValueError(msg)
+
+    lower, upper = thresh_bounds
+
+    if not isinstance(lower, float):
+        msg = f"Expected lower threshold bound to be a float, but got {type(lower)}."
+        raise TypeError(msg)
+
+    if not isinstance(upper, float):
+        msg = f"Expected upper threshold bound to be a float, but got {type(upper)}."
+        raise TypeError(msg)
+
+    if upper <= lower:
+        msg = f"Expected the upper bound to be greater than the lower bound, but got {upper} <= {lower}."
+        raise ValueError(msg)
+
+
+def anomaly_maps(anomaly_maps: ndarray) -> None:
+    if not isinstance(anomaly_maps, ndarray):
+        msg = f"Expected anomaly maps to be an ndarray, but got {type(anomaly_maps)}"
+        raise TypeError(msg)
+
+    if anomaly_maps.ndim != 3:
+        msg = f"Expected anomaly maps have 3 dimensions (N, H, W), but got {anomaly_maps.ndim} dimensions"
+        raise ValueError(msg)
+
+    if anomaly_maps.dtype.kind != "f":
+        msg = (
+            "Expected anomaly maps to be an floating ndarray with anomaly scores,"
+            f" but got ndarray with dtype {anomaly_maps.dtype}"
+        )
+        raise TypeError(msg)
+
+
+def masks(masks: ndarray) -> None:
+    if not isinstance(masks, ndarray):
+        msg = f"Expected masks to be an ndarray, but got {type(masks)}"
+        raise TypeError(msg)
+
+    if masks.ndim != 3:
+        msg = f"Expected masks have 3 dimensions (N, H, W), but got {masks.ndim} dimensions"
+        raise ValueError(msg)
+
+    if masks.dtype.kind == "b":
+        pass
+
+    elif masks.dtype.kind in ("i", "u"):
+        masks_unique_vals = np.unique(masks)
+        if np.any((masks_unique_vals != 0) & (masks_unique_vals != 1)):
+            msg = (
+                "Expected masks to be a *binary* ndarray with ground truth labels, "
+                f"but got ndarray with unique values {sorted(masks_unique_vals)}"
+            )
+            raise ValueError(msg)
+
+    else:
+        msg = (
+            "Expected masks to be an integer or boolean ndarray with ground truth labels, "
+            f"but got ndarray with dtype {masks.dtype}"
+        )
+        raise TypeError(msg)
+
+
+def binclf_curves(binclf_curves: ndarray, valid_threshs: ndarray | None) -> None:
+    if not isinstance(binclf_curves, ndarray):
+        msg = f"Expected binclf curves to be an ndarray, but got {type(binclf_curves)}"
+        raise TypeError(msg)
+
+    if binclf_curves.ndim != 4:
+        msg = f"Expected binclf curves to be 4D, but got {binclf_curves.ndim}D"
+        raise ValueError(msg)
+
+    if binclf_curves.shape[-2:] != (2, 2):
+        msg = f"Expected binclf curves to have shape (..., 2, 2), but got {binclf_curves.shape}"
+        raise ValueError(msg)
+
+    if binclf_curves.dtype != np.int64:
+        msg = f"Expected binclf curves to have dtype int64, but got {binclf_curves.dtype}."
+        raise TypeError(msg)
+
+    if (binclf_curves < 0).any():
+        msg = "Expected binclf curves to have non-negative values, but got negative values."
+        raise ValueError(msg)
+
+    neg = binclf_curves[:, :, 0, :].sum(axis=-1)  # (num_images, num_threshs)
+
+    if (neg != neg[:, :1]).any():
+        msg = "Expected binclf curves to have the same number of negatives per image for every thresh."
+        raise ValueError(msg)
+
+    pos = binclf_curves[:, :, 1, :].sum(axis=-1)  # (num_images, num_threshs)
+
+    if (pos != pos[:, :1]).any():
+        msg = "Expected binclf curves to have the same number of positives per image for every thresh."
+        raise ValueError(msg)
+
+    if valid_threshs is None:
+        return
+
+    if binclf_curves.shape[1] != valid_threshs.shape[0]:
+        msg = (
+            "Expected the binclf curves to have as many confusion matrices as the thresholds sequence, "
+            f"but got {binclf_curves.shape[1]} and {valid_threshs.shape[0]}"
+        )
+        raise RuntimeError(msg)
+
+
+def image_classes(image_classes: ndarray) -> None:
+    if not isinstance(image_classes, ndarray):
+        msg = f"Expected image classes to be an ndarray, but got {type(image_classes)}."
+        raise TypeError(msg)
+
+    if image_classes.ndim != 1:
+        msg = f"Expected image classes to be 1D, but got {image_classes.ndim}D."
+        raise ValueError(msg)
+
+    if image_classes.dtype.kind == "b":
+        pass
+    elif image_classes.dtype.kind in ("i", "u"):
+        unique_vals = np.unique(image_classes)
+        if np.any((unique_vals != 0) & (unique_vals != 1)):
+            msg = (
+                "Expected image classes to be a *binary* ndarray with ground truth labels, "
+                f"but got ndarray with unique values {sorted(unique_vals)}"
+            )
+            raise ValueError(msg)
+    else:
+        msg = (
+            "Expected image classes to be an integer or boolean ndarray with ground truth labels, "
+            f"but got ndarray with dtype {image_classes.dtype}"
+        )
+        raise TypeError(msg)
+
+
+def rates(rates: ndarray, nan_allowed: bool) -> None:
+    if not isinstance(rates, ndarray):
+        msg = f"Expected rates to be an ndarray, but got {type(rates)}."
+        raise TypeError(msg)
+
+    if rates.ndim != 1:
+        msg = f"Expected rates to be 1D, but got {rates.ndim}D."
+        raise ValueError(msg)
+
+    if rates.dtype.kind != "f":
+        msg = f"Expected rates to have dtype of float type, but got {rates.dtype}."
+        raise ValueError(msg)
+
+    isnan_mask = np.isnan(rates)
+    if nan_allowed:
+        # if they are all nan, then there is nothing to validate
+        if isnan_mask.all():
+            return
+        valid_values = rates[~isnan_mask]
+    elif isnan_mask.any():
+        msg = "Expected rates to not contain NaN values, but got NaN values."
+        raise ValueError(msg)
+    else:
+        valid_values = rates
+
+    if (valid_values < 0).any():
+        msg = "Expected rates to have values in the interval [0, 1], but got values < 0."
+        raise ValueError(msg)
+
+    if (valid_values > 1).any():
+        msg = "Expected rates to have values in the interval [0, 1], but got values > 1."
+        raise ValueError(msg)
+
+
+def rate_curve(rate_curve: ndarray, nan_allowed: bool, decreasing: bool) -> None:
+    rates(rate_curve, nan_allowed=nan_allowed)
+
+    diffs = np.diff(rate_curve)
+    diffs_valid = diffs[~np.isnan(diffs)] if nan_allowed else diffs
+
+    if decreasing and (diffs_valid > 0).any():
+        msg = "Expected rate curve to be monotonically decreasing, but got non-monotonically decreasing values."
+        raise ValueError(msg)
+
+    if not decreasing and (diffs_valid < 0).any():
+        msg = "Expected rate curve to be monotonically increasing, but got non-monotonically increasing values."
+        raise ValueError(msg)
+
+
+def per_image_rate_curves(rate_curves: ndarray, nan_allowed: bool, decreasing: bool) -> None:
+    if not isinstance(rate_curves, ndarray):
+        msg = f"Expected per-image rate curves to be an ndarray, but got {type(rate_curves)}."
+        raise TypeError(msg)
+
+    if rate_curves.ndim != 2:
+        msg = f"Expected per-image rate curves to be 2D, but got {rate_curves.ndim}D."
+        raise ValueError(msg)
+
+    if rate_curves.dtype.kind != "f":
+        msg = f"Expected per-image rate curves to have dtype of float type, but got {rate_curves.dtype}."
+        raise ValueError(msg)
+
+    isnan_mask = np.isnan(rate_curves)
+    if nan_allowed:
+        # if they are all nan, then there is nothing to validate
+        if isnan_mask.all():
+            return
+        valid_values = rate_curves[~isnan_mask]
+    elif isnan_mask.any():
+        msg = "Expected per-image rate curves to not contain NaN values, but got NaN values."
+        raise ValueError(msg)
+    else:
+        valid_values = rate_curves
+
+    if (valid_values < 0).any():
+        msg = "Expected per-image rate curves to have values in the interval [0, 1], but got values < 0."
+        raise ValueError(msg)
+
+    if (valid_values > 1).any():
+        msg = "Expected per-image rate curves to have values in the interval [0, 1], but got values > 1."
+        raise ValueError(msg)
+
+    diffs = np.diff(rate_curves, axis=1)
+    diffs_valid = diffs[~np.isnan(diffs)] if nan_allowed else diffs
+
+    if decreasing and (diffs_valid > 0).any():
+        msg = (
+            "Expected per-image rate curves to be monotonically decreasing, "
+            "but got non-monotonically decreasing values."
+        )
+        raise ValueError(msg)
+
+    if not decreasing and (diffs_valid < 0).any():
+        msg = (
+            "Expected per-image rate curves to be monotonically increasing, "
+            "but got non-monotonically increasing values."
+        )
+        raise ValueError(msg)
diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
index c6077fef9a..f23f28530e 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -22,14 +22,14 @@
 
 def _validate_threshs(threshs: Tensor) -> None:
     _validate.is_tensor(threshs, argname="threshs")
-    binclf_curve_numpy._validate_threshs(threshs.numpy())  # noqa: SLF001
+    _validate.threshs(threshs.numpy())
 
 
 def _validate_binclf_curves(binclf_curves: Tensor, valid_threshs: Tensor | None = None) -> None:
     _validate.is_tensor(binclf_curves, argname="binclf_curves")
     if valid_threshs is not None:
         _validate_threshs(valid_threshs)
-    binclf_curve_numpy._validate_binclf_curves(  # noqa: SLF001
+    _validate.binclf_curves(
         binclf_curves.detach().cpu().numpy(),
         valid_threshs=valid_threshs.numpy() if valid_threshs is not None else None,
     )
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index 87db5c3f9b..83d394a089 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -93,138 +93,6 @@ def _validate_gts_batch(gts_batch: ndarray) -> None:
         raise ValueError(msg)
 
 
-def _validate_threshs(threshs: ndarray) -> None:
-    if not isinstance(threshs, ndarray):
-        msg = f"Expected `threshs` to be an ndarray, but got {type(threshs)}"
-        raise TypeError(msg)
-
-    if threshs.ndim != 1:
-        msg = f"Expected `threshs` to be 1D, but got {threshs.ndim}"
-        raise ValueError(msg)
-
-    if threshs.dtype.kind != "f":
-        msg = (
-            "Expected `threshs` to be an floating ndarray with anomaly scores,"
-            f" but got ndarray with dtype {threshs.dtype}"
-        )
-        raise TypeError(msg)
-
-    # make sure they are strictly increasing
-    if any(thresh <= prev_th for prev_th, thresh in itertools.pairwise(threshs)):
-        msg = "Expected `threshs` to be strictly increasing, but it is not."
-        raise ValueError(msg)
-
-
-def _validate_thresh_bounds(thresh_bounds: tuple[float, float]) -> None:
-    if not isinstance(thresh_bounds, tuple):
-        msg = f"Expected `thresh_bounds` to be a tuple, but got {type(thresh_bounds)}"
-        raise TypeError(msg)
-
-    if len(thresh_bounds) != 2:
-        msg = f"Expected `thresh_bounds` to be a tuple of length 2, but got {len(thresh_bounds)}"
-        raise ValueError(msg)
-
-    lower, upper = thresh_bounds
-
-    if not isinstance(lower, float) or not isinstance(upper, float):
-        msg = f"Expected `thresh_bounds` to be a tuple of floats, but got {type(lower)} and {type(upper)}"
-        raise TypeError(msg)
-
-    if lower >= upper:
-        msg = f"Expected `thresh_bounds[1]` > `thresh_bounds[0]`, but got {thresh_bounds[1]} <= {thresh_bounds[0]}"
-        raise ValueError(msg)
-
-
-def _validate_anomaly_maps(anomaly_maps: ndarray) -> None:
-    if not isinstance(anomaly_maps, ndarray):
-        msg = f"Expected `anomaly_maps` to be an ndarray, but got {type(anomaly_maps)}"
-        raise TypeError(msg)
-
-    if anomaly_maps.ndim != 3:
-        msg = f"Expected `anomaly_maps` have 3 dimensions (N, H, W), but got {anomaly_maps.ndim} dimensions"
-        raise ValueError(msg)
-
-    if anomaly_maps.dtype.kind != "f":
-        msg = (
-            "Expected `anomaly_maps` to be an floating ndarray with anomaly scores,"
-            f" but got ndarray with dtype {anomaly_maps.dtype}"
-        )
-        raise TypeError(msg)
-
-
-def _validate_masks(masks: ndarray) -> None:
-    if not isinstance(masks, ndarray):
-        msg = f"Expected `masks` to be an ndarray, but got {type(masks)}"
-        raise TypeError(msg)
-
-    if masks.ndim != 3:
-        msg = f"Expected `masks` have 3 dimensions (N, H, W), but got {masks.ndim} dimensions"
-        raise ValueError(msg)
-
-    if masks.dtype.kind == "b":
-        pass
-
-    elif masks.dtype.kind in ("i", "u"):
-        masks_unique_vals = np.unique(masks)
-        if np.any((masks_unique_vals != 0) & (masks_unique_vals != 1)):
-            msg = (
-                "Expected `masks` to be a *binary* ndarray with ground truth labels, "
-                f"but got ndarray with unique values {sorted(masks_unique_vals)}"
-            )
-            raise ValueError(msg)
-
-    else:
-        msg = (
-            "Expected `masks` to be an integer or boolean ndarray with ground truth labels, "
-            f"but got ndarray with dtype {masks.dtype}"
-        )
-        raise TypeError(msg)
-
-
-def _validate_binclf_curves(binclf_curves: ndarray, valid_threshs: ndarray | None) -> None:
-    if not isinstance(binclf_curves, ndarray):
-        msg = f"Expected `binclf_curves` to be an ndarray, but got {type(binclf_curves)}"
-        raise TypeError(msg)
-
-    if binclf_curves.ndim != 4:
-        msg = f"Expected `binclf_curves` to be 4D, but got {binclf_curves.ndim}D"
-        raise ValueError(msg)
-
-    if binclf_curves.shape[-2:] != (2, 2):
-        msg = f"Expected `binclf_curves` to have shape (..., 2, 2), but got {binclf_curves.shape}"
-        raise ValueError(msg)
-
-    if binclf_curves.dtype != np.int64:
-        msg = f"Expected `binclf_curves` to have dtype int64, but got {binclf_curves.dtype}."
-        raise TypeError(msg)
-
-    if (binclf_curves < 0).any():
-        msg = "Expected `binclf_curves` to have non-negative values, but got negative values."
-        raise ValueError(msg)
-
-    neg = binclf_curves[:, :, 0, :].sum(axis=-1)  # (num_images, num_threshs)
-
-    if (neg != neg[:, :1]).any():
-        msg = "Expected `binclf_curves` to have the same number of negatives per image for every thresh."
-        raise ValueError(msg)
-
-    pos = binclf_curves[:, :, 1, :].sum(axis=-1)  # (num_images, num_threshs)
-
-    if (pos != pos[:, :1]).any():
-        msg = "Expected `binclf_curves` to have the same number of positives per image for every thresh."
-        raise ValueError(msg)
-
-    if valid_threshs is None:
-        return
-
-    if binclf_curves.shape[1] != valid_threshs.shape[0]:
-        msg = (
-            "Expected `binclf_curves` to have the same number of thresholds as `threshs`, "
-            f"but got {binclf_curves.shape[1]} and {valid_threshs.shape[0]}"
-        )
-        raise RuntimeError(msg)
-
-
 # =========================================== PYTHON VERSION ===========================================
 
 
@@ -360,7 +228,7 @@ def binclf_multiple_curves(
     _validate_scores_batch(scores_batch)
     _validate_gts_batch(gts_batch)
     _validate.same_shape(scores_batch, gts_batch)
-    _validate_threshs(threshs)
+    _validate.threshs(threshs)
 
     if algorithm == Algorithm.PYTHON:
         return _binclf_multiple_curves_python(scores_batch, gts_batch, threshs)
@@ -386,9 +254,9 @@ def _get_threshs_minmax_linspace(anomaly_maps: ndarray, num_threshs: int) -> nda
     # this operation can be a bit expensive
     thresh_low, thresh_high = thresh_bounds = (anomaly_maps.min().item(), anomaly_maps.max().item())
     try:
-        _validate_thresh_bounds(thresh_bounds)
+        _validate.thresh_bounds(thresh_bounds)
     except ValueError as ex:
-        msg = "Invalid `thresh_bounds` computed from `anomaly_maps`."
+        msg = f"Invalid threshold bounds computed from the given anomaly maps. Cause: {ex}"
         raise ValueError(msg) from ex
     return np.linspace(thresh_low, thresh_high, num_threshs, dtype=anomaly_maps.dtype)
 
@@ -447,15 +315,15 @@ def per_image_binclf_curve(
             Thresholds are sorted in ascending order.
     """
     Algorithm.validate(algorithm)
-    _validate_anomaly_maps(anomaly_maps)
-    _validate_masks(masks)
+    _validate.anomaly_maps(anomaly_maps)
+    _validate.masks(masks)
     _validate.same_shape(anomaly_maps, masks)
 
     threshs: ndarray
 
     if threshs_choice == ThreshsChoice.GIVEN:
         assert threshs_given is not None
-        _validate_threshs(threshs_given)
+        _validate.threshs(threshs_given)
         if num_threshs is not None:
             logger.warning(
                 f"Argument `num_threshs` was given, but it is ignored because `threshs_choice` is {threshs_choice}.",
@@ -487,9 +355,9 @@ def per_image_binclf_curve(
     num_images = anomaly_maps.shape[0]
 
     try:
-        _validate_binclf_curves(binclf_curves, valid_threshs=threshs)
+        _validate.binclf_curves(binclf_curves, valid_threshs=threshs)
 
-        # these two validations cannot be done in `_validate_binclf_curves` because it does not have access to the
+        # these two validations cannot be done in `_validate.binclf_curves` because it does not have access to the
         # original shapes of `anomaly_maps`
         if binclf_curves.shape[0] != num_images:
             msg = (
@@ -499,7 +367,7 @@ def per_image_binclf_curve(
             raise RuntimeError(msg)
 
     except (TypeError, ValueError) as ex:
-        msg = "Invalid `binclf_curves` was computed."
+        msg = f"Invalid `binclf_curves` was computed. Cause: {ex}"
         raise RuntimeError(msg) from ex
 
     return threshs, binclf_curves
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 4a8e60388d..29a27859db 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -21,43 +21,53 @@
 
 from anomalib.data.utils.image import duplicate_filename
 
-from . import _validate, binclf_curve_numpy, pimo_numpy
+from . import _validate, pimo_numpy
 from .binclf_curve_numpy import Algorithm as BinclfAlgorithm
 from .pimo_numpy import SharedFPRMetric
 
 # =========================================== ARGS VALIDATION ===========================================
 
 
+def _images_classes_from_masks(masks: Tensor) -> Tensor:
+    masks = torch.concat(masks, dim=0)
+    device = masks.device
+    image_classes = pimo_numpy._images_classes_from_masks(masks.numpy())  # noqa: SLF001
+    return torch.from_numpy(image_classes, device=device)
+
+
+# =========================================== ARGS VALIDATION ===========================================
+
+
 def _validate_anomaly_maps(anomaly_maps: Tensor) -> None:
     _validate.is_tensor(anomaly_maps, argname="anomaly_maps")
-    binclf_curve_numpy._validate_anomaly_maps(anomaly_maps.numpy())  # noqa: SLF001
+    _validate.anomaly_maps(anomaly_maps.numpy())
 
 
 def _validate_masks(masks: Tensor) -> None:
     _validate.is_tensor(masks, argname="masks")
-    binclf_curve_numpy._validate_masks(masks.numpy())  # noqa: SLF001
+    _validate.masks(masks.numpy())
 
 
 def _validate_threshs(threshs: Tensor) -> None:
     _validate.is_tensor(threshs, argname="threshs")
-    binclf_curve_numpy._validate_threshs(threshs.numpy())  # noqa: SLF001
+    _validate.threshs(threshs.numpy())
 
 
 def _validate_shared_fpr(shared_fpr: Tensor, nan_allowed: bool = False, decreasing: bool = True) -> None:
     _validate.is_tensor(shared_fpr, argname="shared_fpr")
-    pimo_numpy._validate_rate_curve(shared_fpr.numpy(), nan_allowed=nan_allowed, decreasing=decreasing)  # noqa: SLF001
+    _validate.rate_curve(shared_fpr.numpy(), nan_allowed=nan_allowed, decreasing=decreasing)
 
 
 def _validate_image_classes(image_classes: Tensor) -> None:
     _validate.is_tensor(image_classes, argname="image_classes")
-    pimo_numpy._validate_image_classes(image_classes.numpy())  # noqa: SLF001
+    _validate.image_classes(image_classes.numpy())
 
 
 def _validate_per_image_tprs(per_image_tprs: Tensor, image_classes: Tensor) -> None:
     _validate.is_tensor(per_image_tprs, argname="per_image_tprs")
     _validate_image_classes(image_classes)
 
-    pimo_numpy._validate_per_image_rate_curves(  # noqa: SLF001
+    _validate.per_image_rate_curves(
         per_image_tprs[image_classes == 1].numpy(),
         nan_allowed=False,
         decreasing=True,
@@ -71,7 +81,7 @@ def _validate_per_image_tprs(per_image_tprs: Tensor, image_classes: Tensor) -> N
 
 def _validate_aupimos(aupimos: Tensor) -> None:
     _validate.is_tensor(aupimos, argname="aupimos")
-    pimo_numpy._validate_rates(aupimos.numpy(), nan_allowed=True)  # noqa: SLF001
+    _validate.rates(aupimos.numpy(), nan_allowed=True)
 
 
 # =========================================== RESULT OBJECT ===========================================
@@ -113,7 +123,7 @@ def __post_init__(self) -> None:
             _validate_per_image_tprs(self.per_image_tprs, self.image_classes)
 
         except (TypeError, ValueError) as ex:
-            msg = f"Invalid inputs for {self.__class__.__name__} object."
+            msg = f"Invalid inputs for {self.__class__.__name__} object. Cause: {ex}."
             raise ValueError(msg) from ex
 
         if self.threshs.shape != self.shared_fpr.shape:
@@ -199,7 +209,7 @@ def load(cls: type[PIMOResult], file_path: str | Path) -> PIMOResult:
         try:
             return cls.from_dict(payload)
         except (TypeError, ValueError) as ex:
-            msg = f"Invalid payload in file {file_path}."
+            msg = f"Invalid payload in file {file_path}. Cause: {ex}."
             raise ValueError(msg) from ex
 
 
@@ -253,26 +263,12 @@ def __post_init__(self) -> None:
             _validate.rate_range((self.fpr_lower_bound, self.fpr_upper_bound))
             _validate.num_threshs(self.num_threshs)
             _validate_aupimos(self.aupimos)
+            _validate.thresh_bounds((self.thresh_lower_bound, self.thresh_upper_bound))
 
         except (TypeError, ValueError) as ex:
-            msg = f"Invalid inputs for {self.__class__.__name__} object."
+            msg = f"Invalid inputs for {self.__class__.__name__} object. Cause: {ex}."
             raise ValueError(msg) from ex
 
-        if not isinstance(self.thresh_lower_bound, float):
-            msg = f"Invalid inputs for {self.__class__.__name__} object. `thresh_lower_bound` must be a float."
-            raise TypeError(msg)
-
-        if not isinstance(self.thresh_upper_bound, float):
-            msg = f"Invalid inputs for {self.__class__.__name__} object. `thresh_upper_bound` must be a float."
-            raise TypeError(msg)
-
-        if self.thresh_lower_bound >= self.thresh_upper_bound:
-            msg = (
-                f"Invalid {self.__class__.__name__} object. "
-                f"thresh_lower_bound={self.thresh_lower_bound} >= thresh_upper_bound={self.thresh_upper_bound}."
-            )
-            raise ValueError(msg)
-
     def to_dict(self) -> dict[str, Tensor | str | float | int]:
         """Return a dictionary with the result object's attributes."""
         return {
@@ -340,7 +336,7 @@ def load(cls: type[AUPIMOResult], file_path: str | Path) -> AUPIMOResult:
         try:
             return cls.from_dict(payload)
         except (TypeError, ValueError) as ex:
-            msg = f"Invalid payload in file {file_path}."
+            msg = f"Invalid payload in file {file_path}. Cause: {ex}."
             raise ValueError(msg) from ex
 
 
@@ -485,7 +481,7 @@ def num_images(self) -> int:
     @property
     def image_classes(self) -> Tensor:
         """Image classes (0: normal, 1: anomalous)."""
-        return pimo_numpy._images_classes_from_masks(torch.concat(self.masks, dim=0).cpu().numpy())  # noqa: SLF001
+        return _images_classes_from_masks(self.masks)
 
     def __init__(
         self,
diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index ef49e828fa..94229c18e9 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -58,40 +58,16 @@ def validate(metric: str) -> None:
             raise ValueError(msg)
 
 
-# =========================================== ARGS VALIDATION ===========================================
+# =========================================== AUX ===========================================
 
 
 def _images_classes_from_masks(masks: ndarray) -> ndarray:
     """Deduce the image classes from the masks."""
-    binclf_curve_numpy._validate_masks(masks)  # noqa: SLF001
+    _validate.masks(masks)
     return (masks == 1).any(axis=(1, 2)).astype(np.int32)
 
 
-def _validate_image_classes(image_classes: ndarray) -> None:
-    if not isinstance(image_classes, ndarray):
-        msg = f"Expected image classes to be an ndarray, but got {type(image_classes)}."
-        raise TypeError(msg)
-
-    if image_classes.ndim != 1:
-        msg = f"Expected image classes to be 1D, but got {image_classes.ndim}D."
-        raise ValueError(msg)
-
-    if image_classes.dtype.kind == "b":
-        pass
-    elif image_classes.dtype.kind in ("i", "u"):
-        unique_vals = np.unique(image_classes)
-        if np.any((unique_vals != 0) & (unique_vals != 1)):
-            msg = (
-                "Expected image classes to be a *binary* ndarray with ground truth labels, "
-                f"but got ndarray with unique values {sorted(unique_vals)}"
-            )
-            raise ValueError(msg)
-    else:
-        msg = (
-            "Expected image classes to be an integer or boolean ndarray with ground truth labels, "
-            f"but got ndarray with dtype {image_classes.dtype}"
-        )
-        raise TypeError(msg)
+# =========================================== ARGS VALIDATION ===========================================
 
 
 def _validate_at_least_one_anomalous_image(masks: ndarray) -> None:
@@ -108,102 +84,11 @@ def _validate_at_least_one_normal_image(masks: ndarray) -> None:
         raise ValueError(msg)
 
 
-def _validate_rates(rates: ndarray, nan_allowed: bool) -> None:
-    if not isinstance(rates, ndarray):
-        msg = f"Expected rates to be an ndarray, but got {type(rates)}."
-        raise TypeError(msg)
-
-    if rates.ndim != 1:
-        msg = f"Expected rates to be 1D, but got {rates.ndim}D."
-        raise ValueError(msg)
-
-    if rates.dtype.kind != "f":
-        msg = f"Expected rates to have dtype of float type, but got {rates.dtype}."
-        raise ValueError(msg)
-
-    isnan_mask = np.isnan(rates)
-    if nan_allowed:
-        # if they are all nan, then there is nothing to validate
-        if isnan_mask.all():
-            return
-        valid_values = rates[~isnan_mask]
-    elif isnan_mask.any():
-        msg = "Expected rates to not contain NaN values, but got NaN values."
-        raise ValueError(msg)
-    else:
-        valid_values = rates
-
-    if (valid_values < 0).any():
-        msg = "Expected rates to have values in the interval [0, 1], but got values < 0."
-        raise ValueError(msg)
-
-    if (valid_values > 1).any():
-        msg = "Expected rates to have values in the interval [0, 1], but got values > 1."
-        raise ValueError(msg)
-
-
-def _validate_rate_curve(rate_curve: ndarray, nan_allowed: bool, decreasing: bool) -> None:
-    _validate_rates(rate_curve, nan_allowed=nan_allowed)
-
-    diffs = np.diff(rate_curve)
-    diffs_valid = diffs[~np.isnan(diffs)] if nan_allowed else diffs
-
-    if decreasing and (diffs_valid > 0).any():
-        msg = "Expected rate curve to be monotonically decreasing, but got non-monotonically decreasing values."
-        raise ValueError(msg)
-
-    if not decreasing and (diffs_valid < 0).any():
-        msg = "Expected rate curve to be monotonically increasing, but got non-monotonically increasing values."
-        raise ValueError(msg)
-
-
-def _validate_per_image_rate_curves(rate_curves: ndarray, nan_allowed: bool, decreasing: bool) -> None:
-    if not isinstance(rate_curves, ndarray):
-        msg = f"Expected per-image rate curves to be an ndarray, but got {type(rate_curves)}."
-        raise TypeError(msg)
-
-    if rate_curves.ndim != 2:
-        msg = f"Expected per-image rate curves to be 2D, but got {rate_curves.ndim}D."
-        raise ValueError(msg)
-
-    if rate_curves.dtype.kind != "f":
-        msg = f"Expected per-image rate curves to have dtype of float type, but got {rate_curves.dtype}."
-        raise ValueError(msg)
-
-    isnan_mask = np.isnan(rate_curves)
-    if nan_allowed:
-        # if they are all nan, then there is nothing to validate
-        if isnan_mask.all():
-            return
-        valid_values = rate_curves[~isnan_mask]
-    elif isnan_mask.any():
-        msg = "Expected per-image rate curves to not contain NaN values, but got NaN values."
-        raise ValueError(msg)
-    else:
-        valid_values = rate_curves
-
-    if (valid_values < 0).any():
-        msg = "Expected per-image rate curves to have values in the interval [0, 1], but got values < 0."
-        raise ValueError(msg)
-
-    if (valid_values > 1).any():
-        msg = "Expected per-image rate curves to have values in the interval [0, 1], but got values > 1."
-        raise ValueError(msg)
-
-    diffs = np.diff(rate_curves, axis=1)
-    diffs_valid = diffs[~np.isnan(diffs)] if nan_allowed else diffs
-
-    if decreasing and (diffs_valid > 0).any():
-        msg = (
-            "Expected per-image rate curves to be monotonically decreasing, "
-            "but got non-monotonically decreasing values."
-        )
-        raise ValueError(msg)
-
-    if not decreasing and (diffs_valid < 0).any():
+def _joint_validate_threshs_shared_fpr(threshs: ndarray, shared_fpr: ndarray) -> None:
+    if threshs.shape[0] != shared_fpr.shape[0]:
         msg = (
-            "Expected per-image rate curves to be monotonically increasing, "
-            "but got non-monotonically increasing values."
+            "Expected `threshs` and `shared_fpr` to have the same number of elements, "
+            f"but got {threshs.shape[0]} != {shared_fpr.shape[0]}"
         )
         raise ValueError(msg)
 
@@ -222,8 +107,8 @@ def pimo_curves(  # noqa: D103
     BinclfAlgorithm.validate(binclf_algorithm)
     SharedFPRMetric.validate(shared_fpr_metric)
     _validate.num_threshs(num_threshs)
-    binclf_curve_numpy._validate_anomaly_maps(anomaly_maps)  # noqa: SLF001
-    binclf_curve_numpy._validate_masks(masks)  # noqa: SLF001
+    _validate.anomaly_maps(anomaly_maps)
+    _validate.masks(masks)
     _validate.same_shape(anomaly_maps, masks)
     _validate_at_least_one_anomalous_image(masks)
     _validate_at_least_one_normal_image(masks)
@@ -255,9 +140,9 @@ def pimo_curves(  # noqa: D103
         # shape -> (N, K)
         per_image_fprs_normals = binclf_curve_numpy.per_image_fpr(binclf_curves[image_classes == 0])
         try:
-            _validate_per_image_rate_curves(per_image_fprs_normals, nan_allowed=False, decreasing=True)
+            _validate.per_image_rate_curves(per_image_fprs_normals, nan_allowed=False, decreasing=True)
         except ValueError as ex:
-            msg = "Cannot compute PIMO because the per-image FPR curves from normal images are invalid."
+            msg = f"Cannot compute PIMO because the per-image FPR curves from normal images are invalid. Cause: {ex}"
             raise RuntimeError(msg) from ex
 
         # shape -> (K,)
@@ -274,15 +159,6 @@ def pimo_curves(  # noqa: D103
     return threshs, shared_fpr, per_image_tprs, image_classes
 
 
-def _joint_validate_threshs_shared_fpr(threshs: ndarray, shared_fpr: ndarray) -> None:
-    if threshs.shape[0] != shared_fpr.shape[0]:
-        msg = (
-            "Expected `threshs` and `shared_fpr` to have the same number of elements, "
-            f"but got {threshs.shape[0]} != {shared_fpr.shape[0]}"
-        )
-        raise ValueError(msg)
-
-
 # =========================================== AUPIMO ===========================================
 
 
@@ -307,13 +183,13 @@ def aupimo_scores(  # noqa: D103
         shared_fpr_metric=shared_fpr_metric,
     )
     try:
-        binclf_curve_numpy._validate_threshs(threshs)  # noqa: SLF001
-        _validate_rate_curve(shared_fpr, nan_allowed=False, decreasing=True)
-        _validate_image_classes(image_classes)
-        _validate_per_image_rate_curves(per_image_tprs[image_classes == 1], nan_allowed=False, decreasing=True)
+        _validate.threshs(threshs)
+        _validate.rate_curve(shared_fpr, nan_allowed=False, decreasing=True)
+        _validate.image_classes(image_classes)
+        _validate.per_image_rate_curves(per_image_tprs[image_classes == 1], nan_allowed=False, decreasing=True)
 
     except ValueError as ex:
-        msg = "Cannot compute AUPIMO because the PIMO curves are invalid."
+        msg = f"Cannot compute AUPIMO because the PIMO curves are invalid. Cause: {ex}"
         raise RuntimeError(msg) from ex
 
     fpr_lower_bound, fpr_upper_bound = fpr_bounds
@@ -444,8 +320,8 @@ def thresh_at_shared_fpr_level(threshs: ndarray, shared_fpr: ndarray, fpr_level:
             [1] threshold
             [2] the actual shared FPR value at the returned threshold
     """
-    binclf_curve_numpy._validate_threshs(threshs)  # noqa: SLF001
-    _validate_rate_curve(shared_fpr, nan_allowed=False, decreasing=True)
+    _validate.threshs(threshs)
+    _validate.rate_curve(shared_fpr, nan_allowed=False, decreasing=True)
     _joint_validate_threshs_shared_fpr(threshs, shared_fpr)
     _validate.rate(fpr_level, zero_ok=True, one_ok=True)
 

From 3808de890118442855bb9a44267e337b62fdd1f8 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Sun, 24 Dec 2023 15:36:18 +0100
Subject: [PATCH 20/57] test result objects' properties

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 tests/unit/metrics/per_image/test_pimo.py | 26 +++++++++++++++--------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/tests/unit/metrics/per_image/test_pimo.py b/tests/unit/metrics/per_image/test_pimo.py
index 7a8eeb19ac..d070a627e7 100644
--- a/tests/unit/metrics/per_image/test_pimo.py
+++ b/tests/unit/metrics/per_image/test_pimo.py
@@ -174,7 +174,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
             ],
         )
 
-    if metafunc.function is test_pimoresult_conversions or metafunc.function is test_aupimoresult_conversions:
+    if metafunc.function is test_pimoresult_object or metafunc.function is test_aupimoresult_object:
         anomaly_maps = torch.from_numpy(anomaly_maps)
         masks = torch.from_numpy(masks)
         metafunc.parametrize(argnames=("anomaly_maps", "masks"), argvalues=[(anomaly_maps, masks)])
@@ -492,7 +492,7 @@ def test_aupimo_edge(
     )
 
 
-def test_pimoresult_conversions(
+def test_pimoresult_object(
     anomaly_maps: Tensor,
     masks: Tensor,
 ) -> None:
@@ -500,7 +500,6 @@ def test_pimoresult_conversions(
     from anomalib.metrics.per_image import pimo
     from anomalib.metrics.per_image.pimo import PIMOResult
 
-    # object -> dict -> object
     pimoresult = pimo.pimo_curves(
         anomaly_maps,
         masks,
@@ -508,10 +507,14 @@ def test_pimoresult_conversions(
         binclf_algorithm="numba",
         shared_fpr_metric="mean-per-image-fpr",
     )
-    # convert to dict
+
+    _ = pimoresult.num_threshs
+    _ = pimoresult.num_images
+    _ = pimoresult.image_classes
+
+    # object -> dict -> object
     dic = pimoresult.to_dict()
     assert isinstance(dic, dict)
-    # convert back to PIMOResult
     pimoresult_from_dict = PIMOResult.from_dict(dic)
     assert isinstance(pimoresult_from_dict, PIMOResult)
     # values should be the same
@@ -534,7 +537,7 @@ def test_pimoresult_conversions(
     assert torch.allclose(pimoresult_from_load.per_image_tprs, pimoresult.per_image_tprs, equal_nan=True)
 
 
-def test_aupimoresult_conversions(
+def test_aupimoresult_object(
     anomaly_maps: Tensor,
     masks: Tensor,
 ) -> None:
@@ -542,7 +545,6 @@ def test_aupimoresult_conversions(
     from anomalib.metrics.per_image import pimo
     from anomalib.metrics.per_image.pimo import AUPIMOResult
 
-    # object -> dict -> object
     _, aupimoresult = pimo.aupimo_scores(
         anomaly_maps,
         masks,
@@ -552,10 +554,16 @@ def test_aupimoresult_conversions(
         fpr_bounds=(1e-5, 1e-4),
         force=True,
     )
-    # convert to dict
+
+    # call properties
+    _ = aupimoresult.num_images
+    _ = aupimoresult.image_classes
+    _ = aupimoresult.fpr_bounds
+    _ = aupimoresult.thresh_bounds
+
+    # object -> dict -> object
     dic = aupimoresult.to_dict()
     assert isinstance(dic, dict)
-    # convert back to AUPIMOResult
     aupimoresult_from_dict = AUPIMOResult.from_dict(dic)
     assert isinstance(aupimoresult_from_dict, AUPIMOResult)
     # values should be the same

From dfa8dc3be5434ee981268dd73613a1d7c9acb05f Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Sun, 24 Dec 2023 15:58:05 +0100
Subject: [PATCH 21/57] minor refactors

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/pimo.py | 66 +++++++++++++++++++-------
 1 file changed, 50 insertions(+), 16 deletions(-)

diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 29a27859db..d6be2f27a0 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -269,6 +269,50 @@ def __post_init__(self) -> None:
             msg = f"Invalid inputs for {self.__class__.__name__} object. Cause: {ex}."
             raise ValueError(msg) from ex
 
+    @classmethod
+    def from_pimoresult(
+        cls: type[AUPIMOResult],
+        pimoresult: PIMOResult,
+        fpr_bounds: tuple[float, float],
+        aupimos: Tensor,
+    ) -> AUPIMOResult:
+        """Return an AUPIMO result object from a PIMO result object.
+
+        Args:
+            pimoresult: PIMO result object
+            fpr_bounds: lower and upper bounds of the FPR integration range
+            aupimos: AUPIMO scores
+        """
+        if pimoresult.per_image_tprs.shape[0] != aupimos.shape[0]:
+            msg = (
+                f"Invalid {cls.__name__} object. Attributes have inconsistent shapes: "
+                f"there are {pimoresult.per_image_tprs.shape[0]} PIMO curves but {aupimos.shape[0]} AUPIMO scores."
+            )
+            raise ValueError(msg)
+
+        if not torch.isnan(aupimos[pimoresult.image_classes == 0]).all():
+            msg = "Expected all normal images to have NaN AUPIMOs, but some have non-NaN values."
+            raise ValueError(msg)
+
+        if torch.isnan(aupimos[pimoresult.image_classes == 1]).any():
+            msg = "Expected all anomalous images to have valid AUPIMOs (not nan), but some have NaN values."
+            raise ValueError(msg)
+
+        fpr_lower_bound, fpr_upper_bound = fpr_bounds
+        # recall: fpr upper/lower bounds are the same as the thresh lower/upper bounds
+        _, thresh_lower_bound, __ = pimoresult.thresh_at(fpr_upper_bound)
+        _, thresh_upper_bound, __ = pimoresult.thresh_at(fpr_lower_bound)
+        # `_` is the threshold's index, `__` is the actual fpr value
+        return cls(
+            shared_fpr_metric=pimoresult.shared_fpr_metric,
+            fpr_lower_bound=fpr_lower_bound,
+            fpr_upper_bound=fpr_upper_bound,
+            num_threshs=pimoresult.num_threshs,
+            thresh_lower_bound=float(thresh_lower_bound),
+            thresh_upper_bound=float(thresh_upper_bound),
+            aupimos=aupimos,
+        )
+
     def to_dict(self) -> dict[str, Tensor | str | float | int]:
         """Return a dictionary with the result object's attributes."""
         return {
@@ -312,7 +356,8 @@ def save(self, file_path: str | Path) -> None:
         file_path = duplicate_filename(file_path)
         file_path = Path(file_path)
         payload = self.to_dict()
-        payload = {k: v.numpy().tolist() if isinstance(v, Tensor) else v for k, v in payload.items()}
+        aupimos: Tensor = payload["aupimos"]
+        payload["aupimos"] = aupimos.numpy().tolist()
         with file_path.open("w") as f:
             json.dump(payload, f, indent=4)
 
@@ -433,23 +478,12 @@ def aupimo_scores(  # noqa: D103
         shared_fpr=shared_fpr,
         per_image_tprs=per_image_tprs,
     )
-    fpr_lower_bound, fpr_upper_bound = fpr_bounds
-    # recall: fpr upper/lower bounds are the same as the thresh lower/upper bounds
-    # `_` is the threshold's index, `__` is the actual fpr value
-    _, thresh_lower_bound, __ = pimoresult.thresh_at(fpr_upper_bound)
-    _, thresh_upper_bound, __ = pimoresult.thresh_at(fpr_lower_bound)
-    return (
+    aupimoresult = AUPIMOResult.from_pimoresult(
         pimoresult,
-        AUPIMOResult(
-            shared_fpr_metric=shared_fpr_metric,
-            fpr_lower_bound=fpr_lower_bound,
-            fpr_upper_bound=fpr_upper_bound,
-            num_threshs=num_threshs,
-            thresh_lower_bound=float(thresh_lower_bound),
-            thresh_upper_bound=float(thresh_upper_bound),
-            aupimos=aupimos,
-        ),
+        fpr_bounds=fpr_bounds,
+        aupimos=aupimos,
     )
+    return pimoresult, aupimoresult
 
 
 # =========================================== TORCHMETRICS ===========================================

From 2cefa2c0ac0a04755043d830ecd91b8e707826c8 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Sun, 24 Dec 2023 16:36:07 +0100
Subject: [PATCH 22/57] add missing docstrings

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/__init__.py    |   5 +-
 .../metrics/per_image/_binclf_curve_numba.py  |   2 +
 src/anomalib/metrics/per_image/_validate.py   |   2 +
 .../metrics/per_image/binclf_curve.py         |   2 +
 .../metrics/per_image/binclf_curve_numpy.py   |   2 +
 src/anomalib/metrics/per_image/pimo.py        | 208 +++++++++++++++---
 src/anomalib/metrics/per_image/pimo_numpy.py  |  72 +++++-
 7 files changed, 259 insertions(+), 34 deletions(-)

diff --git a/src/anomalib/metrics/per_image/__init__.py b/src/anomalib/metrics/per_image/__init__.py
index 9a6ef065d3..d69f0702fe 100644
--- a/src/anomalib/metrics/per_image/__init__.py
+++ b/src/anomalib/metrics/per_image/__init__.py
@@ -1,4 +1,7 @@
-"""Per-Image Metrics."""
+"""Per-Image Metrics.
+
+TODO(jpcbertoldo): add formalities (license header, author)
+"""
 
 from .binclf_curve import per_image_binclf_curve, per_image_fpr, per_image_tpr
 from .binclf_curve_numpy import Algorithm as BinClfAlgorithm
diff --git a/src/anomalib/metrics/per_image/_binclf_curve_numba.py b/src/anomalib/metrics/per_image/_binclf_curve_numba.py
index 60b9336f74..914480d77f 100644
--- a/src/anomalib/metrics/per_image/_binclf_curve_numba.py
+++ b/src/anomalib/metrics/per_image/_binclf_curve_numba.py
@@ -1,6 +1,8 @@
 """Binary classification matrix curve (NUMBA implementation of low level functions).
 
 See docstring of `binclf_curve` or `binclf_curve_numpy` for more details.
+
+TODO(jpcbertoldo): add formalities (license header, author)
 """
 
 import numba
diff --git a/src/anomalib/metrics/per_image/_validate.py b/src/anomalib/metrics/per_image/_validate.py
index 2a6a574cd8..d24316dc78 100644
--- a/src/anomalib/metrics/per_image/_validate.py
+++ b/src/anomalib/metrics/per_image/_validate.py
@@ -1,6 +1,8 @@
 """Utils for validating arguments and results.
 
 `torch` is imported in the functions that use it, so this module can be used in numpy-standalone mode.
+
+TODO(jpcbertoldo): add formalities (license header, author)
 """
 
 from __future__ import annotations
diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
index f23f28530e..2c6c555e76 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -7,6 +7,8 @@
 
 Validations will preferably happen in ndarray so the numpy code can be reused without torch,
 so often times the Tensor arguments will be converted to ndarray and then validated.
+
+TODO(jpcbertoldo): add formalities (license header, author)
 """
 
 from __future__ import annotations
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index 83d394a089..8155e94607 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -3,6 +3,8 @@
 A binary classification (binclf) matrix (TP, FP, FN, TN) is evaluated at multiple thresholds.
 
 The thresholds are shared by all instances/images, but their binclf are computed independently for each instance/image.
+
+TODO(jpcbertoldo): add formalities (license header, author)
 """
 
 import itertools
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index d6be2f27a0..c3ffc1600c 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -3,10 +3,14 @@
 This module implements torch interfaces to access the numpy code in `pimo_numpy.py`.
 Check its docstring for more details.
 
-Tensors are build with `torch.from_numpy` and so the returned tensors will share the same memory as the numpy arrays.
-
 Validations will preferably happen in ndarray so the numpy code can be reused without torch,
 so often times the Tensor arguments will be converted to ndarray and then validated.
+
+TODO(jpcbertoldo): add ref to paper to all functions
+TODO(jpcbertoldo): add link to the tutorial notebooks
+TODO(jpcbertoldo): add image file path to `PIMOResult` and `AUPIMOResult` and change save/load methods
+TODO(jpcbertoldo): change `aucs` in the paper supp mat to `aupimos`
+TODO(jpcbertoldo): add formalities (license header, author)
 """
 from __future__ import annotations
 
@@ -87,10 +91,25 @@ def _validate_aupimos(aupimos: Tensor) -> None:
 # =========================================== RESULT OBJECT ===========================================
 
 
-# TODO(jpcbertoldo): add image file path to `PIMOResult`  # noqa: TD003
-# TODO(jpcbertoldo): missing docstring for `PIMOResult`  # noqa: TD003
 @dataclass
-class PIMOResult:  # noqa: D101
+class PIMOResult:
+    """Per-Image Overlap (PIMO, pronounced pee-mo) curve.
+
+    This interface gathers the PIMO curve data and metadata and provides several utility methods.
+
+    Notation:
+        - N: number of images
+        - K: number of thresholds
+        - FPR: False Positive Rate
+        - TPR: True Positive Rate
+
+    Attributes:
+        shared_fpr_metric (str): [metadata] shared FPR metric used to compute the PIMO curve
+        threshs (Tensor): sequence of K (monotonically increasing) thresholds used to compute the PIMO curve
+        shared_fpr (Tensor): K values of the shared FPR metric at the corresponding thresholds
+        per_image_tprs (Tensor): for each of the N images, the K values of in-image TPR at the corresponding thresholds
+    """
+
     # metadata
     shared_fpr_metric: str
 
@@ -213,11 +232,26 @@ def load(cls: type[PIMOResult], file_path: str | Path) -> PIMOResult:
             raise ValueError(msg) from ex
 
 
-# TODO(jpcbertoldo): add image file path to `AUPIMOResult`  # noqa: TD003
-# TODO(jpcbertoldo): missing docstring for `AUPIMOResult`  # noqa: TD003
-# TODO(jpcbertoldo): change `aucs` in the paper supp mat to `aupimos`  # noqa: TD003
 @dataclass
-class AUPIMOResult:  # noqa: D101
+class AUPIMOResult:
+    """Area Under the Per-Image Overlap (AUPIMO, pronounced a-u-pee-mo) curve.
+
+    This interface gathers the AUPIMO data and metadata and provides several utility methods.
+
+    Notation:
+        - N: number of images
+        - K: number of thresholds
+
+    Attributes:
+        shared_fpr_metric (str): [metadata] shared FPR metric used to compute the PIMO curve
+        fpr_lower_bound (float): [metadata] LOWER bound of the FPR integration range
+        fpr_upper_bound (float): [metadata] UPPER bound of the FPR integration range
+        num_threshs (int): [metadata] number of thresholds used to compute the PIMO curve (K)
+        thresh_lower_bound (float): LOWER threshold bound --> corresponds to the UPPER FPR bound
+        thresh_upper_bound (float): UPPER threshold bound --> corresponds to the LOWER FPR bound
+        aupimos (Tensor): N values of AUPIMO scores
+    """
+
     # metadata
     shared_fpr_metric: str
     fpr_lower_bound: float
@@ -388,14 +422,29 @@ def load(cls: type[AUPIMOResult], file_path: str | Path) -> AUPIMOResult:
 # =========================================== FUNCTIONAL ===========================================
 
 
-# TODO(jpcbertoldo): missing docstring for `pimo`  # noqa: TD003
-def pimo_curves(  # noqa: D103
+def pimo_curves(
     anomaly_maps: Tensor,
     masks: Tensor,
     num_threshs: int,
     binclf_algorithm: str = BinclfAlgorithm.NUMBA,
     shared_fpr_metric: str = SharedFPRMetric.MEAN_PERIMAGE_FPR,
 ) -> PIMOResult:
+    """Compute the Per-IMage Overlap (PIMO, pronounced pee-mo) curves.
+
+    This torch interface is a wrapper around the numpy code.
+    The tensors are converted to numpy arrays and then passed and validated in the numpy code.
+    The results are converted back to tensors and wrapped in an dataclass object.
+
+    Refer to `pimo_numpy.pimo_curves()` and `PIMOResult` (their docstrings below).
+
+    pimo_numpy.pimo_curves.__doc__
+    ==============================
+    {docstring_pimo_curves}
+
+    PIMOResult.__doc__
+    ==================
+    {docstring_pimoresult}
+    """
     _validate.is_tensor(anomaly_maps, argname="anomaly_maps")
     anomaly_maps_array = anomaly_maps.detach().cpu().numpy()
 
@@ -431,8 +480,14 @@ def pimo_curves(  # noqa: D103
     )
 
 
-# TODO(jpcbertoldo): missing docstring for `aupimo`  # noqa: TD003
-def aupimo_scores(  # noqa: D103
+# append the docstring
+pimo_curves.__doc__ = pimo_curves.__doc__.format(  # type: ignore[union-attr]
+    docstring_pimo_curves=pimo_numpy.pimo_curves.__doc__,
+    docstring_pimoresult=PIMOResult.__doc__,
+)
+
+
+def aupimo_scores(
     anomaly_maps: Tensor,
     masks: Tensor,
     num_threshs: int = 300_000,
@@ -441,6 +496,26 @@ def aupimo_scores(  # noqa: D103
     fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
     force: bool = False,
 ) -> tuple[PIMOResult, AUPIMOResult]:
+    """Compute the PIMO curves and their Area Under the Curve (i.e. AUPIMO) scores.
+
+    This torch interface is a wrapper around the numpy code.
+    The tensors are converted to numpy arrays and then passed and validated in the numpy code.
+    The results are converted back to tensors and wrapped in an dataclass object.
+
+    Refer to `pimo_numpy.aupimo_scores()`, `PIMOResult` and `AUPIMOResult` (their docstrings below).
+
+    pimo_numpy.aupimo_scores.__doc__
+    =================================
+    {docstring_aupimo_scores}
+
+    PIMOResult.__doc__
+    ==================
+    {docstring_pimoresult}
+
+    AUPIMOResult.__doc__
+    ====================
+    {docstring_aupimoresult}
+    """
     _validate.is_tensor(anomaly_maps, argname="anomaly_maps")
     anomaly_maps_array = anomaly_maps.detach().cpu().numpy()
 
@@ -486,11 +561,35 @@ def aupimo_scores(  # noqa: D103
     return pimoresult, aupimoresult
 
 
+# append the docstrings
+aupimo_scores.__doc__ = aupimo_scores.__doc__.format(  # type: ignore[union-attr]
+    docstring_aupimo_scores=pimo_numpy.aupimo_scores.__doc__,
+    docstring_pimoresult=PIMOResult.__doc__,
+    docstring_aupimoresult=AUPIMOResult.__doc__,
+)
+
+
 # =========================================== TORCHMETRICS ===========================================
 
 
-# TODO(jpcbertoldo): missing docstring for `PIMO`  # noqa: TD003
-class PIMO(Metric):  # noqa: D101
+class PIMO(Metric):
+    """Per-Image Overlap (PIMO) curve.
+
+    This torchmetrics interface is a wrapper around the functional interface, which is a wrapper around the numpy code.
+    The tensors are converted to numpy arrays and then passed and validated in the numpy code.
+    The results are converted back to tensors and wrapped in an dataclass object.
+
+    Refer to `pimo_numpy.pimo_curves()` and `PIMOResult` (their docstrings below).
+
+    pimo_numpy.pimo_curves.__doc__
+    ==============================
+    {docstring_pimo_curves}
+
+    PIMOResult.__doc__
+    ==================
+    {docstring_pimoresult}
+    """
+
     is_differentiable: bool = False
     higher_is_better: bool | None = None
     full_state_update: bool = False
@@ -503,7 +602,7 @@ class PIMO(Metric):  # noqa: D101
     masks: list[Tensor]
 
     @property
-    def is_empty(self) -> bool:
+    def _is_empty(self) -> bool:
         """Return True if the metric has not been updated yet."""
         return len(self.anomaly_maps) == 0
 
@@ -523,8 +622,13 @@ def __init__(
         binclf_algorithm: str = BinclfAlgorithm.NUMBA,
         shared_fpr_metric: str = SharedFPRMetric.MEAN_PERIMAGE_FPR,
     ) -> None:
-        """Per-Image Overlap (PIMO) curve."""
-        # TODO(jpcbertoldo): docstring of `PIMO.__init__()`  # noqa: TD003
+        """Per-Image Overlap (PIMO) curve.
+
+        Args:
+            num_threshs: number of thresholds used to compute the PIMO curve
+            binclf_algorithm: algorithm to compute the binary classification curve
+            shared_fpr_metric: metric to compute the shared FPR curve
+        """
         super().__init__()
 
         warnings.warn(
@@ -551,7 +655,7 @@ def __init__(
         self.add_state("masks", default=[], dist_reduce_fx="cat")
 
     def update(self, anomaly_maps: Tensor, masks: Tensor) -> None:
-        """Update list of anomaly maps and masks.
+        """Update lists of anomaly maps and masks.
 
         Args:
             anomaly_maps (Tensor): predictions of the model (ndim == 2, float)
@@ -563,9 +667,15 @@ def update(self, anomaly_maps: Tensor, masks: Tensor) -> None:
         self.anomaly_maps.append(anomaly_maps)
         self.masks.append(masks)
 
-    # TODO(jpcbertoldo): missing docstring for `PIMO.compute`  # noqa: TD003
-    def compute(self) -> PIMOResult:  # noqa: D102
-        if self.is_empty:
+    def compute(self) -> PIMOResult:
+        """Compute the PIMO curves.
+
+        Call the functional interface `pimo_curves()`, which is a wrapper around the numpy code.
+
+        Returns:
+            PIMOResult: PIMO curves dataclass object. See `PIMOResult` for details.
+        """
+        if self._is_empty:
             msg = "No anomaly maps and masks have been added yet. Please call `update()` first."
             raise RuntimeError(msg)
         anomaly_maps = torch.concat(self.anomaly_maps, dim=0)
@@ -579,10 +689,33 @@ def compute(self) -> PIMOResult:  # noqa: D102
         )
 
 
+# append the docstrings
+PIMO.__doc__ = PIMO.__doc__.format(  # type: ignore[union-attr]
+    docstring_pimo_curves=pimo_numpy.pimo_curves.__doc__,
+    docstring_pimoresult=PIMOResult.__doc__,
+)
+
+
 class AUPIMO(PIMO):
     """Area Under the Per-Image Overlap (PIMO) curve.
 
-    TODO(jpcbertoldo): docstring of `AUPIMO`  # noqa: DAR101
+    This torchmetrics interface is a wrapper around the functional interface, which is a wrapper around the numpy code.
+    The tensors are converted to numpy arrays and then passed and validated in the numpy code.
+    The results are converted back to tensors and wrapped in an dataclass object.
+
+    Refer to `pimo_numpy.aupimo_scores()`, `PIMOResult` and `AUPIMOResult` (their docstrings below).
+
+    pimo_numpy.aupimo_scores.__doc__
+    =================================
+    {docstring_aupimo_scores}
+
+    PIMOResult.__doc__
+    ==================
+    {docstring_pimoresult}
+
+    AUPIMOResult.__doc__
+    ====================
+    {docstring_aupimoresult}
     """
 
     fpr_bounds: tuple[float, float]
@@ -636,7 +769,12 @@ def __init__(
     ) -> None:
         """Area Under the Per-Image Overlap (PIMO) curve.
 
-        TODO(jpcbertoldo): docstring of `AUPIMO.__init__()`  # noqa: DAR101
+        Args:
+            num_threshs: [passed to parent `PIMO`] number of thresholds used to compute the PIMO curve
+            binclf_algorithm: [passed to parent `PIMO`] algorithm to compute the binary classification curve
+            shared_fpr_metric: [passed to parent `PIMO`] metric to compute the shared FPR curve
+            fpr_bounds: lower and upper bounds of the FPR integration range
+            force: if True, force the computation of the AUPIMO scores even in bad conditions (e.g. few points)
         """
         super().__init__(
             num_threshs=num_threshs,
@@ -652,8 +790,18 @@ def __init__(
         self.force = force
 
     def compute(self, force: bool | None = None) -> tuple[PIMOResult, AUPIMOResult]:  # type: ignore[override]
-        """TODO(jpcbertoldo): docstring of `AUPIMO.compute()`."""  # noqa: D402
-        if self.is_empty:
+        """Compute the PIMO curves and their Area Under the curve (AUPIMO) scores.
+
+        Call the functional interface `aupimo_scores()`, which is a wrapper around the numpy code.
+
+        Args:
+            force: if given (not None), override the `force` attribute.
+
+        Returns:
+            tuple[PIMOResult, AUPIMOResult]: PIMO curves and AUPIMO scores dataclass objects.
+                See `PIMOResult` and `AUPIMOResult` for details.
+        """
+        if self._is_empty:
             msg = "No anomaly maps and masks have been added yet. Please call `update()` first."
             raise RuntimeError(msg)
         anomaly_maps = torch.concat(self.anomaly_maps, dim=0)
@@ -668,3 +816,11 @@ def compute(self, force: bool | None = None) -> tuple[PIMOResult, AUPIMOResult]:
             fpr_bounds=self.fpr_bounds,
             force=force,
         )
+
+
+# append the docstrings
+AUPIMO.__doc__ = AUPIMO.__doc__.format(  # type: ignore[union-attr]
+    docstring_aupimo_scores=pimo_numpy.aupimo_scores.__doc__,
+    docstring_pimoresult=PIMOResult.__doc__,
+    docstring_aupimoresult=AUPIMOResult.__doc__,
+)
diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index 94229c18e9..d490c64691 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -2,8 +2,8 @@
 
 # PIMO
 
-PIMO is a measure of average True Positive Rate (TPR), on each image, across multiple anomaly score thresholds.
-The anomaly score thresholds are indexed by an False Positive Rate (FPR) measure on the normal images.
+PIMO is a curve of True Positive Rate (TPR) values on each image across multiple anomaly score thresholds.
+The anomaly score thresholds are indexed by a (shared) valued of False Positive Rate (FPR) measure on the normal images.
 
 Each *anomalous* image has its own curve such that the X-axis is shared by all of them.
 
@@ -22,7 +22,9 @@
 
 `AUPIMO` is the area under each `PIMO` curve with bounded integration range in terms of shared FPR.
 
-TODO(jpcbertoldo): add ref to paper
+TODO(jpcbertoldo): add ref to paper to all functions
+TODO(jpcbertoldo): add link to the tutorial notebooks
+TODO(jpcbertoldo): add formalities (license header, author)
 """
 
 import logging
@@ -96,14 +98,41 @@ def _joint_validate_threshs_shared_fpr(threshs: ndarray, shared_fpr: ndarray) ->
 # =========================================== PIMO ===========================================
 
 
-# TODO(jpcbertoldo): missing docstring for `pimo`  # noqa: TD003
-def pimo_curves(  # noqa: D103
+def pimo_curves(
     anomaly_maps: ndarray,
     masks: ndarray,
     num_threshs: int,
     binclf_algorithm: str = BinclfAlgorithm.NUMBA,
     shared_fpr_metric: str = SharedFPRMetric.MEAN_PERIMAGE_FPR,
 ) -> tuple[ndarray, ndarray, ndarray, ndarray]:
+    """Compute the Per-IMage Overlap (PIMO, pronounced pee-mo) curves.
+
+    PIMO is a curve of True Positive Rate (TPR) values on each image across multiple anomaly score thresholds.
+    The anomaly score thresholds are indexed by a (cross-image shared) value of False Positive Rate (FPR) measure on
+    the normal images.
+
+    See the module's docstring for more details.
+
+    Args' notation:
+        N: number of images
+        H: image height
+        W: image width
+        K: number of thresholds
+
+    Args:
+        anomaly_maps: floating point anomaly score maps of shape (N, H, W)
+        masks: binary (bool or int) ground truth masks of shape (N, H, W)
+        num_threshs: number of thresholds to compute (K)
+        binclf_algorithm: algorithm to compute the binary classifier curve (see `binclf_curve_numpy.Algorithm`)
+        shared_fpr_metric: metric to compute the shared FPR axis
+
+    Returns:
+        tuple[ndarray, ndarray, ndarray, ndarray]:
+            [0] thresholds of shape (K,) in ascending order
+            [1] shared FPR values of shape (K,) in descending order (indices correspond to the thresholds)
+            [2] per-image TPR curves of shape (N, K), axis 1 in descending order (indices correspond to the thresholds)
+            [3] image classes of shape (N,) with values 0 (normal) or 1 (anomalous)
+    """
     BinclfAlgorithm.validate(binclf_algorithm)
     SharedFPRMetric.validate(shared_fpr_metric)
     _validate.num_threshs(num_threshs)
@@ -162,8 +191,7 @@ def pimo_curves(  # noqa: D103
 # =========================================== AUPIMO ===========================================
 
 
-# TODO(jpcbertoldo): missing docstring for `aupimo`  # noqa: TD003
-def aupimo_scores(  # noqa: D103
+def aupimo_scores(
     anomaly_maps: ndarray,
     masks: ndarray,
     num_threshs: int = 300_000,
@@ -172,6 +200,36 @@ def aupimo_scores(  # noqa: D103
     fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
     force: bool = False,
 ) -> tuple[ndarray, ndarray, ndarray, ndarray, ndarray]:
+    """Compute the PIMO curves and their Area Under the Curve (i.e. AUPIMO) scores.
+
+    Scores are computed from the integration of the PIMO curves within the given FPR bounds, then normalized to [0, 1].
+    It can be thought of as the average TPR of the PIMO curves within the given FPR bounds.
+
+    See `pimo_curves()` and the module's docstring for more details.
+
+    Args' notation:
+        N: number of images
+        H: image height
+        W: image width
+        K: number of thresholds
+
+    Args:
+        anomaly_maps: floating point anomaly score maps of shape (N, H, W)
+        masks: binary (bool or int) ground truth masks of shape (N, H, W)
+        num_threshs: number of thresholds to compute (K)
+        binclf_algorithm: algorithm to compute the binary classifier curve (see `binclf_curve_numpy.Algorithm`)
+        shared_fpr_metric: metric to compute the shared FPR axis
+        fpr_bounds: lower and upper bounds of the FPR integration range
+        force: whether to force the computation despite bad conditions
+
+    Returns:
+        tuple[ndarray, ndarray, ndarray, ndarray, ndarray]:
+            [0] thresholds of shape (K,) in ascending order
+            [1] shared FPR values of shape (K,) in descending order (indices correspond to the thresholds)
+            [2] per-image TPR curves of shape (N, K), axis 1 in descending order (indices correspond to the thresholds)
+            [3] image classes of shape (N,) with values 0 (normal) or 1 (anomalous)
+            [4] AUPIMO scores of shape (N,) in [0, 1]
+    """
     _validate.rate_range(fpr_bounds)
 
     # other validations are done in the `pimo` function

From adc14fd79257eada31770ff3c953ad199b325dfd Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Sun, 24 Dec 2023 18:09:30 +0100
Subject: [PATCH 23/57] minore vocabulary fix for consistency

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/_validate.py  | 18 +++++++++---------
 src/anomalib/metrics/per_image/pimo.py       |  2 +-
 src/anomalib/metrics/per_image/pimo_numpy.py |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/anomalib/metrics/per_image/_validate.py b/src/anomalib/metrics/per_image/_validate.py
index d24316dc78..3d6ff94b3a 100644
--- a/src/anomalib/metrics/per_image/_validate.py
+++ b/src/anomalib/metrics/per_image/_validate.py
@@ -258,19 +258,19 @@ def binclf_curves(binclf_curves: ndarray, valid_threshs: ndarray | None) -> None
         raise RuntimeError(msg)
 
 
-def image_classes(image_classes: ndarray) -> None:
-    if not isinstance(image_classes, ndarray):
-        msg = f"Expected image classes to be an ndarray, but got {type(image_classes)}."
+def images_classes(images_classes: ndarray) -> None:
+    if not isinstance(images_classes, ndarray):
+        msg = f"Expected image classes to be an ndarray, but got {type(images_classes)}."
         raise TypeError(msg)
 
-    if image_classes.ndim != 1:
-        msg = f"Expected image classes to be 1D, but got {image_classes.ndim}D."
+    if images_classes.ndim != 1:
+        msg = f"Expected image classes to be 1D, but got {images_classes.ndim}D."
         raise ValueError(msg)
 
-    if image_classes.dtype.kind == "b":
+    if images_classes.dtype.kind == "b":
         pass
-    elif image_classes.dtype.kind in ("i", "u"):
-        unique_vals = np.unique(image_classes)
+    elif images_classes.dtype.kind in ("i", "u"):
+        unique_vals = np.unique(images_classes)
         if np.any((unique_vals != 0) & (unique_vals != 1)):
             msg = (
                 "Expected image classes to be a *binary* ndarray with ground truth labels, "
@@ -280,7 +280,7 @@ def image_classes(image_classes: ndarray) -> None:
     else:
         msg = (
             "Expected image classes to be an integer or boolean ndarray with ground truth labels, "
-            f"but got ndarray with dtype {image_classes.dtype}"
+            f"but got ndarray with dtype {images_classes.dtype}"
         )
         raise TypeError(msg)
 
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index c3ffc1600c..dba87725e6 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -64,7 +64,7 @@ def _validate_shared_fpr(shared_fpr: Tensor, nan_allowed: bool = False, decreasi
 
 def _validate_image_classes(image_classes: Tensor) -> None:
     _validate.is_tensor(image_classes, argname="image_classes")
-    _validate.image_classes(image_classes.numpy())
+    _validate.images_classes(image_classes.numpy())
 
 
 def _validate_per_image_tprs(per_image_tprs: Tensor, image_classes: Tensor) -> None:
diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index d490c64691..c8695629cf 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -243,7 +243,7 @@ def aupimo_scores(
     try:
         _validate.threshs(threshs)
         _validate.rate_curve(shared_fpr, nan_allowed=False, decreasing=True)
-        _validate.image_classes(image_classes)
+        _validate.images_classes(image_classes)
         _validate.per_image_rate_curves(per_image_tprs[image_classes == 1], nan_allowed=False, decreasing=True)
 
     except ValueError as ex:

From c30c4eafb528f2db24bcfb1fcc7dee8c2b7e4278 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Tue, 26 Dec 2023 11:18:59 +0100
Subject: [PATCH 24/57] add per image scores statistics and test it

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/__init__.py    |   5 +
 src/anomalib/metrics/per_image/pimo.py        |  34 ++-
 src/anomalib/metrics/per_image/utils.py       |  47 ++++
 src/anomalib/metrics/per_image/utils_numpy.py | 247 ++++++++++++++++++
 tests/unit/metrics/per_image/test_pimo.py     |   8 +
 tests/unit/metrics/per_image/test_utils.py    | 118 +++++++++
 6 files changed, 458 insertions(+), 1 deletion(-)
 create mode 100644 src/anomalib/metrics/per_image/utils.py
 create mode 100644 src/anomalib/metrics/per_image/utils_numpy.py
 create mode 100644 tests/unit/metrics/per_image/test_utils.py

diff --git a/src/anomalib/metrics/per_image/__init__.py b/src/anomalib/metrics/per_image/__init__.py
index d69f0702fe..b40ddd9f01 100644
--- a/src/anomalib/metrics/per_image/__init__.py
+++ b/src/anomalib/metrics/per_image/__init__.py
@@ -8,11 +8,15 @@
 from .binclf_curve_numpy import ThreshsChoice as BinclfThreshsChoice
 from .pimo import AUPIMO, PIMO, AUPIMOResult, PIMOResult, aupimo_scores, pimo_curves
 from .pimo_numpy import SharedFPRMetric
+from .utils import per_image_scores_stats
+from .utils_numpy import StatsOutliersPolicy, StatsRepeatedPolicy
 
 __all__ = [
     # constants
     "BinClfAlgorithm",
     "BinclfThreshsChoice",
+    "StatsOutliersPolicy",
+    "StatsRepeatedPolicy",
     "SharedFPRMetric",
     # result classes
     "PIMOResult",
@@ -23,6 +27,7 @@
     "per_image_tpr",
     "pimo_curves",
     "aupimo_scores",
+    "per_image_scores_stats",
     # torchmetrics interfaces
     "PIMO",
     "AUPIMO",
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index dba87725e6..51f28c62f0 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -25,9 +25,10 @@
 
 from anomalib.data.utils.image import duplicate_filename
 
-from . import _validate, pimo_numpy
+from . import _validate, pimo_numpy, utils
 from .binclf_curve_numpy import Algorithm as BinclfAlgorithm
 from .pimo_numpy import SharedFPRMetric
+from .utils import StatsOutliersPolicy, StatsRepeatedPolicy
 
 # =========================================== ARGS VALIDATION ===========================================
 
@@ -418,6 +419,37 @@ def load(cls: type[AUPIMOResult], file_path: str | Path) -> AUPIMOResult:
             msg = f"Invalid payload in file {file_path}. Cause: {ex}."
             raise ValueError(msg) from ex
 
+    def stats(
+        self,
+        outliers_policy: str | None = StatsOutliersPolicy.NONE,
+        repeated_policy: str | None = StatsRepeatedPolicy.AVOID,
+        repeated_replacement_atol: float = 1e-2,
+    ) -> list[dict[str, str | int | float]]:
+        """Return the AUPIMO statistics.
+
+        See `anomalib.utils.metrics.per_image.per_image_scores_stats` for details (its docstring below).
+
+        Returns:
+            list[dict[str, str | int | float]]: AUPIMO statistics
+
+        `anomalib.utils.metrics.per_image.per_image_scores_stats`.__doc__
+        ==================================================================
+        {docstring_per_image_scores_stats}
+        """
+        return utils.per_image_scores_stats(
+            self.aupimos,
+            self.image_classes,
+            only_class=1,
+            outliers_policy=outliers_policy,
+            repeated_policy=repeated_policy,
+            repeated_replacement_atol=repeated_replacement_atol,
+        )
+
+
+AUPIMOResult.__doc__ = AUPIMOResult.__doc__.format(  # type: ignore[union-attr]
+    docstring_per_image_scores_stats=utils.per_image_scores_stats.__doc__,
+)
+
 
 # =========================================== FUNCTIONAL ===========================================
 
diff --git a/src/anomalib/metrics/per_image/utils.py b/src/anomalib/metrics/per_image/utils.py
new file mode 100644
index 0000000000..2223b0d7ab
--- /dev/null
+++ b/src/anomalib/metrics/per_image/utils.py
@@ -0,0 +1,47 @@
+"""Torch-oriented interfaces for `utils.py`."""
+from torch import Tensor
+
+from . import _validate, utils_numpy
+from .utils_numpy import StatsOutliersPolicy, StatsRepeatedPolicy
+
+
+def per_image_scores_stats(
+    per_image_scores: Tensor,
+    images_classes: Tensor | None = None,
+    only_class: int | None = None,
+    outliers_policy: str | None = StatsOutliersPolicy.NONE,
+    repeated_policy: str | None = StatsRepeatedPolicy.AVOID,
+    repeated_replacement_atol: float = 1e-2,
+) -> list[dict[str, str | int | float]]:
+    """Torch-oriented interface for `per_image_scores_stats`. See its dscription for more details (below).
+
+    Numpy version docstring
+    =======================
+
+    {docstring}
+    """
+    _validate.is_tensor(per_image_scores, "per_image_scores")
+    per_image_scores_array = per_image_scores.detach().cpu().numpy()
+
+    if images_classes is not None:
+        _validate.is_tensor(images_classes, "images_classes")
+        images_classes_array = images_classes.detach().cpu().numpy()
+
+    else:
+        images_classes_array = None
+
+    # other validations happen inside `utils_numpy.per_image_scores_stats`
+
+    return utils_numpy.per_image_scores_stats(
+        per_image_scores_array,
+        images_classes_array,
+        only_class=only_class,
+        outliers_policy=outliers_policy,
+        repeated_policy=repeated_policy,
+        repeated_replacement_atol=repeated_replacement_atol,
+    )
+
+
+per_image_scores_stats.__doc__ = per_image_scores_stats.__doc__.format(  # type: ignore[union-attr]
+    docstring=utils_numpy.per_image_scores_stats.__doc__,
+)
diff --git a/src/anomalib/metrics/per_image/utils_numpy.py b/src/anomalib/metrics/per_image/utils_numpy.py
new file mode 100644
index 0000000000..3f6e44379b
--- /dev/null
+++ b/src/anomalib/metrics/per_image/utils_numpy.py
@@ -0,0 +1,247 @@
+"""Utility functions for per-image metrics.
+
+TODO(jpcbertoldo): add formalities (license header, author)
+"""
+
+from typing import ClassVar
+
+import matplotlib as mpl
+import numpy as np
+from numpy import ndarray
+
+from . import _validate
+
+# =========================================== CONSTANTS ===========================================
+
+
+class StatsOutliersPolicy:
+    """How to handle outliers in per-image metrics boxplots. Use them? Only high? Only low? Both?
+
+    Outliers are defined as in a boxplot, i.e. values that are more than 1.5 times the interquartile range (IQR) away
+    from the Q1 and Q3 quartiles (respectively low and high outliers). The IQR is the difference between Q3 and Q1.
+
+    None | "none": do not include outliers.
+    "hi": only include high outliers.
+    "lo": only include low outliers.
+    "both": include both high and low outliers.
+    """
+
+    NONE: ClassVar[str] = "none"
+    HI: ClassVar[str] = "hi"
+    LO: ClassVar[str] = "lo"
+    BOTH: ClassVar[str] = "both"
+
+    POLICIES: ClassVar[tuple[str | None, ...]] = (None, NONE, HI, LO, BOTH)
+
+    @staticmethod
+    def validate(policy: str | None) -> None:
+        """Validate the argument `policy`."""
+        if policy not in StatsOutliersPolicy.POLICIES:
+            msg = f"Invalid `policy`. Expected one of {StatsOutliersPolicy.POLICIES}, but got {policy}."
+            raise ValueError(msg)
+
+
+class StatsRepeatedPolicy:
+    """How to handle repeated values in per-image metrics boxplots (two stats with same value). Avoid them?
+
+    None | "none": do not avoid repeated values, so several stats can have the same value and image index.
+    "avoid": if a stat has the same value as another stat, the one with the closest then another image,
+             with the nearest score, is selected.
+    """
+
+    NONE: ClassVar[str] = "none"
+    AVOID: ClassVar[str] = "avoid"
+
+    POLICIES: ClassVar[tuple[str | None, ...]] = (None, NONE, AVOID)
+
+    @staticmethod
+    def validate(policy: str | None) -> None:
+        """Validate the argument `policy`."""
+        if policy not in StatsRepeatedPolicy.POLICIES:
+            msg = f"Invalid `policy`. Expected one of {StatsRepeatedPolicy.POLICIES}, but got {policy}."
+            raise ValueError(msg)
+
+
+# =========================================== ARGS VALIDATION ===========================================
+def _validate_image_class(image_class: int) -> None:
+    if not isinstance(image_class, int):
+        msg = f"Expected image class to be an int (0 for 'normal', 1 for 'anomalous'), but got {type(image_class)}."
+        raise TypeError(msg)
+
+    if image_class not in (0, 1):
+        msg = f"Expected image class to be either 0 for 'normal' or 1 for 'anomalous', but got {image_class}."
+        raise ValueError(msg)
+
+
+def _validate_per_image_scores(per_image_scores: ndarray) -> None:
+    if not isinstance(per_image_scores, ndarray):
+        msg = f"Expected per-image scores to be a numpy array, but got {type(per_image_scores)}."
+        raise TypeError(msg)
+
+    if per_image_scores.ndim != 1:
+        msg = f"Expected per-image scores to be 1D, but got {per_image_scores.ndim}D."
+        raise ValueError(msg)
+
+
+# =========================================== FUNCTIONS ===========================================
+
+
+def per_image_scores_stats(
+    per_image_scores: ndarray,
+    images_classes: ndarray | None = None,
+    only_class: int | None = None,
+    outliers_policy: str | None = StatsOutliersPolicy.NONE,
+    repeated_policy: str | None = StatsRepeatedPolicy.AVOID,
+    repeated_replacement_atol: float = 1e-2,
+) -> list[dict[str, str | int | float]]:
+    """Compute statistics of per-image scores (based on a boxplot's statistics).
+
+    This function uses `matplotlib.cbook.boxplot_stats`, which is the same function used by `matplotlib.pyplot.boxplot`.
+
+    ** OUTLIERS **
+    Outliers are defined as in a boxplot, i.e. values that are more than 1.5 times the interquartile range (IQR) away
+    from the Q1 and Q3 quartiles (respectively low and high outliers). The IQR is the difference between Q3 and Q1.
+
+    Outliers are handled according to `outliers_policy`:
+        - None | "none": do not include outliers.
+        - "hi": only include high outliers.
+        - "lo": only include low outliers.
+        - "both": include both high and low outliers.
+
+    ** IMAGE INDEX **
+    Each statistic is associated with the image whose score is the closest to the statistic's value.
+
+    ** REPEATED VALUES **
+    It is possible that two stats have the same value (e.g. the median and the 25th percentile can be the same).
+    Such cases are handled according to `repeated_policy`:
+        - None | "none": do not address the issue, so several stats can have the same value and image index.
+        - "avoid": avoid repeated values by iterativealy looking for other images with similar score, whose score
+                    must be within `repeated_replacement_atol` (absolute tolerance) of the repeated value.
+
+    Args:
+        per_image_scores (ndarray): 1D ndarray of per-image scores.
+        images_classes (ndarray | None):
+            Used to filter statistics to only one class. If None, all images are considered.
+            If given, 1D ndarray of binary image classes (0 for 'normal', 1 for 'anomalous'). Defaults to None.
+        only_class (int | None):
+            Only used if `images_classes` is not None.
+            If not None, only compute statistics for images of the given class.
+            `None` means both image classes are used.
+            Defaults to None.
+        outliers_policy (str | None): How to handle outliers stats (use them?). See `OutliersPolicy`. Defaults to None.
+        repeated_policy (str | None): How to handle repeated values in boxplot stats (two stats with same value).
+                                        See `RepeatedPolicy`. Defaults to None.
+        repeated_replacement_atol (float): Absolute tolerance used to replace repeated values. Only used if
+                                            `repeated_policy` is not None (or 'none'). Defaults to 1e-2 (1%).
+
+    Returns:
+        list[dict[str, str | int | float]]: List of boxplot statistics.
+
+        Each dictionary has the following keys:
+            - 'stat_name': Name of the statistic. Possible values:
+                - 'mean': Mean of the scores.
+                - 'med': Median of the scores.
+                - 'q1': 25th percentile of the scores.
+                - 'q3': 75th percentile of the scores.
+                - 'whishi': Upper whisker value.
+                - 'whislo': Lower whisker value.
+                - 'outlo_i': low outlier value; `i` is a unique index for each low outlier.
+                - 'outhi_j': high outlier value; `j` is a unique index for each high outlier.
+            - 'stat_value': Value of the statistic (same units as `values`).
+            - 'image_idx': Index of the image in `per_image_scores` whose score is the closest to the statistic's value.
+            - 'score': The score of the image at index `image_idx` (not necessarily the same as `stat_value`).
+
+        The list is sorted by increasing `stat_value`.
+    """
+    StatsOutliersPolicy.validate(outliers_policy)
+    StatsRepeatedPolicy.validate(repeated_policy)
+    _validate_per_image_scores(per_image_scores)
+
+    # restrain the images to the class `only_class` if given, else use all images
+    if images_classes is None:
+        images_selection_mask = np.ones_like(per_image_scores, dtype=bool)
+
+    elif only_class is not None:
+        _validate.images_classes(images_classes)
+        _validate.same_shape(per_image_scores, images_classes)
+        _validate_image_class(only_class)
+        images_selection_mask = images_classes == only_class
+
+    else:
+        images_selection_mask = np.ones_like(per_image_scores, dtype=bool)
+
+    # indexes in `per_image_scores` are referred to as `candidate_idx`
+    # while the indexes in the original array are referred to as `image_idx`
+    #  - `candidate_idx` works for `per_image_scores` and `candidate2image_idx` (see below)
+    #  - `image_idx` works for `images_classes` and `images_idxs_selected`
+    per_image_scores = per_image_scores[images_selection_mask]
+    # converts `candidate_idx` to `image_idx`
+    candidate2image_idx = np.nonzero(images_selection_mask)[0]
+
+    # function used in `matplotlib.boxplot`
+    boxplot_stats = mpl.cbook.boxplot_stats(per_image_scores)[0]  # [0] is for the only boxplot
+
+    # remove unnecessary keys
+    boxplot_stats = {name: value for name, value in boxplot_stats.items() if name not in ("iqr", "cilo", "cihi")}
+
+    # unroll `fliers` (outliers), remove unnecessary ones according to `outliers_policy`,
+    # then add them to `boxplot_stats` with unique keys
+    outliers = boxplot_stats.pop("fliers")
+    outliers_lo = outliers[outliers < boxplot_stats["med"]]
+    outliers_hi = outliers[outliers > boxplot_stats["med"]]
+
+    if outliers_policy in (StatsOutliersPolicy.HI, StatsOutliersPolicy.BOTH):
+        boxplot_stats = {
+            **boxplot_stats,
+            **{f"outhi_{idx:06}": value for idx, value in enumerate(outliers_hi)},
+        }
+
+    if outliers_policy in (StatsOutliersPolicy.LO, StatsOutliersPolicy.BOTH):
+        boxplot_stats = {
+            **boxplot_stats,
+            **{f"outlo_{idx:06}": value for idx, value in enumerate(outliers_lo)},
+        }
+
+    # state variables for the stateful function `append_record` below
+    images_idxs_selected: set[int] = set()
+    records: list[dict[str, str | int | float]] = []
+
+    def append_record(stat_name: str, stat_value: float) -> None:
+        candidates_sorted = np.abs(per_image_scores - stat_value).argsort()
+        candidate_idx = candidates_sorted[0]
+        image_idx = candidate2image_idx[candidate_idx]
+
+        # handle repeated values
+        if image_idx not in images_idxs_selected or repeated_policy is None:
+            pass
+
+        elif repeated_policy == StatsRepeatedPolicy.AVOID:
+            for other_candidate_idx in candidates_sorted:
+                other_candidate_image_idx = candidate2image_idx[other_candidate_idx]
+                if other_candidate_image_idx in images_idxs_selected:
+                    continue
+                # if the code reaches here, it means that `other_candidate_image_idx` is not in `images_idxs_selected`
+                # i.e. this image has not been selected yet, so it can be used
+                other_candidate_score = per_image_scores[other_candidate_idx]
+                # if the other candidate is not too far from the value, use it
+                # note that the first choice has not changed, so if no other is selected in the loop
+                # it will be the first choice
+                if np.isclose(other_candidate_score, stat_value, atol=repeated_replacement_atol):
+                    candidate_idx = other_candidate_idx
+                    image_idx = other_candidate_image_idx
+                    break
+
+        images_idxs_selected.add(image_idx)
+        records.append(
+            {
+                "stat_name": stat_name,
+                "stat_value": float(stat_value),
+                "image_idx": int(image_idx),
+                "score": float(per_image_scores[candidate_idx]),
+            },
+        )
+
+    # loop over the stats from the lowest to the highest value
+    for stat, val in sorted(boxplot_stats.items(), key=lambda x: x[1]):
+        append_record(stat, val)
+    return sorted(records, key=lambda r: r["score"])
diff --git a/tests/unit/metrics/per_image/test_pimo.py b/tests/unit/metrics/per_image/test_pimo.py
index d070a627e7..a179f0c430 100644
--- a/tests/unit/metrics/per_image/test_pimo.py
+++ b/tests/unit/metrics/per_image/test_pimo.py
@@ -586,3 +586,11 @@ def test_aupimoresult_object(
     assert aupimoresult_from_load.num_threshs == aupimoresult.num_threshs
     assert aupimoresult_from_load.thresh_bounds == aupimoresult.thresh_bounds
     assert torch.allclose(aupimoresult_from_load.aupimos, aupimoresult.aupimos, equal_nan=True)
+
+    # statistics
+    stats = aupimoresult.stats()
+    assert len(stats) == 6
+    from .test_utils import assert_statsdict_stuff
+
+    for statdic in stats:
+        assert_statsdict_stuff(statdic, 2)
diff --git a/tests/unit/metrics/per_image/test_utils.py b/tests/unit/metrics/per_image/test_utils.py
new file mode 100644
index 0000000000..63a09a7f42
--- /dev/null
+++ b/tests/unit/metrics/per_image/test_utils.py
@@ -0,0 +1,118 @@
+"""Test `utils.py`."""
+
+import numpy as np
+import torch
+
+
+def assert_statsdict_stuff(statdic: dict, max_image_idx: int) -> None:
+    """Assert stuff about a `statdic`."""
+    assert "stat_name" in statdic
+    stat_name = statdic["stat_name"]
+    assert stat_name in ("mean", "med", "q1", "q3", "whishi", "whislo") or stat_name.startswith(
+        ("outlo_", "outhi_"),
+    )
+    assert "stat_value" in statdic
+    assert "image_idx" in statdic
+    image_idx = statdic["image_idx"]
+    assert 0 <= image_idx <= max_image_idx
+
+
+def test_per_image_scores_stats() -> None:
+    """Test `per_image_scores_boxplot_stats`."""
+    from anomalib.metrics.per_image import (
+        StatsOutliersPolicy,
+        StatsRepeatedPolicy,
+        per_image_scores_stats,
+    )
+
+    gen = torch.Generator().manual_seed(42)
+    num_scores = 201
+    scores = torch.randn(num_scores, generator=gen)
+
+    stats = per_image_scores_stats(scores)
+    assert len(stats) == 6
+    for statdic in stats:
+        assert_statsdict_stuff(statdic, num_scores - 1)
+
+    classes = (torch.arange(num_scores) % 3 == 0).to(torch.long)
+    stats = per_image_scores_stats(scores, classes, only_class=None)
+    assert len(stats) == 6
+    stats = per_image_scores_stats(scores, classes, only_class=0)
+    assert len(stats) == 6
+    stats = per_image_scores_stats(scores, classes, only_class=1)
+    assert len(stats) == 6
+
+    stats = per_image_scores_stats(scores, outliers_policy=StatsOutliersPolicy.BOTH)
+    assert len(stats) == 6
+    stats = per_image_scores_stats(scores, outliers_policy=StatsOutliersPolicy.LO)
+    assert len(stats) == 6
+    stats = per_image_scores_stats(scores, outliers_policy=StatsOutliersPolicy.HI)
+    assert len(stats) == 6
+    stats = per_image_scores_stats(scores, outliers_policy=StatsOutliersPolicy.NONE)
+    assert len(stats) == 6
+
+    # force repeated values
+    scores = torch.round(scores * 10) / 10
+    stats = per_image_scores_stats(scores, repeated_policy=StatsRepeatedPolicy.AVOID)
+    assert len(stats) == 6
+    stats = per_image_scores_stats(
+        scores,
+        classes,
+        repeated_policy=StatsRepeatedPolicy.AVOID,
+        repeated_replacement_atol=1e-1,
+    )
+    assert len(stats) == 6
+    stats = per_image_scores_stats(scores, repeated_policy=StatsRepeatedPolicy.NONE)
+    assert len(stats) == 6
+
+
+def test_per_image_scores_stats_specific_values() -> None:
+    """Test `per_image_scores_boxplot_stats` with specific values."""
+    from anomalib.metrics.per_image import per_image_scores_stats
+
+    scores = torch.concatenate(
+        [
+            # whislo = min value is 0.0
+            torch.tensor([0.0]),
+            torch.zeros(98),
+            # q1 value is 0.0
+            torch.tensor([0.0]),
+            torch.linspace(0.01, 0.29, 98),
+            # med value is 0.3
+            torch.tensor([0.3]),
+            torch.linspace(0.31, 0.69, 99),
+            # q3 value is 0.7
+            torch.tensor([0.7]),
+            torch.linspace(0.71, 0.99, 99),
+            # whishi = max value is 1.0
+            torch.tensor([1.0]),
+        ],
+    )
+
+    stats = per_image_scores_stats(scores)
+    assert len(stats) == 6
+
+    statdict_whislo = stats[0]
+    statdict_q1 = stats[1]
+    statdict_med = stats[2]
+    statdict_mean = stats[3]
+    statdict_q3 = stats[4]
+    statdict_whishi = stats[5]
+
+    assert statdict_whislo["stat_name"] == "whislo"
+    assert np.isclose(statdict_whislo["stat_value"], 0.0)
+
+    assert statdict_q1["stat_name"] == "q1"
+    assert np.isclose(statdict_q1["stat_value"], 0.0, atol=1e-2)
+
+    assert statdict_med["stat_name"] == "med"
+    assert np.isclose(statdict_med["stat_value"], 0.3, atol=1e-2)
+
+    assert statdict_mean["stat_name"] == "mean"
+    assert np.isclose(statdict_mean["stat_value"], 0.3762, atol=1e-2)
+
+    assert statdict_q3["stat_name"] == "q3"
+    assert np.isclose(statdict_q3["stat_value"], 0.7, atol=1e-2)
+
+    assert statdict_whishi["stat_name"] == "whishi"
+    assert statdict_whishi["stat_value"] == 1.0

From 408fb2bdd45b79424fb2a942b53b93af02475d15 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Tue, 26 Dec 2023 12:25:36 +0100
Subject: [PATCH 25/57] refactor constants notation

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/__init__.py    |  9 +++---
 .../metrics/per_image/binclf_curve.py         |  6 ++--
 .../metrics/per_image/binclf_curve_numpy.py   | 32 +++++++++----------
 src/anomalib/metrics/per_image/pimo.py        | 20 ++++++------
 src/anomalib/metrics/per_image/pimo_numpy.py  | 17 +++++-----
 5 files changed, 41 insertions(+), 43 deletions(-)

diff --git a/src/anomalib/metrics/per_image/__init__.py b/src/anomalib/metrics/per_image/__init__.py
index b40ddd9f01..98fc2a50f8 100644
--- a/src/anomalib/metrics/per_image/__init__.py
+++ b/src/anomalib/metrics/per_image/__init__.py
@@ -4,20 +4,19 @@
 """
 
 from .binclf_curve import per_image_binclf_curve, per_image_fpr, per_image_tpr
-from .binclf_curve_numpy import Algorithm as BinClfAlgorithm
-from .binclf_curve_numpy import ThreshsChoice as BinclfThreshsChoice
+from .binclf_curve_numpy import BinclfAlgorithm, BinclfThreshsChoice
 from .pimo import AUPIMO, PIMO, AUPIMOResult, PIMOResult, aupimo_scores, pimo_curves
-from .pimo_numpy import SharedFPRMetric
+from .pimo_numpy import PIMOSharedFPRMetric
 from .utils import per_image_scores_stats
 from .utils_numpy import StatsOutliersPolicy, StatsRepeatedPolicy
 
 __all__ = [
     # constants
-    "BinClfAlgorithm",
+    "BinclfAlgorithm",
     "BinclfThreshsChoice",
     "StatsOutliersPolicy",
     "StatsRepeatedPolicy",
-    "SharedFPRMetric",
+    "PIMOSharedFPRMetric",
     # result classes
     "PIMOResult",
     "AUPIMOResult",
diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
index 2c6c555e76..adaece314f 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -17,7 +17,7 @@
 from torch import Tensor
 
 from . import _validate, binclf_curve_numpy
-from .binclf_curve_numpy import Algorithm, ThreshsChoice
+from .binclf_curve_numpy import BinclfAlgorithm, BinclfThreshsChoice
 
 # =========================================== ARGS VALIDATION ===========================================
 
@@ -43,8 +43,8 @@ def _validate_binclf_curves(binclf_curves: Tensor, valid_threshs: Tensor | None
 def per_image_binclf_curve(
     anomaly_maps: Tensor,
     masks: Tensor,
-    algorithm: str = Algorithm.NUMBA,
-    threshs_choice: str = ThreshsChoice.MINMAX_LINSPACE,
+    algorithm: str = BinclfAlgorithm.NUMBA,
+    threshs_choice: str = BinclfThreshsChoice.MINMAX_LINSPACE,
     threshs_given: Tensor | None = None,
     num_threshs: int | None = None,
 ) -> tuple[Tensor, Tensor]:
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index 8155e94607..912741d0cc 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -31,7 +31,7 @@
 
 
 @dataclass
-class Algorithm:
+class BinclfAlgorithm:
     """Algorithm to use."""
 
     PYTHON: ClassVar[str] = "python"
@@ -41,13 +41,13 @@ class Algorithm:
     @staticmethod
     def validate(algorithm: str) -> None:
         """Validate `algorithm` argument."""
-        if algorithm not in Algorithm.ALGORITHMS:
-            msg = f"Expected `algorithm` to be one of {Algorithm.ALGORITHMS}, but got {algorithm}"
+        if algorithm not in BinclfAlgorithm.ALGORITHMS:
+            msg = f"Expected `algorithm` to be one of {BinclfAlgorithm.ALGORITHMS}, but got {algorithm}"
             raise ValueError(msg)
 
 
 @dataclass
-class ThreshsChoice:
+class BinclfThreshsChoice:
     """Sequence of thresholds to use."""
 
     GIVEN: ClassVar[str] = "given"
@@ -186,7 +186,7 @@ def binclf_multiple_curves(
     scores_batch: ndarray,
     gts_batch: ndarray,
     threshs: ndarray,
-    algorithm: str = Algorithm.NUMBA,
+    algorithm: str = BinclfAlgorithm.NUMBA,
 ) -> ndarray:
     """Multiple binary classification matrix (per-instance scope) at each threshold (shared).
 
@@ -226,16 +226,16 @@ def binclf_multiple_curves(
 
         Thresholds are sorted in ascending order.
     """
-    Algorithm.validate(algorithm)
+    BinclfAlgorithm.validate(algorithm)
     _validate_scores_batch(scores_batch)
     _validate_gts_batch(gts_batch)
     _validate.same_shape(scores_batch, gts_batch)
     _validate.threshs(threshs)
 
-    if algorithm == Algorithm.PYTHON:
+    if algorithm == BinclfAlgorithm.PYTHON:
         return _binclf_multiple_curves_python(scores_batch, gts_batch, threshs)
 
-    if algorithm == Algorithm.NUMBA:
+    if algorithm == BinclfAlgorithm.NUMBA:
         if not HAS_NUMBA:
             logger.warning(
                 "Algorithm 'numba' was selected, but numba is not installed. Fallback to 'python' algorithm.",
@@ -243,7 +243,7 @@ def binclf_multiple_curves(
             return _binclf_multiple_curves_python(scores_batch, gts_batch, threshs)
         return _binclf_curve_numba.binclf_multiple_curves_numba(scores_batch, gts_batch, threshs)
 
-    msg = f"Expected `algorithm` to be one of {Algorithm.ALGORITHMS}, but got {algorithm}"
+    msg = f"Expected `algorithm` to be one of {BinclfAlgorithm.ALGORITHMS}, but got {algorithm}"
     raise NotImplementedError(msg)
 
 
@@ -266,8 +266,8 @@ def _get_threshs_minmax_linspace(anomaly_maps: ndarray, num_threshs: int) -> nda
 def per_image_binclf_curve(
     anomaly_maps: ndarray,
     masks: ndarray,
-    algorithm: str = Algorithm.NUMBA,
-    threshs_choice: str = ThreshsChoice.MINMAX_LINSPACE,
+    algorithm: str = BinclfAlgorithm.NUMBA,
+    threshs_choice: str = BinclfThreshsChoice.MINMAX_LINSPACE,
     threshs_given: ndarray | None = None,
     num_threshs: int | None = None,
 ) -> tuple[ndarray, ndarray]:
@@ -316,14 +316,14 @@ def per_image_binclf_curve(
 
             Thresholds are sorted in ascending order.
     """
-    Algorithm.validate(algorithm)
+    BinclfAlgorithm.validate(algorithm)
     _validate.anomaly_maps(anomaly_maps)
     _validate.masks(masks)
     _validate.same_shape(anomaly_maps, masks)
 
     threshs: ndarray
 
-    if threshs_choice == ThreshsChoice.GIVEN:
+    if threshs_choice == BinclfThreshsChoice.GIVEN:
         assert threshs_given is not None
         _validate.threshs(threshs_given)
         if num_threshs is not None:
@@ -332,7 +332,7 @@ def per_image_binclf_curve(
             )
         threshs = threshs_given.astype(anomaly_maps.dtype)
 
-    elif threshs_choice == ThreshsChoice.MINMAX_LINSPACE:
+    elif threshs_choice == BinclfThreshsChoice.MINMAX_LINSPACE:
         assert num_threshs is not None
         if threshs_given is not None:
             logger.warning(
@@ -341,11 +341,11 @@ def per_image_binclf_curve(
         # `num_threshs` is validated in the function below
         threshs = _get_threshs_minmax_linspace(anomaly_maps, num_threshs)
 
-    elif threshs_choice == ThreshsChoice.MEAN_FPR_OPTIMIZED:
+    elif threshs_choice == BinclfThreshsChoice.MEAN_FPR_OPTIMIZED:
         raise NotImplementedError(f"TODO implement {threshs_choice}")  # noqa: EM102
 
     else:
-        msg = f"Expected `threshs_choice` to be one of {ThreshsChoice.CHOICES}, but got {threshs_choice}"
+        msg = f"Expected `threshs_choice` to be one of {BinclfThreshsChoice.CHOICES}, but got {threshs_choice}"
         raise NotImplementedError(msg)
 
     # keep the batch dimension and flatten the rest
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 51f28c62f0..63ef58f382 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -26,8 +26,8 @@
 from anomalib.data.utils.image import duplicate_filename
 
 from . import _validate, pimo_numpy, utils
-from .binclf_curve_numpy import Algorithm as BinclfAlgorithm
-from .pimo_numpy import SharedFPRMetric
+from .binclf_curve_numpy import BinclfAlgorithm
+from .pimo_numpy import PIMOSharedFPRMetric
 from .utils import StatsOutliersPolicy, StatsRepeatedPolicy
 
 # =========================================== ARGS VALIDATION ===========================================
@@ -137,7 +137,7 @@ def image_classes(self) -> Tensor:
     def __post_init__(self) -> None:
         """Validate the inputs for the result object are consistent."""
         try:
-            SharedFPRMetric.validate(self.shared_fpr_metric)
+            PIMOSharedFPRMetric.validate(self.shared_fpr_metric)
             _validate_threshs(self.threshs)
             _validate_shared_fpr(self.shared_fpr, nan_allowed=False)
             _validate_per_image_tprs(self.per_image_tprs, self.image_classes)
@@ -294,7 +294,7 @@ def thresh_bounds(self) -> tuple[float, float]:
     def __post_init__(self) -> None:
         """Validate the inputs for the result object are consistent."""
         try:
-            SharedFPRMetric.validate(self.shared_fpr_metric)
+            PIMOSharedFPRMetric.validate(self.shared_fpr_metric)
             _validate.rate_range((self.fpr_lower_bound, self.fpr_upper_bound))
             _validate.num_threshs(self.num_threshs)
             _validate_aupimos(self.aupimos)
@@ -459,7 +459,7 @@ def pimo_curves(
     masks: Tensor,
     num_threshs: int,
     binclf_algorithm: str = BinclfAlgorithm.NUMBA,
-    shared_fpr_metric: str = SharedFPRMetric.MEAN_PERIMAGE_FPR,
+    shared_fpr_metric: str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
 ) -> PIMOResult:
     """Compute the Per-IMage Overlap (PIMO, pronounced pee-mo) curves.
 
@@ -524,7 +524,7 @@ def aupimo_scores(
     masks: Tensor,
     num_threshs: int = 300_000,
     binclf_algorithm: str = BinclfAlgorithm.NUMBA,
-    shared_fpr_metric: str = SharedFPRMetric.MEAN_PERIMAGE_FPR,
+    shared_fpr_metric: str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
     fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
     force: bool = False,
 ) -> tuple[PIMOResult, AUPIMOResult]:
@@ -652,7 +652,7 @@ def __init__(
         self,
         num_threshs: int,
         binclf_algorithm: str = BinclfAlgorithm.NUMBA,
-        shared_fpr_metric: str = SharedFPRMetric.MEAN_PERIMAGE_FPR,
+        shared_fpr_metric: str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
     ) -> None:
         """Per-Image Overlap (PIMO) curve.
 
@@ -680,8 +680,8 @@ def __init__(
         BinclfAlgorithm.validate(binclf_algorithm)
         self.binclf_algorithm = binclf_algorithm
 
-        SharedFPRMetric.validate(shared_fpr_metric)
-        self.shared_fpr_metric = SharedFPRMetric.MEAN_PERIMAGE_FPR
+        PIMOSharedFPRMetric.validate(shared_fpr_metric)
+        self.shared_fpr_metric = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR
 
         self.add_state("anomaly_maps", default=[], dist_reduce_fx="cat")
         self.add_state("masks", default=[], dist_reduce_fx="cat")
@@ -795,7 +795,7 @@ def __init__(
         self,
         num_threshs: int = 300_000,
         binclf_algorithm: str = BinclfAlgorithm.NUMBA,
-        shared_fpr_metric: str = SharedFPRMetric.MEAN_PERIMAGE_FPR,
+        shared_fpr_metric: str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
         fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
         force: bool = False,
     ) -> None:
diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index c8695629cf..a29ee0bd44 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -36,8 +36,7 @@
 from numpy import ndarray
 
 from . import _validate, binclf_curve_numpy
-from .binclf_curve_numpy import Algorithm as BinclfAlgorithm
-from .binclf_curve_numpy import ThreshsChoice as BinclfThreshsChoice
+from .binclf_curve_numpy import BinclfAlgorithm, BinclfThreshsChoice
 
 logger = logging.getLogger(__name__)
 
@@ -45,7 +44,7 @@
 
 
 @dataclass
-class SharedFPRMetric:
+class PIMOSharedFPRMetric:
     """Shared FPR metric (x-axis of the PIMO curve)."""
 
     MEAN_PERIMAGE_FPR: ClassVar[str] = "mean-per-image-fpr"
@@ -55,8 +54,8 @@ class SharedFPRMetric:
     @staticmethod
     def validate(metric: str) -> None:
         """Validate the argument `metric`."""
-        if metric not in SharedFPRMetric.METRICS:
-            msg = f"Invalid `metric`. Expected one of {SharedFPRMetric.METRICS}, but got {metric} instead."
+        if metric not in PIMOSharedFPRMetric.METRICS:
+            msg = f"Invalid `metric`. Expected one of {PIMOSharedFPRMetric.METRICS}, but got {metric} instead."
             raise ValueError(msg)
 
 
@@ -103,7 +102,7 @@ def pimo_curves(
     masks: ndarray,
     num_threshs: int,
     binclf_algorithm: str = BinclfAlgorithm.NUMBA,
-    shared_fpr_metric: str = SharedFPRMetric.MEAN_PERIMAGE_FPR,
+    shared_fpr_metric: str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
 ) -> tuple[ndarray, ndarray, ndarray, ndarray]:
     """Compute the Per-IMage Overlap (PIMO, pronounced pee-mo) curves.
 
@@ -134,7 +133,7 @@ def pimo_curves(
             [3] image classes of shape (N,) with values 0 (normal) or 1 (anomalous)
     """
     BinclfAlgorithm.validate(binclf_algorithm)
-    SharedFPRMetric.validate(shared_fpr_metric)
+    PIMOSharedFPRMetric.validate(shared_fpr_metric)
     _validate.num_threshs(num_threshs)
     _validate.anomaly_maps(anomaly_maps)
     _validate.masks(masks)
@@ -165,7 +164,7 @@ def pimo_curves(
     )
 
     shared_fpr: ndarray
-    if shared_fpr_metric == SharedFPRMetric.MEAN_PERIMAGE_FPR:
+    if shared_fpr_metric == PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR:
         # shape -> (N, K)
         per_image_fprs_normals = binclf_curve_numpy.per_image_fpr(binclf_curves[image_classes == 0])
         try:
@@ -196,7 +195,7 @@ def aupimo_scores(
     masks: ndarray,
     num_threshs: int = 300_000,
     binclf_algorithm: str = BinclfAlgorithm.NUMBA,
-    shared_fpr_metric: str = SharedFPRMetric.MEAN_PERIMAGE_FPR,
+    shared_fpr_metric: str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
     fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
     force: bool = False,
 ) -> tuple[ndarray, ndarray, ndarray, ndarray, ndarray]:

From 43c6eb27675e92ecb787b12c5c7cf5c43c310221 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Tue, 26 Dec 2023 19:43:57 +0100
Subject: [PATCH 26/57] add stats tests and test it

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/__init__.py    |   7 +-
 src/anomalib/metrics/per_image/utils.py       | 195 +++++++++++++++
 src/anomalib/metrics/per_image/utils_numpy.py | 226 ++++++++++++++++++
 tests/unit/metrics/per_image/test_utils.py    | 155 ++++++++++++
 4 files changed, 581 insertions(+), 2 deletions(-)

diff --git a/src/anomalib/metrics/per_image/__init__.py b/src/anomalib/metrics/per_image/__init__.py
index 98fc2a50f8..328eed7fd3 100644
--- a/src/anomalib/metrics/per_image/__init__.py
+++ b/src/anomalib/metrics/per_image/__init__.py
@@ -7,7 +7,7 @@
 from .binclf_curve_numpy import BinclfAlgorithm, BinclfThreshsChoice
 from .pimo import AUPIMO, PIMO, AUPIMOResult, PIMOResult, aupimo_scores, pimo_curves
 from .pimo_numpy import PIMOSharedFPRMetric
-from .utils import per_image_scores_stats
+from .utils import compare_models_pairwise_ttest, compare_models_pairwise_wilcoxon, per_image_scores_stats
 from .utils_numpy import StatsOutliersPolicy, StatsRepeatedPolicy
 
 __all__ = [
@@ -26,8 +26,11 @@
     "per_image_tpr",
     "pimo_curves",
     "aupimo_scores",
-    "per_image_scores_stats",
     # torchmetrics interfaces
     "PIMO",
     "AUPIMO",
+    # utils
+    "compare_models_pairwise_ttest",
+    "compare_models_pairwise_wilcoxon",
+    "per_image_scores_stats",
 ]
diff --git a/src/anomalib/metrics/per_image/utils.py b/src/anomalib/metrics/per_image/utils.py
index 2223b0d7ab..ddb6545dc8 100644
--- a/src/anomalib/metrics/per_image/utils.py
+++ b/src/anomalib/metrics/per_image/utils.py
@@ -1,9 +1,18 @@
 """Torch-oriented interfaces for `utils.py`."""
+from __future__ import annotations
+
+from collections import OrderedDict
+from typing import TYPE_CHECKING
+
+import torch
 from torch import Tensor
 
 from . import _validate, utils_numpy
 from .utils_numpy import StatsOutliersPolicy, StatsRepeatedPolicy
 
+if TYPE_CHECKING:
+    from .pimo import AUPIMOResult
+
 
 def per_image_scores_stats(
     per_image_scores: Tensor,
@@ -45,3 +54,189 @@ def per_image_scores_stats(
 per_image_scores_stats.__doc__ = per_image_scores_stats.__doc__.format(  # type: ignore[union-attr]
     docstring=utils_numpy.per_image_scores_stats.__doc__,
 )
+
+
+def _validate_scores_per_model(  # noqa: C901
+    scores_per_model: dict[str, Tensor]
+    | OrderedDict[str, Tensor]
+    | dict[str, AUPIMOResult]
+    | OrderedDict[str, AUPIMOResult],
+) -> None:
+    # it has to be imported here to avoid circular imports
+    from .pimo import AUPIMOResult
+
+    if not isinstance(scores_per_model, dict | OrderedDict):
+        msg = f"Expected scores per model to be a dictionary or ordered dictionary, but got {type(scores_per_model)}."
+        raise TypeError(msg)
+
+    if len(scores_per_model) < 2:
+        msg = f"Expected scores per model to have at least 2 models, but got {len(scores_per_model)}."
+        raise ValueError(msg)
+
+    first_key_value_tensor = None
+
+    for model_name, scores in scores_per_model.items():
+        if not isinstance(model_name, str):
+            msg = f"Expected model name to be a string, but got {type(model_name)} for model {model_name}."
+            raise TypeError(msg)
+
+        if isinstance(scores, AUPIMOResult):
+            scores_tensor = scores.aupimos
+        elif isinstance(scores, Tensor):
+            scores_tensor = scores
+        else:
+            msg = f"Expected scores to be a Tensor or AUPIMOResult, but got {type(scores)} for model {model_name}."
+            raise TypeError(msg)
+
+        if scores_tensor.ndim != 1:
+            msg = f"Expected scores to be 1D Tensor, but got {scores_tensor.ndim}D for model {model_name}."
+            raise ValueError(msg)
+
+        num_valid_scores = scores_tensor[~torch.isnan(scores_tensor)].numel()
+
+        if num_valid_scores < 2:
+            msg = f"Expected at least 2 scores, but got {num_valid_scores} for model {model_name}."
+            raise ValueError(msg)
+
+        if first_key_value_tensor is None:
+            first_key_value_tensor = (model_name, scores, scores_tensor)
+            continue
+
+        first_model_name, first_scores, first_scores_tensor = first_key_value_tensor
+
+        # must have the same type
+        # test using `isinstance` to avoid issues with subclasses
+        if isinstance(scores, Tensor) != isinstance(first_scores, Tensor):
+            msg = (
+                "Expected scores to have the same type, "
+                f"but got ({model_name}) {type(scores)} != {type(first_scores)} ({first_model_name})."
+            )
+            raise TypeError(msg)
+
+        # same shape
+        if scores_tensor.shape != first_scores_tensor.shape:
+            msg = (
+                "Expected scores to have the same shape, "
+                f"but got ({model_name}) {scores_tensor.shape} != {first_scores_tensor.shape} ({first_model_name})."
+            )
+            raise ValueError(msg)
+
+        # `nan` at the same indices
+        if (torch.isnan(scores_tensor) != torch.isnan(first_scores_tensor)).any():
+            msg = (
+                "Expected `nan` values, if any, to be at the same indices, "
+                f"but there are differences between models {model_name} and {first_model_name}."
+            )
+            raise ValueError(msg)
+
+        if isinstance(scores, Tensor):
+            continue
+
+        # check that the metadata is the same, so they can be compared indeed
+
+        if scores.shared_fpr_metric != first_scores.shared_fpr_metric:
+            msg = (
+                "Expected scores to have the same shared FPR metric, "
+                f"but got ({model_name}) {scores.shared_fpr_metric} != "
+                f"{first_scores.shared_fpr_metric} ({first_model_name})."
+            )
+            raise ValueError(msg)
+
+        if scores.fpr_bounds != first_scores.fpr_bounds:
+            msg = (
+                "Expected scores to have the same FPR bounds, "
+                f"but got ({model_name}) {scores.fpr_bounds} != {first_scores.fpr_bounds} ({first_model_name})."
+            )
+            raise ValueError(msg)
+
+
+def compare_models_pairwise_ttest(
+    scores_per_model: dict[str, Tensor]
+    | OrderedDict[str, Tensor]
+    | dict[str, AUPIMOResult]
+    | OrderedDict[str, AUPIMOResult],
+    alternative: str,
+    higher_is_better: bool,
+) -> tuple[tuple[str, ...], dict[tuple[str, str], float]]:
+    """Torch-oriented interface for `compare_models_pairwise_ttest`. See its dscription for more details (below).
+
+    Numpy version docstring
+    =======================
+
+    {docstring}
+    """
+    _validate_scores_per_model(scores_per_model)
+    scores_per_model_items = [
+        (
+            model_name,
+            (scores if isinstance(scores, Tensor) else scores.aupimos).detach().cpu().numpy(),
+        )
+        for model_name, scores in scores_per_model.items()
+    ]
+    cls = OrderedDict if isinstance(scores_per_model, OrderedDict) else dict
+    scores_per_model_with_arrays = cls(scores_per_model_items)
+
+    return utils_numpy.compare_models_pairwise_ttest(scores_per_model_with_arrays, alternative, higher_is_better)
+
+
+compare_models_pairwise_ttest.__doc__ = compare_models_pairwise_ttest.__doc__.format(  # type: ignore[union-attr]
+    docstring=utils_numpy.compare_models_pairwise_ttest.__doc__,
+)
+
+
+def compare_models_pairwise_wilcoxon(
+    scores_per_model: dict[str, Tensor]
+    | OrderedDict[str, Tensor]
+    | dict[str, AUPIMOResult]
+    | OrderedDict[str, AUPIMOResult],
+    alternative: str,
+    higher_is_better: bool,
+) -> tuple[tuple[str, ...], dict[tuple[str, str], float]]:
+    """Torch-oriented interface for `compare_models_pairwise_wilcoxon`. See its dscription for more details (below).
+
+    Numpy version docstring
+    =======================
+
+    {docstring}
+    """
+    _validate_scores_per_model(scores_per_model)
+    scores_per_model_items = [
+        (
+            model_name,
+            (scores if isinstance(scores, Tensor) else scores.aupimos).detach().cpu().numpy(),
+        )
+        for model_name, scores in scores_per_model.items()
+    ]
+    cls = OrderedDict if isinstance(scores_per_model, OrderedDict) else dict
+    scores_per_model_with_arrays = cls(scores_per_model_items)
+
+    return utils_numpy.compare_models_pairwise_wilcoxon(scores_per_model_with_arrays, alternative, higher_is_better)
+
+
+compare_models_pairwise_wilcoxon.__doc__ = compare_models_pairwise_wilcoxon.__doc__.format(  # type: ignore[union-attr]
+    docstring=utils_numpy.compare_models_pairwise_wilcoxon.__doc__,
+)
+
+
+# NEXT
+# NEXT
+# NEXT
+# NEXT
+# NEXT
+# NEXT
+# NEXT
+# NEXT
+# NEXT
+# NEXT
+# NEXT
+# NEXT
+# NEXT
+# NEXT
+# NEXT
+# NEXT
+# NEXT
+# NEXT
+# NEXT
+# NEXT
+# GET RID OF THIS ERROR FOR NOT DEFINING AUPIMORESULT
+# MAKE THE FORMATING OF STATS TEST LIKE BEFORE
diff --git a/src/anomalib/metrics/per_image/utils_numpy.py b/src/anomalib/metrics/per_image/utils_numpy.py
index 3f6e44379b..f506c0c7b3 100644
--- a/src/anomalib/metrics/per_image/utils_numpy.py
+++ b/src/anomalib/metrics/per_image/utils_numpy.py
@@ -2,11 +2,16 @@
 
 TODO(jpcbertoldo): add formalities (license header, author)
 """
+from __future__ import annotations
 
+import itertools
+from collections import OrderedDict
 from typing import ClassVar
 
 import matplotlib as mpl
 import numpy as np
+import scipy
+import scipy.stats
 from numpy import ndarray
 
 from . import _validate
@@ -62,6 +67,26 @@ def validate(policy: str | None) -> None:
             raise ValueError(msg)
 
 
+class StatsAlternativeHypothesis:
+    """Alternative hypothesis for the statistical tests used to compare per-image metrics."""
+
+    TWO_SIDED: ClassVar[str] = "two-sided"
+    LESS: ClassVar[str] = "less"
+    GREATER: ClassVar[str] = "greater"
+
+    ALTERNATIVES: ClassVar[tuple[str, ...]] = (TWO_SIDED, LESS, GREATER)
+
+    @staticmethod
+    def validate(alternative: str) -> None:
+        """Validate the argument `alternative`."""
+        if alternative not in StatsAlternativeHypothesis.ALTERNATIVES:
+            msg = (
+                "Invalid `alternative`. "
+                f"Expected one of {StatsAlternativeHypothesis.ALTERNATIVES}, but got {alternative}."
+            )
+            raise ValueError(msg)
+
+
 # =========================================== ARGS VALIDATION ===========================================
 def _validate_image_class(image_class: int) -> None:
     if not isinstance(image_class, int):
@@ -83,6 +108,59 @@ def _validate_per_image_scores(per_image_scores: ndarray) -> None:
         raise ValueError(msg)
 
 
+def _validate_scores_per_model(scores_per_model: dict[str, ndarray] | OrderedDict[str, ndarray]) -> None:
+    if not isinstance(scores_per_model, dict | OrderedDict):
+        msg = f"Expected scores per model to be a dictionary or ordered dictionary, but got {type(scores_per_model)}."
+        raise TypeError(msg)
+
+    if len(scores_per_model) < 2:
+        msg = f"Expected scores per model to have at least 2 models, but got {len(scores_per_model)}."
+        raise ValueError(msg)
+
+    first_key_value = None
+
+    for model_name, scores in scores_per_model.items():
+        if not isinstance(model_name, str):
+            msg = f"Expected model name to be a string, but got {type(model_name)} for model {model_name}."
+            raise TypeError(msg)
+
+        if not isinstance(scores, ndarray):
+            msg = f"Expected scores to be a numpy array, but got {type(scores)} for model {model_name}."
+            raise TypeError(msg)
+
+        if scores.ndim != 1:
+            msg = f"Expected scores to be 1D, but got {scores.ndim}D for model {model_name}."
+            raise ValueError(msg)
+
+        num_valid_scores = scores[~np.isnan(scores)].shape[0]
+
+        if num_valid_scores < 2:
+            msg = f"Expected at least 2 scores, but got {num_valid_scores} for model {model_name}."
+            raise ValueError(msg)
+
+        if first_key_value is None:
+            first_key_value = (model_name, scores)
+            continue
+
+        first_model_name, first_scores = first_key_value
+
+        # same shape
+        if scores.shape != first_scores.shape:
+            msg = (
+                "Expected scores to have the same shape, "
+                f"but got ({model_name}) {scores.shape} != {first_scores.shape} ({first_model_name})."
+            )
+            raise ValueError(msg)
+
+        # `nan` at the same indices
+        if (np.isnan(scores) != np.isnan(first_scores)).any():
+            msg = (
+                "Expected `nan` values, if any, to be at the same indices, "
+                f"but there are differences between models {model_name} and {first_model_name}."
+            )
+            raise ValueError(msg)
+
+
 # =========================================== FUNCTIONS ===========================================
 
 
@@ -245,3 +323,151 @@ def append_record(stat_name: str, stat_value: float) -> None:
     for stat, val in sorted(boxplot_stats.items(), key=lambda x: x[1]):
         append_record(stat, val)
     return sorted(records, key=lambda r: r["score"])
+
+
+def compare_models_pairwise_ttest(
+    scores_per_model: dict[str, ndarray] | OrderedDict[str, ndarray],
+    alternative: str,
+    higher_is_better: bool,
+) -> tuple[tuple[str, ...], dict[tuple[str, str], float]]:
+    """Compare all pairs of models using the paired t-test (parametric).
+
+    If an ordered dictionary is given, the models are sorted by the order of the dictionary.
+    Otherwise, the models are sorted by average SCORE.
+
+    Each comparison of two models is a paired t-test (null hypothesis is that they are equal).
+
+    Args:
+        scores_per_model: Dictionary of models and their per-image scores.
+            key: model name
+            value: tensor of shape (num_images,). All `nan` values must be at the same positions.
+        higher_is_better: Whether higher values of the metric are better. Defaults to True.
+        alternative: Alternative hypothesis for the statistical tests. See `StatsAlternativeHypothesis`.
+
+    Returns:
+            (models_ordered, test_results):
+                - models_ordered: List of models ordered (by the user or by average score, see above).
+                - confidences: Dictionary of confidence values for each pair of models.
+                    For all pairs of indices i and j from 0 to n-1, where `n` is the number of models and i != j:
+                        - key: (models_ordered[i], models_ordered[j])
+                        - value: confidence on the alternative hypothesis.
+                    For models `models_ordered[i]` and `models_ordered[j]`, the alternative hypothesis is:
+                        - if `less`: model[i] < model[j]
+                        - if `greater`: model[i] > model[j]
+                        - if `two-sided`: model[i] != model[j]
+                    on average.
+    """
+    _validate_scores_per_model(scores_per_model)
+    StatsAlternativeHypothesis.validate(alternative)
+
+    # remove nan values; list of items keeps the order of the OrderedDict
+    scores_per_model_nonan_items = [
+        (model_name, scores[~np.isnan(scores)]) for model_name, scores in scores_per_model.items()
+    ]
+
+    # sort models by average value if not an ordered dictionary
+    # position 0 is assumed the best model
+    if isinstance(scores_per_model, OrderedDict):
+        scores_per_model_nonan = OrderedDict(scores_per_model_nonan_items)
+    else:
+        scores_per_model_nonan = OrderedDict(
+            sorted(scores_per_model_nonan_items, key=lambda kv: kv[1].mean(), reverse=higher_is_better),
+        )
+
+    models_ordered = tuple(scores_per_model_nonan.keys())
+    models_pairs = list(itertools.permutations(models_ordered, 2))
+    confidences: dict[tuple[str, str], float] = {}
+    for model_i, model_j in models_pairs:
+        values_i = scores_per_model_nonan[model_i]
+        values_j = scores_per_model_nonan[model_j]
+        pvalue = scipy.stats.ttest_rel(
+            values_i,
+            values_j,
+            alternative=alternative,
+        ).pvalue
+        confidences[(model_i, model_j)] = 1.0 - float(pvalue)
+
+    return models_ordered, confidences
+
+
+def compare_models_pairwise_wilcoxon(
+    scores_per_model: dict[str, ndarray] | OrderedDict[str, ndarray],
+    alternative: str,
+    higher_is_better: bool,
+    atol: float | None = 1e-3,
+) -> tuple[tuple[str, ...], dict[tuple[str, str], float]]:
+    """Compare all pairs of models using the Wilcoxon signed-rank test (non-parametric).
+
+    If an ordered dictionary is given, the models are sorted by the order of the dictionary.
+    Otherwise, the models are sorted by average RANK.
+
+    Each comparison of two models is a Wilcoxon signed-rank test (null hypothesis is that they are equal).
+
+    This is like the non-parametric version of the paired t-test.
+
+    Args:
+        scores_per_model: Dictionary of models and their per-image scores.
+            key: model name
+            value: tensor of shape (num_images,). All `nan` values must be at the same positions.
+        higher_is_better: Whether higher values of the metric are better. Defaults to True.
+        alternative: Alternative hypothesis for the statistical tests. See `StatsAlternativeHypothesis`.
+        atol: Absolute tolerance used to consider two scores as equal. Defaults to 1e-3 (0.1%).
+              When doing a paired test, if the difference between two scores is below `atol`, the difference is
+              truncated to 0. If `atol` is None, no truncation is done.
+
+    Returns:
+            (models_ordered, test_results):
+                - models_ordered: List of models ordered (by the user or by average score, see above).
+                - confidences: Dictionary of confidence values for each pair of models.
+                    For all pairs of indices i and j from 0 to n-1, where `n` is the number of models and i != j:
+                        - key: (models_ordered[i], models_ordered[j])
+                        - value: confidence on the alternative hypothesis.
+                    For models `models_ordered[i]` and `models_ordered[j]`, the alternative hypothesis is:
+                        - if `less`: model[i] < model[j]
+                        - if `greater`: model[i] > model[j]
+                        - if `two-sided`: model[i] != model[j]
+                    on average in terms of ranks (not scores).
+    """
+    _validate_scores_per_model(scores_per_model)
+    StatsAlternativeHypothesis.validate(alternative)
+
+    # remove nan values; list of items keeps the order of the OrderedDict
+    scores_per_model_nonan_items = [
+        (model_name, scores[~np.isnan(scores)]) for model_name, scores in scores_per_model.items()
+    ]
+
+    # sort models by average value if not an ordered dictionary
+    # position 0 is assumed the best model
+    if isinstance(scores_per_model, OrderedDict):
+        scores_per_model_nonan = OrderedDict(scores_per_model_nonan_items)
+    else:
+        # these average ranks will NOT consider `atol` because we want to rank the models anyway
+        scores_nonan = np.stack([v for _, v in scores_per_model_nonan_items], axis=0)
+        avg_ranks = scipy.stats.rankdata(
+            -scores_nonan if higher_is_better else scores_nonan,
+            method="average",
+            axis=0,
+        ).mean(axis=1)
+        argsort_avg_ranks = avg_ranks.argsort()  # ascending order, lower score is better
+        scores_per_model_nonan = OrderedDict(scores_per_model_nonan_items[idx] for idx in argsort_avg_ranks)
+
+    models_ordered = tuple(scores_per_model_nonan.keys())
+    models_pairs = list(itertools.permutations(models_ordered, 2))
+    confidences: dict[tuple[str, str], float] = {}
+    for model_i, model_j in models_pairs:
+        values_i = scores_per_model_nonan[model_i]
+        values_j = scores_per_model_nonan[model_j]
+        diff = values_i - values_j
+
+        if atol is not None:
+            # make the difference null if below the tolerance
+            diff[np.abs(diff) <= atol] = 0.0
+
+        # extreme case
+        if (diff == 0).all():  # noqa: SIM108
+            pvalue = 1.0
+        else:
+            pvalue = scipy.stats.wilcoxon(diff, alternative=alternative).pvalue
+        confidences[(model_i, model_j)] = 1.0 - float(pvalue)
+
+    return models_ordered, confidences
diff --git a/tests/unit/metrics/per_image/test_utils.py b/tests/unit/metrics/per_image/test_utils.py
index 63a09a7f42..ec5d1a8a80 100644
--- a/tests/unit/metrics/per_image/test_utils.py
+++ b/tests/unit/metrics/per_image/test_utils.py
@@ -1,7 +1,73 @@
 """Test `utils.py`."""
 
+from collections import OrderedDict
+
 import numpy as np
+import pytest
 import torch
+from torch import Tensor
+
+from anomalib.metrics.per_image import AUPIMOResult, PIMOSharedFPRMetric
+
+
+def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
+    """Generate test cases."""
+    num_images = 100
+    # avg is 0.8
+    aucs1 = 0.8 * torch.ones(num_images)
+    # avg ~ 0.7
+    aucs2 = torch.linspace(0.6, 0.8, num_images)
+    # avg ~ 0.6
+    aucs3 = torch.sin(torch.linspace(0, torch.pi, num_images)).clip(0, 1)
+
+    def get_similar(aucs: Tensor) -> Tensor:
+        # a multiplicative factor oscilating around 99.5% and 100.5% signal
+        # to provoke the non-parametric plot to show
+        # the two are within the tolerance
+        factor = 1 - 0.005 * torch.sin(torch.linspace(0, 2 * torch.pi, len(aucs)))
+        aucs_bis = (aucs * factor).clip(0, 1)
+        aucs_bis[torch.isnan(aucs)] = torch.nan
+        return aucs_bis
+
+    if (
+        metafunc.function is test_compare_models_pairwise_ttest
+        or metafunc.function is test_compare_models_pairwise_wilcoxon
+    ):
+        mock_aupimoresult_stuff = {
+            "shared_fpr_metric": PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
+            "fpr_lower_bound": 1e-5,
+            "fpr_upper_bound": 1e-4,
+            "num_threshs": 1_000,
+            "thresh_lower_bound": 1.0,
+            "thresh_upper_bound": 2.0,
+        }
+        metafunc.parametrize(
+            ("scores_per_model",),
+            [
+                ({"a": aucs1, "b": aucs2},),
+                ({"a": aucs1, "b": get_similar(aucs1)},),
+                ({"a": aucs1, "b": aucs2, "c": aucs3},),
+                (OrderedDict([("c", aucs1), ("b", aucs2), ("a", aucs3)]),),
+                (
+                    {
+                        "a": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs1}),
+                        "b": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs2}),
+                        "c": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs3}),
+                    },
+                ),
+            ],
+        )
+        metafunc.parametrize(
+            ("alternative", "higher_is_better"),
+            [
+                ("two-sided", True),
+                ("two-sided", False),
+                ("less", False),
+                ("greater", True),
+                # not considering the case (less, true) and (greater, false) because it will break
+                # some assumptions in the assertions but they are possible
+            ],
+        )
 
 
 def assert_statsdict_stuff(statdic: dict, max_image_idx: int) -> None:
@@ -116,3 +182,92 @@ def test_per_image_scores_stats_specific_values() -> None:
 
     assert statdict_whishi["stat_name"] == "whishi"
     assert statdict_whishi["stat_value"] == 1.0
+
+
+def test_compare_models_pairwise_ttest(scores_per_model: dict, alternative: str, higher_is_better: bool) -> None:
+    """Test `compare_models_pairwise_ttest`."""
+    from anomalib.metrics.per_image import AUPIMOResult, compare_models_pairwise_ttest
+
+    models_ordered, confidences = compare_models_pairwise_ttest(
+        scores_per_model,
+        alternative=alternative,
+        higher_is_better=higher_is_better,
+    )
+    assert len(confidences) == (len(models_ordered) * (len(models_ordered) - 1))
+
+    diff = set(scores_per_model.keys()).symmetric_difference(set(models_ordered))
+    assert len(diff) == 0
+
+    if isinstance(scores_per_model, OrderedDict):
+        assert models_ordered == tuple(scores_per_model.keys())
+
+    elif len(scores_per_model) == 2:
+        assert models_ordered == (("a", "b") if higher_is_better else ("b", "a"))
+
+    elif len(scores_per_model) == 3:
+        assert models_ordered == (("a", "b", "c") if higher_is_better else ("c", "b", "a"))
+
+    if isinstance(next(iter(scores_per_model.values())), AUPIMOResult):
+        return
+
+    def copy_and_add_nan(scores: Tensor) -> Tensor:
+        scores = scores.clone()
+        scores[5:] = torch.nan
+        return scores
+
+    # removing samples should reduce the confidences
+    scores_per_model["a"] = copy_and_add_nan(scores_per_model["a"])
+    scores_per_model["b"] = copy_and_add_nan(scores_per_model["b"])
+    if "c" in scores_per_model:
+        scores_per_model["c"] = copy_and_add_nan(scores_per_model["c"])
+
+    compare_models_pairwise_ttest(
+        scores_per_model,
+        alternative=alternative,
+        higher_is_better=higher_is_better,
+    )
+
+
+def test_compare_models_pairwise_wilcoxon(scores_per_model: dict, alternative: str, higher_is_better: bool) -> None:
+    """Test `compare_models_pairwise_wilcoxon`."""
+    from anomalib.metrics.per_image import AUPIMOResult, compare_models_pairwise_wilcoxon
+
+    models_ordered, confidences = compare_models_pairwise_wilcoxon(
+        scores_per_model,
+        alternative=alternative,
+        higher_is_better=higher_is_better,
+    )
+    assert len(confidences) == (len(models_ordered) * (len(models_ordered) - 1))
+
+    diff = set(scores_per_model.keys()).symmetric_difference(set(models_ordered))
+    assert len(diff) == 0
+
+    if isinstance(scores_per_model, OrderedDict):
+        assert models_ordered == tuple(scores_per_model.keys())
+
+    elif len(scores_per_model) == 2:
+        assert models_ordered == (("a", "b") if higher_is_better else ("b", "a"))
+
+    elif len(scores_per_model) == 3:
+        # this one is not trivial without looking at the data, so no assertions
+        pass
+
+    if isinstance(next(iter(scores_per_model.values())), AUPIMOResult):
+        return
+
+    def copy_and_add_nan(scores: Tensor) -> Tensor:
+        scores = scores.clone()
+        scores[5:] = torch.nan
+        return scores
+
+    # removing samples should reduce the confidences
+    scores_per_model["a"] = copy_and_add_nan(scores_per_model["a"])
+    scores_per_model["b"] = copy_and_add_nan(scores_per_model["b"])
+    if "c" in scores_per_model:
+        scores_per_model["c"] = copy_and_add_nan(scores_per_model["c"])
+
+    compare_models_pairwise_wilcoxon(
+        scores_per_model,
+        alternative=alternative,
+        higher_is_better=higher_is_better,
+    )

From 980c972a5931440495e0cfa42c5a6ef690c2bae5 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Wed, 27 Dec 2023 01:31:09 +0100
Subject: [PATCH 27/57] change the meaning of AUPIMO.num_thresh

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/pimo.py       | 31 +++++++++++++++-----
 src/anomalib/metrics/per_image/pimo_numpy.py |  7 +++--
 tests/unit/metrics/per_image/test_pimo.py    |  7 +++--
 3 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 63ef58f382..0fac13455a 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -239,18 +239,15 @@ class AUPIMOResult:
 
     This interface gathers the AUPIMO data and metadata and provides several utility methods.
 
-    Notation:
-        - N: number of images
-        - K: number of thresholds
-
     Attributes:
         shared_fpr_metric (str): [metadata] shared FPR metric used to compute the PIMO curve
         fpr_lower_bound (float): [metadata] LOWER bound of the FPR integration range
         fpr_upper_bound (float): [metadata] UPPER bound of the FPR integration range
-        num_threshs (int): [metadata] number of thresholds used to compute the PIMO curve (K)
+        num_threshs (int): [metadata] number of thresholds used to effectively compute AUPIMO;
+                            should not be confused with the number of thresholds used to compute the PIMO curve
         thresh_lower_bound (float): LOWER threshold bound --> corresponds to the UPPER FPR bound
         thresh_upper_bound (float): UPPER threshold bound --> corresponds to the LOWER FPR bound
-        aupimos (Tensor): N values of AUPIMO scores
+        aupimos (Tensor): values of AUPIMO scores (1 per image)
     """
 
     # metadata
@@ -269,6 +266,16 @@ def num_images(self) -> int:
         """Number of images."""
         return self.aupimos.shape[0]
 
+    @property
+    def num_normal_images(self) -> int:
+        """Number of normal images."""
+        return int((self.image_classes == 0).sum())
+
+    @property
+    def num_anomalous_images(self) -> int:
+        """Number of anomalous images."""
+        return int((self.image_classes == 1).sum())
+
     @property
     def image_classes(self) -> Tensor:
         """Image classes (0: normal, 1: anomalous)."""
@@ -296,6 +303,7 @@ def __post_init__(self) -> None:
         try:
             PIMOSharedFPRMetric.validate(self.shared_fpr_metric)
             _validate.rate_range((self.fpr_lower_bound, self.fpr_upper_bound))
+            # TODO(jpcbertoldo): warn when it's too low (use parameters from the numpy code)  # noqa: TD003
             _validate.num_threshs(self.num_threshs)
             _validate_aupimos(self.aupimos)
             _validate.thresh_bounds((self.thresh_lower_bound, self.thresh_upper_bound))
@@ -309,6 +317,7 @@ def from_pimoresult(
         cls: type[AUPIMOResult],
         pimoresult: PIMOResult,
         fpr_bounds: tuple[float, float],
+        num_threshs_auc: int,
         aupimos: Tensor,
     ) -> AUPIMOResult:
         """Return an AUPIMO result object from a PIMO result object.
@@ -316,6 +325,8 @@ def from_pimoresult(
         Args:
             pimoresult: PIMO result object
             fpr_bounds: lower and upper bounds of the FPR integration range
+            num_threshs_auc: number of thresholds used to effectively compute AUPIMO;
+                         NOT the number of thresholds used to compute the PIMO curve!
             aupimos: AUPIMO scores
         """
         if pimoresult.per_image_tprs.shape[0] != aupimos.shape[0]:
@@ -342,7 +353,7 @@ def from_pimoresult(
             shared_fpr_metric=pimoresult.shared_fpr_metric,
             fpr_lower_bound=fpr_lower_bound,
             fpr_upper_bound=fpr_upper_bound,
-            num_threshs=pimoresult.num_threshs,
+            num_threshs=num_threshs_auc,
             thresh_lower_bound=float(thresh_lower_bound),
             thresh_upper_bound=float(thresh_upper_bound),
             aupimos=aupimos,
@@ -556,7 +567,7 @@ def aupimo_scores(
 
     # other validations are done in the numpy code
 
-    threshs_array, shared_fpr_array, per_image_tprs_array, _, aupimos_array = pimo_numpy.aupimo_scores(
+    threshs_array, shared_fpr_array, per_image_tprs_array, _, aupimos_array, num_threshs_auc = pimo_numpy.aupimo_scores(
         anomaly_maps_array,
         masks_array,
         num_threshs,
@@ -588,6 +599,10 @@ def aupimo_scores(
     aupimoresult = AUPIMOResult.from_pimoresult(
         pimoresult,
         fpr_bounds=fpr_bounds,
+        # not `num_threshs`!
+        # `num_threshs` is the number of thresholds used to compute the PIMO curve
+        # this is the number of thresholds used to compute the AUPIMO integral
+        num_threshs_auc=num_threshs_auc,
         aupimos=aupimos,
     )
     return pimoresult, aupimoresult
diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index a29ee0bd44..fc40094057 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -198,7 +198,7 @@ def aupimo_scores(
     shared_fpr_metric: str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
     fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
     force: bool = False,
-) -> tuple[ndarray, ndarray, ndarray, ndarray, ndarray]:
+) -> tuple[ndarray, ndarray, ndarray, ndarray, ndarray, int]:
     """Compute the PIMO curves and their Area Under the Curve (i.e. AUPIMO) scores.
 
     Scores are computed from the integration of the PIMO curves within the given FPR bounds, then normalized to [0, 1].
@@ -228,6 +228,7 @@ def aupimo_scores(
             [2] per-image TPR curves of shape (N, K), axis 1 in descending order (indices correspond to the thresholds)
             [3] image classes of shape (N,) with values 0 (normal) or 1 (anomalous)
             [4] AUPIMO scores of shape (N,) in [0, 1]
+            [5] number of points used in the AUC integration
     """
     _validate.rate_range(fpr_bounds)
 
@@ -323,7 +324,7 @@ def aupimo_scores(
         shared_fpr_bounded_log = shared_fpr_bounded_log[~invalid_shared_fpr]
         per_image_tprs_bounded = per_image_tprs_bounded[:, ~invalid_shared_fpr]
 
-    num_points_integral = shared_fpr_bounded_log.shape[0]
+    num_points_integral = int(shared_fpr_bounded_log.shape[0])
 
     if num_points_integral <= 30:
         msg = (
@@ -352,7 +353,7 @@ def aupimo_scores(
     normalization_factor = aupimo_normalizing_factor(fpr_bounds)
     aucs = (aucs / normalization_factor).clip(0, 1)
 
-    return threshs, shared_fpr, per_image_tprs, image_classes, aucs
+    return threshs, shared_fpr, per_image_tprs, image_classes, aucs, num_points_integral
 
 
 # =========================================== AUX ===========================================
diff --git a/tests/unit/metrics/per_image/test_pimo.py b/tests/unit/metrics/per_image/test_pimo.py
index a179f0c430..b3a86bb0d2 100644
--- a/tests/unit/metrics/per_image/test_pimo.py
+++ b/tests/unit/metrics/per_image/test_pimo.py
@@ -354,7 +354,7 @@ def test_aupimo_values_numpy(
     """Test if `aupimo()` returns the expected values."""
     from anomalib.metrics.per_image import pimo_numpy
 
-    threshs, shared_fpr, per_image_tprs, image_classes, aupimos = pimo_numpy.aupimo_scores(
+    threshs, shared_fpr, per_image_tprs, image_classes, aupimos, _ = pimo_numpy.aupimo_scores(
         anomaly_maps,
         masks,
         num_threshs=7,
@@ -396,7 +396,10 @@ def do_assertions(pimoresult: PIMOResult, aupimoresult: AUPIMOResult) -> None:
         assert pimoresult.shared_fpr_metric == "mean-per-image-fpr"
         assert aupimoresult.shared_fpr_metric == "mean-per-image-fpr"
         assert aupimoresult.fpr_bounds == fpr_bounds
-        assert aupimoresult.num_threshs == 7
+        # recall: this one is not the same as the number of thresholds in the curve
+        # this is the number of thresholds used to compute the integral in `aupimo()`
+        # always less because of the integration bounds
+        assert aupimoresult.num_threshs < 7
 
         # test data
         # from pimo result

From a052f6abe33a21d1e36e1c937ef79385d90a24a3 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Thu, 28 Dec 2023 14:35:35 +0100
Subject: [PATCH 28/57] interface to format pairwise test results

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/__init__.py    |   8 +-
 src/anomalib/metrics/per_image/utils.py       | 165 +++++++++++++++---
 src/anomalib/metrics/per_image/utils_numpy.py |  44 +++--
 tests/unit/metrics/per_image/test_utils.py    |  95 ++++++----
 4 files changed, 244 insertions(+), 68 deletions(-)

diff --git a/src/anomalib/metrics/per_image/__init__.py b/src/anomalib/metrics/per_image/__init__.py
index 328eed7fd3..40be18986e 100644
--- a/src/anomalib/metrics/per_image/__init__.py
+++ b/src/anomalib/metrics/per_image/__init__.py
@@ -7,7 +7,12 @@
 from .binclf_curve_numpy import BinclfAlgorithm, BinclfThreshsChoice
 from .pimo import AUPIMO, PIMO, AUPIMOResult, PIMOResult, aupimo_scores, pimo_curves
 from .pimo_numpy import PIMOSharedFPRMetric
-from .utils import compare_models_pairwise_ttest, compare_models_pairwise_wilcoxon, per_image_scores_stats
+from .utils import (
+    compare_models_pairwise_ttest,
+    compare_models_pairwise_wilcoxon,
+    format_pairwise_tests_results,
+    per_image_scores_stats,
+)
 from .utils_numpy import StatsOutliersPolicy, StatsRepeatedPolicy
 
 __all__ = [
@@ -32,5 +37,6 @@
     # utils
     "compare_models_pairwise_ttest",
     "compare_models_pairwise_wilcoxon",
+    "format_pairwise_tests_results",
     "per_image_scores_stats",
 ]
diff --git a/src/anomalib/metrics/per_image/utils.py b/src/anomalib/metrics/per_image/utils.py
index ddb6545dc8..e5608a4eae 100644
--- a/src/anomalib/metrics/per_image/utils.py
+++ b/src/anomalib/metrics/per_image/utils.py
@@ -2,9 +2,12 @@
 from __future__ import annotations
 
 from collections import OrderedDict
+from copy import deepcopy
 from typing import TYPE_CHECKING
 
+import pandas as pd
 import torch
+from pandas import DataFrame
 from torch import Tensor
 
 from . import _validate, utils_numpy
@@ -14,6 +17,81 @@
     from .pimo import AUPIMOResult
 
 
+# =========================================== ARGS VALIDATION ===========================================
+
+
+def _validate_models_ordered(models_ordered: tuple[str, ...]) -> None:
+    if not isinstance(models_ordered, tuple):
+        msg = f"Expected models ordered to be a tuple, but got {type(models_ordered)}."
+        raise TypeError(msg)
+
+    if len(models_ordered) < 2:
+        msg = f"Expected models ordered to have at least 2 models, but got {len(models_ordered)}."
+        raise ValueError(msg)
+
+    for model_name in models_ordered:
+        if not isinstance(model_name, str):
+            msg = f"Expected model name to be a string, but got {type(model_name)} for model {model_name}."
+            raise TypeError(msg)
+
+        if model_name == "":
+            msg = "Expected model name to be non-empty, but got empty string."
+            raise ValueError(msg)
+
+    num_redundant_models = len(models_ordered) - len(set(models_ordered))
+    if num_redundant_models > 0:
+        msg = f"Expected models ordered to have unique models, but got {num_redundant_models} redundant models."
+        raise ValueError(msg)
+
+
+def _validate_confidences(confidences: dict[tuple[str, str], float]) -> None:
+    if not isinstance(confidences, dict):
+        msg = f"Expected confidences to be a dict, but got {type(confidences)}."
+        raise TypeError(msg)
+
+    for (model1, model2), confidence in confidences.items():
+        if not isinstance(model1, str):
+            msg = f"Expected model name to be a string, but got {type(model1)} for model {model1}."
+            raise TypeError(msg)
+
+        if not isinstance(model2, str):
+            msg = f"Expected model name to be a string, but got {type(model2)} for model {model2}."
+            raise TypeError(msg)
+
+        if not isinstance(confidence, float):
+            msg = f"Expected confidence to be a float, but got {type(confidence)} for models {model1} and {model2}."
+            raise TypeError(msg)
+
+        if not (0 <= confidence <= 1):
+            msg = f"Expected confidence to be between 0 and 1, but got {confidence} for models {model1} and {model2}."
+            raise ValueError(msg)
+
+
+def _joint_validate_models_ordered_and_confidences(
+    models_ordered: tuple[str, ...],
+    confidences: dict[tuple[str, str], float],
+) -> None:
+    num_models = len(models_ordered)
+    expected_num_pairs = num_models * (num_models - 1)
+
+    if len(confidences) != expected_num_pairs:
+        msg = f"Expected {expected_num_pairs} pairs of models, but got {len(confidences)} pairs of models."
+        raise ValueError(msg)
+
+    models_in_confidences = {model for pair_models in confidences for model in pair_models}
+
+    diff = set(models_ordered).symmetric_difference(models_in_confidences)
+    if len(diff) > 0:
+        msg = (
+            "Expected models in confidences to be the same as models ordered, but got models missing in one"
+            f"of them: {diff}."
+        )
+        raise ValueError(msg)
+
+
+# =========================================== FUNCTIONS ===========================================
+
+
 def per_image_scores_stats(
     per_image_scores: Tensor,
     images_classes: Tensor | None = None,
@@ -218,25 +296,68 @@ def compare_models_pairwise_wilcoxon(
 )
 
 
-# NEXT
-# NEXT
-# NEXT
-# NEXT
-# NEXT
-# NEXT
-# NEXT
-# NEXT
-# NEXT
-# NEXT
-# NEXT
-# NEXT
-# NEXT
-# NEXT
-# NEXT
-# NEXT
-# NEXT
-# NEXT
-# NEXT
-# NEXT
-# GET RID OF THIS ERROR FOR NOT DEFINING AUPIMORESULT
-# MAKE THE FORMATING OF STATS TEST LIKE BEFORE
+def format_pairwise_tests_results(
+    models_ordered: tuple[str, ...],
+    confidences: dict[tuple[str, str], float],
+    model1_as_column: bool = True,
+    left_to_right: bool = False,
+    top_to_bottom: bool = False,
+) -> DataFrame:
+    """Format the results of pairwise tests into a square dataframe.
+
+    The confidence values refer to the confidence level (in [0, 1]) on the alternative hypothesis,
+    which is formulated as "`model1` <alternative> `model2`", where `<alternative>` can be '<', '>', or '!='.
+
+    HOW TO READ THE DATAFRAME
+    =========================
+    There are 6 possible ways to read the dataframe, depending on the values of `model1_as_column` and `alternative`
+    (from the pairwise test function that generated `confidences`).
+
+    *column* and *row* below refer to a generic column and row value (model names) in the dataframe.
+
+    if (
+        model1_as_column == True and alternative == 'less'
+        or model1_as_column == False and alternative == 'greater'
+    )
+        read: "column < row"
+        equivalently: "row > column"
+
+    elif (
+        model1_as_column == True and alternative == 'greater'
+        or model1_as_column == False and alternative == 'less'
+    )
+        read: "column > row"
+        equivalently: "row < column"
+
+    else:  # alternative == 'two-sided'
+        read: "column != row"
+        equivalently: "row != column"
+
+    Args:
+        models_ordered: The models ordered in a meaningful way, generally from best to worst when automatically ordered.
+        confidences: The confidence on the alternative hypothesis, as returned by the pairwise test function.
+        model1_as_column: Whether to put `model1` as column or row in the dataframe.
+        left_to_right: Whether to order the columns from best to worst model as left to right.
+        top_to_bottom: Whether to order the rows from best to worst model as top to bottom.
+            Default column/row ordering is from worst to best model (left to right, top to bottom),
+            so the upper left corner is the worst model compared to itself, and the bottom right corner is the best
+            model compared to itself.
+
+    """
+    _validate_models_ordered(models_ordered)
+    _validate_confidences(confidences)
+    _joint_validate_models_ordered_and_confidences(models_ordered, confidences)
+    confidences = deepcopy(confidences)
+    confidences.update({(model, model): torch.nan for model in models_ordered})
+    # `df` stands for `dataframe`
+    confdf = pd.DataFrame(confidences, index=["confidence"]).T
+    confdf.index.names = ["model1", "model2"]
+    confdf = confdf.reset_index()
+    confdf["model1"] = pd.Categorical(confdf["model1"], categories=models_ordered, ordered=True)
+    confdf["model2"] = pd.Categorical(confdf["model2"], categories=models_ordered, ordered=True)
+    # df at this point: 3 columns: model1, model2, confidence
+    index_model, column_model = ("model2", "model1") if model1_as_column else ("model1", "model2")
+    confdf = confdf.pivot_table(index=index_model, columns=column_model, values="confidence", dropna=False, sort=False)
+    # now it is a square dataframe with models as index and columns, and confidence as values
+    confdf = confdf.sort_index(axis=0, ascending=top_to_bottom)
+    return confdf.sort_index(axis=1, ascending=left_to_right)
diff --git a/src/anomalib/metrics/per_image/utils_numpy.py b/src/anomalib/metrics/per_image/utils_numpy.py
index f506c0c7b3..f39a40151e 100644
--- a/src/anomalib/metrics/per_image/utils_numpy.py
+++ b/src/anomalib/metrics/per_image/utils_numpy.py
@@ -338,24 +338,34 @@ def compare_models_pairwise_ttest(
     Each comparison of two models is a paired t-test (null hypothesis is that they are equal).
 
     Args:
-        scores_per_model: Dictionary of models and their per-image scores.
+        scores_per_model: Dictionary of `n` models and their per-image scores.
             key: model name
             value: tensor of shape (num_images,). All `nan` values must be at the same positions.
-        higher_is_better: Whether higher values of the metric are better. Defaults to True.
-        alternative: Alternative hypothesis for the statistical tests. See `StatsAlternativeHypothesis`.
+        higher_is_better: Whether higher values of score are better or worse. Defaults to True.
+        alternative: Alternative hypothesis for the statistical tests. See `confidences` in "Returns" section.
+                     Valid values are `StatsAlternativeHypothesis.ALTERNATIVES`.
 
     Returns:
             (models_ordered, test_results):
-                - models_ordered: List of models ordered (by the user or by average score, see above).
+                - models_ordered: Models sorted by the user (`OrderedDict` input) or automatically (`dict` input).
+
+                    Automatic sorting is by average score from best to worst model.
+                    Depending on `higher_is_better`, this corresponds to:
+                        - `higher_is_better=True` ==> descending score order
+                        - `higher_is_better=False` ==> ascending score order
+                    along the indices from 0 to `n-1`.
+
                 - confidences: Dictionary of confidence values for each pair of models.
-                    For all pairs of indices i and j from 0 to n-1, where `n` is the number of models and i != j:
+
+                    For all pairs of indices i and j from 0 to `n-1` such that i != j:
                         - key: (models_ordered[i], models_ordered[j])
                         - value: confidence on the alternative hypothesis.
+
                     For models `models_ordered[i]` and `models_ordered[j]`, the alternative hypothesis is:
                         - if `less`: model[i] < model[j]
                         - if `greater`: model[i] > model[j]
                         - if `two-sided`: model[i] != model[j]
-                    on average.
+                    in termos of average score.
     """
     _validate_scores_per_model(scores_per_model)
     StatsAlternativeHypothesis.validate(alternative)
@@ -406,27 +416,34 @@ def compare_models_pairwise_wilcoxon(
     This is like the non-parametric version of the paired t-test.
 
     Args:
-        scores_per_model: Dictionary of models and their per-image scores.
+        scores_per_model: Dictionary of `n` models and their per-image scores.
             key: model name
             value: tensor of shape (num_images,). All `nan` values must be at the same positions.
-        higher_is_better: Whether higher values of the metric are better. Defaults to True.
-        alternative: Alternative hypothesis for the statistical tests. See `StatsAlternativeHypothesis`.
+        higher_is_better: Whether higher values of score are better or worse. Defaults to True.
+        alternative: Alternative hypothesis for the statistical tests. See `confidences` in "Returns" section.
+                     Valid values are `StatsAlternativeHypothesis.ALTERNATIVES`.
         atol: Absolute tolerance used to consider two scores as equal. Defaults to 1e-3 (0.1%).
               When doing a paired test, if the difference between two scores is below `atol`, the difference is
               truncated to 0. If `atol` is None, no truncation is done.
 
     Returns:
             (models_ordered, test_results):
-                - models_ordered: List of models ordered (by the user or by average score, see above).
+                - models_ordered: Models sorted by the user (`OrderedDict` input) or automatically (`dict` input).
+
+                    Automatic sorting is from "best to worst" model, which corresponds to ascending average rank
+                    along the indices from 0 to `n-1`.
+
                 - confidences: Dictionary of confidence values for each pair of models.
-                    For all pairs of indices i and j from 0 to n-1, where `n` is the number of models and i != j:
+
+                    For all pairs of indices i and j from 0 to `n-1` such that i != j:
                         - key: (models_ordered[i], models_ordered[j])
                         - value: confidence on the alternative hypothesis.
+
                     For models `models_ordered[i]` and `models_ordered[j]`, the alternative hypothesis is:
                         - if `less`: model[i] < model[j]
                         - if `greater`: model[i] > model[j]
                         - if `two-sided`: model[i] != model[j]
-                    on average in terms of ranks (not scores).
+                    in terms of average ranks (not scores!).
     """
     _validate_scores_per_model(scores_per_model)
     StatsAlternativeHypothesis.validate(alternative)
@@ -448,7 +465,8 @@ def compare_models_pairwise_wilcoxon(
             method="average",
             axis=0,
         ).mean(axis=1)
-        argsort_avg_ranks = avg_ranks.argsort()  # ascending order, lower score is better
+        # ascending order, lower score is better --> best to worst model
+        argsort_avg_ranks = avg_ranks.argsort()
         scores_per_model_nonan = OrderedDict(scores_per_model_nonan_items[idx] for idx in argsort_avg_ranks)
 
     models_ordered = tuple(scores_per_model_nonan.keys())
diff --git a/tests/unit/metrics/per_image/test_utils.py b/tests/unit/metrics/per_image/test_utils.py
index ec5d1a8a80..a5a92c4cb6 100644
--- a/tests/unit/metrics/per_image/test_utils.py
+++ b/tests/unit/metrics/per_image/test_utils.py
@@ -20,43 +20,32 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
     # avg ~ 0.6
     aucs3 = torch.sin(torch.linspace(0, torch.pi, num_images)).clip(0, 1)
 
-    def get_similar(aucs: Tensor) -> Tensor:
-        # a multiplicative factor oscilating around 99.5% and 100.5% signal
-        # to provoke the non-parametric plot to show
-        # the two are within the tolerance
-        factor = 1 - 0.005 * torch.sin(torch.linspace(0, 2 * torch.pi, len(aucs)))
-        aucs_bis = (aucs * factor).clip(0, 1)
-        aucs_bis[torch.isnan(aucs)] = torch.nan
-        return aucs_bis
+    mock_aupimoresult_stuff = {
+        "shared_fpr_metric": PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
+        "fpr_lower_bound": 1e-5,
+        "fpr_upper_bound": 1e-4,
+        "num_threshs": 1_000,
+        "thresh_lower_bound": 1.0,
+        "thresh_upper_bound": 2.0,
+    }
+    scores_per_model_dicts = [
+        ({"a": aucs1, "b": aucs2},),
+        ({"a": aucs1, "b": aucs2, "c": aucs3},),
+        (OrderedDict([("c", aucs1), ("b", aucs2), ("a", aucs3)]),),
+        (
+            {
+                "a": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs1}),
+                "b": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs2}),
+                "c": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs3}),
+            },
+        ),
+    ]
 
     if (
         metafunc.function is test_compare_models_pairwise_ttest
         or metafunc.function is test_compare_models_pairwise_wilcoxon
     ):
-        mock_aupimoresult_stuff = {
-            "shared_fpr_metric": PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
-            "fpr_lower_bound": 1e-5,
-            "fpr_upper_bound": 1e-4,
-            "num_threshs": 1_000,
-            "thresh_lower_bound": 1.0,
-            "thresh_upper_bound": 2.0,
-        }
-        metafunc.parametrize(
-            ("scores_per_model",),
-            [
-                ({"a": aucs1, "b": aucs2},),
-                ({"a": aucs1, "b": get_similar(aucs1)},),
-                ({"a": aucs1, "b": aucs2, "c": aucs3},),
-                (OrderedDict([("c", aucs1), ("b", aucs2), ("a", aucs3)]),),
-                (
-                    {
-                        "a": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs1}),
-                        "b": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs2}),
-                        "c": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs3}),
-                    },
-                ),
-            ],
-        )
+        metafunc.parametrize(("scores_per_model",), scores_per_model_dicts)
         metafunc.parametrize(
             ("alternative", "higher_is_better"),
             [
@@ -69,6 +58,9 @@ def get_similar(aucs: Tensor) -> Tensor:
             ],
         )
 
+    if metafunc.function is test_format_pairwise_tests_results:
+        metafunc.parametrize(("scores_per_model",), scores_per_model_dicts[:3])
+
 
 def assert_statsdict_stuff(statdic: dict, max_image_idx: int) -> None:
     """Assert stuff about a `statdic`."""
@@ -271,3 +263,42 @@ def copy_and_add_nan(scores: Tensor) -> Tensor:
         alternative=alternative,
         higher_is_better=higher_is_better,
     )
+
+
+def test_format_pairwise_tests_results(scores_per_model: dict) -> None:
+    """Test `format_pairwise_tests_results`."""
+    from anomalib.metrics.per_image import (
+        compare_models_pairwise_ttest,
+        compare_models_pairwise_wilcoxon,
+        format_pairwise_tests_results,
+    )
+
+    models_ordered, confidences = compare_models_pairwise_wilcoxon(
+        scores_per_model,
+        alternative="greater",
+        higher_is_better=True,
+    )
+    confidence_df = format_pairwise_tests_results(
+        models_ordered,
+        confidences,
+        model1_as_column=True,
+        left_to_right=True,
+        top_to_bottom=True,
+    )
+    assert tuple(confidence_df.columns.tolist()) == models_ordered
+    assert tuple(confidence_df.index.tolist()) == models_ordered
+
+    models_ordered, confidences = compare_models_pairwise_ttest(
+        scores_per_model,
+        alternative="greater",
+        higher_is_better=True,
+    )
+    confidence_df = format_pairwise_tests_results(
+        models_ordered,
+        confidences,
+        model1_as_column=True,
+        left_to_right=True,
+        top_to_bottom=True,
+    )
+    assert tuple(confidence_df.columns.tolist()) == models_ordered
+    assert tuple(confidence_df.index.tolist()) == models_ordered

From dabba4a6df5ec95fd6bf570b51b0f1e980bd8f49 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Thu, 28 Dec 2023 14:44:32 +0100
Subject: [PATCH 29/57] improve doc

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/__init__.py    |  4 +--
 src/anomalib/metrics/per_image/utils.py       | 10 +++---
 src/anomalib/metrics/per_image/utils_numpy.py | 31 ++++++++++++++-----
 tests/unit/metrics/per_image/test_utils.py    | 10 +++---
 4 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/src/anomalib/metrics/per_image/__init__.py b/src/anomalib/metrics/per_image/__init__.py
index 40be18986e..dbc0f6d1a6 100644
--- a/src/anomalib/metrics/per_image/__init__.py
+++ b/src/anomalib/metrics/per_image/__init__.py
@@ -8,7 +8,7 @@
 from .pimo import AUPIMO, PIMO, AUPIMOResult, PIMOResult, aupimo_scores, pimo_curves
 from .pimo_numpy import PIMOSharedFPRMetric
 from .utils import (
-    compare_models_pairwise_ttest,
+    compare_models_pairwise_ttest_rel,
     compare_models_pairwise_wilcoxon,
     format_pairwise_tests_results,
     per_image_scores_stats,
@@ -35,7 +35,7 @@
     "PIMO",
     "AUPIMO",
     # utils
-    "compare_models_pairwise_ttest",
+    "compare_models_pairwise_ttest_rel",
     "compare_models_pairwise_wilcoxon",
     "format_pairwise_tests_results",
     "per_image_scores_stats",
diff --git a/src/anomalib/metrics/per_image/utils.py b/src/anomalib/metrics/per_image/utils.py
index e5608a4eae..c6a0278a8f 100644
--- a/src/anomalib/metrics/per_image/utils.py
+++ b/src/anomalib/metrics/per_image/utils.py
@@ -228,7 +228,7 @@ def _validate_scores_per_model(  # noqa: C901
             raise ValueError(msg)
 
 
-def compare_models_pairwise_ttest(
+def compare_models_pairwise_ttest_rel(
     scores_per_model: dict[str, Tensor]
     | OrderedDict[str, Tensor]
     | dict[str, AUPIMOResult]
@@ -236,7 +236,7 @@ def compare_models_pairwise_ttest(
     alternative: str,
     higher_is_better: bool,
 ) -> tuple[tuple[str, ...], dict[tuple[str, str], float]]:
-    """Torch-oriented interface for `compare_models_pairwise_ttest`. See its dscription for more details (below).
+    """Torch-oriented interface for `compare_models_pairwise_ttest_rel`. See its dscription for more details (below).
 
     Numpy version docstring
     =======================
@@ -254,11 +254,11 @@ def compare_models_pairwise_ttest(
     cls = OrderedDict if isinstance(scores_per_model, OrderedDict) else dict
     scores_per_model_with_arrays = cls(scores_per_model_items)
 
-    return utils_numpy.compare_models_pairwise_ttest(scores_per_model_with_arrays, alternative, higher_is_better)
+    return utils_numpy.compare_models_pairwise_ttest_rel(scores_per_model_with_arrays, alternative, higher_is_better)
 
 
-compare_models_pairwise_ttest.__doc__ = compare_models_pairwise_ttest.__doc__.format(  # type: ignore[union-attr]
-    docstring=utils_numpy.compare_models_pairwise_ttest.__doc__,
+compare_models_pairwise_ttest_rel.__doc__ = compare_models_pairwise_ttest_rel.__doc__.format(  # type: ignore[union-attr]
+    docstring=utils_numpy.compare_models_pairwise_ttest_rel.__doc__,
 )
 
 
diff --git a/src/anomalib/metrics/per_image/utils_numpy.py b/src/anomalib/metrics/per_image/utils_numpy.py
index f39a40151e..7252989c3a 100644
--- a/src/anomalib/metrics/per_image/utils_numpy.py
+++ b/src/anomalib/metrics/per_image/utils_numpy.py
@@ -174,6 +174,9 @@ def per_image_scores_stats(
 ) -> list[dict[str, str | int | float]]:
     """Compute statistics of per-image scores (based on a boxplot's statistics).
 
+    For a single per-image metric collection (1 model, 1 dataset), compute statistics (based on a boxplot)
+    and find the closest image to each statistic.
+
     This function uses `matplotlib.cbook.boxplot_stats`, which is the same function used by `matplotlib.pyplot.boxplot`.
 
     ** OUTLIERS **
@@ -325,18 +328,25 @@ def append_record(stat_name: str, stat_value: float) -> None:
     return sorted(records, key=lambda r: r["score"])
 
 
-def compare_models_pairwise_ttest(
+def compare_models_pairwise_ttest_rel(
     scores_per_model: dict[str, ndarray] | OrderedDict[str, ndarray],
     alternative: str,
     higher_is_better: bool,
 ) -> tuple[tuple[str, ...], dict[tuple[str, str], float]]:
-    """Compare all pairs of models using the paired t-test (parametric).
+    """Compare all pairs of models using the paired t-test on two related samples (parametric).
+
+    This is a test for the null hypothesis that two repeated samples have identical average (expected) values.
+    In fact, it tests whether the average of the differences between the two samples is significantly different from 0.
+
+    Refs:
+        - `scipy.stats.ttest_rel`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html
+        - Wikipedia page: https://en.wikipedia.org/wiki/Student's_t-test#Dependent_t-test_for_paired_samples
+
+    ===
 
     If an ordered dictionary is given, the models are sorted by the order of the dictionary.
     Otherwise, the models are sorted by average SCORE.
 
-    Each comparison of two models is a paired t-test (null hypothesis is that they are equal).
-
     Args:
         scores_per_model: Dictionary of `n` models and their per-image scores.
             key: model name
@@ -408,13 +418,20 @@ def compare_models_pairwise_wilcoxon(
 ) -> tuple[tuple[str, ...], dict[tuple[str, str], float]]:
     """Compare all pairs of models using the Wilcoxon signed-rank test (non-parametric).
 
-    If an ordered dictionary is given, the models are sorted by the order of the dictionary.
-    Otherwise, the models are sorted by average RANK.
-
     Each comparison of two models is a Wilcoxon signed-rank test (null hypothesis is that they are equal).
 
+    It tests whether the distribution of the differences of scores is symmetric about zero in a non-parametric way.
     This is like the non-parametric version of the paired t-test.
 
+    Refs:
+        - `scipy.stats.wilcoxon`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wilcoxon.html#scipy.stats.wilcoxon
+        - Wikipedia page: https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test
+
+    ===
+
+    If an ordered dictionary is given, the models are sorted by the order of the dictionary.
+    Otherwise, the models are sorted by average RANK.
+
     Args:
         scores_per_model: Dictionary of `n` models and their per-image scores.
             key: model name
diff --git a/tests/unit/metrics/per_image/test_utils.py b/tests/unit/metrics/per_image/test_utils.py
index a5a92c4cb6..a7b42a2a27 100644
--- a/tests/unit/metrics/per_image/test_utils.py
+++ b/tests/unit/metrics/per_image/test_utils.py
@@ -178,9 +178,9 @@ def test_per_image_scores_stats_specific_values() -> None:
 
 def test_compare_models_pairwise_ttest(scores_per_model: dict, alternative: str, higher_is_better: bool) -> None:
     """Test `compare_models_pairwise_ttest`."""
-    from anomalib.metrics.per_image import AUPIMOResult, compare_models_pairwise_ttest
+    from anomalib.metrics.per_image import AUPIMOResult, compare_models_pairwise_ttest_rel
 
-    models_ordered, confidences = compare_models_pairwise_ttest(
+    models_ordered, confidences = compare_models_pairwise_ttest_rel(
         scores_per_model,
         alternative=alternative,
         higher_is_better=higher_is_better,
@@ -213,7 +213,7 @@ def copy_and_add_nan(scores: Tensor) -> Tensor:
     if "c" in scores_per_model:
         scores_per_model["c"] = copy_and_add_nan(scores_per_model["c"])
 
-    compare_models_pairwise_ttest(
+    compare_models_pairwise_ttest_rel(
         scores_per_model,
         alternative=alternative,
         higher_is_better=higher_is_better,
@@ -268,7 +268,7 @@ def copy_and_add_nan(scores: Tensor) -> Tensor:
 def test_format_pairwise_tests_results(scores_per_model: dict) -> None:
     """Test `format_pairwise_tests_results`."""
     from anomalib.metrics.per_image import (
-        compare_models_pairwise_ttest,
+        compare_models_pairwise_ttest_rel,
         compare_models_pairwise_wilcoxon,
         format_pairwise_tests_results,
     )
@@ -288,7 +288,7 @@ def test_format_pairwise_tests_results(scores_per_model: dict) -> None:
     assert tuple(confidence_df.columns.tolist()) == models_ordered
     assert tuple(confidence_df.index.tolist()) == models_ordered
 
-    models_ordered, confidences = compare_models_pairwise_ttest(
+    models_ordered, confidences = compare_models_pairwise_ttest_rel(
         scores_per_model,
         alternative="greater",
         higher_is_better=True,

From 215847b41f748d92f58a0ef14f8239b4fd492d09 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Thu, 28 Dec 2023 19:07:07 +0100
Subject: [PATCH 30/57] add optional `paths` to result objects and some minor
 fixes and refactors

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/_validate.py |  38 +++-
 src/anomalib/metrics/per_image/pimo.py      | 185 +++++++++++++-------
 src/anomalib/metrics/per_image/utils.py     | 160 ++++++++++-------
 tests/unit/metrics/per_image/test_pimo.py   |  13 ++
 tests/unit/metrics/per_image/test_utils.py  |   8 +
 5 files changed, 282 insertions(+), 122 deletions(-)

diff --git a/src/anomalib/metrics/per_image/_validate.py b/src/anomalib/metrics/per_image/_validate.py
index 3d6ff94b3a..d9a1000db2 100644
--- a/src/anomalib/metrics/per_image/_validate.py
+++ b/src/anomalib/metrics/per_image/_validate.py
@@ -93,13 +93,14 @@ def rate_range(bounds: tuple[float, float]) -> None:
         raise ValueError(msg)
 
 
-def file_path(file_path: str | Path, must_exist: bool, extension: str | None) -> None:
+def file_path(file_path: str | Path, must_exist: bool, extension: str | None, pathlib_ok: bool) -> None:
     """Validate the given path is a file (optionally) with the expected extension.
 
     Args:
         file_path (str | Path): The file path to validate.
         must_exist (bool): Flag indicating whether the file must exist.
         extension (str | None): The expected file extension, eg. .png, .jpg, etc. If `None`, no validation is performed.
+        pathlib_ok (bool): Flag indicating whether `pathlib.Path` is allowed; if False, only `str` paths are allowed.
     """
     if isinstance(file_path, str):
         file_path = Path(file_path)
@@ -108,6 +109,11 @@ def file_path(file_path: str | Path, must_exist: bool, extension: str | None) ->
         msg = f"Expected file path to be a string or pathlib.Path, but got {type(file_path)}"
         raise TypeError(msg)
 
+    # if it's here, then it's a `pathlib.Path`
+    elif not pathlib_ok:
+        msg = f"Only `str` paths are allowed, but got {type(file_path)}"
+        raise TypeError(msg)
+
     if file_path.is_dir():
         msg = "Expected file path to be a file, but got a directory."
         raise ValueError(msg)
@@ -124,6 +130,31 @@ def file_path(file_path: str | Path, must_exist: bool, extension: str | None) ->
         raise ValueError(msg)
 
 
+def file_paths(file_paths: list[str | Path], must_exist: bool, extension: str | None, pathlib_ok: bool) -> None:
+    """Validate the given paths are files (optionally) with the expected extension.
+
+    Args:
+        file_paths (list[str | Path]): The file paths to validate.
+        must_exist (bool): Flag indicating whether the files must exist.
+        extension (str | None): The expected file extension, eg. .png, .jpg, etc. If `None`, no validation is performed.
+        pathlib_ok (bool): Flag indicating whether `pathlib.Path` is allowed; if False, only `str` paths are allowed.
+    """
+    if not isinstance(file_paths, list):
+        msg = f"Expected paths to be a list, but got {type(file_paths)}."
+        raise TypeError(msg)
+
+    for idx, path in enumerate(file_paths):
+        try:
+            msg = f"Invalid path at index {idx}: {path}"
+            file_path(path, must_exist=must_exist, extension=extension, pathlib_ok=pathlib_ok)
+
+        except TypeError as ex:  # noqa: PERF203
+            raise TypeError(msg) from ex
+
+        except ValueError as ex:
+            raise ValueError(msg) from ex
+
+
 def threshs(threshs: ndarray) -> None:
     """Validate that the thresholds are valid and monotonically increasing."""
     if not isinstance(threshs, ndarray):
@@ -334,7 +365,7 @@ def rate_curve(rate_curve: ndarray, nan_allowed: bool, decreasing: bool) -> None
         raise ValueError(msg)
 
 
-def per_image_rate_curves(rate_curves: ndarray, nan_allowed: bool, decreasing: bool) -> None:
+def per_image_rate_curves(rate_curves: ndarray, nan_allowed: bool, decreasing: bool | None) -> None:
     if not isinstance(rate_curves, ndarray):
         msg = f"Expected per-image rate curves to be an ndarray, but got {type(rate_curves)}."
         raise TypeError(msg)
@@ -367,6 +398,9 @@ def per_image_rate_curves(rate_curves: ndarray, nan_allowed: bool, decreasing: b
         msg = "Expected per-image rate curves to have values in the interval [0, 1], but got values > 1."
         raise ValueError(msg)
 
+    if decreasing is None:
+        return
+
     diffs = np.diff(rate_curves, axis=1)
     diffs_valid = diffs[~np.isnan(diffs)] if nan_allowed else diffs
 
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 0fac13455a..0869cbd5ff 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -18,6 +18,7 @@
 import warnings
 from dataclasses import dataclass, field
 from pathlib import Path
+from typing import TYPE_CHECKING
 
 import torch
 from torch import Tensor
@@ -30,6 +31,9 @@
 from .pimo_numpy import PIMOSharedFPRMetric
 from .utils import StatsOutliersPolicy, StatsRepeatedPolicy
 
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
 # =========================================== ARGS VALIDATION ===========================================
 
 
@@ -69,15 +73,24 @@ def _validate_image_classes(image_classes: Tensor) -> None:
 
 
 def _validate_per_image_tprs(per_image_tprs: Tensor, image_classes: Tensor) -> None:
-    _validate.is_tensor(per_image_tprs, argname="per_image_tprs")
     _validate_image_classes(image_classes)
+    _validate.is_tensor(per_image_tprs, argname="per_image_tprs")
+
+    # general validations
+    _validate.per_image_rate_curves(
+        per_image_tprs.numpy(),
+        nan_allowed=True,  # normal images have NaN TPRs
+        decreasing=None,  # not checked here
+    )
 
+    # specific to anomalous images
     _validate.per_image_rate_curves(
         per_image_tprs[image_classes == 1].numpy(),
         nan_allowed=False,
         decreasing=True,
     )
 
+    # specific to normal images
     normal_images_tprs = per_image_tprs[image_classes == 0]
     if not normal_images_tprs.isnan().all():
         msg = "Expected all normal images to have NaN TPRs, but some have non-NaN values."
@@ -89,10 +102,30 @@ def _validate_aupimos(aupimos: Tensor) -> None:
     _validate.rates(aupimos.numpy(), nan_allowed=True)
 
 
+def _validate_source_images_paths(paths: Sequence[str], expected_num_paths: int | None) -> None:
+    _validate.file_paths(
+        paths,  # type: ignore[arg-type]
+        # not necessary to exist because the metric can be computed
+        # directly from the anomaly maps and masks, without the images
+        must_exist=False,
+        # this will eventually be serialized to a file, so we don't want pathlib objects keep it simple
+        pathlib_ok=False,
+        # not enforcing the image type (e.g. png, jpg, etc.)
+        extension=None,
+    )
+
+    if expected_num_paths is None:
+        return
+
+    if len(paths) != expected_num_paths:
+        msg = f"Invalid `paths` argument. Expected {expected_num_paths} paths, but got {len(paths)} instead."
+        raise ValueError(msg)
+
+
 # =========================================== RESULT OBJECT ===========================================
 
 
-@dataclass
+@dataclass(frozen=True)
 class PIMOResult:
     """Per-Image Overlap (PIMO, pronounced pee-mo) curve.
 
@@ -109,6 +142,7 @@ class PIMOResult:
         threshs (Tensor): sequence of K (monotonically increasing) thresholds used to compute the PIMO curve
         shared_fpr (Tensor): K values of the shared FPR metric at the corresponding thresholds
         per_image_tprs (Tensor): for each of the N images, the K values of in-image TPR at the corresponding thresholds
+        paths (list[str]) (optional): [metadata] paths to the source images to which the PIMO curves correspond
     """
 
     # metadata
@@ -119,6 +153,9 @@ class PIMOResult:
     shared_fpr: Tensor = field(repr=False)  # shape => (K,)
     per_image_tprs: Tensor = field(repr=False)  # shape => (N, K)
 
+    # optional metadata
+    paths: list[str] | None = field(repr=False, default=None)
+
     @property
     def num_threshs(self) -> int:
         """Number of thresholds."""
@@ -131,34 +168,40 @@ def num_images(self) -> int:
 
     @property
     def image_classes(self) -> Tensor:
-        """Image classes (0: normal, 1: anomalous)."""
-        return (self.per_image_tprs.flatten(1) == 1).any(dim=1).to(torch.int32)
+        """Image classes (0: normal, 1: anomalous).
+
+        Deduced from the per-image TPRs.
+        If any TPR value is not NaN, the image is considered anomalous.
+        """
+        return (~torch.isnan(self.per_image_tprs)).any(dim=1).to(torch.int32)
 
     def __post_init__(self) -> None:
         """Validate the inputs for the result object are consistent."""
         try:
-            PIMOSharedFPRMetric.validate(self.shared_fpr_metric)
             _validate_threshs(self.threshs)
             _validate_shared_fpr(self.shared_fpr, nan_allowed=False)
             _validate_per_image_tprs(self.per_image_tprs, self.image_classes)
 
+            if self.paths is not None:
+                _validate_source_images_paths(self.paths, expected_num_paths=self.per_image_tprs.shape[0])
+
         except (TypeError, ValueError) as ex:
             msg = f"Invalid inputs for {self.__class__.__name__} object. Cause: {ex}."
-            raise ValueError(msg) from ex
+            raise TypeError(msg) from ex
 
         if self.threshs.shape != self.shared_fpr.shape:
             msg = (
                 f"Invalid {self.__class__.__name__} object. Attributes have inconsistent shapes: "
-                f"threshs.shape={self.threshs.shape} != shared_fpr.shape={self.shared_fpr.shape}."
+                f"{self.threshs.shape=} != {self.shared_fpr.shape=}."
             )
-            raise ValueError(msg)
+            raise TypeError(msg)
 
         if self.threshs.shape[0] != self.per_image_tprs.shape[1]:
             msg = (
                 f"Invalid {self.__class__.__name__} object. Attributes have inconsistent shapes: "
-                f"threshs.shape[0]={self.threshs.shape[0]} != per_image_tprs.shape[1]={self.per_image_tprs.shape[1]}."
+                f"{self.threshs.shape[0]=} != {self.per_image_tprs.shape[1]=}."
             )
-            raise ValueError(msg)
+            raise TypeError(msg)
 
     def thresh_at(self, fpr_level: float) -> tuple[int, float, float]:
         """Return the threshold at the given shared FPR.
@@ -182,33 +225,34 @@ def thresh_at(self, fpr_level: float) -> tuple[int, float, float]:
 
     def to_dict(self) -> dict[str, Tensor | str]:
         """Return a dictionary with the result object's attributes."""
-        return {
+        dic = {
             "shared_fpr_metric": self.shared_fpr_metric,
             "threshs": self.threshs,
             "shared_fpr": self.shared_fpr,
             "per_image_tprs": self.per_image_tprs,
         }
+        if self.paths is not None:
+            dic["paths"] = self.paths
+        return dic
 
     @classmethod
-    def from_dict(cls: type[PIMOResult], dic: dict[str, Tensor | str]) -> PIMOResult:
+    def from_dict(cls: type[PIMOResult], dic: dict[str, Tensor | str | list[str]]) -> PIMOResult:
         """Return a result object from a dictionary."""
-        keys = ["shared_fpr_metric", "threshs", "shared_fpr", "per_image_tprs"]
-        for key in keys:
-            if key not in dic:
-                msg = f"Invalid input dictionary for {cls.__name__} object, missing key: {key}. Must contain: {keys}."
-                raise ValueError(msg)
+        try:
+            return cls(**dic)  # type: ignore[arg-type]
 
-        return cls(**dic)
+        except TypeError as ex:
+            msg = f"Invalid input dictionary for {cls.__name__} object. Cause: {ex}."
+            raise TypeError(msg) from ex
 
     def save(self, file_path: str | Path) -> None:
         """Save to a `.pt` file.
 
         Args:
             file_path: path to the `.pt` file where to save the PIMO result.
-                - must have a `.pt` extension
-                - if the file already exists, a numerical suffix is added to the filename
+                       If the file already exists, a numerical suffix is added to the filename.
         """
-        _validate.file_path(file_path, must_exist=False, extension=".pt")
+        _validate.file_path(file_path, must_exist=False, extension=".pt", pathlib_ok=True)
         file_path = duplicate_filename(file_path)
         payload = self.to_dict()
         torch.save(payload, file_path)
@@ -219,21 +263,20 @@ def load(cls: type[PIMOResult], file_path: str | Path) -> PIMOResult:
 
         Args:
             file_path: path to the `.pt` file where to load the PIMO result.
-                - must have a `.pt` extension
         """
-        _validate.file_path(file_path, must_exist=True, extension=".pt")
+        _validate.file_path(file_path, must_exist=True, extension=".pt", pathlib_ok=True)
         payload = torch.load(file_path)
         if not isinstance(payload, dict):
-            msg = f"Invalid payload in file {file_path}. Must be a dictionary."
+            msg = f"Invalid content in file {file_path}. Must be a dictionary."
             raise TypeError(msg)
         try:
             return cls.from_dict(payload)
-        except (TypeError, ValueError) as ex:
-            msg = f"Invalid payload in file {file_path}. Cause: {ex}."
-            raise ValueError(msg) from ex
+        except TypeError as ex:
+            msg = f"Invalid content in file {file_path}. Cause: {ex}."
+            raise TypeError(msg) from ex
 
 
-@dataclass
+@dataclass(frozen=True)
 class AUPIMOResult:
     """Area Under the Per-Image Overlap (AUPIMO, pronounced a-u-pee-mo) curve.
 
@@ -261,6 +304,9 @@ class AUPIMOResult:
     thresh_upper_bound: float = field(repr=False)
     aupimos: Tensor = field(repr=False)  # shape => (N,)
 
+    # optional metadata
+    paths: list[str] | None = field(repr=False, default=None)
+
     @property
     def num_images(self) -> int:
         """Number of images."""
@@ -301,16 +347,18 @@ def thresh_bounds(self) -> tuple[float, float]:
     def __post_init__(self) -> None:
         """Validate the inputs for the result object are consistent."""
         try:
-            PIMOSharedFPRMetric.validate(self.shared_fpr_metric)
             _validate.rate_range((self.fpr_lower_bound, self.fpr_upper_bound))
             # TODO(jpcbertoldo): warn when it's too low (use parameters from the numpy code)  # noqa: TD003
             _validate.num_threshs(self.num_threshs)
             _validate_aupimos(self.aupimos)
             _validate.thresh_bounds((self.thresh_lower_bound, self.thresh_upper_bound))
 
+            if self.paths is not None:
+                _validate_source_images_paths(self.paths, expected_num_paths=self.aupimos.shape[0])
+
         except (TypeError, ValueError) as ex:
             msg = f"Invalid inputs for {self.__class__.__name__} object. Cause: {ex}."
-            raise ValueError(msg) from ex
+            raise TypeError(msg) from ex
 
     @classmethod
     def from_pimoresult(
@@ -319,6 +367,7 @@ def from_pimoresult(
         fpr_bounds: tuple[float, float],
         num_threshs_auc: int,
         aupimos: Tensor,
+        paths: list[str] | None = None,
     ) -> AUPIMOResult:
         """Return an AUPIMO result object from a PIMO result object.
 
@@ -328,21 +377,28 @@ def from_pimoresult(
             num_threshs_auc: number of thresholds used to effectively compute AUPIMO;
                          NOT the number of thresholds used to compute the PIMO curve!
             aupimos: AUPIMO scores
+            paths: paths to the source images to which the AUPIMO scores correspond.
         """
         if pimoresult.per_image_tprs.shape[0] != aupimos.shape[0]:
             msg = (
                 f"Invalid {cls.__name__} object. Attributes have inconsistent shapes: "
                 f"there are {pimoresult.per_image_tprs.shape[0]} PIMO curves but {aupimos.shape[0]} AUPIMO scores."
             )
-            raise ValueError(msg)
+            raise TypeError(msg)
 
         if not torch.isnan(aupimos[pimoresult.image_classes == 0]).all():
             msg = "Expected all normal images to have NaN AUPIMOs, but some have non-NaN values."
-            raise ValueError(msg)
+            raise TypeError(msg)
 
         if torch.isnan(aupimos[pimoresult.image_classes == 1]).any():
             msg = "Expected all anomalous images to have valid AUPIMOs (not nan), but some have NaN values."
-            raise ValueError(msg)
+            raise TypeError(msg)
+
+        if pimoresult.paths is not None:
+            paths = pimoresult.paths
+
+        elif paths is not None:
+            _validate_source_images_paths(paths, expected_num_paths=pimoresult.num_images)
 
         fpr_lower_bound, fpr_upper_bound = fpr_bounds
         # recall: fpr upper/lower bounds are the same as the thresh lower/upper bounds
@@ -357,11 +413,12 @@ def from_pimoresult(
             thresh_lower_bound=float(thresh_lower_bound),
             thresh_upper_bound=float(thresh_upper_bound),
             aupimos=aupimos,
+            paths=paths,
         )
 
     def to_dict(self) -> dict[str, Tensor | str | float | int]:
         """Return a dictionary with the result object's attributes."""
-        return {
+        dic = {
             "shared_fpr_metric": self.shared_fpr_metric,
             "fpr_lower_bound": self.fpr_lower_bound,
             "fpr_upper_bound": self.fpr_upper_bound,
@@ -370,35 +427,28 @@ def to_dict(self) -> dict[str, Tensor | str | float | int]:
             "thresh_upper_bound": self.thresh_upper_bound,
             "aupimos": self.aupimos,
         }
+        if self.paths is not None:
+            dic["paths"] = self.paths
+        return dic
 
     @classmethod
-    def from_dict(cls: type[AUPIMOResult], dic: dict[str, Tensor | str | float | int]) -> AUPIMOResult:
+    def from_dict(cls: type[AUPIMOResult], dic: dict[str, Tensor | str | float | int | list[str]]) -> AUPIMOResult:
         """Return a result object from a dictionary."""
-        keys = [
-            "shared_fpr_metric",
-            "fpr_lower_bound",
-            "fpr_upper_bound",
-            "num_threshs",
-            "thresh_lower_bound",
-            "thresh_upper_bound",
-            "aupimos",
-        ]
-        for key in keys:
-            if key not in dic:
-                msg = f"Invalid input dictionary for {cls.__name__} object, missing key: {key}. Must contain: {keys}."
-                raise ValueError(msg)
-
-        return cls(**dic)  # type: ignore[arg-type]
+        try:
+            return cls(**dic)  # type: ignore[arg-type]
+
+        except TypeError as ex:
+            msg = f"Invalid input dictionary for {cls.__name__} object. Cause: {ex}."
+            raise TypeError(msg) from ex
 
     def save(self, file_path: str | Path) -> None:
         """Save to a `.json` file.
 
         Args:
             file_path: path to the `.json` file where to save the AUPIMO result.
-                - must have a `.json` extension
-                - if the file already exists, a numerical suffix is added to the filename
+                       If the file already exists, a numerical suffix is added to the filename.
         """
-        _validate.file_path(file_path, must_exist=False, extension=".json")
+        _validate.file_path(file_path, must_exist=False, extension=".json", pathlib_ok=True)
         file_path = duplicate_filename(file_path)
         file_path = Path(file_path)
         payload = self.to_dict()
@@ -413,9 +463,8 @@ def load(cls: type[AUPIMOResult], file_path: str | Path) -> AUPIMOResult:
 
         Args:
             file_path: path to the `.json` file where to load the AUPIMO result.
-                - must have a `.json` extension
         """
-        _validate.file_path(file_path, must_exist=True, extension=".json")
+        _validate.file_path(file_path, must_exist=True, extension=".json", pathlib_ok=True)
         file_path = Path(file_path)
         with file_path.open("r") as f:
             payload = json.load(f)
@@ -428,7 +477,7 @@ def load(cls: type[AUPIMOResult], file_path: str | Path) -> AUPIMOResult:
             return cls.from_dict(payload)
         except (TypeError, ValueError) as ex:
             msg = f"Invalid payload in file {file_path}. Cause: {ex}."
-            raise ValueError(msg) from ex
+            raise TypeError(msg) from ex
 
     def stats(
         self,
@@ -471,6 +520,7 @@ def pimo_curves(
     num_threshs: int,
     binclf_algorithm: str = BinclfAlgorithm.NUMBA,
     shared_fpr_metric: str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
+    paths: list[str] | None = None,
 ) -> PIMOResult:
     """Compute the Per-IMage Overlap (PIMO, pronounced pee-mo) curves.
 
@@ -480,6 +530,9 @@ def pimo_curves(
 
     Refer to `pimo_numpy.pimo_curves()` and `PIMOResult` (their docstrings below).
 
+    Args (extra):
+        paths: paths to the source images to which the PIMO curves correspond.
+
     pimo_numpy.pimo_curves.__doc__
     ==============================
     {docstring_pimo_curves}
@@ -488,12 +541,15 @@ def pimo_curves(
     ==================
     {docstring_pimoresult}
     """
-    _validate.is_tensor(anomaly_maps, argname="anomaly_maps")
+    _validate_anomaly_maps(anomaly_maps)
     anomaly_maps_array = anomaly_maps.detach().cpu().numpy()
 
-    _validate.is_tensor(masks, argname="masks")
+    _validate_masks(masks)
     masks_array = masks.detach().cpu().numpy()
 
+    if paths is not None:
+        _validate_source_images_paths(paths, expected_num_paths=anomaly_maps.shape[0])
+
     # other validations are done in the numpy code
     threshs_array, shared_fpr_array, per_image_tprs_array, _ = pimo_numpy.pimo_curves(
         anomaly_maps_array,
@@ -520,6 +576,7 @@ def pimo_curves(
         threshs=threshs,
         shared_fpr=shared_fpr,
         per_image_tprs=per_image_tprs,
+        paths=paths,
     )
 
 
@@ -538,6 +595,7 @@ def aupimo_scores(
     shared_fpr_metric: str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
     fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
     force: bool = False,
+    paths: list[str] | None = None,
 ) -> tuple[PIMOResult, AUPIMOResult]:
     """Compute the PIMO curves and their Area Under the Curve (i.e. AUPIMO) scores.
 
@@ -547,6 +605,9 @@ def aupimo_scores(
 
     Refer to `pimo_numpy.aupimo_scores()`, `PIMOResult` and `AUPIMOResult` (their docstrings below).
 
+    Args (extra):
+        paths: paths to the source images to which the AUPIMO scores correspond.
+
     pimo_numpy.aupimo_scores.__doc__
     =================================
     {docstring_aupimo_scores}
@@ -559,12 +620,15 @@ def aupimo_scores(
     ====================
     {docstring_aupimoresult}
     """
-    _validate.is_tensor(anomaly_maps, argname="anomaly_maps")
+    _validate_anomaly_maps(anomaly_maps)
     anomaly_maps_array = anomaly_maps.detach().cpu().numpy()
 
-    _validate.is_tensor(masks, argname="masks")
+    _validate_masks(masks)
     masks_array = masks.detach().cpu().numpy()
 
+    if paths is not None:
+        _validate_source_images_paths(paths, expected_num_paths=anomaly_maps.shape[0])
+
     # other validations are done in the numpy code
 
     threshs_array, shared_fpr_array, per_image_tprs_array, _, aupimos_array, num_threshs_auc = pimo_numpy.aupimo_scores(
@@ -595,6 +659,7 @@ def aupimo_scores(
         threshs=threshs,
         shared_fpr=shared_fpr,
         per_image_tprs=per_image_tprs,
+        paths=paths,
     )
     aupimoresult = AUPIMOResult.from_pimoresult(
         pimoresult,
diff --git a/src/anomalib/metrics/per_image/utils.py b/src/anomalib/metrics/per_image/utils.py
index c6a0278a8f..0f0e3a1723 100644
--- a/src/anomalib/metrics/per_image/utils.py
+++ b/src/anomalib/metrics/per_image/utils.py
@@ -1,6 +1,8 @@
 """Torch-oriented interfaces for `utils.py`."""
 from __future__ import annotations
 
+import logging
+import warnings
 from collections import OrderedDict
 from copy import deepcopy
 from typing import TYPE_CHECKING
@@ -17,6 +19,8 @@
     from .pimo import AUPIMOResult
 
 
+logger = logging.getLogger(__name__)
+
 # =========================================== ARGS VALIDATION ===========================================
 
 
@@ -134,99 +138,135 @@ def per_image_scores_stats(
 )
 
 
-def _validate_scores_per_model(  # noqa: C901
-    scores_per_model: dict[str, Tensor]
-    | OrderedDict[str, Tensor]
-    | dict[str, AUPIMOResult]
-    | OrderedDict[str, AUPIMOResult],
-) -> None:
-    # it has to be imported here to avoid circular imports
-    from .pimo import AUPIMOResult
-
-    if not isinstance(scores_per_model, dict | OrderedDict):
-        msg = f"Expected scores per model to be a dictionary or ordered dictionary, but got {type(scores_per_model)}."
-        raise TypeError(msg)
-
-    if len(scores_per_model) < 2:
-        msg = f"Expected scores per model to have at least 2 models, but got {len(scores_per_model)}."
-        raise ValueError(msg)
-
-    first_key_value_tensor = None
+def _validate_scores_per_model_tensor(scores_per_model: dict[str, Tensor] | OrderedDict[str, Tensor]) -> None:
+    first_key_value = None
 
     for model_name, scores in scores_per_model.items():
-        if not isinstance(model_name, str):
-            msg = f"Expected model name to be a string, but got {type(model_name)} for model {model_name}."
-            raise TypeError(msg)
-
-        if isinstance(scores, AUPIMOResult):
-            scores_tensor = scores.aupimos
-        elif isinstance(scores, Tensor):
-            scores_tensor = scores
-        else:
-            msg = f"Expected scores to be a Tensor or AUPIMOResult, but got {type(scores)} for model {model_name}."
-            raise TypeError(msg)
-
-        if scores_tensor.ndim != 1:
-            msg = f"Expected scores to be 1D Tensor, but got {scores_tensor.ndim}D for model {model_name}."
+        if scores.ndim != 1:
+            msg = f"Expected scores to be 1D, but got {scores.ndim}D for model {model_name}."
             raise ValueError(msg)
 
-        num_valid_scores = scores_tensor[~torch.isnan(scores_tensor)].numel()
+        num_valid_scores = scores[~torch.isnan(scores)].numel()
 
-        if num_valid_scores < 2:
-            msg = f"Expected at least 2 scores, but got {num_valid_scores} for model {model_name}."
+        if num_valid_scores < 1:
+            msg = f"Expected at least 1 non-nan score, but got {num_valid_scores} for model {model_name}."
             raise ValueError(msg)
 
-        if first_key_value_tensor is None:
-            first_key_value_tensor = (model_name, scores, scores_tensor)
+        if first_key_value is None:
+            first_key_value = (model_name, scores)
             continue
 
-        first_model_name, first_scores, first_scores_tensor = first_key_value_tensor
-
-        # must have the same type
-        # test using `isinstance` to avoid issues with subclasses
-        if isinstance(scores, Tensor) != isinstance(first_scores, Tensor):
-            msg = (
-                "Expected scores to have the same type, "
-                f"but got ({model_name}) {type(scores)} != {type(first_scores)} ({first_model_name})."
-            )
-            raise TypeError(msg)
+        first_model_name, first_scores = first_key_value
 
         # same shape
-        if scores_tensor.shape != first_scores_tensor.shape:
+        if scores.shape[0] != first_scores.shape[0]:
             msg = (
-                "Expected scores to have the same shape, "
-                f"but got ({model_name}) {scores_tensor.shape} != {first_scores_tensor.shape} ({first_model_name})."
+                "Expected scores to have the same number of scores, "
+                f"but got ({model_name}) {scores.shape[0]} != {first_scores.shape[0]} ({first_model_name})."
             )
             raise ValueError(msg)
 
         # `nan` at the same indices
-        if (torch.isnan(scores_tensor) != torch.isnan(first_scores_tensor)).any():
+        if (torch.isnan(scores) != torch.isnan(first_scores)).any():
             msg = (
                 "Expected `nan` values, if any, to be at the same indices, "
                 f"but there are differences between models {model_name} and {first_model_name}."
             )
             raise ValueError(msg)
 
-        if isinstance(scores, Tensor):
+
+def _validate_scores_per_model_aupimoresult(
+    scores_per_model: dict[str, AUPIMOResult] | OrderedDict[str, AUPIMOResult],
+    missing_paths_ok: bool,
+) -> None:
+    first_key_value = None
+
+    for model_name, aupimoresult in scores_per_model.items():
+        if first_key_value is None:
+            first_key_value = (model_name, aupimoresult)
             continue
 
-        # check that the metadata is the same, so they can be compared indeed
+        first_model_name, first_aupimoresult = first_key_value
 
-        if scores.shared_fpr_metric != first_scores.shared_fpr_metric:
+        # check that the metadata is the same, so they can be compared indeed
+        if aupimoresult.shared_fpr_metric != first_aupimoresult.shared_fpr_metric:
             msg = (
-                "Expected scores to have the same shared FPR metric, "
-                f"but got ({model_name}) {scores.shared_fpr_metric} != "
-                f"{first_scores.shared_fpr_metric} ({first_model_name})."
+                "Expected AUPIMOResult objects in scores per model to have the same shared FPR metric, "
+                f"but got ({model_name}) {aupimoresult.shared_fpr_metric} != "
+                f"{first_aupimoresult.shared_fpr_metric} ({first_model_name})."
             )
             raise ValueError(msg)
 
-        if scores.fpr_bounds != first_scores.fpr_bounds:
+        if aupimoresult.fpr_bounds != first_aupimoresult.fpr_bounds:
             msg = (
-                "Expected scores to have the same FPR bounds, "
-                f"but got ({model_name}) {scores.fpr_bounds} != {first_scores.fpr_bounds} ({first_model_name})."
+                "Expected AUPIMOResult objects in scores per model to have the same FPR bounds, "
+                f"but got ({model_name}) {aupimoresult.fpr_bounds} != "
+                f"{first_aupimoresult.fpr_bounds} ({first_model_name})."
             )
             raise ValueError(msg)
 
+    available_paths = [tuple(scores.paths) for scores in scores_per_model.values() if scores.paths is not None]
+
+    if len(set(available_paths)) > 1:
+        msg = (
+            "Expected AUPIMOResult objects in scores per model to have the same paths, "
+            "but got different paths for different models."
+        )
+        raise ValueError(msg)
+
+    if len(available_paths) != len(scores_per_model):
+        msg = "Some models have paths, while others are missing them."
+        if missing_paths_ok:
+            warnings.warn(msg, UserWarning, stacklevel=3)
+            logger.warning(msg)
+
+
+def _validate_scores_per_model(
+    scores_per_model: dict[str, Tensor]
+    | OrderedDict[str, Tensor]
+    | dict[str, AUPIMOResult]
+    | OrderedDict[str, AUPIMOResult],
+) -> None:
+    # it has to be imported here to avoid circular imports
+    from .pimo import AUPIMOResult
+
+    if not isinstance(scores_per_model, dict | OrderedDict):
+        msg = f"Expected scores per model to be a dictionary or ordered dictionary, but got {type(scores_per_model)}."
+        raise TypeError(msg)
+
+    if len(scores_per_model) < 2:
+        msg = f"Expected scores per model to have at least 2 models, but got {len(scores_per_model)}."
+        raise ValueError(msg)
+
+    if not all(isinstance(model_name, str) for model_name in scores_per_model):
+        msg = "Expected scores per model to have model names (strings) as keys."
+        raise TypeError(msg)
+
+    first_instance = next(iter(scores_per_model.values()))
+
+    if (
+        isinstance(first_instance, Tensor)
+        and any(not isinstance(scores, Tensor) for scores in scores_per_model.values())
+    ) or (
+        isinstance(first_instance, AUPIMOResult)
+        and any(not isinstance(scores, AUPIMOResult) for scores in scores_per_model.values())
+    ):
+        msg = (
+            "Values in the scores per model dict must have the same type for values (Tensor or AUPIMOResult), "
+            "but more than one type was found."
+        )
+        raise TypeError(msg)
+
+    if isinstance(first_instance, Tensor):
+        _validate_scores_per_model_tensor(scores_per_model)
+        return
+
+    _validate_scores_per_model_tensor(
+        {model_name: scores.aupimos for model_name, scores in scores_per_model.items()},
+    )
+
+    _validate_scores_per_model_aupimoresult(scores_per_model, missing_paths_ok=True)
+
 
 def compare_models_pairwise_ttest_rel(
     scores_per_model: dict[str, Tensor]
diff --git a/tests/unit/metrics/per_image/test_pimo.py b/tests/unit/metrics/per_image/test_pimo.py
index b3a86bb0d2..bbcd4aa797 100644
--- a/tests/unit/metrics/per_image/test_pimo.py
+++ b/tests/unit/metrics/per_image/test_pimo.py
@@ -178,6 +178,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
         anomaly_maps = torch.from_numpy(anomaly_maps)
         masks = torch.from_numpy(masks)
         metafunc.parametrize(argnames=("anomaly_maps", "masks"), argvalues=[(anomaly_maps, masks)])
+        metafunc.parametrize(argnames=("paths",), argvalues=[(None,), (["/path/to/a", "/path/to/b", "/path/to/c"],)])
 
 
 def _do_test_pimo_outputs(
@@ -498,17 +499,23 @@ def test_aupimo_edge(
 def test_pimoresult_object(
     anomaly_maps: Tensor,
     masks: Tensor,
+    paths: list[str] | None,
 ) -> None:
     """Test if `PIMOResult` can be converted to other formats and back."""
     from anomalib.metrics.per_image import pimo
     from anomalib.metrics.per_image.pimo import PIMOResult
 
+    optional_kwargs = {}
+    if paths is not None:
+        optional_kwargs["paths"] = paths
+
     pimoresult = pimo.pimo_curves(
         anomaly_maps,
         masks,
         num_threshs=7,
         binclf_algorithm="numba",
         shared_fpr_metric="mean-per-image-fpr",
+        **optional_kwargs,
     )
 
     _ = pimoresult.num_threshs
@@ -543,11 +550,16 @@ def test_pimoresult_object(
 def test_aupimoresult_object(
     anomaly_maps: Tensor,
     masks: Tensor,
+    paths: list[str] | None,
 ) -> None:
     """Test if `AUPIMOResult` can be converted to other formats and back."""
     from anomalib.metrics.per_image import pimo
     from anomalib.metrics.per_image.pimo import AUPIMOResult
 
+    optional_kwargs = {}
+    if paths is not None:
+        optional_kwargs["paths"] = paths
+
     _, aupimoresult = pimo.aupimo_scores(
         anomaly_maps,
         masks,
@@ -556,6 +568,7 @@ def test_aupimoresult_object(
         shared_fpr_metric="mean-per-image-fpr",
         fpr_bounds=(1e-5, 1e-4),
         force=True,
+        **optional_kwargs,
     )
 
     # call properties
diff --git a/tests/unit/metrics/per_image/test_utils.py b/tests/unit/metrics/per_image/test_utils.py
index a7b42a2a27..b6fcad4b5b 100644
--- a/tests/unit/metrics/per_image/test_utils.py
+++ b/tests/unit/metrics/per_image/test_utils.py
@@ -28,6 +28,7 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
         "thresh_lower_bound": 1.0,
         "thresh_upper_bound": 2.0,
     }
+    fake_paths = [f"/path/to/file_{i}" for i in range(num_images)]
     scores_per_model_dicts = [
         ({"a": aucs1, "b": aucs2},),
         ({"a": aucs1, "b": aucs2, "c": aucs3},),
@@ -39,6 +40,13 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
                 "c": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs3}),
             },
         ),
+        (
+            {
+                "a": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs1, "paths": fake_paths}),
+                "b": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs2, "paths": fake_paths}),
+                "c": AUPIMOResult(**{**mock_aupimoresult_stuff, "aupimos": aucs3, "paths": fake_paths}),
+            },
+        ),
     ]
 
     if (

From a6404fcfa0dcac31b9c3d0264b127bf873c65250 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Thu, 4 Jan 2024 10:57:28 +0100
Subject: [PATCH 31/57] remove frozen from dataclasses and some done todos

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/pimo.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 0869cbd5ff..686d01f58f 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -8,8 +8,6 @@
 
 TODO(jpcbertoldo): add ref to paper to all functions
 TODO(jpcbertoldo): add link to the tutorial notebooks
-TODO(jpcbertoldo): add image file path to `PIMOResult` and `AUPIMOResult` and change save/load methods
-TODO(jpcbertoldo): change `aucs` in the paper supp mat to `aupimos`
 TODO(jpcbertoldo): add formalities (license header, author)
 """
 from __future__ import annotations
@@ -125,7 +123,7 @@ def _validate_source_images_paths(paths: Sequence[str], expected_num_paths: int
 # =========================================== RESULT OBJECT ===========================================
 
 
-@dataclass(frozen=True)
+@dataclass
 class PIMOResult:
     """Per-Image Overlap (PIMO, pronounced pee-mo) curve.
 
@@ -276,7 +274,7 @@ def load(cls: type[PIMOResult], file_path: str | Path) -> PIMOResult:
             raise TypeError(msg) from ex
 
 
-@dataclass(frozen=True)
+@dataclass
 class AUPIMOResult:
     """Area Under the Per-Image Overlap (AUPIMO, pronounced a-u-pee-mo) curve.
 

From 14c97fa2841e1e175e634c9b4fb45bbbbf4bf9db Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Thu, 4 Jan 2024 11:11:02 +0100
Subject: [PATCH 32/57] review headers

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/__init__.py            | 5 ++++-
 src/anomalib/metrics/per_image/_binclf_curve_numba.py | 5 ++++-
 src/anomalib/metrics/per_image/_validate.py           | 5 ++++-
 src/anomalib/metrics/per_image/binclf_curve.py        | 5 ++++-
 src/anomalib/metrics/per_image/binclf_curve_numpy.py  | 5 ++++-
 src/anomalib/metrics/per_image/pimo.py                | 8 +++++---
 src/anomalib/metrics/per_image/pimo_numpy.py          | 7 ++++---
 src/anomalib/metrics/per_image/utils_numpy.py         | 6 +++++-
 8 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/src/anomalib/metrics/per_image/__init__.py b/src/anomalib/metrics/per_image/__init__.py
index dbc0f6d1a6..f32f7f3e34 100644
--- a/src/anomalib/metrics/per_image/__init__.py
+++ b/src/anomalib/metrics/per_image/__init__.py
@@ -1,8 +1,11 @@
 """Per-Image Metrics.
 
-TODO(jpcbertoldo): add formalities (license header, author)
+author: jpcbertoldo
 """
 
+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 from .binclf_curve import per_image_binclf_curve, per_image_fpr, per_image_tpr
 from .binclf_curve_numpy import BinclfAlgorithm, BinclfThreshsChoice
 from .pimo import AUPIMO, PIMO, AUPIMOResult, PIMOResult, aupimo_scores, pimo_curves
diff --git a/src/anomalib/metrics/per_image/_binclf_curve_numba.py b/src/anomalib/metrics/per_image/_binclf_curve_numba.py
index 914480d77f..951223d109 100644
--- a/src/anomalib/metrics/per_image/_binclf_curve_numba.py
+++ b/src/anomalib/metrics/per_image/_binclf_curve_numba.py
@@ -2,9 +2,12 @@
 
 See docstring of `binclf_curve` or `binclf_curve_numpy` for more details.
 
-TODO(jpcbertoldo): add formalities (license header, author)
+author: jpcbertoldo
 """
 
+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import numba
 import numpy as np
 from numpy import ndarray
diff --git a/src/anomalib/metrics/per_image/_validate.py b/src/anomalib/metrics/per_image/_validate.py
index d9a1000db2..34996df752 100644
--- a/src/anomalib/metrics/per_image/_validate.py
+++ b/src/anomalib/metrics/per_image/_validate.py
@@ -2,9 +2,12 @@
 
 `torch` is imported in the functions that use it, so this module can be used in numpy-standalone mode.
 
-TODO(jpcbertoldo): add formalities (license header, author)
+author: jpcbertoldo
 """
 
+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 from __future__ import annotations
 
 from pathlib import Path
diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
index adaece314f..4a7b9daa62 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -8,9 +8,12 @@
 Validations will preferably happen in ndarray so the numpy code can be reused without torch,
 so often times the Tensor arguments will be converted to ndarray and then validated.
 
-TODO(jpcbertoldo): add formalities (license header, author)
+author: jpcbertoldo
 """
 
+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 from __future__ import annotations
 
 import torch
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index 912741d0cc..2472ded352 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -4,9 +4,12 @@
 
 The thresholds are shared by all instances/images, but their binclf are computed independently for each instance/image.
 
-TODO(jpcbertoldo): add formalities (license header, author)
+author: jpcbertoldo
 """
 
+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 import logging
 from dataclasses import dataclass
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 686d01f58f..f467984405 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -6,10 +6,12 @@
 Validations will preferably happen in ndarray so the numpy code can be reused without torch,
 so often times the Tensor arguments will be converted to ndarray and then validated.
 
-TODO(jpcbertoldo): add ref to paper to all functions
-TODO(jpcbertoldo): add link to the tutorial notebooks
-TODO(jpcbertoldo): add formalities (license header, author)
+author: jpcbertoldo
 """
+
+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 from __future__ import annotations
 
 import json
diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index fc40094057..15ca1a2d49 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -22,11 +22,12 @@
 
 `AUPIMO` is the area under each `PIMO` curve with bounded integration range in terms of shared FPR.
 
-TODO(jpcbertoldo): add ref to paper to all functions
-TODO(jpcbertoldo): add link to the tutorial notebooks
-TODO(jpcbertoldo): add formalities (license header, author)
+author: jpcbertoldo
 """
 
+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import logging
 import warnings
 from dataclasses import dataclass
diff --git a/src/anomalib/metrics/per_image/utils_numpy.py b/src/anomalib/metrics/per_image/utils_numpy.py
index 7252989c3a..989bdbe1d0 100644
--- a/src/anomalib/metrics/per_image/utils_numpy.py
+++ b/src/anomalib/metrics/per_image/utils_numpy.py
@@ -1,7 +1,11 @@
 """Utility functions for per-image metrics.
 
-TODO(jpcbertoldo): add formalities (license header, author)
+author: jpcbertoldo
 """
+
+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 from __future__ import annotations
 
 import itertools

From 5bc3b2b0433f0e12c4e37dde9e4141647714a20e Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Mon, 22 Jan 2024 18:19:02 +0100
Subject: [PATCH 33/57] doc modifs

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/_binclf_curve_numba.py | 8 ++++----
 src/anomalib/metrics/per_image/binclf_curve.py        | 2 +-
 src/anomalib/metrics/per_image/binclf_curve_numpy.py  | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/anomalib/metrics/per_image/_binclf_curve_numba.py b/src/anomalib/metrics/per_image/_binclf_curve_numba.py
index 951223d109..fe004185c8 100644
--- a/src/anomalib/metrics/per_image/_binclf_curve_numba.py
+++ b/src/anomalib/metrics/per_image/_binclf_curve_numba.py
@@ -15,11 +15,11 @@
 
 @numba.jit(nopython=True)
 def binclf_one_curve_numba(scores: ndarray, gts: ndarray, threshs: ndarray) -> ndarray:
-    """ONE binary classification matrix at each threshold (NUMBA implementation).
+    """One binary classification matrix at each threshold (NUMBA implementation).
 
     This does the same as `_binclf_one_curve_python` but with numba using just-in-time compilation.
 
-    ATTENTION: VALIDATION IS NOT DONE HERE. Make sure to validate the arguments before calling this function.
+    Note: VALIDATION IS NOT DONE HERE! Make sure to validate the arguments before calling this function.
 
     Args:
         scores (ndarray): Anomaly scores (D,).
@@ -86,12 +86,12 @@ def binclf_one_curve_numba(scores: ndarray, gts: ndarray, threshs: ndarray) -> n
 
 @numba.jit(nopython=True, parallel=True)
 def binclf_multiple_curves_numba(scores_batch: ndarray, gts_batch: ndarray, threshs: ndarray) -> ndarray:
-    """MULTIPLE binary classification matrix at each threshold (NUMBA implementation).
+    """Multiple binary classification matrix at each threshold (NUMBA implementation).
 
     This does the same as `_binclf_multiple_curves_python` but with numba,
     using parallelization and just-in-time compilation.
 
-    ATTENTION: VALIDATION IS NOT DONE HERE. Make sure to validate the arguments before calling this function.
+    Note: VALIDATION IS NOT DONE HERE. Make sure to validate the arguments before calling this function.
 
     Args:
         scores_batch (ndarray): Anomaly scores (N, D,).
diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
index 4a7b9daa62..8c814da59f 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -53,7 +53,7 @@ def per_image_binclf_curve(
 ) -> tuple[Tensor, Tensor]:
     """Compute the binary classification matrix of each image in the batch for multiple thresholds (shared).
 
-    ATTENTION: tensors are converted to numpy arrays and then converted back to tensors (same device as `anomaly_maps`).
+    Note: tensors are converted to numpy arrays and then converted back to tensors (same device as `anomaly_maps`).
 
     Args:
         anomaly_maps (Tensor): Anomaly score maps of shape (N, H, W [, D, ...])
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index 2472ded352..e3abb2f0e1 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -102,7 +102,7 @@ def _validate_gts_batch(gts_batch: ndarray) -> None:
 
 
 def _binclf_one_curve_python(scores: ndarray, gts: ndarray, threshs: ndarray) -> ndarray:
-    """ONE binary classification matrix at each threshold (PYTHON implementation).
+    """One binary classification matrix at each threshold (PYTHON implementation).
 
     In the case where the thresholds are given (i.e. not considering all possible thresholds based on the scores),
     this weird-looking function is faster than the two options in `torchmetrics` on the CPU:
@@ -111,7 +111,7 @@ def _binclf_one_curve_python(scores: ndarray, gts: ndarray, threshs: ndarray) ->
 
     (both in module `torchmetrics.functional.classification.precision_recall_curve` in `torchmetrics==1.1.0`).
 
-    ATTENTION: VALIDATION IS NOT DONE HERE. Make sure to validate the arguments before calling this function.
+    Note: VALIDATION IS NOT DONE HERE. Make sure to validate the arguments before calling this function.
 
     Args:
         scores (ndarray): Anomaly scores (D,).
@@ -178,7 +178,7 @@ def func(score) -> bool:  # noqa: ANN001
 
 _binclf_multiple_curves_python = np.vectorize(_binclf_one_curve_python, signature="(n),(n),(k)->(k,2,2)")
 _binclf_multiple_curves_python.__doc__ = """
-MULTIPLE binary classification matrix at each threshold (PYTHON implementation).
+Multiple binary classification matrix at each threshold (PYTHON implementation).
 vectorized version of `_binclf_one_curve_python` (see above)
 """
 

From b8e0ddf55b385427c87022082fa1b9fa93f9a9cb Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Mon, 22 Jan 2024 18:23:24 +0100
Subject: [PATCH 34/57] refactor `score_less_than_thresh` in
 `_binclf_one_curve_python`

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/binclf_curve_numpy.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index e3abb2f0e1..690e9b3f4c 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -13,6 +13,7 @@
 import itertools
 import logging
 from dataclasses import dataclass
+from functools import partial
 from typing import ClassVar
 
 import numpy as np
@@ -140,24 +141,21 @@ def _binclf_one_curve_python(scores: ndarray, gts: ndarray, threshs: ndarray) ->
     num_neg = current_count_fp = scores_negatives.size
     fps = np.empty((num_th,), dtype=np.int64)
 
-    def score_less_than_thresh(thresh):  # noqa: ANN001, ANN202
-        def func(score) -> bool:  # noqa: ANN001
-            return score < thresh
-
-        return func
+    def score_less_than_thresh(score: float, thresh: float) -> bool:
+        return score < thresh
 
     # it will progressively drop the scores that are below the current thresh
     for thresh_idx, thresh in enumerate(threshs):
         # UPDATE POSITIVES
         # < becasue it is the same as ~(>=)
-        num_drop = sum(1 for _ in itertools.takewhile(score_less_than_thresh(thresh), scores_positives))
+        num_drop = sum(1 for _ in itertools.takewhile(partial(score_less_than_thresh, thresh=thresh), scores_positives))
         scores_positives = scores_positives[num_drop:]
         current_count_tp -= num_drop
         tps[thresh_idx] = current_count_tp
 
         # UPDATE NEGATIVES
         # same with the negatives
-        num_drop = sum(1 for _ in itertools.takewhile(score_less_than_thresh(thresh), scores_negatives))
+        num_drop = sum(1 for _ in itertools.takewhile(partial(score_less_than_thresh, thresh=thresh), scores_negatives))
         scores_negatives = scores_negatives[num_drop:]
         current_count_fp -= num_drop
         fps[thresh_idx] = current_count_fp

From 943c1a78bd213265c5bd6c13b56b90989900eae7 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Mon, 22 Jan 2024 17:50:19 +0100
Subject: [PATCH 35/57] correct license comments

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/__init__.py            |  8 +++++++-
 src/anomalib/metrics/per_image/_binclf_curve_numba.py |  8 +++++++-
 src/anomalib/metrics/per_image/_validate.py           |  8 +++++++-
 src/anomalib/metrics/per_image/binclf_curve.py        |  8 +++++++-
 src/anomalib/metrics/per_image/binclf_curve_numpy.py  |  8 +++++++-
 src/anomalib/metrics/per_image/pimo.py                |  8 +++++++-
 src/anomalib/metrics/per_image/pimo_numpy.py          |  8 +++++++-
 src/anomalib/metrics/per_image/utils.py               | 10 ++++++++++
 src/anomalib/metrics/per_image/utils_numpy.py         |  8 +++++++-
 9 files changed, 66 insertions(+), 8 deletions(-)

diff --git a/src/anomalib/metrics/per_image/__init__.py b/src/anomalib/metrics/per_image/__init__.py
index f32f7f3e34..11c23fba24 100644
--- a/src/anomalib/metrics/per_image/__init__.py
+++ b/src/anomalib/metrics/per_image/__init__.py
@@ -3,7 +3,13 @@
 author: jpcbertoldo
 """
 
-# Copyright (C) 2022 Intel Corporation
+# Original Code
+# Copyright (c) 2024 @jpcbertoldo
+# https://github.com/jpcbertoldo/aupimo
+# SPDX-License-Identifier: MIT
+#
+# Modified
+# Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 from .binclf_curve import per_image_binclf_curve, per_image_fpr, per_image_tpr
diff --git a/src/anomalib/metrics/per_image/_binclf_curve_numba.py b/src/anomalib/metrics/per_image/_binclf_curve_numba.py
index fe004185c8..3efe04b52c 100644
--- a/src/anomalib/metrics/per_image/_binclf_curve_numba.py
+++ b/src/anomalib/metrics/per_image/_binclf_curve_numba.py
@@ -5,7 +5,13 @@
 author: jpcbertoldo
 """
 
-# Copyright (C) 2022 Intel Corporation
+# Original Code
+# Copyright (c) 2024 @jpcbertoldo
+# https://github.com/jpcbertoldo/aupimo
+# SPDX-License-Identifier: MIT
+#
+# Modified
+# Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 import numba
diff --git a/src/anomalib/metrics/per_image/_validate.py b/src/anomalib/metrics/per_image/_validate.py
index 34996df752..28fbe6114c 100644
--- a/src/anomalib/metrics/per_image/_validate.py
+++ b/src/anomalib/metrics/per_image/_validate.py
@@ -5,7 +5,13 @@
 author: jpcbertoldo
 """
 
-# Copyright (C) 2022 Intel Corporation
+# Original Code
+# Copyright (c) 2024 @jpcbertoldo
+# https://github.com/jpcbertoldo/aupimo
+# SPDX-License-Identifier: MIT
+#
+# Modified
+# Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
index 8c814da59f..6805baf1e8 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -11,7 +11,13 @@
 author: jpcbertoldo
 """
 
-# Copyright (C) 2022 Intel Corporation
+# Original Code
+# Copyright (c) 2024 @jpcbertoldo
+# https://github.com/jpcbertoldo/aupimo
+# SPDX-License-Identifier: MIT
+#
+# Modified
+# Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index 690e9b3f4c..ffdd9e753d 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -7,7 +7,13 @@
 author: jpcbertoldo
 """
 
-# Copyright (C) 2022 Intel Corporation
+# Original Code
+# Copyright (c) 2024 @jpcbertoldo
+# https://github.com/jpcbertoldo/aupimo
+# SPDX-License-Identifier: MIT
+#
+# Modified
+# Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 import itertools
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index f467984405..d8305d8bba 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -9,7 +9,13 @@
 author: jpcbertoldo
 """
 
-# Copyright (C) 2022 Intel Corporation
+# Original Code
+# Copyright (c) 2024 @jpcbertoldo
+# https://github.com/jpcbertoldo/aupimo
+# SPDX-License-Identifier: MIT
+#
+# Modified
+# Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index 15ca1a2d49..a9a0109566 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -25,7 +25,13 @@
 author: jpcbertoldo
 """
 
-# Copyright (C) 2022 Intel Corporation
+# Original Code
+# Copyright (c) 2024 @jpcbertoldo
+# https://github.com/jpcbertoldo/aupimo
+# SPDX-License-Identifier: MIT
+#
+# Modified
+# Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 import logging
diff --git a/src/anomalib/metrics/per_image/utils.py b/src/anomalib/metrics/per_image/utils.py
index 0f0e3a1723..aeab08a55f 100644
--- a/src/anomalib/metrics/per_image/utils.py
+++ b/src/anomalib/metrics/per_image/utils.py
@@ -1,4 +1,14 @@
 """Torch-oriented interfaces for `utils.py`."""
+
+# Original Code
+# Copyright (c) 2024 @jpcbertoldo
+# https://github.com/jpcbertoldo/aupimo
+# SPDX-License-Identifier: MIT
+#
+# Modified
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 from __future__ import annotations
 
 import logging
diff --git a/src/anomalib/metrics/per_image/utils_numpy.py b/src/anomalib/metrics/per_image/utils_numpy.py
index 989bdbe1d0..4ef627ec22 100644
--- a/src/anomalib/metrics/per_image/utils_numpy.py
+++ b/src/anomalib/metrics/per_image/utils_numpy.py
@@ -3,7 +3,13 @@
 author: jpcbertoldo
 """
 
-# Copyright (C) 2022 Intel Corporation
+# Original Code
+# Copyright (c) 2024 @jpcbertoldo
+# https://github.com/jpcbertoldo/aupimo
+# SPDX-License-Identifier: MIT
+#
+# Modified
+# Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations

From 2e565d1e52fd1c584b20ffbc74ea3a5fbb9fe3b9 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Fri, 9 Feb 2024 16:02:59 +0100
Subject: [PATCH 36/57] fix doc

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/__init__.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/anomalib/metrics/__init__.py b/src/anomalib/metrics/__init__.py
index 4c3eafa811..a47680f676 100644
--- a/src/anomalib/metrics/__init__.py
+++ b/src/anomalib/metrics/__init__.py
@@ -11,6 +11,7 @@
 import torchmetrics
 from omegaconf import DictConfig, ListConfig
 
+from . import per_image
 from .anomaly_score_distribution import AnomalyScoreDistribution
 from .aupr import AUPR
 from .aupro import AUPRO
@@ -19,6 +20,7 @@
 from .f1_max import F1Max
 from .f1_score import F1Score
 from .min_max import MinMax
+from .per_image import AUPIMO, PIMO, aupimo_scores, pimo_curves
 from .precision_recall_curve import BinaryPrecisionRecallCurve
 from .pro import PRO
 from .threshold import F1AdaptiveThreshold, ManualThreshold
@@ -35,6 +37,11 @@
     "ManualThreshold",
     "MinMax",
     "PRO",
+    "per_image",
+    "pimo_curves",
+    "aupimo_scores",
+    "PIMO",
+    "AUPIMO",
 ]
 
 logger = logging.getLogger(__name__)

From 103b6dbbddf3b071fb0feaa35f4f7a04bf463a80 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Tue, 28 May 2024 11:42:12 +0200
Subject: [PATCH 37/57] numba as extra requirement

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 97ffc68e07..a9916e9657 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -81,7 +81,8 @@ test = [
     "coverage[toml]",
     "tox",
 ]
-full = ["anomalib[core,openvino,loggers,notebooks]"]
+extra = ["numba>=0.58.1"]
+full = ["anomalib[core,openvino,loggers,notebooks,extra]"]
 dev = ["anomalib[full,docs,test]"]
 
 [project.scripts]

From 08b85ef460073c68afae25772769219059b031a1 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Tue, 28 May 2024 11:48:20 +0200
Subject: [PATCH 38/57] refactor copyrights from jpcbertoldo

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/__init__.py            | 2 --
 src/anomalib/metrics/per_image/_binclf_curve_numba.py | 2 --
 src/anomalib/metrics/per_image/_validate.py           | 2 --
 src/anomalib/metrics/per_image/binclf_curve.py        | 2 --
 src/anomalib/metrics/per_image/binclf_curve_numpy.py  | 2 --
 src/anomalib/metrics/per_image/pimo.py                | 2 --
 src/anomalib/metrics/per_image/pimo_numpy.py          | 2 --
 src/anomalib/metrics/per_image/utils.py               | 7 ++++---
 src/anomalib/metrics/per_image/utils_numpy.py         | 2 --
 tests/unit/metrics/per_image/__init__.py              | 5 ++++-
 tests/unit/metrics/per_image/test_binclf_curve.py     | 5 ++++-
 tests/unit/metrics/per_image/test_pimo.py             | 5 ++++-
 tests/unit/metrics/per_image/test_utils.py            | 5 ++++-
 13 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/src/anomalib/metrics/per_image/__init__.py b/src/anomalib/metrics/per_image/__init__.py
index 11c23fba24..406da86931 100644
--- a/src/anomalib/metrics/per_image/__init__.py
+++ b/src/anomalib/metrics/per_image/__init__.py
@@ -4,9 +4,7 @@
 """
 
 # Original Code
-# Copyright (c) 2024 @jpcbertoldo
 # https://github.com/jpcbertoldo/aupimo
-# SPDX-License-Identifier: MIT
 #
 # Modified
 # Copyright (C) 2024 Intel Corporation
diff --git a/src/anomalib/metrics/per_image/_binclf_curve_numba.py b/src/anomalib/metrics/per_image/_binclf_curve_numba.py
index 3efe04b52c..1446c50c64 100644
--- a/src/anomalib/metrics/per_image/_binclf_curve_numba.py
+++ b/src/anomalib/metrics/per_image/_binclf_curve_numba.py
@@ -6,9 +6,7 @@
 """
 
 # Original Code
-# Copyright (c) 2024 @jpcbertoldo
 # https://github.com/jpcbertoldo/aupimo
-# SPDX-License-Identifier: MIT
 #
 # Modified
 # Copyright (C) 2024 Intel Corporation
diff --git a/src/anomalib/metrics/per_image/_validate.py b/src/anomalib/metrics/per_image/_validate.py
index 28fbe6114c..7db1dffb82 100644
--- a/src/anomalib/metrics/per_image/_validate.py
+++ b/src/anomalib/metrics/per_image/_validate.py
@@ -6,9 +6,7 @@
 """
 
 # Original Code
-# Copyright (c) 2024 @jpcbertoldo
 # https://github.com/jpcbertoldo/aupimo
-# SPDX-License-Identifier: MIT
 #
 # Modified
 # Copyright (C) 2024 Intel Corporation
diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
index 6805baf1e8..2051ffbc20 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -12,9 +12,7 @@
 """
 
 # Original Code
-# Copyright (c) 2024 @jpcbertoldo
 # https://github.com/jpcbertoldo/aupimo
-# SPDX-License-Identifier: MIT
 #
 # Modified
 # Copyright (C) 2024 Intel Corporation
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index ffdd9e753d..f5373f68b7 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -8,9 +8,7 @@
 """
 
 # Original Code
-# Copyright (c) 2024 @jpcbertoldo
 # https://github.com/jpcbertoldo/aupimo
-# SPDX-License-Identifier: MIT
 #
 # Modified
 # Copyright (C) 2024 Intel Corporation
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index d8305d8bba..25ebf63bbe 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -10,9 +10,7 @@
 """
 
 # Original Code
-# Copyright (c) 2024 @jpcbertoldo
 # https://github.com/jpcbertoldo/aupimo
-# SPDX-License-Identifier: MIT
 #
 # Modified
 # Copyright (C) 2024 Intel Corporation
diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index a9a0109566..be334f3679 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -26,9 +26,7 @@
 """
 
 # Original Code
-# Copyright (c) 2024 @jpcbertoldo
 # https://github.com/jpcbertoldo/aupimo
-# SPDX-License-Identifier: MIT
 #
 # Modified
 # Copyright (C) 2024 Intel Corporation
diff --git a/src/anomalib/metrics/per_image/utils.py b/src/anomalib/metrics/per_image/utils.py
index aeab08a55f..d32b498b0c 100644
--- a/src/anomalib/metrics/per_image/utils.py
+++ b/src/anomalib/metrics/per_image/utils.py
@@ -1,9 +1,10 @@
-"""Torch-oriented interfaces for `utils.py`."""
+"""Torch-oriented interfaces for `utils.py`.
+
+author: jpcbertoldo
+"""
 
 # Original Code
-# Copyright (c) 2024 @jpcbertoldo
 # https://github.com/jpcbertoldo/aupimo
-# SPDX-License-Identifier: MIT
 #
 # Modified
 # Copyright (C) 2024 Intel Corporation
diff --git a/src/anomalib/metrics/per_image/utils_numpy.py b/src/anomalib/metrics/per_image/utils_numpy.py
index 4ef627ec22..ecfcea3aef 100644
--- a/src/anomalib/metrics/per_image/utils_numpy.py
+++ b/src/anomalib/metrics/per_image/utils_numpy.py
@@ -4,9 +4,7 @@
 """
 
 # Original Code
-# Copyright (c) 2024 @jpcbertoldo
 # https://github.com/jpcbertoldo/aupimo
-# SPDX-License-Identifier: MIT
 #
 # Modified
 # Copyright (C) 2024 Intel Corporation
diff --git a/tests/unit/metrics/per_image/__init__.py b/tests/unit/metrics/per_image/__init__.py
index b925ba9aa3..6c2c8af91d 100644
--- a/tests/unit/metrics/per_image/__init__.py
+++ b/tests/unit/metrics/per_image/__init__.py
@@ -1 +1,4 @@
-"""Per-Image Metrics Tests."""
+"""Per-Image Metrics Tests.
+
+author: jpcbertoldo
+"""
diff --git a/tests/unit/metrics/per_image/test_binclf_curve.py b/tests/unit/metrics/per_image/test_binclf_curve.py
index 5ea2a658d0..589bd9c27d 100644
--- a/tests/unit/metrics/per_image/test_binclf_curve.py
+++ b/tests/unit/metrics/per_image/test_binclf_curve.py
@@ -1,4 +1,7 @@
-"""Tests for per-image binary classification curves using numpy and numba versions."""
+"""Tests for per-image binary classification curves using numpy and numba versions.
+
+author: jpcbertoldo
+"""
 # ruff: noqa: SLF001, PT011
 
 import numpy as np
diff --git a/tests/unit/metrics/per_image/test_pimo.py b/tests/unit/metrics/per_image/test_pimo.py
index bbcd4aa797..d3617fb24b 100644
--- a/tests/unit/metrics/per_image/test_pimo.py
+++ b/tests/unit/metrics/per_image/test_pimo.py
@@ -1,4 +1,7 @@
-"""Test `anomalib.metrics.per_image.pimo_numpy`."""
+"""Test `anomalib.metrics.per_image.pimo_numpy`.
+
+author: jpcbertoldo
+"""
 
 import tempfile
 from pathlib import Path
diff --git a/tests/unit/metrics/per_image/test_utils.py b/tests/unit/metrics/per_image/test_utils.py
index b6fcad4b5b..3f531ed288 100644
--- a/tests/unit/metrics/per_image/test_utils.py
+++ b/tests/unit/metrics/per_image/test_utils.py
@@ -1,4 +1,7 @@
-"""Test `utils.py`."""
+"""Test `utils.py`.
+
+author: jpcbertoldo
+"""
 
 from collections import OrderedDict
 

From 616576824df247f065b11865a90634d4b7fb150b Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Tue, 28 May 2024 11:52:23 +0200
Subject: [PATCH 39/57] remove from __future__ import annotations

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/_validate.py    | 2 --
 src/anomalib/metrics/per_image/binclf_curve.py | 2 --
 src/anomalib/metrics/per_image/pimo.py         | 2 --
 src/anomalib/metrics/per_image/utils.py        | 2 --
 src/anomalib/metrics/per_image/utils_numpy.py  | 2 --
 5 files changed, 10 deletions(-)

diff --git a/src/anomalib/metrics/per_image/_validate.py b/src/anomalib/metrics/per_image/_validate.py
index 7db1dffb82..04bb7b8bf7 100644
--- a/src/anomalib/metrics/per_image/_validate.py
+++ b/src/anomalib/metrics/per_image/_validate.py
@@ -12,8 +12,6 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-from __future__ import annotations
-
 from pathlib import Path
 from typing import Any
 
diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
index 2051ffbc20..5dfa80f950 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -18,8 +18,6 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-from __future__ import annotations
-
 import torch
 from torch import Tensor
 
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 25ebf63bbe..09f823f8fd 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -16,8 +16,6 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-from __future__ import annotations
-
 import json
 import warnings
 from dataclasses import dataclass, field
diff --git a/src/anomalib/metrics/per_image/utils.py b/src/anomalib/metrics/per_image/utils.py
index d32b498b0c..3a35ed7aa9 100644
--- a/src/anomalib/metrics/per_image/utils.py
+++ b/src/anomalib/metrics/per_image/utils.py
@@ -10,8 +10,6 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-from __future__ import annotations
-
 import logging
 import warnings
 from collections import OrderedDict
diff --git a/src/anomalib/metrics/per_image/utils_numpy.py b/src/anomalib/metrics/per_image/utils_numpy.py
index ecfcea3aef..73da53a82a 100644
--- a/src/anomalib/metrics/per_image/utils_numpy.py
+++ b/src/anomalib/metrics/per_image/utils_numpy.py
@@ -10,8 +10,6 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-from __future__ import annotations
-
 import itertools
 from collections import OrderedDict
 from typing import ClassVar

From fbdf8b601fd39a749410770d8e977fe39c9a1991 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Tue, 28 May 2024 12:04:59 +0200
Subject: [PATCH 40/57] refactor validations names

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/_validate.py   |  43 ++++---
 .../metrics/per_image/binclf_curve.py         |  14 +--
 .../metrics/per_image/binclf_curve_numpy.py   |  26 ++--
 src/anomalib/metrics/per_image/pimo.py        |  88 +++++++-------
 src/anomalib/metrics/per_image/pimo_numpy.py  |  40 +++---
 src/anomalib/metrics/per_image/utils.py       | 115 +++++++++---------
 src/anomalib/metrics/per_image/utils_numpy.py |  18 +--
 7 files changed, 174 insertions(+), 170 deletions(-)

diff --git a/src/anomalib/metrics/per_image/_validate.py b/src/anomalib/metrics/per_image/_validate.py
index 04bb7b8bf7..15cba7d129 100644
--- a/src/anomalib/metrics/per_image/_validate.py
+++ b/src/anomalib/metrics/per_image/_validate.py
@@ -30,7 +30,7 @@ def is_tensor(tensor: Any, argname: str | None = None) -> None:  # noqa: ANN401
         raise TypeError(msg)
 
 
-def num_threshs(num_threshs: int) -> None:
+def is_num_threshs_gte2(num_threshs: int) -> None:
     """Validate the number of thresholds is a positive integer >= 2."""
     if not isinstance(num_threshs, int):
         msg = f"Expected the number of thresholds to be an integer, but got {type(num_threshs)}"
@@ -41,7 +41,7 @@ def num_threshs(num_threshs: int) -> None:
         raise ValueError(msg)
 
 
-def same_shape(*args) -> None:
+def is_same_shape(*args) -> None:
     """Works both for tensors and ndarrays."""
     assert len(args) > 0
     shapes = sorted({tuple(arg.shape) for arg in args})
@@ -50,7 +50,7 @@ def same_shape(*args) -> None:
         raise ValueError(msg)
 
 
-def rate(rate: float | int, zero_ok: bool, one_ok: bool) -> None:
+def is_rate(rate: float | int, zero_ok: bool, one_ok: bool) -> None:
     """Validates a rate parameter.
 
     Args:
@@ -75,7 +75,7 @@ def rate(rate: float | int, zero_ok: bool, one_ok: bool) -> None:
         raise ValueError(msg)
 
 
-def rate_range(bounds: tuple[float, float]) -> None:
+def is_rate_range(bounds: tuple[float, float]) -> None:
     """Validates the range of rates within the bounds.
 
     Args:
@@ -90,15 +90,15 @@ def rate_range(bounds: tuple[float, float]) -> None:
         raise ValueError(msg)
 
     lower, upper = bounds
-    rate(lower, zero_ok=False, one_ok=False)
-    rate(upper, zero_ok=False, one_ok=True)
+    is_rate(lower, zero_ok=False, one_ok=False)
+    is_rate(upper, zero_ok=False, one_ok=True)
 
     if lower >= upper:
         msg = f"Expected the upper bound to be larger than the lower bound, but got {upper=} <= {lower=}"
         raise ValueError(msg)
 
 
-def file_path(file_path: str | Path, must_exist: bool, extension: str | None, pathlib_ok: bool) -> None:
+def is_file_path(file_path: str | Path, must_exist: bool, extension: str | None, pathlib_ok: bool) -> None:
     """Validate the given path is a file (optionally) with the expected extension.
 
     Args:
@@ -135,7 +135,12 @@ def file_path(file_path: str | Path, must_exist: bool, extension: str | None, pa
         raise ValueError(msg)
 
 
-def file_paths(file_paths: list[str | Path], must_exist: bool, extension: str | None, pathlib_ok: bool) -> None:
+def is_list_of_file_path(
+    file_paths: list[str | Path],
+    must_exist: bool,
+    extension: str | None,
+    pathlib_ok: bool,
+) -> None:
     """Validate the given paths are files (optionally) with the expected extension.
 
     Args:
@@ -151,7 +156,7 @@ def file_paths(file_paths: list[str | Path], must_exist: bool, extension: str |
     for idx, path in enumerate(file_paths):
         try:
             msg = f"Invalid path at index {idx}: {path}"
-            file_path(path, must_exist=must_exist, extension=extension, pathlib_ok=pathlib_ok)
+            is_file_path(path, must_exist=must_exist, extension=extension, pathlib_ok=pathlib_ok)
 
         except TypeError as ex:  # noqa: PERF203
             raise TypeError(msg) from ex
@@ -160,7 +165,7 @@ def file_paths(file_paths: list[str | Path], must_exist: bool, extension: str |
             raise ValueError(msg) from ex
 
 
-def threshs(threshs: ndarray) -> None:
+def is_threshs(threshs: ndarray) -> None:
     """Validate that the thresholds are valid and monotonically increasing."""
     if not isinstance(threshs, ndarray):
         msg = f"Expected thresholds to be an ndarray, but got {type(threshs)}"
@@ -180,7 +185,7 @@ def threshs(threshs: ndarray) -> None:
         raise ValueError(msg)
 
 
-def thresh_bounds(thresh_bounds: tuple[float, float]) -> None:
+def is_thresh_bounds(thresh_bounds: tuple[float, float]) -> None:
     if not isinstance(thresh_bounds, tuple):
         msg = f"Expected threshold bounds to be a tuple, but got {type(thresh_bounds)}."
         raise TypeError(msg)
@@ -204,7 +209,7 @@ def thresh_bounds(thresh_bounds: tuple[float, float]) -> None:
         raise ValueError(msg)
 
 
-def anomaly_maps(anomaly_maps: ndarray) -> None:
+def is_anomaly_maps(anomaly_maps: ndarray) -> None:
     if not isinstance(anomaly_maps, ndarray):
         msg = f"Expected anomaly maps to be an ndarray, but got {type(anomaly_maps)}"
         raise TypeError(msg)
@@ -221,7 +226,7 @@ def anomaly_maps(anomaly_maps: ndarray) -> None:
         raise TypeError(msg)
 
 
-def masks(masks: ndarray) -> None:
+def is_masks(masks: ndarray) -> None:
     if not isinstance(masks, ndarray):
         msg = f"Expected masks to be an ndarray, but got {type(masks)}"
         raise TypeError(msg)
@@ -250,7 +255,7 @@ def masks(masks: ndarray) -> None:
         raise TypeError(msg)
 
 
-def binclf_curves(binclf_curves: ndarray, valid_threshs: ndarray | None) -> None:
+def is_binclf_curves(binclf_curves: ndarray, valid_threshs: ndarray | None) -> None:
     if not isinstance(binclf_curves, ndarray):
         msg = f"Expected binclf curves to be an ndarray, but got {type(binclf_curves)}"
         raise TypeError(msg)
@@ -294,7 +299,7 @@ def binclf_curves(binclf_curves: ndarray, valid_threshs: ndarray | None) -> None
         raise RuntimeError(msg)
 
 
-def images_classes(images_classes: ndarray) -> None:
+def is_images_classes(images_classes: ndarray) -> None:
     if not isinstance(images_classes, ndarray):
         msg = f"Expected image classes to be an ndarray, but got {type(images_classes)}."
         raise TypeError(msg)
@@ -321,7 +326,7 @@ def images_classes(images_classes: ndarray) -> None:
         raise TypeError(msg)
 
 
-def rates(rates: ndarray, nan_allowed: bool) -> None:
+def is_rates(rates: ndarray, nan_allowed: bool) -> None:
     if not isinstance(rates, ndarray):
         msg = f"Expected rates to be an ndarray, but got {type(rates)}."
         raise TypeError(msg)
@@ -355,8 +360,8 @@ def rates(rates: ndarray, nan_allowed: bool) -> None:
         raise ValueError(msg)
 
 
-def rate_curve(rate_curve: ndarray, nan_allowed: bool, decreasing: bool) -> None:
-    rates(rate_curve, nan_allowed=nan_allowed)
+def is_rate_curve(rate_curve: ndarray, nan_allowed: bool, decreasing: bool) -> None:
+    is_rates(rate_curve, nan_allowed=nan_allowed)
 
     diffs = np.diff(rate_curve)
     diffs_valid = diffs[~np.isnan(diffs)] if nan_allowed else diffs
@@ -370,7 +375,7 @@ def rate_curve(rate_curve: ndarray, nan_allowed: bool, decreasing: bool) -> None
         raise ValueError(msg)
 
 
-def per_image_rate_curves(rate_curves: ndarray, nan_allowed: bool, decreasing: bool | None) -> None:
+def is_per_image_rate_curves(rate_curves: ndarray, nan_allowed: bool, decreasing: bool | None) -> None:
     if not isinstance(rate_curves, ndarray):
         msg = f"Expected per-image rate curves to be an ndarray, but got {type(rate_curves)}."
         raise TypeError(msg)
diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
index 5dfa80f950..b0420cf02a 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -27,16 +27,16 @@
 # =========================================== ARGS VALIDATION ===========================================
 
 
-def _validate_threshs(threshs: Tensor) -> None:
+def _validate_is_threshs(threshs: Tensor) -> None:
     _validate.is_tensor(threshs, argname="threshs")
-    _validate.threshs(threshs.numpy())
+    _validate.is_threshs(threshs.numpy())
 
 
-def _validate_binclf_curves(binclf_curves: Tensor, valid_threshs: Tensor | None = None) -> None:
+def _validate_is_binclf_curves(binclf_curves: Tensor, valid_threshs: Tensor | None = None) -> None:
     _validate.is_tensor(binclf_curves, argname="binclf_curves")
     if valid_threshs is not None:
-        _validate_threshs(valid_threshs)
-    _validate.binclf_curves(
+        _validate_is_threshs(valid_threshs)
+    _validate.is_binclf_curves(
         binclf_curves.detach().cpu().numpy(),
         valid_threshs=valid_threshs.numpy() if valid_threshs is not None else None,
     )
@@ -146,7 +146,7 @@ def per_image_tpr(binclf_curves: Tensor) -> Tensor:
 
             Thresholds are sorted in ascending order, so TPR is in descending order.
     """
-    _validate_binclf_curves(binclf_curves)
+    _validate_is_binclf_curves(binclf_curves)
     binclf_curves_array = binclf_curves.detach().cpu().numpy()
     tprs_array = binclf_curve_numpy.per_image_tpr(binclf_curves_array)
     return torch.from_numpy(tprs_array).to(binclf_curves.device)
@@ -168,7 +168,7 @@ def per_image_fpr(binclf_curves: Tensor) -> Tensor:
 
             Thresholds are sorted in ascending order, so FPR is in descending order.
     """
-    _validate_binclf_curves(binclf_curves)
+    _validate_is_binclf_curves(binclf_curves)
     binclf_curves_array = binclf_curves.detach().cpu().numpy()
     fprs_array = binclf_curve_numpy.per_image_fpr(binclf_curves_array)
     return torch.from_numpy(fprs_array).to(binclf_curves.device)
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index f5373f68b7..55fcbd398c 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -67,7 +67,7 @@ class BinclfThreshsChoice:
 # =========================================== ARGS VALIDATION ===========================================
 
 
-def _validate_scores_batch(scores_batch: ndarray) -> None:
+def _validate_is_scores_batch(scores_batch: ndarray) -> None:
     """scores_batch (ndarray): floating (N, D)."""
     if not isinstance(scores_batch, ndarray):
         msg = f"Expected `scores_batch` to be an ndarray, but got {type(scores_batch)}"
@@ -85,7 +85,7 @@ def _validate_scores_batch(scores_batch: ndarray) -> None:
         raise ValueError(msg)
 
 
-def _validate_gts_batch(gts_batch: ndarray) -> None:
+def _validate_is_gts_batch(gts_batch: ndarray) -> None:
     """gts_batch (ndarray): boolean (N, D)."""
     if not isinstance(gts_batch, ndarray):
         msg = f"Expected `gts_batch` to be an ndarray, but got {type(gts_batch)}"
@@ -232,10 +232,10 @@ def binclf_multiple_curves(
         Thresholds are sorted in ascending order.
     """
     BinclfAlgorithm.validate(algorithm)
-    _validate_scores_batch(scores_batch)
-    _validate_gts_batch(gts_batch)
-    _validate.same_shape(scores_batch, gts_batch)
-    _validate.threshs(threshs)
+    _validate_is_scores_batch(scores_batch)
+    _validate_is_gts_batch(gts_batch)
+    _validate.is_same_shape(scores_batch, gts_batch)
+    _validate.is_threshs(threshs)
 
     if algorithm == BinclfAlgorithm.PYTHON:
         return _binclf_multiple_curves_python(scores_batch, gts_batch, threshs)
@@ -257,11 +257,11 @@ def binclf_multiple_curves(
 
 def _get_threshs_minmax_linspace(anomaly_maps: ndarray, num_threshs: int) -> ndarray:
     """Get thresholds linearly spaced between the min and max of the anomaly maps."""
-    _validate.num_threshs(num_threshs)
+    _validate.is_num_threshs_gte2(num_threshs)
     # this operation can be a bit expensive
     thresh_low, thresh_high = thresh_bounds = (anomaly_maps.min().item(), anomaly_maps.max().item())
     try:
-        _validate.thresh_bounds(thresh_bounds)
+        _validate.is_thresh_bounds(thresh_bounds)
     except ValueError as ex:
         msg = f"Invalid threshold bounds computed from the given anomaly maps. Cause: {ex}"
         raise ValueError(msg) from ex
@@ -322,15 +322,15 @@ def per_image_binclf_curve(
             Thresholds are sorted in ascending order.
     """
     BinclfAlgorithm.validate(algorithm)
-    _validate.anomaly_maps(anomaly_maps)
-    _validate.masks(masks)
-    _validate.same_shape(anomaly_maps, masks)
+    _validate.is_anomaly_maps(anomaly_maps)
+    _validate.is_masks(masks)
+    _validate.is_same_shape(anomaly_maps, masks)
 
     threshs: ndarray
 
     if threshs_choice == BinclfThreshsChoice.GIVEN:
         assert threshs_given is not None
-        _validate.threshs(threshs_given)
+        _validate.is_threshs(threshs_given)
         if num_threshs is not None:
             logger.warning(
                 f"Argument `num_threshs` was given, but it is ignored because `threshs_choice` is {threshs_choice}.",
@@ -362,7 +362,7 @@ def per_image_binclf_curve(
     num_images = anomaly_maps.shape[0]
 
     try:
-        _validate.binclf_curves(binclf_curves, valid_threshs=threshs)
+        _validate.is_binclf_curves(binclf_curves, valid_threshs=threshs)
 
         # these two validations cannot be done in `_validate.binclf_curves` because it does not have access to the
         # original shapes of `anomaly_maps`
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 09f823f8fd..d550c748f6 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -36,7 +36,7 @@
 if TYPE_CHECKING:
     from collections.abc import Sequence
 
-# =========================================== ARGS VALIDATION ===========================================
+# =========================================== AUX ===========================================
 
 
 def _images_classes_from_masks(masks: Tensor) -> Tensor:
@@ -49,44 +49,44 @@ def _images_classes_from_masks(masks: Tensor) -> Tensor:
 # =========================================== ARGS VALIDATION ===========================================
 
 
-def _validate_anomaly_maps(anomaly_maps: Tensor) -> None:
+def _validate_is_anomaly_maps(anomaly_maps: Tensor) -> None:
     _validate.is_tensor(anomaly_maps, argname="anomaly_maps")
-    _validate.anomaly_maps(anomaly_maps.numpy())
+    _validate.is_anomaly_maps(anomaly_maps.numpy())
 
 
-def _validate_masks(masks: Tensor) -> None:
+def _validate_is_masks(masks: Tensor) -> None:
     _validate.is_tensor(masks, argname="masks")
-    _validate.masks(masks.numpy())
+    _validate.is_masks(masks.numpy())
 
 
-def _validate_threshs(threshs: Tensor) -> None:
+def _validate_is_threshs(threshs: Tensor) -> None:
     _validate.is_tensor(threshs, argname="threshs")
-    _validate.threshs(threshs.numpy())
+    _validate.is_threshs(threshs.numpy())
 
 
-def _validate_shared_fpr(shared_fpr: Tensor, nan_allowed: bool = False, decreasing: bool = True) -> None:
+def _validate_is_shared_fpr(shared_fpr: Tensor, nan_allowed: bool = False, decreasing: bool = True) -> None:
     _validate.is_tensor(shared_fpr, argname="shared_fpr")
-    _validate.rate_curve(shared_fpr.numpy(), nan_allowed=nan_allowed, decreasing=decreasing)
+    _validate.is_rate_curve(shared_fpr.numpy(), nan_allowed=nan_allowed, decreasing=decreasing)
 
 
-def _validate_image_classes(image_classes: Tensor) -> None:
+def _validate_is_image_classes(image_classes: Tensor) -> None:
     _validate.is_tensor(image_classes, argname="image_classes")
-    _validate.images_classes(image_classes.numpy())
+    _validate.is_images_classes(image_classes.numpy())
 
 
-def _validate_per_image_tprs(per_image_tprs: Tensor, image_classes: Tensor) -> None:
-    _validate_image_classes(image_classes)
+def _validate_is_per_image_tprs(per_image_tprs: Tensor, image_classes: Tensor) -> None:
+    _validate_is_image_classes(image_classes)
     _validate.is_tensor(per_image_tprs, argname="per_image_tprs")
 
     # general validations
-    _validate.per_image_rate_curves(
+    _validate.is_per_image_rate_curves(
         per_image_tprs.numpy(),
         nan_allowed=True,  # normal images have NaN TPRs
         decreasing=None,  # not checked here
     )
 
     # specific to anomalous images
-    _validate.per_image_rate_curves(
+    _validate.is_per_image_rate_curves(
         per_image_tprs[image_classes == 1].numpy(),
         nan_allowed=False,
         decreasing=True,
@@ -99,13 +99,13 @@ def _validate_per_image_tprs(per_image_tprs: Tensor, image_classes: Tensor) -> N
         raise ValueError(msg)
 
 
-def _validate_aupimos(aupimos: Tensor) -> None:
+def _validate_is_aupimos(aupimos: Tensor) -> None:
     _validate.is_tensor(aupimos, argname="aupimos")
-    _validate.rates(aupimos.numpy(), nan_allowed=True)
+    _validate.is_rates(aupimos.numpy(), nan_allowed=True)
 
 
-def _validate_source_images_paths(paths: Sequence[str], expected_num_paths: int | None) -> None:
-    _validate.file_paths(
+def _validate_is_source_images_paths(paths: Sequence[str], expected_num_paths: int | None) -> None:
+    _validate.is_list_of_file_path(
         paths,  # type: ignore[arg-type]
         # not necessary to exist because the metric can be computed
         # directly from the anomaly maps and masks, without the images
@@ -180,12 +180,12 @@ def image_classes(self) -> Tensor:
     def __post_init__(self) -> None:
         """Validate the inputs for the result object are consistent."""
         try:
-            _validate_threshs(self.threshs)
-            _validate_shared_fpr(self.shared_fpr, nan_allowed=False)
-            _validate_per_image_tprs(self.per_image_tprs, self.image_classes)
+            _validate_is_threshs(self.threshs)
+            _validate_is_shared_fpr(self.shared_fpr, nan_allowed=False)
+            _validate_is_per_image_tprs(self.per_image_tprs, self.image_classes)
 
             if self.paths is not None:
-                _validate_source_images_paths(self.paths, expected_num_paths=self.per_image_tprs.shape[0])
+                _validate_is_source_images_paths(self.paths, expected_num_paths=self.per_image_tprs.shape[0])
 
         except (TypeError, ValueError) as ex:
             msg = f"Invalid inputs for {self.__class__.__name__} object. Cause: {ex}."
@@ -254,7 +254,7 @@ def save(self, file_path: str | Path) -> None:
             file_path: path to the `.pt` file where to save the PIMO result.
                        If the file already exists, a numerical suffix is added to the filename.
         """
-        _validate.file_path(file_path, must_exist=False, extension=".pt", pathlib_ok=True)
+        _validate.is_file_path(file_path, must_exist=False, extension=".pt", pathlib_ok=True)
         file_path = duplicate_filename(file_path)
         payload = self.to_dict()
         torch.save(payload, file_path)
@@ -266,7 +266,7 @@ def load(cls: type[PIMOResult], file_path: str | Path) -> PIMOResult:
         Args:
             file_path: path to the `.pt` file where to load the PIMO result.
         """
-        _validate.file_path(file_path, must_exist=True, extension=".pt", pathlib_ok=True)
+        _validate.is_file_path(file_path, must_exist=True, extension=".pt", pathlib_ok=True)
         payload = torch.load(file_path)
         if not isinstance(payload, dict):
             msg = f"Invalid content in file {file_path}. Must be a dictionary."
@@ -349,14 +349,14 @@ def thresh_bounds(self) -> tuple[float, float]:
     def __post_init__(self) -> None:
         """Validate the inputs for the result object are consistent."""
         try:
-            _validate.rate_range((self.fpr_lower_bound, self.fpr_upper_bound))
+            _validate.is_rate_range((self.fpr_lower_bound, self.fpr_upper_bound))
             # TODO(jpcbertoldo): warn when it's too low (use parameters from the numpy code)  # noqa: TD003
-            _validate.num_threshs(self.num_threshs)
-            _validate_aupimos(self.aupimos)
-            _validate.thresh_bounds((self.thresh_lower_bound, self.thresh_upper_bound))
+            _validate.is_num_threshs_gte2(self.num_threshs)
+            _validate_is_aupimos(self.aupimos)
+            _validate.is_thresh_bounds((self.thresh_lower_bound, self.thresh_upper_bound))
 
             if self.paths is not None:
-                _validate_source_images_paths(self.paths, expected_num_paths=self.aupimos.shape[0])
+                _validate_is_source_images_paths(self.paths, expected_num_paths=self.aupimos.shape[0])
 
         except (TypeError, ValueError) as ex:
             msg = f"Invalid inputs for {self.__class__.__name__} object. Cause: {ex}."
@@ -400,7 +400,7 @@ def from_pimoresult(
             paths = pimoresult.paths
 
         elif paths is not None:
-            _validate_source_images_paths(paths, expected_num_paths=pimoresult.num_images)
+            _validate_is_source_images_paths(paths, expected_num_paths=pimoresult.num_images)
 
         fpr_lower_bound, fpr_upper_bound = fpr_bounds
         # recall: fpr upper/lower bounds are the same as the thresh lower/upper bounds
@@ -450,7 +450,7 @@ def save(self, file_path: str | Path) -> None:
             file_path: path to the `.json` file where to save the AUPIMO result.
                        If the file already exists, a numerical suffix is added to the filename.
         """
-        _validate.file_path(file_path, must_exist=False, extension=".json", pathlib_ok=True)
+        _validate.is_file_path(file_path, must_exist=False, extension=".json", pathlib_ok=True)
         file_path = duplicate_filename(file_path)
         file_path = Path(file_path)
         payload = self.to_dict()
@@ -466,7 +466,7 @@ def load(cls: type[AUPIMOResult], file_path: str | Path) -> AUPIMOResult:
         Args:
             file_path: path to the `.json` file where to load the AUPIMO result.
         """
-        _validate.file_path(file_path, must_exist=True, extension=".json", pathlib_ok=True)
+        _validate.is_file_path(file_path, must_exist=True, extension=".json", pathlib_ok=True)
         file_path = Path(file_path)
         with file_path.open("r") as f:
             payload = json.load(f)
@@ -543,14 +543,14 @@ def pimo_curves(
     ==================
     {docstring_pimoresult}
     """
-    _validate_anomaly_maps(anomaly_maps)
+    _validate_is_anomaly_maps(anomaly_maps)
     anomaly_maps_array = anomaly_maps.detach().cpu().numpy()
 
-    _validate_masks(masks)
+    _validate_is_masks(masks)
     masks_array = masks.detach().cpu().numpy()
 
     if paths is not None:
-        _validate_source_images_paths(paths, expected_num_paths=anomaly_maps.shape[0])
+        _validate_is_source_images_paths(paths, expected_num_paths=anomaly_maps.shape[0])
 
     # other validations are done in the numpy code
     threshs_array, shared_fpr_array, per_image_tprs_array, _ = pimo_numpy.pimo_curves(
@@ -622,14 +622,14 @@ def aupimo_scores(
     ====================
     {docstring_aupimoresult}
     """
-    _validate_anomaly_maps(anomaly_maps)
+    _validate_is_anomaly_maps(anomaly_maps)
     anomaly_maps_array = anomaly_maps.detach().cpu().numpy()
 
-    _validate_masks(masks)
+    _validate_is_masks(masks)
     masks_array = masks.detach().cpu().numpy()
 
     if paths is not None:
-        _validate_source_images_paths(paths, expected_num_paths=anomaly_maps.shape[0])
+        _validate_is_source_images_paths(paths, expected_num_paths=anomaly_maps.shape[0])
 
     # other validations are done in the numpy code
 
@@ -755,7 +755,7 @@ def __init__(
         # the options below are, redundantly, validated here to avoid reaching
         # an error later in the execution
 
-        _validate.num_threshs(num_threshs)
+        _validate.is_num_threshs_gte2(num_threshs)
         self.num_threshs = num_threshs
 
         # validate binclf_algorithm and shared_fpr_metric
@@ -775,9 +775,9 @@ def update(self, anomaly_maps: Tensor, masks: Tensor) -> None:
             anomaly_maps (Tensor): predictions of the model (ndim == 2, float)
             masks (Tensor): ground truth masks (ndim == 2, binary)
         """
-        _validate_anomaly_maps(anomaly_maps)
-        _validate_masks(masks)
-        _validate.same_shape(anomaly_maps, masks)
+        _validate_is_anomaly_maps(anomaly_maps)
+        _validate_is_masks(masks)
+        _validate.is_same_shape(anomaly_maps, masks)
         self.anomaly_maps.append(anomaly_maps)
         self.masks.append(masks)
 
@@ -898,7 +898,7 @@ def __init__(
 
         # other validations are done in PIMO.__init__()
 
-        _validate.rate_range(fpr_bounds)
+        _validate.is_rate_range(fpr_bounds)
         self.fpr_bounds = fpr_bounds
 
         self.force = force
diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index be334f3679..89c1943f24 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -69,21 +69,21 @@ def validate(metric: str) -> None:
 
 def _images_classes_from_masks(masks: ndarray) -> ndarray:
     """Deduce the image classes from the masks."""
-    _validate.masks(masks)
+    _validate.is_masks(masks)
     return (masks == 1).any(axis=(1, 2)).astype(np.int32)
 
 
 # =========================================== ARGS VALIDATION ===========================================
 
 
-def _validate_at_least_one_anomalous_image(masks: ndarray) -> None:
+def _validate_has_at_least_one_anomalous_image(masks: ndarray) -> None:
     image_classes = _images_classes_from_masks(masks)
     if (image_classes == 1).sum() == 0:
         msg = "Expected at least one ANOMALOUS image, but found none."
         raise ValueError(msg)
 
 
-def _validate_at_least_one_normal_image(masks: ndarray) -> None:
+def _validate_has_at_least_one_normal_image(masks: ndarray) -> None:
     image_classes = _images_classes_from_masks(masks)
     if (image_classes == 0).sum() == 0:
         msg = "Expected at least one NORMAL image, but found none."
@@ -139,12 +139,12 @@ def pimo_curves(
     """
     BinclfAlgorithm.validate(binclf_algorithm)
     PIMOSharedFPRMetric.validate(shared_fpr_metric)
-    _validate.num_threshs(num_threshs)
-    _validate.anomaly_maps(anomaly_maps)
-    _validate.masks(masks)
-    _validate.same_shape(anomaly_maps, masks)
-    _validate_at_least_one_anomalous_image(masks)
-    _validate_at_least_one_normal_image(masks)
+    _validate.is_num_threshs_gte2(num_threshs)
+    _validate.is_anomaly_maps(anomaly_maps)
+    _validate.is_masks(masks)
+    _validate.is_same_shape(anomaly_maps, masks)
+    _validate_has_at_least_one_anomalous_image(masks)
+    _validate_has_at_least_one_normal_image(masks)
 
     image_classes = _images_classes_from_masks(masks)
 
@@ -173,7 +173,7 @@ def pimo_curves(
         # shape -> (N, K)
         per_image_fprs_normals = binclf_curve_numpy.per_image_fpr(binclf_curves[image_classes == 0])
         try:
-            _validate.per_image_rate_curves(per_image_fprs_normals, nan_allowed=False, decreasing=True)
+            _validate.is_per_image_rate_curves(per_image_fprs_normals, nan_allowed=False, decreasing=True)
         except ValueError as ex:
             msg = f"Cannot compute PIMO because the per-image FPR curves from normal images are invalid. Cause: {ex}"
             raise RuntimeError(msg) from ex
@@ -235,7 +235,7 @@ def aupimo_scores(
             [4] AUPIMO scores of shape (N,) in [0, 1]
             [5] number of points used in the AUC integration
     """
-    _validate.rate_range(fpr_bounds)
+    _validate.is_rate_range(fpr_bounds)
 
     # other validations are done in the `pimo` function
     threshs, shared_fpr, per_image_tprs, image_classes = pimo_curves(
@@ -246,10 +246,10 @@ def aupimo_scores(
         shared_fpr_metric=shared_fpr_metric,
     )
     try:
-        _validate.threshs(threshs)
-        _validate.rate_curve(shared_fpr, nan_allowed=False, decreasing=True)
-        _validate.images_classes(image_classes)
-        _validate.per_image_rate_curves(per_image_tprs[image_classes == 1], nan_allowed=False, decreasing=True)
+        _validate.is_threshs(threshs)
+        _validate.is_rate_curve(shared_fpr, nan_allowed=False, decreasing=True)
+        _validate.is_images_classes(image_classes)
+        _validate.is_per_image_rate_curves(per_image_tprs[image_classes == 1], nan_allowed=False, decreasing=True)
 
     except ValueError as ex:
         msg = f"Cannot compute AUPIMO because the PIMO curves are invalid. Cause: {ex}"
@@ -383,10 +383,10 @@ def thresh_at_shared_fpr_level(threshs: ndarray, shared_fpr: ndarray, fpr_level:
             [1] threshold
             [2] the actual shared FPR value at the returned threshold
     """
-    _validate.threshs(threshs)
-    _validate.rate_curve(shared_fpr, nan_allowed=False, decreasing=True)
+    _validate.is_threshs(threshs)
+    _validate.is_rate_curve(shared_fpr, nan_allowed=False, decreasing=True)
     _joint_validate_threshs_shared_fpr(threshs, shared_fpr)
-    _validate.rate(fpr_level, zero_ok=True, one_ok=True)
+    _validate.is_rate(fpr_level, zero_ok=True, one_ok=True)
 
     shared_fpr_min, shared_fpr_max = shared_fpr.min(), shared_fpr.max()
 
@@ -433,7 +433,7 @@ def aupimo_normalizing_factor(fpr_bounds: tuple[float, float]) -> float:
     Returns:
         float: the normalization factor (>0).
     """
-    _validate.rate_range(fpr_bounds)
+    _validate.is_rate_range(fpr_bounds)
     fpr_lower_bound, fpr_upper_bound = fpr_bounds
     # the log's base must be the same as the one used in the integration!
     return float(np.log(fpr_upper_bound / fpr_lower_bound))
@@ -453,7 +453,7 @@ def aupimo_random_model_score(fpr_bounds: tuple[float, float]) -> float:
     Returns:
         float: the AUPIMO score.
     """
-    _validate.rate_range(fpr_bounds)
+    _validate.is_rate_range(fpr_bounds)
     fpr_lower_bound, fpr_upper_bound = fpr_bounds
     integral_value = fpr_upper_bound - fpr_lower_bound
     return float(integral_value / aupimo_normalizing_factor(fpr_bounds))
diff --git a/src/anomalib/metrics/per_image/utils.py b/src/anomalib/metrics/per_image/utils.py
index 3a35ed7aa9..1790afccc8 100644
--- a/src/anomalib/metrics/per_image/utils.py
+++ b/src/anomalib/metrics/per_image/utils.py
@@ -11,7 +11,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import logging
-import warnings
 from collections import OrderedDict
 from copy import deepcopy
 from typing import TYPE_CHECKING
@@ -33,7 +32,7 @@
 # =========================================== ARGS VALIDATION ===========================================
 
 
-def _validate_models_ordered(models_ordered: tuple[str, ...]) -> None:
+def _validate_is_models_ordered(models_ordered: tuple[str, ...]) -> None:
     if not isinstance(models_ordered, tuple):
         msg = f"Expected models ordered to be a tuple, but got {type(models_ordered)}."
         raise TypeError(msg)
@@ -57,7 +56,7 @@ def _validate_models_ordered(models_ordered: tuple[str, ...]) -> None:
         raise ValueError(msg)
 
 
-def _validate_confidences(confidences: dict[tuple[str, str], float]) -> None:
+def _validate_is_confidences(confidences: dict[tuple[str, str], float]) -> None:
     if not isinstance(confidences, dict):
         msg = f"Expected confidences to be a dict, but got {type(confidences)}."
         raise TypeError(msg)
@@ -102,52 +101,7 @@ def _joint_validate_models_ordered_and_confidences(
         raise ValueError(msg)
 
 
-# =========================================== FUNCTIONS ===========================================
-
-
-def per_image_scores_stats(
-    per_image_scores: Tensor,
-    images_classes: Tensor | None = None,
-    only_class: int | None = None,
-    outliers_policy: str | None = StatsOutliersPolicy.NONE,
-    repeated_policy: str | None = StatsRepeatedPolicy.AVOID,
-    repeated_replacement_atol: float = 1e-2,
-) -> list[dict[str, str | int | float]]:
-    """Torch-oriented interface for `per_image_scores_stats`. See its dscription for more details (below).
-
-    Numpy version docstring
-    =======================
-
-    {docstring}
-    """
-    _validate.is_tensor(per_image_scores, "per_image_scores")
-    per_image_scores_array = per_image_scores.detach().cpu().numpy()
-
-    if images_classes is not None:
-        _validate.is_tensor(images_classes, "images_classes")
-        images_classes_array = images_classes.detach().cpu().numpy()
-
-    else:
-        images_classes_array = None
-
-    # other validations happen inside `utils_numpy.per_image_scores_stats`
-
-    return utils_numpy.per_image_scores_stats(
-        per_image_scores_array,
-        images_classes_array,
-        only_class=only_class,
-        outliers_policy=outliers_policy,
-        repeated_policy=repeated_policy,
-        repeated_replacement_atol=repeated_replacement_atol,
-    )
-
-
-per_image_scores_stats.__doc__ = per_image_scores_stats.__doc__.format(  # type: ignore[union-attr]
-    docstring=utils_numpy.per_image_scores_stats.__doc__,
-)
-
-
-def _validate_scores_per_model_tensor(scores_per_model: dict[str, Tensor] | OrderedDict[str, Tensor]) -> None:
+def _validate_is_scores_per_model_tensor(scores_per_model: dict[str, Tensor] | OrderedDict[str, Tensor]) -> None:
     first_key_value = None
 
     for model_name, scores in scores_per_model.items():
@@ -184,7 +138,7 @@ def _validate_scores_per_model_tensor(scores_per_model: dict[str, Tensor] | Orde
             raise ValueError(msg)
 
 
-def _validate_scores_per_model_aupimoresult(
+def _validate_is_scores_per_model_aupimoresult(
     scores_per_model: dict[str, AUPIMOResult] | OrderedDict[str, AUPIMOResult],
     missing_paths_ok: bool,
 ) -> None:
@@ -230,7 +184,7 @@ def _validate_scores_per_model_aupimoresult(
             logger.warning(msg)
 
 
-def _validate_scores_per_model(
+def _validate_is_scores_per_model(
     scores_per_model: dict[str, Tensor]
     | OrderedDict[str, Tensor]
     | dict[str, AUPIMOResult]
@@ -267,14 +221,59 @@ def _validate_scores_per_model(
         raise TypeError(msg)
 
     if isinstance(first_instance, Tensor):
-        _validate_scores_per_model_tensor(scores_per_model)
+        _validate_is_scores_per_model_tensor(scores_per_model)
         return
 
-    _validate_scores_per_model_tensor(
+    _validate_is_scores_per_model_tensor(
         {model_name: scores.aupimos for model_name, scores in scores_per_model.items()},
     )
 
-    _validate_scores_per_model_aupimoresult(scores_per_model, missing_paths_ok=True)
+    _validate_is_scores_per_model_aupimoresult(scores_per_model, missing_paths_ok=True)
+
+
+# =========================================== FUNCTIONS ===========================================
+
+
+def per_image_scores_stats(
+    per_image_scores: Tensor,
+    images_classes: Tensor | None = None,
+    only_class: int | None = None,
+    outliers_policy: str | None = StatsOutliersPolicy.NONE,
+    repeated_policy: str | None = StatsRepeatedPolicy.AVOID,
+    repeated_replacement_atol: float = 1e-2,
+) -> list[dict[str, str | int | float]]:
+    """Torch-oriented interface for `per_image_scores_stats`. See its dscription for more details (below).
+
+    Numpy version docstring
+    =======================
+
+    {docstring}
+    """
+    _validate.is_tensor(per_image_scores, "per_image_scores")
+    per_image_scores_array = per_image_scores.detach().cpu().numpy()
+
+    if images_classes is not None:
+        _validate.is_tensor(images_classes, "images_classes")
+        images_classes_array = images_classes.detach().cpu().numpy()
+
+    else:
+        images_classes_array = None
+
+    # other validations happen inside `utils_numpy.per_image_scores_stats`
+
+    return utils_numpy.per_image_scores_stats(
+        per_image_scores_array,
+        images_classes_array,
+        only_class=only_class,
+        outliers_policy=outliers_policy,
+        repeated_policy=repeated_policy,
+        repeated_replacement_atol=repeated_replacement_atol,
+    )
+
+
+per_image_scores_stats.__doc__ = per_image_scores_stats.__doc__.format(  # type: ignore[union-attr]
+    docstring=utils_numpy.per_image_scores_stats.__doc__,
+)
 
 
 def compare_models_pairwise_ttest_rel(
@@ -292,7 +291,7 @@ def compare_models_pairwise_ttest_rel(
 
     {docstring}
     """
-    _validate_scores_per_model(scores_per_model)
+    _validate_is_scores_per_model(scores_per_model)
     scores_per_model_items = [
         (
             model_name,
@@ -326,7 +325,7 @@ def compare_models_pairwise_wilcoxon(
 
     {docstring}
     """
-    _validate_scores_per_model(scores_per_model)
+    _validate_is_scores_per_model(scores_per_model)
     scores_per_model_items = [
         (
             model_name,
@@ -393,8 +392,8 @@ def format_pairwise_tests_results(
             model compared to itself.
 
     """
-    _validate_models_ordered(models_ordered)
-    _validate_confidences(confidences)
+    _validate_is_models_ordered(models_ordered)
+    _validate_is_confidences(confidences)
     _joint_validate_models_ordered_and_confidences(models_ordered, confidences)
     confidences = deepcopy(confidences)
     confidences.update({(model, model): torch.nan for model in models_ordered})
diff --git a/src/anomalib/metrics/per_image/utils_numpy.py b/src/anomalib/metrics/per_image/utils_numpy.py
index 73da53a82a..ebadacb045 100644
--- a/src/anomalib/metrics/per_image/utils_numpy.py
+++ b/src/anomalib/metrics/per_image/utils_numpy.py
@@ -94,7 +94,7 @@ def validate(alternative: str) -> None:
 
 
 # =========================================== ARGS VALIDATION ===========================================
-def _validate_image_class(image_class: int) -> None:
+def _validate_is_image_class(image_class: int) -> None:
     if not isinstance(image_class, int):
         msg = f"Expected image class to be an int (0 for 'normal', 1 for 'anomalous'), but got {type(image_class)}."
         raise TypeError(msg)
@@ -104,7 +104,7 @@ def _validate_image_class(image_class: int) -> None:
         raise ValueError(msg)
 
 
-def _validate_per_image_scores(per_image_scores: ndarray) -> None:
+def _validate_is_per_image_scores(per_image_scores: ndarray) -> None:
     if not isinstance(per_image_scores, ndarray):
         msg = f"Expected per-image scores to be a numpy array, but got {type(per_image_scores)}."
         raise TypeError(msg)
@@ -114,7 +114,7 @@ def _validate_per_image_scores(per_image_scores: ndarray) -> None:
         raise ValueError(msg)
 
 
-def _validate_scores_per_model(scores_per_model: dict[str, ndarray] | OrderedDict[str, ndarray]) -> None:
+def _validate_is_scores_per_model(scores_per_model: dict[str, ndarray] | OrderedDict[str, ndarray]) -> None:
     if not isinstance(scores_per_model, dict | OrderedDict):
         msg = f"Expected scores per model to be a dictionary or ordered dictionary, but got {type(scores_per_model)}."
         raise TypeError(msg)
@@ -242,16 +242,16 @@ def per_image_scores_stats(
     """
     StatsOutliersPolicy.validate(outliers_policy)
     StatsRepeatedPolicy.validate(repeated_policy)
-    _validate_per_image_scores(per_image_scores)
+    _validate_is_per_image_scores(per_image_scores)
 
     # restrain the images to the class `only_class` if given, else use all images
     if images_classes is None:
         images_selection_mask = np.ones_like(per_image_scores, dtype=bool)
 
     elif only_class is not None:
-        _validate.images_classes(images_classes)
-        _validate.same_shape(per_image_scores, images_classes)
-        _validate_image_class(only_class)
+        _validate.is_images_classes(images_classes)
+        _validate.is_same_shape(per_image_scores, images_classes)
+        _validate_is_image_class(only_class)
         images_selection_mask = images_classes == only_class
 
     else:
@@ -383,7 +383,7 @@ def compare_models_pairwise_ttest_rel(
                         - if `two-sided`: model[i] != model[j]
                     in termos of average score.
     """
-    _validate_scores_per_model(scores_per_model)
+    _validate_is_scores_per_model(scores_per_model)
     StatsAlternativeHypothesis.validate(alternative)
 
     # remove nan values; list of items keeps the order of the OrderedDict
@@ -468,7 +468,7 @@ def compare_models_pairwise_wilcoxon(
                         - if `two-sided`: model[i] != model[j]
                     in terms of average ranks (not scores!).
     """
-    _validate_scores_per_model(scores_per_model)
+    _validate_is_scores_per_model(scores_per_model)
     StatsAlternativeHypothesis.validate(alternative)
 
     # remove nan values; list of items keeps the order of the OrderedDict

From f55890491728194f6820a26fc7a0ee3d76d8cbcf Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Tue, 28 May 2024 13:22:40 +0200
Subject: [PATCH 41/57] dedupe file path validation

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/data/utils/path.py             | 15 ++++-
 src/anomalib/metrics/per_image/_validate.py | 68 ---------------------
 src/anomalib/metrics/per_image/pimo.py      | 43 ++++++++-----
 tests/unit/data/utils/test_path.py          |  5 ++
 4 files changed, 47 insertions(+), 84 deletions(-)

diff --git a/src/anomalib/data/utils/path.py b/src/anomalib/data/utils/path.py
index ca0435be41..80c73a0f68 100644
--- a/src/anomalib/data/utils/path.py
+++ b/src/anomalib/data/utils/path.py
@@ -3,7 +3,6 @@
 # Copyright (C) 2022-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-
 import os
 import re
 from enum import Enum
@@ -143,13 +142,20 @@ def contains_non_printable_characters(path: str | Path) -> bool:
     return not printable_pattern.match(str(path))
 
 
-def validate_path(path: str | Path, base_dir: str | Path | None = None, should_exist: bool = True) -> Path:
+def validate_path(
+    path: str | Path,
+    base_dir: str | Path | None = None,
+    should_exist: bool = True,
+    accepted_extensions: tuple[str, ...] | None = None,
+) -> Path:
     """Validate the path.
 
     Args:
         path (str | Path): Path to validate.
         base_dir (str | Path): Base directory to restrict file access.
         should_exist (bool): If True, do not raise an exception if the path does not exist.
+        accepted_extensions (tuple[str, ...] | None): Accepted extensions for the path. An exception is raised if the
+            path does not have one of the accepted extensions. If None, no check is performed. Defaults to None.
 
     Returns:
         Path: Validated path.
@@ -214,6 +220,11 @@ def validate_path(path: str | Path, base_dir: str | Path | None = None, should_e
             msg = f"Read or execute permissions denied for the path: {path}"
             raise PermissionError(msg)
 
+    # Check if the path has one of the accepted extensions
+    if accepted_extensions is not None and path.suffix not in accepted_extensions:
+        msg = f"Path extension is not accepted. Accepted extensions: {accepted_extensions}. Path: {path}"
+        raise ValueError(msg)
+
     return path
 
 
diff --git a/src/anomalib/metrics/per_image/_validate.py b/src/anomalib/metrics/per_image/_validate.py
index 15cba7d129..bbe1a6bd21 100644
--- a/src/anomalib/metrics/per_image/_validate.py
+++ b/src/anomalib/metrics/per_image/_validate.py
@@ -12,7 +12,6 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-from pathlib import Path
 from typing import Any
 
 import numpy as np
@@ -98,73 +97,6 @@ def is_rate_range(bounds: tuple[float, float]) -> None:
         raise ValueError(msg)
 
 
-def is_file_path(file_path: str | Path, must_exist: bool, extension: str | None, pathlib_ok: bool) -> None:
-    """Validate the given path is a file (optionally) with the expected extension.
-
-    Args:
-        file_path (str | Path): The file path to validate.
-        must_exist (bool): Flag indicating whether the file must exist.
-        extension (str | None): The expected file extension, eg. .png, .jpg, etc. If `None`, no validation is performed.
-        pathlib_ok (bool): Flag indicating whether `pathlib.Path` is allowed; if False, only `str` paths are allowed.
-    """
-    if isinstance(file_path, str):
-        file_path = Path(file_path)
-
-    elif not isinstance(file_path, Path):
-        msg = f"Expected file path to be a string or pathlib.Path, but got {type(file_path)}"
-        raise TypeError(msg)
-
-    # if it's here, then it's a `pathlib.Path`
-    elif not pathlib_ok:
-        msg = f"Only `str` paths are allowed, but got {type(file_path)}"
-        raise TypeError(msg)
-
-    if file_path.is_dir():
-        msg = "Expected file path to be a file, but got a directory."
-        raise ValueError(msg)
-
-    if must_exist and not file_path.exists():
-        msg = f"File does not exist: {file_path}"
-        raise FileNotFoundError(msg)
-
-    if extension is None:
-        return
-
-    if file_path.suffix != extension:
-        msg = f"Expected file path to have extension '{extension}', but got '{file_path.suffix}'"
-        raise ValueError(msg)
-
-
-def is_list_of_file_path(
-    file_paths: list[str | Path],
-    must_exist: bool,
-    extension: str | None,
-    pathlib_ok: bool,
-) -> None:
-    """Validate the given paths are files (optionally) with the expected extension.
-
-    Args:
-        file_paths (list[str | Path]): The file paths to validate.
-        must_exist (bool): Flag indicating whether the files must exist.
-        extension (str | None): The expected file extension, eg. .png, .jpg, etc. If `None`, no validation is performed.
-        pathlib_ok (bool): Flag indicating whether `pathlib.Path` is allowed; if False, only `str` paths are allowed.
-    """
-    if not isinstance(file_paths, list):
-        msg = f"Expected paths to be a list, but got {type(file_paths)}."
-        raise TypeError(msg)
-
-    for idx, path in enumerate(file_paths):
-        try:
-            msg = f"Invalid path at index {idx}: {path}"
-            is_file_path(path, must_exist=must_exist, extension=extension, pathlib_ok=pathlib_ok)
-
-        except TypeError as ex:  # noqa: PERF203
-            raise TypeError(msg) from ex
-
-        except ValueError as ex:
-            raise ValueError(msg) from ex
-
-
 def is_threshs(threshs: ndarray) -> None:
     """Validate that the thresholds are valid and monotonically increasing."""
     if not isinstance(threshs, ndarray):
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index d550c748f6..d97e825f43 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -27,6 +27,7 @@
 from torchmetrics import Metric
 
 from anomalib.data.utils.image import duplicate_filename
+from anomalib.data.utils.path import validate_path
 
 from . import _validate, pimo_numpy, utils
 from .binclf_curve_numpy import BinclfAlgorithm
@@ -105,16 +106,30 @@ def _validate_is_aupimos(aupimos: Tensor) -> None:
 
 
 def _validate_is_source_images_paths(paths: Sequence[str], expected_num_paths: int | None) -> None:
-    _validate.is_list_of_file_path(
-        paths,  # type: ignore[arg-type]
-        # not necessary to exist because the metric can be computed
-        # directly from the anomaly maps and masks, without the images
-        must_exist=False,
-        # this will eventually be serialized to a file, so we don't want pathlib objects keep it simple
-        pathlib_ok=False,
-        # not enforcing the image type (e.g. png, jpg, etc.)
-        extension=None,
-    )
+    if not isinstance(paths, list):
+        msg = f"Expected paths to be a list, but got {type(paths)}."
+        raise TypeError(msg)
+
+    for idx, path in enumerate(paths):
+        try:
+            msg = f"Invalid path at index {idx}: {path}"
+            validate_path(
+                path,
+                # not necessary to exist because the metric can be computed
+                # directly from the anomaly maps and masks, without the images
+                should_exist=False,
+            )
+
+        except TypeError as ex:
+            raise TypeError(msg) from ex
+
+        except ValueError as ex:
+            raise ValueError(msg) from ex
+
+        if not isinstance(path, str):
+            # this will eventually be serialized to a file, so we don't want pathlib objects keep it simple
+            msg = f"Expected path to be a string, but got {type(path)}."
+            raise TypeError(msg)
 
     if expected_num_paths is None:
         return
@@ -254,7 +269,7 @@ def save(self, file_path: str | Path) -> None:
             file_path: path to the `.pt` file where to save the PIMO result.
                        If the file already exists, a numerical suffix is added to the filename.
         """
-        _validate.is_file_path(file_path, must_exist=False, extension=".pt", pathlib_ok=True)
+        validate_path(file_path, should_exist=False, accepted_extensions=(".pt",))
         file_path = duplicate_filename(file_path)
         payload = self.to_dict()
         torch.save(payload, file_path)
@@ -266,7 +281,7 @@ def load(cls: type[PIMOResult], file_path: str | Path) -> PIMOResult:
         Args:
             file_path: path to the `.pt` file where to load the PIMO result.
         """
-        _validate.is_file_path(file_path, must_exist=True, extension=".pt", pathlib_ok=True)
+        validate_path(file_path, accepted_extensions=(".pt",))
         payload = torch.load(file_path)
         if not isinstance(payload, dict):
             msg = f"Invalid content in file {file_path}. Must be a dictionary."
@@ -450,7 +465,7 @@ def save(self, file_path: str | Path) -> None:
             file_path: path to the `.json` file where to save the AUPIMO result.
                        If the file already exists, a numerical suffix is added to the filename.
         """
-        _validate.is_file_path(file_path, must_exist=False, extension=".json", pathlib_ok=True)
+        validate_path(file_path, should_exist=False, accepted_extensions=(".json",))
         file_path = duplicate_filename(file_path)
         file_path = Path(file_path)
         payload = self.to_dict()
@@ -466,7 +481,7 @@ def load(cls: type[AUPIMOResult], file_path: str | Path) -> AUPIMOResult:
         Args:
             file_path: path to the `.json` file where to load the AUPIMO result.
         """
-        _validate.is_file_path(file_path, must_exist=True, extension=".json", pathlib_ok=True)
+        validate_path(file_path, accepted_extensions=(".json",))
         file_path = Path(file_path)
         with file_path.open("r") as f:
             payload = json.load(f)
diff --git a/tests/unit/data/utils/test_path.py b/tests/unit/data/utils/test_path.py
index 2230157079..9b3aaa0886 100644
--- a/tests/unit/data/utils/test_path.py
+++ b/tests/unit/data/utils/test_path.py
@@ -67,3 +67,8 @@ def test_no_read_execute_permission(self) -> None:
             Path(tmp_dir).chmod(0o222)  # Remove read and execute permission
             with pytest.raises(PermissionError, match=r"Read or execute permissions denied for the path:*"):
                 validate_path(tmp_dir, base_dir=Path(tmp_dir))
+
+    def test_file_wrongsuffix(self) -> None:
+        """Test ``validate_path`` raises ValueError for a file with wrong suffix."""
+        with pytest.raises(ValueError, match="Path extension is not accepted."):
+            validate_path("file.png", should_exist=False, accepted_extensions=(".json", ".txt"))

From 581b35b74b7343afe4027201db45e9e21876ecf8 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Tue, 28 May 2024 14:05:59 +0200
Subject: [PATCH 42/57] fix tests

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/pimo.py  | 17 +++++++----------
 src/anomalib/metrics/per_image/utils.py | 15 ++++++++-------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index d97e825f43..08d6e88db0 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -18,9 +18,9 @@
 
 import json
 import warnings
+from collections.abc import Sequence
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import TYPE_CHECKING
 
 import torch
 from torch import Tensor
@@ -34,9 +34,6 @@
 from .pimo_numpy import PIMOSharedFPRMetric
 from .utils import StatsOutliersPolicy, StatsRepeatedPolicy
 
-if TYPE_CHECKING:
-    from collections.abc import Sequence
-
 # =========================================== AUX ===========================================
 
 
@@ -253,7 +250,7 @@ def to_dict(self) -> dict[str, Tensor | str]:
         return dic
 
     @classmethod
-    def from_dict(cls: type[PIMOResult], dic: dict[str, Tensor | str | list[str]]) -> PIMOResult:
+    def from_dict(cls: type["PIMOResult"], dic: dict[str, Tensor | str | list[str]]) -> "PIMOResult":
         """Return a result object from a dictionary."""
         try:
             return cls(**dic)  # type: ignore[arg-type]
@@ -275,7 +272,7 @@ def save(self, file_path: str | Path) -> None:
         torch.save(payload, file_path)
 
     @classmethod
-    def load(cls: type[PIMOResult], file_path: str | Path) -> PIMOResult:
+    def load(cls: type["PIMOResult"], file_path: str | Path) -> "PIMOResult":
         """Load from a `.pt` file.
 
         Args:
@@ -379,13 +376,13 @@ def __post_init__(self) -> None:
 
     @classmethod
     def from_pimoresult(
-        cls: type[AUPIMOResult],
+        cls: type["AUPIMOResult"],
         pimoresult: PIMOResult,
         fpr_bounds: tuple[float, float],
         num_threshs_auc: int,
         aupimos: Tensor,
         paths: list[str] | None = None,
-    ) -> AUPIMOResult:
+    ) -> "AUPIMOResult":
         """Return an AUPIMO result object from a PIMO result object.
 
         Args:
@@ -449,7 +446,7 @@ def to_dict(self) -> dict[str, Tensor | str | float | int]:
         return dic
 
     @classmethod
-    def from_dict(cls: type[AUPIMOResult], dic: dict[str, Tensor | str | float | int | list[str]]) -> AUPIMOResult:
+    def from_dict(cls: type["AUPIMOResult"], dic: dict[str, Tensor | str | float | int | list[str]]) -> "AUPIMOResult":
         """Return a result object from a dictionary."""
         try:
             return cls(**dic)  # type: ignore[arg-type]
@@ -475,7 +472,7 @@ def save(self, file_path: str | Path) -> None:
             json.dump(payload, f, indent=4)
 
     @classmethod
-    def load(cls: type[AUPIMOResult], file_path: str | Path) -> AUPIMOResult:
+    def load(cls: type["AUPIMOResult"], file_path: str | Path) -> "AUPIMOResult":
         """Load from a `.json` file.
 
         Args:
diff --git a/src/anomalib/metrics/per_image/utils.py b/src/anomalib/metrics/per_image/utils.py
index 1790afccc8..cd7e1cbf01 100644
--- a/src/anomalib/metrics/per_image/utils.py
+++ b/src/anomalib/metrics/per_image/utils.py
@@ -11,6 +11,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import logging
+import warnings
 from collections import OrderedDict
 from copy import deepcopy
 from typing import TYPE_CHECKING
@@ -139,7 +140,7 @@ def _validate_is_scores_per_model_tensor(scores_per_model: dict[str, Tensor] | O
 
 
 def _validate_is_scores_per_model_aupimoresult(
-    scores_per_model: dict[str, AUPIMOResult] | OrderedDict[str, AUPIMOResult],
+    scores_per_model: dict[str, "AUPIMOResult"] | OrderedDict[str, "AUPIMOResult"],
     missing_paths_ok: bool,
 ) -> None:
     first_key_value = None
@@ -187,8 +188,8 @@ def _validate_is_scores_per_model_aupimoresult(
 def _validate_is_scores_per_model(
     scores_per_model: dict[str, Tensor]
     | OrderedDict[str, Tensor]
-    | dict[str, AUPIMOResult]
-    | OrderedDict[str, AUPIMOResult],
+    | dict[str, "AUPIMOResult"]
+    | OrderedDict[str, "AUPIMOResult"],
 ) -> None:
     # it has to be imported here to avoid circular imports
     from .pimo import AUPIMOResult
@@ -279,8 +280,8 @@ def per_image_scores_stats(
 def compare_models_pairwise_ttest_rel(
     scores_per_model: dict[str, Tensor]
     | OrderedDict[str, Tensor]
-    | dict[str, AUPIMOResult]
-    | OrderedDict[str, AUPIMOResult],
+    | dict[str, "AUPIMOResult"]
+    | OrderedDict[str, "AUPIMOResult"],
     alternative: str,
     higher_is_better: bool,
 ) -> tuple[tuple[str, ...], dict[tuple[str, str], float]]:
@@ -313,8 +314,8 @@ def compare_models_pairwise_ttest_rel(
 def compare_models_pairwise_wilcoxon(
     scores_per_model: dict[str, Tensor]
     | OrderedDict[str, Tensor]
-    | dict[str, AUPIMOResult]
-    | OrderedDict[str, AUPIMOResult],
+    | dict[str, "AUPIMOResult"]
+    | OrderedDict[str, "AUPIMOResult"],
     alternative: str,
     higher_is_better: bool,
 ) -> tuple[tuple[str, ...], dict[tuple[str, str], float]]:

From 5837c0d835751f2ca129d90ecd4539d31ad1971f Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Tue, 28 May 2024 14:17:59 +0200
Subject: [PATCH 43/57] Add todo

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/_validate.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/anomalib/metrics/per_image/_validate.py b/src/anomalib/metrics/per_image/_validate.py
index bbe1a6bd21..fba8037e84 100644
--- a/src/anomalib/metrics/per_image/_validate.py
+++ b/src/anomalib/metrics/per_image/_validate.py
@@ -2,6 +2,9 @@
 
 `torch` is imported in the functions that use it, so this module can be used in numpy-standalone mode.
 
+TODO(jpcbertoldo): Move validations to a common place and reuse them across the codebase.
+https://github.com/openvinotoolkit/anomalib/issues/2093
+
 author: jpcbertoldo
 """
 

From 68a30aa25256eaca9b50122adfb51c6fc6b3b38e Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Tue, 28 May 2024 15:11:25 +0200
Subject: [PATCH 44/57] refactor enums

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 .../metrics/per_image/binclf_curve.py         |  4 +-
 .../metrics/per_image/binclf_curve_numpy.py   | 67 ++++++----------
 src/anomalib/metrics/per_image/pimo.py        | 27 +++----
 src/anomalib/metrics/per_image/pimo_numpy.py  | 32 +++-----
 src/anomalib/metrics/per_image/utils_numpy.py | 80 +++++++------------
 5 files changed, 79 insertions(+), 131 deletions(-)

diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
index b0420cf02a..55d8287515 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -48,8 +48,8 @@ def _validate_is_binclf_curves(binclf_curves: Tensor, valid_threshs: Tensor | No
 def per_image_binclf_curve(
     anomaly_maps: Tensor,
     masks: Tensor,
-    algorithm: str = BinclfAlgorithm.NUMBA,
-    threshs_choice: str = BinclfThreshsChoice.MINMAX_LINSPACE,
+    algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA,
+    threshs_choice: BinclfThreshsChoice | str = BinclfThreshsChoice.MINMAX_LINSPACE,
     threshs_given: Tensor | None = None,
     num_threshs: int | None = None,
 ) -> tuple[Tensor, Tensor]:
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index 55fcbd398c..bb37808fda 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -16,9 +16,8 @@
 
 import itertools
 import logging
-from dataclasses import dataclass
+from enum import Enum
 from functools import partial
-from typing import ClassVar
 
 import numpy as np
 from numpy import ndarray
@@ -38,30 +37,19 @@
 # =========================================== CONSTANTS ===========================================
 
 
-@dataclass
-class BinclfAlgorithm:
-    """Algorithm to use."""
+class BinclfAlgorithm(Enum):
+    """Algorithm to use (relates to the low-level implementation)."""
 
-    PYTHON: ClassVar[str] = "python"
-    NUMBA: ClassVar[str] = "numba"
-    ALGORITHMS: ClassVar[tuple[str, ...]] = (PYTHON, NUMBA)
+    PYTHON: str = "python"
+    NUMBA: str = "numba"
 
-    @staticmethod
-    def validate(algorithm: str) -> None:
-        """Validate `algorithm` argument."""
-        if algorithm not in BinclfAlgorithm.ALGORITHMS:
-            msg = f"Expected `algorithm` to be one of {BinclfAlgorithm.ALGORITHMS}, but got {algorithm}"
-            raise ValueError(msg)
 
-
-@dataclass
-class BinclfThreshsChoice:
+class BinclfThreshsChoice(Enum):
     """Sequence of thresholds to use."""
 
-    GIVEN: ClassVar[str] = "given"
-    MINMAX_LINSPACE: ClassVar[str] = "minmax-linspace"
-    MEAN_FPR_OPTIMIZED: ClassVar[str] = "mean-fpr-optimized"
-    CHOICES: ClassVar[tuple[str, ...]] = (GIVEN, MINMAX_LINSPACE, MEAN_FPR_OPTIMIZED)
+    GIVEN: str = "given"
+    MINMAX_LINSPACE: str = "minmax-linspace"
+    MEAN_FPR_OPTIMIZED: str = "mean-fpr-optimized"
 
 
 # =========================================== ARGS VALIDATION ===========================================
@@ -191,7 +179,7 @@ def binclf_multiple_curves(
     scores_batch: ndarray,
     gts_batch: ndarray,
     threshs: ndarray,
-    algorithm: str = BinclfAlgorithm.NUMBA,
+    algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA,
 ) -> ndarray:
     """Multiple binary classification matrix (per-instance scope) at each threshold (shared).
 
@@ -231,25 +219,22 @@ def binclf_multiple_curves(
 
         Thresholds are sorted in ascending order.
     """
-    BinclfAlgorithm.validate(algorithm)
+    BinclfAlgorithm(algorithm)
     _validate_is_scores_batch(scores_batch)
     _validate_is_gts_batch(gts_batch)
     _validate.is_same_shape(scores_batch, gts_batch)
     _validate.is_threshs(threshs)
 
-    if algorithm == BinclfAlgorithm.PYTHON:
-        return _binclf_multiple_curves_python(scores_batch, gts_batch, threshs)
+    if BinclfAlgorithm(algorithm) == BinclfAlgorithm.NUMBA:
+        if HAS_NUMBA:
+            return _binclf_curve_numba.binclf_multiple_curves_numba(scores_batch, gts_batch, threshs)
 
-    if algorithm == BinclfAlgorithm.NUMBA:
-        if not HAS_NUMBA:
-            logger.warning(
-                "Algorithm 'numba' was selected, but numba is not installed. Fallback to 'python' algorithm.",
-            )
-            return _binclf_multiple_curves_python(scores_batch, gts_batch, threshs)
-        return _binclf_curve_numba.binclf_multiple_curves_numba(scores_batch, gts_batch, threshs)
+        logger.warning(
+            f"Algorithm '{BinclfAlgorithm.NUMBA.value}' was selected, but numba is not installed. "
+            f"Falling back to '{BinclfAlgorithm.PYTHON.value}' implementation.",
+        )
 
-    msg = f"Expected `algorithm` to be one of {BinclfAlgorithm.ALGORITHMS}, but got {algorithm}"
-    raise NotImplementedError(msg)
+    return _binclf_multiple_curves_python(scores_batch, gts_batch, threshs)
 
 
 # ========================================= PER-IMAGE BINCLF CURVE =========================================
@@ -271,8 +256,8 @@ def _get_threshs_minmax_linspace(anomaly_maps: ndarray, num_threshs: int) -> nda
 def per_image_binclf_curve(
     anomaly_maps: ndarray,
     masks: ndarray,
-    algorithm: str = BinclfAlgorithm.NUMBA,
-    threshs_choice: str = BinclfThreshsChoice.MINMAX_LINSPACE,
+    algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA,
+    threshs_choice: BinclfThreshsChoice | str = BinclfThreshsChoice.MINMAX_LINSPACE,
     threshs_given: ndarray | None = None,
     num_threshs: int | None = None,
 ) -> tuple[ndarray, ndarray]:
@@ -321,14 +306,14 @@ def per_image_binclf_curve(
 
             Thresholds are sorted in ascending order.
     """
-    BinclfAlgorithm.validate(algorithm)
+    BinclfAlgorithm(algorithm)
     _validate.is_anomaly_maps(anomaly_maps)
     _validate.is_masks(masks)
     _validate.is_same_shape(anomaly_maps, masks)
 
     threshs: ndarray
 
-    if threshs_choice == BinclfThreshsChoice.GIVEN:
+    if BinclfThreshsChoice(threshs_choice) == BinclfThreshsChoice.GIVEN:
         assert threshs_given is not None
         _validate.is_threshs(threshs_given)
         if num_threshs is not None:
@@ -337,7 +322,7 @@ def per_image_binclf_curve(
             )
         threshs = threshs_given.astype(anomaly_maps.dtype)
 
-    elif threshs_choice == BinclfThreshsChoice.MINMAX_LINSPACE:
+    elif BinclfThreshsChoice(threshs_choice) == BinclfThreshsChoice.MINMAX_LINSPACE:
         assert num_threshs is not None
         if threshs_given is not None:
             logger.warning(
@@ -346,11 +331,11 @@ def per_image_binclf_curve(
         # `num_threshs` is validated in the function below
         threshs = _get_threshs_minmax_linspace(anomaly_maps, num_threshs)
 
-    elif threshs_choice == BinclfThreshsChoice.MEAN_FPR_OPTIMIZED:
+    elif BinclfThreshsChoice(threshs_choice) == BinclfThreshsChoice.MEAN_FPR_OPTIMIZED:
         raise NotImplementedError(f"TODO implement {threshs_choice}")  # noqa: EM102
 
     else:
-        msg = f"Expected `threshs_choice` to be one of {BinclfThreshsChoice.CHOICES}, but got {threshs_choice}"
+        msg = f"Expected `threshs_choice` to be from {list(BinclfThreshsChoice.__members__)}, but got {threshs_choice}"
         raise NotImplementedError(msg)
 
     # keep the batch dimension and flatten the rest
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 08d6e88db0..3cd5315091 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -195,6 +195,7 @@ def __post_init__(self) -> None:
             _validate_is_threshs(self.threshs)
             _validate_is_shared_fpr(self.shared_fpr, nan_allowed=False)
             _validate_is_per_image_tprs(self.per_image_tprs, self.image_classes)
+            self.shared_fpr_metric = PIMOSharedFPRMetric(self.shared_fpr_metric).value
 
             if self.paths is not None:
                 _validate_is_source_images_paths(self.paths, expected_num_paths=self.per_image_tprs.shape[0])
@@ -361,6 +362,7 @@ def thresh_bounds(self) -> tuple[float, float]:
     def __post_init__(self) -> None:
         """Validate the inputs for the result object are consistent."""
         try:
+            self.shared_fpr_metric = PIMOSharedFPRMetric(self.shared_fpr_metric).value
             _validate.is_rate_range((self.fpr_lower_bound, self.fpr_upper_bound))
             # TODO(jpcbertoldo): warn when it's too low (use parameters from the numpy code)  # noqa: TD003
             _validate.is_num_threshs_gte2(self.num_threshs)
@@ -532,8 +534,8 @@ def pimo_curves(
     anomaly_maps: Tensor,
     masks: Tensor,
     num_threshs: int,
-    binclf_algorithm: str = BinclfAlgorithm.NUMBA,
-    shared_fpr_metric: str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
+    binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA,
+    shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
     paths: list[str] | None = None,
 ) -> PIMOResult:
     """Compute the Per-IMage Overlap (PIMO, pronounced pee-mo) curves.
@@ -605,8 +607,8 @@ def aupimo_scores(
     anomaly_maps: Tensor,
     masks: Tensor,
     num_threshs: int = 300_000,
-    binclf_algorithm: str = BinclfAlgorithm.NUMBA,
-    shared_fpr_metric: str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
+    binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA,
+    shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
     fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
     force: bool = False,
     paths: list[str] | None = None,
@@ -745,8 +747,8 @@ def image_classes(self) -> Tensor:
     def __init__(
         self,
         num_threshs: int,
-        binclf_algorithm: str = BinclfAlgorithm.NUMBA,
-        shared_fpr_metric: str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
+        binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA,
+        shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR.value,
     ) -> None:
         """Per-Image Overlap (PIMO) curve.
 
@@ -770,12 +772,9 @@ def __init__(
         _validate.is_num_threshs_gte2(num_threshs)
         self.num_threshs = num_threshs
 
-        # validate binclf_algorithm and shared_fpr_metric
-        BinclfAlgorithm.validate(binclf_algorithm)
-        self.binclf_algorithm = binclf_algorithm
-
-        PIMOSharedFPRMetric.validate(shared_fpr_metric)
-        self.shared_fpr_metric = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR
+        # validate binclf_algorithm and get string
+        self.binclf_algorithm = BinclfAlgorithm(binclf_algorithm).value
+        self.shared_fpr_metric = PIMOSharedFPRMetric(shared_fpr_metric).value
 
         self.add_state("anomaly_maps", default=[], dist_reduce_fx="cat")
         self.add_state("masks", default=[], dist_reduce_fx="cat")
@@ -888,8 +887,8 @@ def __repr__(self) -> str:
     def __init__(
         self,
         num_threshs: int = 300_000,
-        binclf_algorithm: str = BinclfAlgorithm.NUMBA,
-        shared_fpr_metric: str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
+        binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA,
+        shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
         fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
         force: bool = False,
     ) -> None:
diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index 89c1943f24..997b5f4c11 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -34,8 +34,7 @@
 
 import logging
 import warnings
-from dataclasses import dataclass
-from typing import ClassVar
+from enum import Enum
 
 import numpy as np
 from numpy import ndarray
@@ -48,20 +47,10 @@
 # =========================================== CONSTANTS ===========================================
 
 
-@dataclass
-class PIMOSharedFPRMetric:
+class PIMOSharedFPRMetric(Enum):
     """Shared FPR metric (x-axis of the PIMO curve)."""
 
-    MEAN_PERIMAGE_FPR: ClassVar[str] = "mean-per-image-fpr"
-
-    METRICS: ClassVar[tuple[str, ...]] = (MEAN_PERIMAGE_FPR,)
-
-    @staticmethod
-    def validate(metric: str) -> None:
-        """Validate the argument `metric`."""
-        if metric not in PIMOSharedFPRMetric.METRICS:
-            msg = f"Invalid `metric`. Expected one of {PIMOSharedFPRMetric.METRICS}, but got {metric} instead."
-            raise ValueError(msg)
+    MEAN_PERIMAGE_FPR: str = "mean-per-image-fpr"
 
 
 # =========================================== AUX ===========================================
@@ -106,8 +95,8 @@ def pimo_curves(
     anomaly_maps: ndarray,
     masks: ndarray,
     num_threshs: int,
-    binclf_algorithm: str = BinclfAlgorithm.NUMBA,
-    shared_fpr_metric: str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
+    binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA,
+    shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
 ) -> tuple[ndarray, ndarray, ndarray, ndarray]:
     """Compute the Per-IMage Overlap (PIMO, pronounced pee-mo) curves.
 
@@ -137,8 +126,9 @@ def pimo_curves(
             [2] per-image TPR curves of shape (N, K), axis 1 in descending order (indices correspond to the thresholds)
             [3] image classes of shape (N,) with values 0 (normal) or 1 (anomalous)
     """
-    BinclfAlgorithm.validate(binclf_algorithm)
-    PIMOSharedFPRMetric.validate(shared_fpr_metric)
+    # validate the strings are valid
+    BinclfAlgorithm(binclf_algorithm)
+    PIMOSharedFPRMetric(shared_fpr_metric)
     _validate.is_num_threshs_gte2(num_threshs)
     _validate.is_anomaly_maps(anomaly_maps)
     _validate.is_masks(masks)
@@ -169,7 +159,7 @@ def pimo_curves(
     )
 
     shared_fpr: ndarray
-    if shared_fpr_metric == PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR:
+    if PIMOSharedFPRMetric(shared_fpr_metric) == PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR:
         # shape -> (N, K)
         per_image_fprs_normals = binclf_curve_numpy.per_image_fpr(binclf_curves[image_classes == 0])
         try:
@@ -199,8 +189,8 @@ def aupimo_scores(
     anomaly_maps: ndarray,
     masks: ndarray,
     num_threshs: int = 300_000,
-    binclf_algorithm: str = BinclfAlgorithm.NUMBA,
-    shared_fpr_metric: str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
+    binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA,
+    shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
     fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
     force: bool = False,
 ) -> tuple[ndarray, ndarray, ndarray, ndarray, ndarray, int]:
diff --git a/src/anomalib/metrics/per_image/utils_numpy.py b/src/anomalib/metrics/per_image/utils_numpy.py
index ebadacb045..08f70e6683 100644
--- a/src/anomalib/metrics/per_image/utils_numpy.py
+++ b/src/anomalib/metrics/per_image/utils_numpy.py
@@ -12,7 +12,7 @@
 
 import itertools
 from collections import OrderedDict
-from typing import ClassVar
+from enum import Enum
 
 import matplotlib as mpl
 import numpy as np
@@ -25,7 +25,7 @@
 # =========================================== CONSTANTS ===========================================
 
 
-class StatsOutliersPolicy:
+class StatsOutliersPolicy(Enum):
     """How to handle outliers in per-image metrics boxplots. Use them? Only high? Only low? Both?
 
     Outliers are defined as in a boxplot, i.e. values that are more than 1.5 times the interquartile range (IQR) away
@@ -37,22 +37,13 @@ class StatsOutliersPolicy:
     "both": include both high and low outliers.
     """
 
-    NONE: ClassVar[str] = "none"
-    HI: ClassVar[str] = "hi"
-    LO: ClassVar[str] = "lo"
-    BOTH: ClassVar[str] = "both"
+    NONE: str = "none"
+    HI: str = "hi"
+    LO: str = "lo"
+    BOTH: str = "both"
 
-    POLICIES: ClassVar[tuple[str | None, ...]] = (None, NONE, HI, LO, BOTH)
 
-    @staticmethod
-    def validate(policy: str | None) -> None:
-        """Validate the argument `policy`."""
-        if policy not in StatsOutliersPolicy.POLICIES:
-            msg = f"Invalid `policy`. Expected one of {StatsOutliersPolicy.POLICIES}, but got {policy}."
-            raise ValueError(msg)
-
-
-class StatsRepeatedPolicy:
+class StatsRepeatedPolicy(Enum):
     """How to handle repeated values in per-image metrics boxplots (two stats with same value). Avoid them?
 
     None | "none": do not avoid repeated values, so several stats can have the same value and image index.
@@ -60,37 +51,16 @@ class StatsRepeatedPolicy:
              with the nearest score, is selected.
     """
 
-    NONE: ClassVar[str] = "none"
-    AVOID: ClassVar[str] = "avoid"
-
-    POLICIES: ClassVar[tuple[str | None, ...]] = (None, NONE, AVOID)
-
-    @staticmethod
-    def validate(policy: str | None) -> None:
-        """Validate the argument `policy`."""
-        if policy not in StatsRepeatedPolicy.POLICIES:
-            msg = f"Invalid `policy`. Expected one of {StatsRepeatedPolicy.POLICIES}, but got {policy}."
-            raise ValueError(msg)
+    NONE: str = "none"
+    AVOID: str = "avoid"
 
 
-class StatsAlternativeHypothesis:
+class StatsAlternativeHypothesis(Enum):
     """Alternative hypothesis for the statistical tests used to compare per-image metrics."""
 
-    TWO_SIDED: ClassVar[str] = "two-sided"
-    LESS: ClassVar[str] = "less"
-    GREATER: ClassVar[str] = "greater"
-
-    ALTERNATIVES: ClassVar[tuple[str, ...]] = (TWO_SIDED, LESS, GREATER)
-
-    @staticmethod
-    def validate(alternative: str) -> None:
-        """Validate the argument `alternative`."""
-        if alternative not in StatsAlternativeHypothesis.ALTERNATIVES:
-            msg = (
-                "Invalid `alternative`. "
-                f"Expected one of {StatsAlternativeHypothesis.ALTERNATIVES}, but got {alternative}."
-            )
-            raise ValueError(msg)
+    TWO_SIDED: str = "two-sided"
+    LESS: str = "less"
+    GREATER: str = "greater"
 
 
 # =========================================== ARGS VALIDATION ===========================================
@@ -174,8 +144,8 @@ def per_image_scores_stats(
     per_image_scores: ndarray,
     images_classes: ndarray | None = None,
     only_class: int | None = None,
-    outliers_policy: str | None = StatsOutliersPolicy.NONE,
-    repeated_policy: str | None = StatsRepeatedPolicy.AVOID,
+    outliers_policy: StatsOutliersPolicy | str | None = StatsOutliersPolicy.NONE,
+    repeated_policy: StatsRepeatedPolicy | str | None = StatsRepeatedPolicy.AVOID,
     repeated_replacement_atol: float = 1e-2,
 ) -> list[dict[str, str | int | float]]:
     """Compute statistics of per-image scores (based on a boxplot's statistics).
@@ -240,8 +210,8 @@ def per_image_scores_stats(
 
         The list is sorted by increasing `stat_value`.
     """
-    StatsOutliersPolicy.validate(outliers_policy)
-    StatsRepeatedPolicy.validate(repeated_policy)
+    outliers_policy = StatsOutliersPolicy(outliers_policy)
+    repeated_policy = StatsRepeatedPolicy(repeated_policy)
     _validate_is_per_image_scores(per_image_scores)
 
     # restrain the images to the class `only_class` if given, else use all images
@@ -277,13 +247,13 @@ def per_image_scores_stats(
     outliers_lo = outliers[outliers < boxplot_stats["med"]]
     outliers_hi = outliers[outliers > boxplot_stats["med"]]
 
-    if outliers_policy in (StatsOutliersPolicy.HI, StatsOutliersPolicy.BOTH):
+    if StatsOutliersPolicy(outliers_policy) in (StatsOutliersPolicy.HI, StatsOutliersPolicy.BOTH):
         boxplot_stats = {
             **boxplot_stats,
             **{f"outhi_{idx:06}": value for idx, value in enumerate(outliers_hi)},
         }
 
-    if outliers_policy in (StatsOutliersPolicy.LO, StatsOutliersPolicy.BOTH):
+    if StatsOutliersPolicy(outliers_policy) in (StatsOutliersPolicy.LO, StatsOutliersPolicy.BOTH):
         boxplot_stats = {
             **boxplot_stats,
             **{f"outlo_{idx:06}": value for idx, value in enumerate(outliers_lo)},
@@ -299,10 +269,14 @@ def append_record(stat_name: str, stat_value: float) -> None:
         image_idx = candidate2image_idx[candidate_idx]
 
         # handle repeated values
-        if image_idx not in images_idxs_selected or repeated_policy is None:
+        if (
+            image_idx not in images_idxs_selected
+            or repeated_policy is None
+            or StatsRepeatedPolicy(repeated_policy) == StatsRepeatedPolicy.NONE
+        ):
             pass
 
-        elif repeated_policy == StatsRepeatedPolicy.AVOID:
+        elif StatsRepeatedPolicy(repeated_policy) == StatsRepeatedPolicy.AVOID:
             for other_candidate_idx in candidates_sorted:
                 other_candidate_image_idx = candidate2image_idx[other_candidate_idx]
                 if other_candidate_image_idx in images_idxs_selected:
@@ -384,7 +358,7 @@ def compare_models_pairwise_ttest_rel(
                     in termos of average score.
     """
     _validate_is_scores_per_model(scores_per_model)
-    StatsAlternativeHypothesis.validate(alternative)
+    StatsAlternativeHypothesis(alternative)
 
     # remove nan values; list of items keeps the order of the OrderedDict
     scores_per_model_nonan_items = [
@@ -469,7 +443,7 @@ def compare_models_pairwise_wilcoxon(
                     in terms of average ranks (not scores!).
     """
     _validate_is_scores_per_model(scores_per_model)
-    StatsAlternativeHypothesis.validate(alternative)
+    StatsAlternativeHypothesis(alternative)
 
     # remove nan values; list of items keeps the order of the OrderedDict
     scores_per_model_nonan_items = [

From d4071ad86c0e7d1ad606b4f0e188db83c448c76e Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Tue, 28 May 2024 15:19:06 +0200
Subject: [PATCH 45/57] only logger.warning

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/pimo.py       |  8 +++----
 src/anomalib/metrics/per_image/pimo_numpy.py | 22 +++++++-------------
 src/anomalib/metrics/per_image/utils.py      |  7 +++----
 3 files changed, 14 insertions(+), 23 deletions(-)

diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 3cd5315091..d7d5885d14 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -17,7 +17,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-import warnings
+import logging
 from collections.abc import Sequence
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -34,6 +34,8 @@
 from .pimo_numpy import PIMOSharedFPRMetric
 from .utils import StatsOutliersPolicy, StatsRepeatedPolicy
 
+logger = logging.getLogger(__name__)
+
 # =========================================== AUX ===========================================
 
 
@@ -759,11 +761,9 @@ def __init__(
         """
         super().__init__()
 
-        warnings.warn(
+        logger.warning(
             f"Metric `{self.__class__.__name__}` will save all targets and predictions in buffer."
             " For large datasets this may lead to large memory footprint.",
-            UserWarning,
-            stacklevel=1,
         )
 
         # the options below are, redundantly, validated here to avoid reaching
diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index 997b5f4c11..81094f3dcc 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -33,7 +33,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import logging
-import warnings
 from enum import Enum
 
 import numpy as np
@@ -260,18 +259,16 @@ def aupimo_scores(
     )
 
     if not np.isclose(fpr_lower_bound_defacto, fpr_lower_bound, rtol=(rtol := 1e-2)):
-        msg = (
+        logger.warning(
             "The lower bound of the shared FPR integration range is not exactly achieved. "
-            f"Expected {fpr_lower_bound} but got {fpr_lower_bound_defacto}, which is not within {rtol=}."
+            f"Expected {fpr_lower_bound} but got {fpr_lower_bound_defacto}, which is not within {rtol=}.",
         )
-        warnings.warn(msg, RuntimeWarning, stacklevel=1)
 
     if not np.isclose(fpr_upper_bound_defacto, fpr_upper_bound, rtol=rtol):
-        msg = (
+        logger.warning = (
             "The upper bound of the shared FPR integration range is not exactly achieved. "
             f"Expected {fpr_upper_bound} but got {fpr_upper_bound_defacto}, which is not within {rtol=}."
         )
-        warnings.warn(msg, RuntimeWarning, stacklevel=1)
 
     # reminder: fpr lower/upper bound is threshold upper/lower bound (reversed)
     thresh_lower_bound_idx = fpr_upper_bound_thresh_idx
@@ -308,12 +305,10 @@ def aupimo_scores(
         raise RuntimeError(msg)
 
     if invalid_shared_fpr.any():
-        msg = (
+        logger.warning(
             "Some values in the shared fpr integration range are nan. "
-            "The AUPIMO will be computed without these values."
+            "The AUPIMO will be computed without these values.",
         )
-        warnings.warn(msg, RuntimeWarning, stacklevel=1)
-        logger.warning(msg)
 
         # get rid of nan values by removing them from the integration range
         shared_fpr_bounded_log = shared_fpr_bounded_log[~invalid_shared_fpr]
@@ -330,17 +325,14 @@ def aupimo_scores(
         if not force:
             raise RuntimeError(msg)
         msg += " Computation was forced!"
-        warnings.warn(msg, RuntimeWarning, stacklevel=1)
         logger.warning(msg)
 
     if num_points_integral < 300:
-        msg = (
+        logger.warning(
             "The AUPIMO may be inaccurate because the shared fpr integration range doesnt have enough points. "
             f"Found {num_points_integral} points in the integration range. "
-            "Try increasing `num_threshs`."
+            "Try increasing `num_threshs`.",
         )
-        warnings.warn(msg, RuntimeWarning, stacklevel=1)
-        logger.warning(msg)
 
     aucs: ndarray = np.trapz(per_image_tprs_bounded, x=shared_fpr_bounded_log, axis=1)
 
diff --git a/src/anomalib/metrics/per_image/utils.py b/src/anomalib/metrics/per_image/utils.py
index cd7e1cbf01..dba232de6c 100644
--- a/src/anomalib/metrics/per_image/utils.py
+++ b/src/anomalib/metrics/per_image/utils.py
@@ -11,7 +11,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import logging
-import warnings
 from collections import OrderedDict
 from copy import deepcopy
 from typing import TYPE_CHECKING
@@ -180,9 +179,9 @@ def _validate_is_scores_per_model_aupimoresult(
 
     if len(available_paths) != len(scores_per_model):
         msg = "Some models have paths, while others are missing them."
-        if missing_paths_ok:
-            warnings.warn(msg, UserWarning, stacklevel=3)
-            logger.warning(msg)
+        if not missing_paths_ok:
+            raise ValueError(msg)
+        logger.warning(msg)
 
 
 def _validate_is_scores_per_model(

From 2f65040d5893ba51f670b77dd654fba00f204754 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Tue, 28 May 2024 15:35:52 +0200
Subject: [PATCH 46/57] refactor test imports

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/__init__.py                      |  8 +++++
 .../metrics/per_image/binclf_curve_numpy.py   |  9 ++----
 .../metrics/per_image/test_binclf_curve.py    | 31 ++++++-------------
 tests/unit/metrics/per_image/test_pimo.py     | 21 +++----------
 tests/unit/metrics/per_image/test_utils.py    | 30 ++++++------------
 5 files changed, 34 insertions(+), 65 deletions(-)

diff --git a/src/anomalib/__init__.py b/src/anomalib/__init__.py
index 711eb023e9..ccefe58ffc 100644
--- a/src/anomalib/__init__.py
+++ b/src/anomalib/__init__.py
@@ -8,6 +8,14 @@
 __version__ = "1.1.0dev"
 
 
+try:
+    import numba  # noqa: F401
+except ImportError:
+    HAS_NUMBA = False
+else:
+    HAS_NUMBA = True
+
+
 class LearningType(str, Enum):
     """Learning type defining how the model learns from the dataset samples."""
 
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index bb37808fda..a7de6331a8 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -22,12 +22,9 @@
 import numpy as np
 from numpy import ndarray
 
-try:
-    import numba  # noqa: F401
-except ImportError:
-    HAS_NUMBA = False
-else:
-    HAS_NUMBA = True
+from anomalib import HAS_NUMBA
+
+if HAS_NUMBA:
     from . import _binclf_curve_numba
 
 from . import _validate
diff --git a/tests/unit/metrics/per_image/test_binclf_curve.py b/tests/unit/metrics/per_image/test_binclf_curve.py
index 589bd9c27d..3a78d82237 100644
--- a/tests/unit/metrics/per_image/test_binclf_curve.py
+++ b/tests/unit/metrics/per_image/test_binclf_curve.py
@@ -7,9 +7,14 @@
 import numpy as np
 import pytest
 import torch
+from anomalib import HAS_NUMBA
+from anomalib.metrics.per_image import binclf_curve, binclf_curve_numpy
 from numpy import ndarray
 from torch import Tensor
 
+if HAS_NUMBA:
+    from anomalib.metrics.per_image import _binclf_curve_numba
+
 
 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
     """Generate test cases."""
@@ -336,8 +341,6 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
 
 def test__binclf_one_curve_python(pred: ndarray, gt: ndarray, threshs: ndarray, expected: ndarray) -> None:
     """Test if `_binclf_one_curve_python()` returns the expected values."""
-    from anomalib.metrics.per_image import binclf_curve_numpy
-
     computed = binclf_curve_numpy._binclf_one_curve_python(pred, gt, threshs)
     assert computed.shape == (threshs.size, 2, 2)
     assert (computed == expected).all()
@@ -350,8 +353,6 @@ def test__binclf_multiple_curves_python(
     expecteds: ndarray,
 ) -> None:
     """Test if `_binclf_multiple_curves_python()` returns the expected values."""
-    from anomalib.metrics.per_image import binclf_curve_numpy
-
     computed = binclf_curve_numpy._binclf_multiple_curves_python(preds, gts, threshs)
     assert computed.shape == (preds.shape[0], threshs.size, 2, 2)
     assert (computed == expecteds).all()
@@ -363,8 +364,8 @@ def test__binclf_multiple_curves_python(
 
 def test__binclf_one_curve_numba(pred: ndarray, gt: ndarray, threshs: ndarray, expected: ndarray) -> None:
     """Test if `_binclf_one_curve_numba()` returns the expected values."""
-    from anomalib.metrics.per_image import _binclf_curve_numba
-
+    if not HAS_NUMBA:
+        pytest.skip("Numba is not available.")
     computed = _binclf_curve_numba.binclf_one_curve_numba(pred, gt, threshs)
     assert computed.shape == (threshs.size, 2, 2)
     assert (computed == expected).all()
@@ -372,8 +373,8 @@ def test__binclf_one_curve_numba(pred: ndarray, gt: ndarray, threshs: ndarray, e
 
 def test__binclf_multiple_curves_numba(preds: ndarray, gts: ndarray, threshs: ndarray, expecteds: ndarray) -> None:
     """Test if `_binclf_multiple_curves_python()` returns the expected values."""
-    from anomalib.metrics.per_image import _binclf_curve_numba
-
+    if not HAS_NUMBA:
+        pytest.skip("Numba is not available.")
     computed = _binclf_curve_numba.binclf_multiple_curves_numba(preds, gts, threshs)
     assert computed.shape == (preds.shape[0], threshs.size, 2, 2)
     assert (computed == expecteds).all()
@@ -391,8 +392,6 @@ def test_binclf_multiple_curves(
     algorithm: str,
 ) -> None:
     """Test if `binclf_multiple_curves()` returns the expected values."""
-    from anomalib.metrics.per_image import binclf_curve_numpy
-
     computed = binclf_curve_numpy.binclf_multiple_curves(
         preds,
         gts,
@@ -421,8 +420,6 @@ def test_binclf_multiple_curves(
 
 def test_binclf_multiple_curves_validations(args: list, kwargs: dict, exception: Exception) -> None:
     """Test if `_binclf_multiple_curves_python()` raises the expected errors."""
-    from anomalib.metrics.per_image import binclf_curve_numpy
-
     with pytest.raises(exception):
         binclf_curve_numpy.binclf_multiple_curves(*args, **kwargs)
 
@@ -438,8 +435,6 @@ def test_per_image_binclf_curve_numpy(
     expected_binclf_curves: ndarray,
 ) -> None:
     """Test if `per_image_binclf_curve()` returns the expected values."""
-    from anomalib.metrics.per_image import binclf_curve_numpy
-
     computed_threshs, computed_binclf_curves = binclf_curve_numpy.per_image_binclf_curve(
         anomaly_maps,
         masks,
@@ -462,8 +457,6 @@ def test_per_image_binclf_curve_numpy(
 
 def test_per_image_binclf_curve_numpy_validations(args: list, kwargs: dict, exception: Exception) -> None:
     """Test if `per_image_binclf_curve()` raises the expected errors."""
-    from anomalib.metrics.per_image import binclf_curve_numpy
-
     with pytest.raises(exception):
         binclf_curve_numpy.per_image_binclf_curve(*args, **kwargs)
 
@@ -475,8 +468,6 @@ def test_per_image_binclf_curve_numpy_validations_alt(args: list, kwargs: dict,
 
 def test_rate_metrics_numpy(binclf_curves: ndarray, expected_fprs: ndarray, expected_tprs: ndarray) -> None:
     """Test if rate metrics are computed correctly."""
-    from anomalib.metrics.per_image import binclf_curve_numpy
-
     tprs = binclf_curve_numpy.per_image_tpr(binclf_curves)
     fprs = binclf_curve_numpy.per_image_fpr(binclf_curves)
 
@@ -502,8 +493,6 @@ def test_per_image_binclf_curve_torch(
     expected_binclf_curves: Tensor,
 ) -> None:
     """Test if `per_image_binclf_curve()` returns the expected values."""
-    from anomalib.metrics.per_image import binclf_curve
-
     computed_threshs, computed_binclf_curves = binclf_curve.per_image_binclf_curve(
         anomaly_maps,
         masks,
@@ -526,8 +515,6 @@ def test_per_image_binclf_curve_torch(
 
 def test_rate_metrics_torch(binclf_curves: Tensor, expected_fprs: Tensor, expected_tprs: Tensor) -> None:
     """Test if rate metrics are computed correctly."""
-    from anomalib.metrics.per_image import binclf_curve
-
     tprs = binclf_curve.per_image_tpr(binclf_curves)
     fprs = binclf_curve.per_image_fpr(binclf_curves)
 
diff --git a/tests/unit/metrics/per_image/test_pimo.py b/tests/unit/metrics/per_image/test_pimo.py
index d3617fb24b..d058951534 100644
--- a/tests/unit/metrics/per_image/test_pimo.py
+++ b/tests/unit/metrics/per_image/test_pimo.py
@@ -9,9 +9,13 @@
 import numpy as np
 import pytest
 import torch
+from anomalib.metrics.per_image import pimo, pimo_numpy
+from anomalib.metrics.per_image.pimo import AUPIMOResult, PIMOResult
 from numpy import ndarray
 from torch import Tensor
 
+from .test_utils import assert_statsdict_stuff
+
 
 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
     """Generate tests for all functions in this module.
@@ -239,8 +243,6 @@ def test_pimo_numpy(
     expected_image_classes: ndarray,
 ) -> None:
     """Test if `pimo()` returns the expected values."""
-    from anomalib.metrics.per_image import pimo_numpy
-
     threshs, shared_fpr, per_image_tprs, image_classes = pimo_numpy.pimo_curves(
         anomaly_maps,
         masks,
@@ -269,8 +271,6 @@ def test_pimo(
     expected_image_classes: Tensor,
 ) -> None:
     """Test if `pimo()` returns the expected values."""
-    from anomalib.metrics.per_image import pimo
-    from anomalib.metrics.per_image.pimo import PIMOResult
 
     def do_assertions(pimoresult: PIMOResult) -> None:
         assert pimoresult.shared_fpr_metric == "mean-per-image-fpr"
@@ -356,8 +356,6 @@ def test_aupimo_values_numpy(
     expected_aupimos: ndarray,
 ) -> None:
     """Test if `aupimo()` returns the expected values."""
-    from anomalib.metrics.per_image import pimo_numpy
-
     threshs, shared_fpr, per_image_tprs, image_classes, aupimos, _ = pimo_numpy.aupimo_scores(
         anomaly_maps,
         masks,
@@ -392,8 +390,6 @@ def test_aupimo_values(
     expected_aupimos: ndarray,
 ) -> None:
     """Test if `aupimo()` returns the expected values."""
-    from anomalib.metrics.per_image import pimo
-    from anomalib.metrics.per_image.pimo import AUPIMOResult, PIMOResult
 
     def do_assertions(pimoresult: PIMOResult, aupimoresult: AUPIMOResult) -> None:
         # test metadata
@@ -460,8 +456,6 @@ def test_aupimo_edge(
     fpr_bounds: tuple[float, float],
 ) -> None:
     """Test some edge cases."""
-    from anomalib.metrics.per_image import pimo_numpy
-
     # None is the case of testing the default bounds
     fpr_bounds = {"fpr_bounds": fpr_bounds, "shared_fpr_metric": "mean-per-image-fpr"} if fpr_bounds is not None else {}
 
@@ -505,9 +499,6 @@ def test_pimoresult_object(
     paths: list[str] | None,
 ) -> None:
     """Test if `PIMOResult` can be converted to other formats and back."""
-    from anomalib.metrics.per_image import pimo
-    from anomalib.metrics.per_image.pimo import PIMOResult
-
     optional_kwargs = {}
     if paths is not None:
         optional_kwargs["paths"] = paths
@@ -556,9 +547,6 @@ def test_aupimoresult_object(
     paths: list[str] | None,
 ) -> None:
     """Test if `AUPIMOResult` can be converted to other formats and back."""
-    from anomalib.metrics.per_image import pimo
-    from anomalib.metrics.per_image.pimo import AUPIMOResult
-
     optional_kwargs = {}
     if paths is not None:
         optional_kwargs["paths"] = paths
@@ -609,7 +597,6 @@ def test_aupimoresult_object(
     # statistics
     stats = aupimoresult.stats()
     assert len(stats) == 6
-    from .test_utils import assert_statsdict_stuff
 
     for statdic in stats:
         assert_statsdict_stuff(statdic, 2)
diff --git a/tests/unit/metrics/per_image/test_utils.py b/tests/unit/metrics/per_image/test_utils.py
index 3f531ed288..5dd1b0a8f6 100644
--- a/tests/unit/metrics/per_image/test_utils.py
+++ b/tests/unit/metrics/per_image/test_utils.py
@@ -8,10 +8,18 @@
 import numpy as np
 import pytest
 import torch
+from anomalib.metrics.per_image import (
+    AUPIMOResult,
+    PIMOSharedFPRMetric,
+    StatsOutliersPolicy,
+    StatsRepeatedPolicy,
+    compare_models_pairwise_ttest_rel,
+    compare_models_pairwise_wilcoxon,
+    format_pairwise_tests_results,
+    per_image_scores_stats,
+)
 from torch import Tensor
 
-from anomalib.metrics.per_image import AUPIMOResult, PIMOSharedFPRMetric
-
 
 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
     """Generate test cases."""
@@ -88,12 +96,6 @@ def assert_statsdict_stuff(statdic: dict, max_image_idx: int) -> None:
 
 def test_per_image_scores_stats() -> None:
     """Test `per_image_scores_boxplot_stats`."""
-    from anomalib.metrics.per_image import (
-        StatsOutliersPolicy,
-        StatsRepeatedPolicy,
-        per_image_scores_stats,
-    )
-
     gen = torch.Generator().manual_seed(42)
     num_scores = 201
     scores = torch.randn(num_scores, generator=gen)
@@ -137,8 +139,6 @@ def test_per_image_scores_stats() -> None:
 
 def test_per_image_scores_stats_specific_values() -> None:
     """Test `per_image_scores_boxplot_stats` with specific values."""
-    from anomalib.metrics.per_image import per_image_scores_stats
-
     scores = torch.concatenate(
         [
             # whislo = min value is 0.0
@@ -189,8 +189,6 @@ def test_per_image_scores_stats_specific_values() -> None:
 
 def test_compare_models_pairwise_ttest(scores_per_model: dict, alternative: str, higher_is_better: bool) -> None:
     """Test `compare_models_pairwise_ttest`."""
-    from anomalib.metrics.per_image import AUPIMOResult, compare_models_pairwise_ttest_rel
-
     models_ordered, confidences = compare_models_pairwise_ttest_rel(
         scores_per_model,
         alternative=alternative,
@@ -233,8 +231,6 @@ def copy_and_add_nan(scores: Tensor) -> Tensor:
 
 def test_compare_models_pairwise_wilcoxon(scores_per_model: dict, alternative: str, higher_is_better: bool) -> None:
     """Test `compare_models_pairwise_wilcoxon`."""
-    from anomalib.metrics.per_image import AUPIMOResult, compare_models_pairwise_wilcoxon
-
     models_ordered, confidences = compare_models_pairwise_wilcoxon(
         scores_per_model,
         alternative=alternative,
@@ -278,12 +274,6 @@ def copy_and_add_nan(scores: Tensor) -> Tensor:
 
 def test_format_pairwise_tests_results(scores_per_model: dict) -> None:
     """Test `format_pairwise_tests_results`."""
-    from anomalib.metrics.per_image import (
-        compare_models_pairwise_ttest_rel,
-        compare_models_pairwise_wilcoxon,
-        format_pairwise_tests_results,
-    )
-
     models_ordered, confidences = compare_models_pairwise_wilcoxon(
         scores_per_model,
         alternative="greater",

From 0d0863fc6554d6081e6f826b7daba71fa2f44ed0 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Tue, 28 May 2024 15:50:24 +0200
Subject: [PATCH 47/57] refactor docs

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/pimo.py        |  89 +--------
 src/anomalib/metrics/per_image/utils.py       | 176 +++++++++++++++---
 src/anomalib/metrics/per_image/utils_numpy.py |  64 +++----
 3 files changed, 186 insertions(+), 143 deletions(-)

diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index d7d5885d14..6a6f4b452a 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -505,14 +505,10 @@ def stats(
     ) -> list[dict[str, str | int | float]]:
         """Return the AUPIMO statistics.
 
-        See `anomalib.utils.metrics.per_image.per_image_scores_stats` for details (its docstring below).
+        See `anomalib.utils.metrics.per_image.per_image_scores_stats` for details.
 
         Returns:
             list[dict[str, str | int | float]]: AUPIMO statistics
-
-        `anomalib.utils.metrics.per_image.per_image_scores_stats`.__doc__
-        ==================================================================
-        {docstring_per_image_scores_stats}
         """
         return utils.per_image_scores_stats(
             self.aupimos,
@@ -524,11 +520,6 @@ def stats(
         )
 
 
-AUPIMOResult.__doc__ = AUPIMOResult.__doc__.format(  # type: ignore[union-attr]
-    docstring_per_image_scores_stats=utils.per_image_scores_stats.__doc__,
-)
-
-
 # =========================================== FUNCTIONAL ===========================================
 
 
@@ -546,18 +537,10 @@ def pimo_curves(
     The tensors are converted to numpy arrays and then passed and validated in the numpy code.
     The results are converted back to tensors and wrapped in an dataclass object.
 
-    Refer to `pimo_numpy.pimo_curves()` and `PIMOResult` (their docstrings below).
+    Refer to `pimo_numpy.pimo_curves()` and `PIMOResult`.
 
     Args (extra):
         paths: paths to the source images to which the PIMO curves correspond.
-
-    pimo_numpy.pimo_curves.__doc__
-    ==============================
-    {docstring_pimo_curves}
-
-    PIMOResult.__doc__
-    ==================
-    {docstring_pimoresult}
     """
     _validate_is_anomaly_maps(anomaly_maps)
     anomaly_maps_array = anomaly_maps.detach().cpu().numpy()
@@ -598,13 +581,6 @@ def pimo_curves(
     )
 
 
-# append the docstring
-pimo_curves.__doc__ = pimo_curves.__doc__.format(  # type: ignore[union-attr]
-    docstring_pimo_curves=pimo_numpy.pimo_curves.__doc__,
-    docstring_pimoresult=PIMOResult.__doc__,
-)
-
-
 def aupimo_scores(
     anomaly_maps: Tensor,
     masks: Tensor,
@@ -621,22 +597,10 @@ def aupimo_scores(
     The tensors are converted to numpy arrays and then passed and validated in the numpy code.
     The results are converted back to tensors and wrapped in an dataclass object.
 
-    Refer to `pimo_numpy.aupimo_scores()`, `PIMOResult` and `AUPIMOResult` (their docstrings below).
+    Refer to `pimo_numpy.aupimo_scores()`, `PIMOResult` and `AUPIMOResult`.
 
     Args (extra):
         paths: paths to the source images to which the AUPIMO scores correspond.
-
-    pimo_numpy.aupimo_scores.__doc__
-    =================================
-    {docstring_aupimo_scores}
-
-    PIMOResult.__doc__
-    ==================
-    {docstring_pimoresult}
-
-    AUPIMOResult.__doc__
-    ====================
-    {docstring_aupimoresult}
     """
     _validate_is_anomaly_maps(anomaly_maps)
     anomaly_maps_array = anomaly_maps.detach().cpu().numpy()
@@ -691,14 +655,6 @@ def aupimo_scores(
     return pimoresult, aupimoresult
 
 
-# append the docstrings
-aupimo_scores.__doc__ = aupimo_scores.__doc__.format(  # type: ignore[union-attr]
-    docstring_aupimo_scores=pimo_numpy.aupimo_scores.__doc__,
-    docstring_pimoresult=PIMOResult.__doc__,
-    docstring_aupimoresult=AUPIMOResult.__doc__,
-)
-
-
 # =========================================== TORCHMETRICS ===========================================
 
 
@@ -709,15 +665,7 @@ class PIMO(Metric):
     The tensors are converted to numpy arrays and then passed and validated in the numpy code.
     The results are converted back to tensors and wrapped in an dataclass object.
 
-    Refer to `pimo_numpy.pimo_curves()` and `PIMOResult` (their docstrings below).
-
-    pimo_numpy.pimo_curves.__doc__
-    ==============================
-    {docstring_pimo_curves}
-
-    PIMOResult.__doc__
-    ==================
-    {docstring_pimoresult}
+    Refer to `pimo_numpy.pimo_curves()` and `PIMOResult`.
     """
 
     is_differentiable: bool = False
@@ -814,13 +762,6 @@ def compute(self) -> PIMOResult:
         )
 
 
-# append the docstrings
-PIMO.__doc__ = PIMO.__doc__.format(  # type: ignore[union-attr]
-    docstring_pimo_curves=pimo_numpy.pimo_curves.__doc__,
-    docstring_pimoresult=PIMOResult.__doc__,
-)
-
-
 class AUPIMO(PIMO):
     """Area Under the Per-Image Overlap (PIMO) curve.
 
@@ -828,19 +769,7 @@ class AUPIMO(PIMO):
     The tensors are converted to numpy arrays and then passed and validated in the numpy code.
     The results are converted back to tensors and wrapped in an dataclass object.
 
-    Refer to `pimo_numpy.aupimo_scores()`, `PIMOResult` and `AUPIMOResult` (their docstrings below).
-
-    pimo_numpy.aupimo_scores.__doc__
-    =================================
-    {docstring_aupimo_scores}
-
-    PIMOResult.__doc__
-    ==================
-    {docstring_pimoresult}
-
-    AUPIMOResult.__doc__
-    ====================
-    {docstring_aupimoresult}
+    Refer to `pimo_numpy.aupimo_scores()`, `PIMOResult` and `AUPIMOResult`.
     """
 
     fpr_bounds: tuple[float, float]
@@ -941,11 +870,3 @@ def compute(self, force: bool | None = None) -> tuple[PIMOResult, AUPIMOResult]:
             fpr_bounds=self.fpr_bounds,
             force=force,
         )
-
-
-# append the docstrings
-AUPIMO.__doc__ = AUPIMO.__doc__.format(  # type: ignore[union-attr]
-    docstring_aupimo_scores=pimo_numpy.aupimo_scores.__doc__,
-    docstring_pimoresult=PIMOResult.__doc__,
-    docstring_aupimoresult=AUPIMOResult.__doc__,
-)
diff --git a/src/anomalib/metrics/per_image/utils.py b/src/anomalib/metrics/per_image/utils.py
index dba232de6c..80cda3c3c8 100644
--- a/src/anomalib/metrics/per_image/utils.py
+++ b/src/anomalib/metrics/per_image/utils.py
@@ -242,12 +242,69 @@ def per_image_scores_stats(
     repeated_policy: str | None = StatsRepeatedPolicy.AVOID,
     repeated_replacement_atol: float = 1e-2,
 ) -> list[dict[str, str | int | float]]:
-    """Torch-oriented interface for `per_image_scores_stats`. See its dscription for more details (below).
+    """Compute statistics of per-image scores (based on a boxplot's statistics).
 
-    Numpy version docstring
-    =======================
+    ***Torch-oriented interface for `.utils_numpy.per_image_scores_stats`***
 
-    {docstring}
+    For a single per-image metric collection (1 model, 1 dataset), compute statistics (based on a boxplot)
+    and find the closest image to each statistic.
+
+    This function uses `matplotlib.cbook.boxplot_stats`, which is the same function used by `matplotlib.pyplot.boxplot`.
+
+    ** OUTLIERS **
+    Outliers are defined as in a boxplot, i.e. values that are more than 1.5 times the interquartile range (IQR) away
+    from the Q1 and Q3 quartiles (respectively low and high outliers). The IQR is the difference between Q3 and Q1.
+
+    Outliers are handled according to `outliers_policy`:
+        - None | "none": do not include outliers.
+        - "hi": only include high outliers.
+        - "lo": only include low outliers.
+        - "both": include both high and low outliers.
+
+    ** IMAGE INDEX **
+    Each statistic is associated with the image whose score is the closest to the statistic's value.
+
+    ** REPEATED VALUES **
+    It is possible that two stats have the same value (e.g. the median and the 25th percentile can be the same).
+    Such cases are handled according to `repeated_policy`:
+        - None | "none": do not address the issue, so several stats can have the same value and image index.
+        - "avoid": avoid repeated values by iterativealy looking for other images with similar score, whose score
+                    must be within `repeated_replacement_atol` (absolute tolerance) of the repeated value.
+
+    Args:
+        per_image_scores (Tensor): 1D Tensor of per-image scores.
+        images_classes (Tensor | None):
+            Used to filter statistics to only one class. If None, all images are considered.
+            If given, 1D Tensor of binary image classes (0 for 'normal', 1 for 'anomalous'). Defaults to None.
+        only_class (int | None):
+            Only used if `images_classes` is not None.
+            If not None, only compute statistics for images of the given class.
+            `None` means both image classes are used.
+            Defaults to None.
+        outliers_policy (str | None): How to handle outliers stats (use them?). See `OutliersPolicy`. Defaults to None.
+        repeated_policy (str | None): How to handle repeated values in boxplot stats (two stats with same value).
+                                        See `RepeatedPolicy`. Defaults to None.
+        repeated_replacement_atol (float): Absolute tolerance used to replace repeated values. Only used if
+                                            `repeated_policy` is not None (or 'none'). Defaults to 1e-2 (1%).
+
+    Returns:
+        list[dict[str, str | int | float]]: List of boxplot statistics.
+
+        Each dictionary has the following keys:
+            - 'stat_name': Name of the statistic. Possible values:
+                - 'mean': Mean of the scores.
+                - 'med': Median of the scores.
+                - 'q1': 25th percentile of the scores.
+                - 'q3': 75th percentile of the scores.
+                - 'whishi': Upper whisker value.
+                - 'whislo': Lower whisker value.
+                - 'outlo_i': low outlier value; `i` is a unique index for each low outlier.
+                - 'outhi_j': high outlier value; `j` is a unique index for each high outlier.
+            - 'stat_value': Value of the statistic (same units as `values`).
+            - 'image_idx': Index of the image in `per_image_scores` whose score is the closest to the statistic's value.
+            - 'score': The score of the image at index `image_idx` (not necessarily the same as `stat_value`).
+
+        The list is sorted by increasing `stat_value`.
     """
     _validate.is_tensor(per_image_scores, "per_image_scores")
     per_image_scores_array = per_image_scores.detach().cpu().numpy()
@@ -271,11 +328,6 @@ def per_image_scores_stats(
     )
 
 
-per_image_scores_stats.__doc__ = per_image_scores_stats.__doc__.format(  # type: ignore[union-attr]
-    docstring=utils_numpy.per_image_scores_stats.__doc__,
-)
-
-
 def compare_models_pairwise_ttest_rel(
     scores_per_model: dict[str, Tensor]
     | OrderedDict[str, Tensor]
@@ -284,12 +336,51 @@ def compare_models_pairwise_ttest_rel(
     alternative: str,
     higher_is_better: bool,
 ) -> tuple[tuple[str, ...], dict[tuple[str, str], float]]:
-    """Torch-oriented interface for `compare_models_pairwise_ttest_rel`. See its dscription for more details (below).
+    """Compare all pairs of models using the paired t-test on two related samples (parametric).
+
+    ***Torch-oriented interface for `.numpy_utils.compare_models_pairwise_ttest_rel`***
+
+    This is a test for the null hypothesis that two repeated samples have identical average (expected) values.
+    In fact, it tests whether the average of the differences between the two samples is significantly different from 0.
+
+    Refs:
+        - `scipy.stats.ttest_rel`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html
+        - Wikipedia page: https://en.wikipedia.org/wiki/Student's_t-test#Dependent_t-test_for_paired_samples
 
-    Numpy version docstring
-    =======================
+    ===
 
-    {docstring}
+    If an ordered dictionary is given, the models are sorted by the order of the dictionary.
+    Otherwise, the models are sorted by average SCORE.
+
+    Args:
+        scores_per_model: Dictionary of `n` models and their per-image scores.
+            key: model name
+            value: tensor of shape (num_images,). All `nan` values must be at the same positions.
+        higher_is_better: Whether higher values of score are better or worse. Defaults to True.
+        alternative: Alternative hypothesis for the statistical tests. See `confidences` in "Returns" section.
+                     Valid values are `StatsAlternativeHypothesis.ALTERNATIVES`.
+
+    Returns:
+        (models_ordered, test_results):
+            - models_ordered: Models sorted by the user (`OrderedDict` input) or automatically (`dict` input).
+
+                Automatic sorting is by average score from best to worst model.
+                Depending on `higher_is_better`, this corresponds to:
+                    - `higher_is_better=True` ==> descending score order
+                    - `higher_is_better=False` ==> ascending score order
+                along the indices from 0 to `n-1`.
+
+            - confidences: Dictionary of confidence values for each pair of models.
+
+                For all pairs of indices i and j from 0 to `n-1` such that i != j:
+                    - key: (models_ordered[i], models_ordered[j])
+                    - value: confidence on the alternative hypothesis.
+
+                For models `models_ordered[i]` and `models_ordered[j]`, the alternative hypothesis is:
+                    - if `less`: model[i] < model[j]
+                    - if `greater`: model[i] > model[j]
+                    - if `two-sided`: model[i] != model[j]
+                in termos of average score.
     """
     _validate_is_scores_per_model(scores_per_model)
     scores_per_model_items = [
@@ -305,11 +396,6 @@ def compare_models_pairwise_ttest_rel(
     return utils_numpy.compare_models_pairwise_ttest_rel(scores_per_model_with_arrays, alternative, higher_is_better)
 
 
-compare_models_pairwise_ttest_rel.__doc__ = compare_models_pairwise_ttest_rel.__doc__.format(  # type: ignore[union-attr]
-    docstring=utils_numpy.compare_models_pairwise_ttest_rel.__doc__,
-)
-
-
 def compare_models_pairwise_wilcoxon(
     scores_per_model: dict[str, Tensor]
     | OrderedDict[str, Tensor]
@@ -318,12 +404,53 @@ def compare_models_pairwise_wilcoxon(
     alternative: str,
     higher_is_better: bool,
 ) -> tuple[tuple[str, ...], dict[tuple[str, str], float]]:
-    """Torch-oriented interface for `compare_models_pairwise_wilcoxon`. See its dscription for more details (below).
+    """Compare all pairs of models using the Wilcoxon signed-rank test (non-parametric).
+
+    ***Torch-oriented interface for `.numpy_utils.compare_models_pairwise_wilcoxon`***
+
+    Each comparison of two models is a Wilcoxon signed-rank test (null hypothesis is that they are equal).
 
-    Numpy version docstring
-    =======================
+    It tests whether the distribution of the differences of scores is symmetric about zero in a non-parametric way.
+    This is like the non-parametric version of the paired t-test.
 
-    {docstring}
+    Refs:
+        - `scipy.stats.wilcoxon`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wilcoxon.html#scipy.stats.wilcoxon
+        - Wikipedia page: https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test
+
+    ===
+
+    If an ordered dictionary is given, the models are sorted by the order of the dictionary.
+    Otherwise, the models are sorted by average RANK.
+
+    Args:
+        scores_per_model: Dictionary of `n` models and their per-image scores.
+            key: model name
+            value: tensor of shape (num_images,). All `nan` values must be at the same positions.
+        higher_is_better: Whether higher values of score are better or worse. Defaults to True.
+        alternative: Alternative hypothesis for the statistical tests. See `confidences` in "Returns" section.
+                     Valid values are `StatsAlternativeHypothesis.ALTERNATIVES`.
+        atol: Absolute tolerance used to consider two scores as equal. Defaults to 1e-3 (0.1%).
+              When doing a paired test, if the difference between two scores is below `atol`, the difference is
+              truncated to 0. If `atol` is None, no truncation is done.
+
+    Returns:
+        (models_ordered, test_results):
+            - models_ordered: Models sorted by the user (`OrderedDict` input) or automatically (`dict` input).
+
+                Automatic sorting is from "best to worst" model, which corresponds to ascending average rank
+                along the indices from 0 to `n-1`.
+
+            - confidences: Dictionary of confidence values for each pair of models.
+
+                For all pairs of indices i and j from 0 to `n-1` such that i != j:
+                    - key: (models_ordered[i], models_ordered[j])
+                    - value: confidence on the alternative hypothesis.
+
+                For models `models_ordered[i]` and `models_ordered[j]`, the alternative hypothesis is:
+                    - if `less`: model[i] < model[j]
+                    - if `greater`: model[i] > model[j]
+                    - if `two-sided`: model[i] != model[j]
+                    in terms of average ranks (not scores!).
     """
     _validate_is_scores_per_model(scores_per_model)
     scores_per_model_items = [
@@ -339,11 +466,6 @@ def compare_models_pairwise_wilcoxon(
     return utils_numpy.compare_models_pairwise_wilcoxon(scores_per_model_with_arrays, alternative, higher_is_better)
 
 
-compare_models_pairwise_wilcoxon.__doc__ = compare_models_pairwise_wilcoxon.__doc__.format(  # type: ignore[union-attr]
-    docstring=utils_numpy.compare_models_pairwise_wilcoxon.__doc__,
-)
-
-
 def format_pairwise_tests_results(
     models_ordered: tuple[str, ...],
     confidences: dict[tuple[str, str], float],
diff --git a/src/anomalib/metrics/per_image/utils_numpy.py b/src/anomalib/metrics/per_image/utils_numpy.py
index 08f70e6683..647080a3d0 100644
--- a/src/anomalib/metrics/per_image/utils_numpy.py
+++ b/src/anomalib/metrics/per_image/utils_numpy.py
@@ -336,26 +336,26 @@ def compare_models_pairwise_ttest_rel(
                      Valid values are `StatsAlternativeHypothesis.ALTERNATIVES`.
 
     Returns:
-            (models_ordered, test_results):
-                - models_ordered: Models sorted by the user (`OrderedDict` input) or automatically (`dict` input).
-
-                    Automatic sorting is by average score from best to worst model.
-                    Depending on `higher_is_better`, this corresponds to:
-                        - `higher_is_better=True` ==> descending score order
-                        - `higher_is_better=False` ==> ascending score order
-                    along the indices from 0 to `n-1`.
-
-                - confidences: Dictionary of confidence values for each pair of models.
-
-                    For all pairs of indices i and j from 0 to `n-1` such that i != j:
-                        - key: (models_ordered[i], models_ordered[j])
-                        - value: confidence on the alternative hypothesis.
-
-                    For models `models_ordered[i]` and `models_ordered[j]`, the alternative hypothesis is:
-                        - if `less`: model[i] < model[j]
-                        - if `greater`: model[i] > model[j]
-                        - if `two-sided`: model[i] != model[j]
-                    in termos of average score.
+        (models_ordered, test_results):
+            - models_ordered: Models sorted by the user (`OrderedDict` input) or automatically (`dict` input).
+
+                Automatic sorting is by average score from best to worst model.
+                Depending on `higher_is_better`, this corresponds to:
+                    - `higher_is_better=True` ==> descending score order
+                    - `higher_is_better=False` ==> ascending score order
+                along the indices from 0 to `n-1`.
+
+            - confidences: Dictionary of confidence values for each pair of models.
+
+                For all pairs of indices i and j from 0 to `n-1` such that i != j:
+                    - key: (models_ordered[i], models_ordered[j])
+                    - value: confidence on the alternative hypothesis.
+
+                For models `models_ordered[i]` and `models_ordered[j]`, the alternative hypothesis is:
+                    - if `less`: model[i] < model[j]
+                    - if `greater`: model[i] > model[j]
+                    - if `two-sided`: model[i] != model[j]
+                in termos of average score.
     """
     _validate_is_scores_per_model(scores_per_model)
     StatsAlternativeHypothesis(alternative)
@@ -424,22 +424,22 @@ def compare_models_pairwise_wilcoxon(
               truncated to 0. If `atol` is None, no truncation is done.
 
     Returns:
-            (models_ordered, test_results):
-                - models_ordered: Models sorted by the user (`OrderedDict` input) or automatically (`dict` input).
+        (models_ordered, test_results):
+            - models_ordered: Models sorted by the user (`OrderedDict` input) or automatically (`dict` input).
 
-                    Automatic sorting is from "best to worst" model, which corresponds to ascending average rank
-                    along the indices from 0 to `n-1`.
+                Automatic sorting is from "best to worst" model, which corresponds to ascending average rank
+                along the indices from 0 to `n-1`.
 
-                - confidences: Dictionary of confidence values for each pair of models.
+            - confidences: Dictionary of confidence values for each pair of models.
 
-                    For all pairs of indices i and j from 0 to `n-1` such that i != j:
-                        - key: (models_ordered[i], models_ordered[j])
-                        - value: confidence on the alternative hypothesis.
+                For all pairs of indices i and j from 0 to `n-1` such that i != j:
+                    - key: (models_ordered[i], models_ordered[j])
+                    - value: confidence on the alternative hypothesis.
 
-                    For models `models_ordered[i]` and `models_ordered[j]`, the alternative hypothesis is:
-                        - if `less`: model[i] < model[j]
-                        - if `greater`: model[i] > model[j]
-                        - if `two-sided`: model[i] != model[j]
+                For models `models_ordered[i]` and `models_ordered[j]`, the alternative hypothesis is:
+                    - if `less`: model[i] < model[j]
+                    - if `greater`: model[i] > model[j]
+                    - if `two-sided`: model[i] != model[j]
                     in terms of average ranks (not scores!).
     """
     _validate_is_scores_per_model(scores_per_model)

From fdd797af676a103cb39d57ff838cf28e5b32ed07 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Wed, 29 May 2024 17:17:57 +0200
Subject: [PATCH 48/57] refactor some docs

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 .../metrics/per_image/_binclf_curve_numba.py  |   6 +-
 .../metrics/per_image/binclf_curve.py         |   3 +-
 .../metrics/per_image/binclf_curve_numpy.py   |   2 +-
 src/anomalib/metrics/per_image/pimo.py        | 137 ++++++++++++++++--
 src/anomalib/metrics/per_image/pimo_numpy.py  |  29 +---
 5 files changed, 133 insertions(+), 44 deletions(-)

diff --git a/src/anomalib/metrics/per_image/_binclf_curve_numba.py b/src/anomalib/metrics/per_image/_binclf_curve_numba.py
index 1446c50c64..e763fce826 100644
--- a/src/anomalib/metrics/per_image/_binclf_curve_numba.py
+++ b/src/anomalib/metrics/per_image/_binclf_curve_numba.py
@@ -1,6 +1,6 @@
 """Binary classification matrix curve (NUMBA implementation of low level functions).
 
-See docstring of `binclf_curve` or `binclf_curve_numpy` for more details.
+Details: `.binclf_curve`.
 
 author: jpcbertoldo
 """
@@ -33,7 +33,7 @@ def binclf_one_curve_numba(scores: ndarray, gts: ndarray, threshs: ndarray) -> n
     Returns:
         ndarray: Binary classification matrix curve (K, 2, 2)
 
-        See docstring of `binclf_multiple_curves` for details.
+        Details: `anomalib.metrics.per_image.binclf_curve_numpy.binclf_multiple_curves`.
     """
     num_th = len(threshs)
 
@@ -105,7 +105,7 @@ def binclf_multiple_curves_numba(scores_batch: ndarray, gts_batch: ndarray, thre
     Returns:
         ndarray: Binary classification matrix curves (N, K, 2, 2)
 
-        See docstring of `binclf_multiple_curves` for details.
+        Details: `anomalib.metrics.per_image.binclf_curve_numpy.binclf_multiple_curves`.
     """
     num_imgs = scores_batch.shape[0]
     num_th = len(threshs)
diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
index 55d8287515..fc870946d3 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -1,7 +1,8 @@
 """Binary classification curve (torch interface).
 
 This module implements torch interfaces to access the numpy code in `binclf_curve_numpy.py`.
-Check its docstring for more details.
+
+Details: `anomalib.metrics.per_image.binclf_curve_numpy.binclf_multiple_curves`.
 
 Tensors are build with `torch.from_numpy` and so the returned tensors will share the same memory as the numpy arrays.
 
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index a7de6331a8..38e3752885 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -111,7 +111,7 @@ def _binclf_one_curve_python(scores: ndarray, gts: ndarray, threshs: ndarray) ->
     Returns:
         ndarray: Binary classification matrix curve (K, 2, 2)
 
-        See docstring of `binclf_multiple_curves` for details.
+        Details: `anomalib.metrics.per_image.binclf_curve_numpy.binclf_multiple_curves`.
     """
     num_th = len(threshs)
 
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 6a6f4b452a..31c17d0a7a 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -1,7 +1,32 @@
 """Per-Image Overlap curve (PIMO, pronounced pee-mo) and its area under the curve (AUPIMO).
 
+# PIMO
+
+PIMO is a curve of True Positive Rate (TPR) values on each image across multiple anomaly score thresholds.
+The anomaly score thresholds are indexed by a (shared) valued of False Positive Rate (FPR) measure on the normal images.
+
+Each *anomalous* image has its own curve such that the X-axis is shared by all of them.
+
+At a given threshold:
+    X-axis: Shared FPR (may vary)
+        1. Log of the Average of per-image FPR on normal images.
+        SEE NOTE BELOW.
+    Y-axis: per-image TP Rate (TPR), or "Overlap" between the ground truth and the predicted masks.
+
+*** Note about other shared FPR alternatives ***
+The shared FPR metric can be made harder by using the cross-image max (or high-percentile) FPRs instead of the mean.
+Rationale: this will further punish models that have exceptional FPs in normal images.
+So far there is only one shared FPR metric implemented but others will be added in the future.
+
+# AUPIMO
+
+`AUPIMO` is the area under each `PIMO` curve with bounded integration range in terms of shared FPR.
+
+# Disclaimer
+
 This module implements torch interfaces to access the numpy code in `pimo_numpy.py`.
-Check its docstring for more details.
+Tensors are converted to numpy arrays and then passed and validated in the numpy code.
+The results are converted back to tensors and eventually wrapped in an dataclass object.
 
 Validations will preferably happen in ndarray so the numpy code can be reused without torch,
 so often times the Tensor arguments will be converted to ndarray and then validated.
@@ -223,7 +248,7 @@ def __post_init__(self) -> None:
     def thresh_at(self, fpr_level: float) -> tuple[int, float, float]:
         """Return the threshold at the given shared FPR.
 
-        See `anomalib.utils.metrics.per_image.pimo_numpy.thresh_at_shared_fpr_level` for details.
+        See `anomalib.metrics.per_image.pimo_numpy.thresh_at_shared_fpr_level` for details.
 
         Args:
             fpr_level (float): shared FPR level
@@ -505,7 +530,7 @@ def stats(
     ) -> list[dict[str, str | int | float]]:
         """Return the AUPIMO statistics.
 
-        See `anomalib.utils.metrics.per_image.per_image_scores_stats` for details.
+        See `anomalib.metrics.per_image.utils.per_image_scores_stats` for details.
 
         Returns:
             list[dict[str, str | int | float]]: AUPIMO statistics
@@ -537,10 +562,28 @@ def pimo_curves(
     The tensors are converted to numpy arrays and then passed and validated in the numpy code.
     The results are converted back to tensors and wrapped in an dataclass object.
 
-    Refer to `pimo_numpy.pimo_curves()` and `PIMOResult`.
+    PIMO is a curve of True Positive Rate (TPR) values on each image across multiple anomaly score thresholds.
+    The anomaly score thresholds are indexed by a (cross-image shared) value of False Positive Rate (FPR) measure on
+    the normal images.
+
+    Details: `anomalib.metrics.per_image.pimo`.
+
+    Args' notation:
+        N: number of images
+        H: image height
+        W: image width
+        K: number of thresholds
 
-    Args (extra):
-        paths: paths to the source images to which the PIMO curves correspond.
+    Args:
+        anomaly_maps: floating point anomaly score maps of shape (N, H, W)
+        masks: binary (bool or int) ground truth masks of shape (N, H, W)
+        num_threshs: number of thresholds to compute (K)
+        binclf_algorithm: algorithm to compute the binary classifier curve (see `binclf_curve_numpy.Algorithm`)
+        shared_fpr_metric: metric to compute the shared FPR axis
+        paths: paths to the source images to which the PIMO curves correspond. Default: None.
+
+    Returns:
+        PIMOResult: PIMO curves dataclass object. See `PIMOResult` for details.
     """
     _validate_is_anomaly_maps(anomaly_maps)
     anomaly_maps_array = anomaly_maps.detach().cpu().numpy()
@@ -597,10 +640,29 @@ def aupimo_scores(
     The tensors are converted to numpy arrays and then passed and validated in the numpy code.
     The results are converted back to tensors and wrapped in an dataclass object.
 
-    Refer to `pimo_numpy.aupimo_scores()`, `PIMOResult` and `AUPIMOResult`.
-
-    Args (extra):
+    Scores are computed from the integration of the PIMO curves within the given FPR bounds, then normalized to [0, 1].
+    It can be thought of as the average TPR of the PIMO curves within the given FPR bounds.
+
+    Details: `anomalib.metrics.per_image.pimo`.
+
+    Args' notation:
+        N: number of images
+        H: image height
+        W: image width
+        K: number of thresholds
+
+    Args:
+        anomaly_maps: floating point anomaly score maps of shape (N, H, W)
+        masks: binary (bool or int) ground truth masks of shape (N, H, W)
+        num_threshs: number of thresholds to compute (K)
+        binclf_algorithm: algorithm to compute the binary classifier curve (see `binclf_curve_numpy.Algorithm`)
+        shared_fpr_metric: metric to compute the shared FPR axis
+        fpr_bounds: lower and upper bounds of the FPR integration range
+        force: whether to force the computation despite bad conditions
         paths: paths to the source images to which the AUPIMO scores correspond.
+
+    Returns:
+        tuple[PIMOResult, AUPIMOResult]: PIMO and AUPIMO results dataclass objects. See `PIMOResult` and `AUPIMOResult`.
     """
     _validate_is_anomaly_maps(anomaly_maps)
     anomaly_maps_array = anomaly_maps.detach().cpu().numpy()
@@ -659,13 +721,35 @@ def aupimo_scores(
 
 
 class PIMO(Metric):
-    """Per-Image Overlap (PIMO) curve.
+    """Per-IMage Overlap (PIMO, pronounced pee-mo) curves.
 
     This torchmetrics interface is a wrapper around the functional interface, which is a wrapper around the numpy code.
     The tensors are converted to numpy arrays and then passed and validated in the numpy code.
     The results are converted back to tensors and wrapped in an dataclass object.
 
-    Refer to `pimo_numpy.pimo_curves()` and `PIMOResult`.
+    PIMO is a curve of True Positive Rate (TPR) values on each image across multiple anomaly score thresholds.
+    The anomaly score thresholds are indexed by a (cross-image shared) value of False Positive Rate (FPR) measure on
+    the normal images.
+
+    Details: `anomalib.metrics.per_image.pimo`.
+
+    Notation:
+        N: number of images
+        H: image height
+        W: image width
+        K: number of thresholds
+
+    Attributes:
+        anomaly_maps: floating point anomaly score maps of shape (N, H, W)
+        masks: binary (bool or int) ground truth masks of shape (N, H, W)
+
+    Args:
+        num_threshs: number of thresholds to compute (K)
+        binclf_algorithm: algorithm to compute the binary classifier curve (see `binclf_curve_numpy.Algorithm`)
+        shared_fpr_metric: metric to compute the shared FPR axis
+
+    Returns:
+        PIMOResult: PIMO curves dataclass object. See `PIMOResult` for details.
     """
 
     is_differentiable: bool = False
@@ -703,9 +787,9 @@ def __init__(
         """Per-Image Overlap (PIMO) curve.
 
         Args:
-            num_threshs: number of thresholds used to compute the PIMO curve
-            binclf_algorithm: algorithm to compute the binary classification curve
-            shared_fpr_metric: metric to compute the shared FPR curve
+            num_threshs: number of thresholds used to compute the PIMO curve (K)
+            binclf_algorithm: algorithm to compute the binary classification curve (see `binclf_curve_numpy.Algorithm`)
+            shared_fpr_metric: metric to compute the shared FPR axis
         """
         super().__init__()
 
@@ -769,7 +853,30 @@ class AUPIMO(PIMO):
     The tensors are converted to numpy arrays and then passed and validated in the numpy code.
     The results are converted back to tensors and wrapped in an dataclass object.
 
-    Refer to `pimo_numpy.aupimo_scores()`, `PIMOResult` and `AUPIMOResult`.
+    Scores are computed from the integration of the PIMO curves within the given FPR bounds, then normalized to [0, 1].
+    It can be thought of as the average TPR of the PIMO curves within the given FPR bounds.
+
+    Details: `anomalib.metrics.per_image.pimo`.
+
+    Notation:
+        N: number of images
+        H: image height
+        W: image width
+        K: number of thresholds
+
+    Attributes:
+        anomaly_maps: floating point anomaly score maps of shape (N, H, W)
+        masks: binary (bool or int) ground truth masks of shape (N, H, W)
+
+    Args:
+        num_threshs: number of thresholds to compute (K)
+        binclf_algorithm: algorithm to compute the binary classifier curve (see `binclf_curve_numpy.Algorithm`)
+        shared_fpr_metric: metric to compute the shared FPR axis
+        fpr_bounds: lower and upper bounds of the FPR integration range
+        force: whether to force the computation despite bad conditions
+
+    Returns:
+        tuple[PIMOResult, AUPIMOResult]: PIMO and AUPIMO results dataclass objects. See `PIMOResult` and `AUPIMOResult`.
     """
 
     fpr_bounds: tuple[float, float]
diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index 81094f3dcc..a28fe05798 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -1,26 +1,6 @@
 """Per-Image Overlap curve (PIMO, pronounced pee-mo) and its area under the curve (AUPIMO).
 
-# PIMO
-
-PIMO is a curve of True Positive Rate (TPR) values on each image across multiple anomaly score thresholds.
-The anomaly score thresholds are indexed by a (shared) valued of False Positive Rate (FPR) measure on the normal images.
-
-Each *anomalous* image has its own curve such that the X-axis is shared by all of them.
-
-At a given threshold:
-    X-axis: Shared FPR (may vary)
-        1. Log of the Average of per-image FPR on normal images.
-        SEE NOTE BELOW.
-    Y-axis: per-image TP Rate (TPR), or "Overlap" between the ground truth and the predicted masks.
-
-*** Note about other shared FPR alternatives ***
-The shared FPR metric can be made harder by using the cross-image max (or high-percentile) FPRs instead of the mean.
-Rationale: this will further punish models that have exceptional FPs in normal images.
-So far there is only one shared FPR metric implemented but others will be added in the future.
-
-# AUPIMO
-
-`AUPIMO` is the area under each `PIMO` curve with bounded integration range in terms of shared FPR.
+Details: `anomalib.metrics.per_image.pimo`.
 
 author: jpcbertoldo
 """
@@ -103,7 +83,7 @@ def pimo_curves(
     The anomaly score thresholds are indexed by a (cross-image shared) value of False Positive Rate (FPR) measure on
     the normal images.
 
-    See the module's docstring for more details.
+    Details: `anomalib.metrics.per_image.pimo`.
 
     Args' notation:
         N: number of images
@@ -168,7 +148,8 @@ def pimo_curves(
             raise RuntimeError(msg) from ex
 
         # shape -> (K,)
-        # this is the only shared FPR metric implemented so far, see note about shared FPR in the module's docstring
+        # this is the only shared FPR metric implemented so far,
+        # see note about shared FPR in Details: `anomalib.metrics.per_image.pimo`.
         shared_fpr = per_image_fprs_normals.mean(axis=0)
 
     else:
@@ -198,7 +179,7 @@ def aupimo_scores(
     Scores are computed from the integration of the PIMO curves within the given FPR bounds, then normalized to [0, 1].
     It can be thought of as the average TPR of the PIMO curves within the given FPR bounds.
 
-    See `pimo_curves()` and the module's docstring for more details.
+    Details: `anomalib.metrics.per_image.pimo`.
 
     Args' notation:
         N: number of images

From 012e8e2c26051eb3fba8b1a780b2d80a56935b59 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Wed, 29 May 2024 18:57:46 +0200
Subject: [PATCH 49/57] correct pre commit errors

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 .../metrics/per_image/binclf_curve_numpy.py   | 30 +++++++++++--------
 src/anomalib/metrics/per_image/pimo.py        | 26 +++++++++-------
 src/anomalib/metrics/per_image/pimo_numpy.py  | 16 +++++-----
 src/anomalib/metrics/per_image/utils.py       |  4 +--
 src/anomalib/metrics/per_image/utils_numpy.py | 16 ++++------
 .../metrics/per_image/test_binclf_curve.py    |  5 ++--
 tests/unit/metrics/per_image/test_pimo.py     |  5 ++--
 tests/unit/metrics/per_image/test_utils.py    |  3 +-
 8 files changed, 57 insertions(+), 48 deletions(-)

diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index 38e3752885..93b70754b2 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -176,7 +176,7 @@ def binclf_multiple_curves(
     scores_batch: ndarray,
     gts_batch: ndarray,
     threshs: ndarray,
-    algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA,
+    algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA.value,
 ) -> ndarray:
     """Multiple binary classification matrix (per-instance scope) at each threshold (shared).
 
@@ -216,13 +216,13 @@ def binclf_multiple_curves(
 
         Thresholds are sorted in ascending order.
     """
-    BinclfAlgorithm(algorithm)
+    algorithm = BinclfAlgorithm(algorithm)
     _validate_is_scores_batch(scores_batch)
     _validate_is_gts_batch(gts_batch)
     _validate.is_same_shape(scores_batch, gts_batch)
     _validate.is_threshs(threshs)
 
-    if BinclfAlgorithm(algorithm) == BinclfAlgorithm.NUMBA:
+    if algorithm == BinclfAlgorithm.NUMBA:
         if HAS_NUMBA:
             return _binclf_curve_numba.binclf_multiple_curves_numba(scores_batch, gts_batch, threshs)
 
@@ -253,8 +253,8 @@ def _get_threshs_minmax_linspace(anomaly_maps: ndarray, num_threshs: int) -> nda
 def per_image_binclf_curve(
     anomaly_maps: ndarray,
     masks: ndarray,
-    algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA,
-    threshs_choice: BinclfThreshsChoice | str = BinclfThreshsChoice.MINMAX_LINSPACE,
+    algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA.value,
+    threshs_choice: BinclfThreshsChoice | str = BinclfThreshsChoice.MINMAX_LINSPACE.value,
     threshs_given: ndarray | None = None,
     num_threshs: int | None = None,
 ) -> tuple[ndarray, ndarray]:
@@ -304,35 +304,41 @@ def per_image_binclf_curve(
             Thresholds are sorted in ascending order.
     """
     BinclfAlgorithm(algorithm)
+    threshs_choice = BinclfThreshsChoice(threshs_choice)
     _validate.is_anomaly_maps(anomaly_maps)
     _validate.is_masks(masks)
     _validate.is_same_shape(anomaly_maps, masks)
 
     threshs: ndarray
 
-    if BinclfThreshsChoice(threshs_choice) == BinclfThreshsChoice.GIVEN:
+    if threshs_choice == BinclfThreshsChoice.GIVEN:
         assert threshs_given is not None
         _validate.is_threshs(threshs_given)
         if num_threshs is not None:
             logger.warning(
-                f"Argument `num_threshs` was given, but it is ignored because `threshs_choice` is {threshs_choice}.",
+                "Argument `num_threshs` was given, "
+                f"but it is ignored because `threshs_choice` is '{threshs_choice.value}'.",
             )
         threshs = threshs_given.astype(anomaly_maps.dtype)
 
-    elif BinclfThreshsChoice(threshs_choice) == BinclfThreshsChoice.MINMAX_LINSPACE:
+    elif threshs_choice == BinclfThreshsChoice.MINMAX_LINSPACE:
         assert num_threshs is not None
         if threshs_given is not None:
             logger.warning(
-                f"Argument `threshs_given` was given, but it is ignored because `threshs_choice` is {threshs_choice}.",
+                "Argument `threshs_given` was given, "
+                f"but it is ignored because `threshs_choice` is '{threshs_choice.value}'.",
             )
         # `num_threshs` is validated in the function below
         threshs = _get_threshs_minmax_linspace(anomaly_maps, num_threshs)
 
-    elif BinclfThreshsChoice(threshs_choice) == BinclfThreshsChoice.MEAN_FPR_OPTIMIZED:
-        raise NotImplementedError(f"TODO implement {threshs_choice}")  # noqa: EM102
+    elif threshs_choice == BinclfThreshsChoice.MEAN_FPR_OPTIMIZED:
+        raise NotImplementedError(f"TODO implement {threshs_choice.value}")  # noqa: EM102
 
     else:
-        msg = f"Expected `threshs_choice` to be from {list(BinclfThreshsChoice.__members__)}, but got {threshs_choice}"
+        msg = (
+            f"Expected `threshs_choice` to be from {list(BinclfThreshsChoice.__members__)},"
+            f" but got '{threshs_choice.value}'"
+        )
         raise NotImplementedError(msg)
 
     # keep the batch dimension and flatten the rest
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 31c17d0a7a..3e4052b19c 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -524,8 +524,8 @@ def load(cls: type["AUPIMOResult"], file_path: str | Path) -> "AUPIMOResult":
 
     def stats(
         self,
-        outliers_policy: str | None = StatsOutliersPolicy.NONE,
-        repeated_policy: str | None = StatsRepeatedPolicy.AVOID,
+        outliers_policy: str | StatsOutliersPolicy = StatsOutliersPolicy.NONE.value,
+        repeated_policy: str | StatsRepeatedPolicy = StatsRepeatedPolicy.AVOID.value,
         repeated_replacement_atol: float = 1e-2,
     ) -> list[dict[str, str | int | float]]:
         """Return the AUPIMO statistics.
@@ -552,8 +552,8 @@ def pimo_curves(
     anomaly_maps: Tensor,
     masks: Tensor,
     num_threshs: int,
-    binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA,
-    shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
+    binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA.value,
+    shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR.value,
     paths: list[str] | None = None,
 ) -> PIMOResult:
     """Compute the Per-IMage Overlap (PIMO, pronounced pee-mo) curves.
@@ -616,7 +616,9 @@ def pimo_curves(
     per_image_tprs = torch.from_numpy(per_image_tprs_array).to(device)
 
     return PIMOResult(
-        shared_fpr_metric=shared_fpr_metric,
+        shared_fpr_metric=shared_fpr_metric.value
+        if isinstance(shared_fpr_metric, PIMOSharedFPRMetric)
+        else shared_fpr_metric,
         threshs=threshs,
         shared_fpr=shared_fpr,
         per_image_tprs=per_image_tprs,
@@ -628,8 +630,8 @@ def aupimo_scores(
     anomaly_maps: Tensor,
     masks: Tensor,
     num_threshs: int = 300_000,
-    binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA,
-    shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
+    binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA.value,
+    shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR.value,
     fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
     force: bool = False,
     paths: list[str] | None = None,
@@ -699,7 +701,9 @@ def aupimo_scores(
     aupimos = torch.from_numpy(aupimos_array).to(device)
 
     pimoresult = PIMOResult(
-        shared_fpr_metric=shared_fpr_metric,
+        shared_fpr_metric=shared_fpr_metric.value
+        if isinstance(shared_fpr_metric, PIMOSharedFPRMetric)
+        else shared_fpr_metric,
         threshs=threshs,
         shared_fpr=shared_fpr,
         per_image_tprs=per_image_tprs,
@@ -781,7 +785,7 @@ def image_classes(self) -> Tensor:
     def __init__(
         self,
         num_threshs: int,
-        binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA,
+        binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA.value,
         shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR.value,
     ) -> None:
         """Per-Image Overlap (PIMO) curve.
@@ -923,8 +927,8 @@ def __repr__(self) -> str:
     def __init__(
         self,
         num_threshs: int = 300_000,
-        binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA,
-        shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
+        binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA.value,
+        shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR.value,
         fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
         force: bool = False,
     ) -> None:
diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index a28fe05798..c743157fa8 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -74,8 +74,8 @@ def pimo_curves(
     anomaly_maps: ndarray,
     masks: ndarray,
     num_threshs: int,
-    binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA,
-    shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
+    binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA.value,
+    shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR.value,
 ) -> tuple[ndarray, ndarray, ndarray, ndarray]:
     """Compute the Per-IMage Overlap (PIMO, pronounced pee-mo) curves.
 
@@ -107,7 +107,7 @@ def pimo_curves(
     """
     # validate the strings are valid
     BinclfAlgorithm(binclf_algorithm)
-    PIMOSharedFPRMetric(shared_fpr_metric)
+    shared_fpr_metric = PIMOSharedFPRMetric(shared_fpr_metric)
     _validate.is_num_threshs_gte2(num_threshs)
     _validate.is_anomaly_maps(anomaly_maps)
     _validate.is_masks(masks)
@@ -132,13 +132,13 @@ def pimo_curves(
         anomaly_maps=anomaly_maps,
         masks=masks,
         algorithm=binclf_algorithm,
-        threshs_choice=BinclfThreshsChoice.GIVEN,
+        threshs_choice=BinclfThreshsChoice.GIVEN.value,
         threshs_given=threshs,
         num_threshs=None,
     )
 
     shared_fpr: ndarray
-    if PIMOSharedFPRMetric(shared_fpr_metric) == PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR:
+    if shared_fpr_metric == PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR:
         # shape -> (N, K)
         per_image_fprs_normals = binclf_curve_numpy.per_image_fpr(binclf_curves[image_classes == 0])
         try:
@@ -170,7 +170,7 @@ def aupimo_scores(
     masks: ndarray,
     num_threshs: int = 300_000,
     binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA,
-    shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
+    shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR.value,
     fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
     force: bool = False,
 ) -> tuple[ndarray, ndarray, ndarray, ndarray, ndarray, int]:
@@ -246,9 +246,9 @@ def aupimo_scores(
         )
 
     if not np.isclose(fpr_upper_bound_defacto, fpr_upper_bound, rtol=rtol):
-        logger.warning = (
+        logger.warning(
             "The upper bound of the shared FPR integration range is not exactly achieved. "
-            f"Expected {fpr_upper_bound} but got {fpr_upper_bound_defacto}, which is not within {rtol=}."
+            f"Expected {fpr_upper_bound} but got {fpr_upper_bound_defacto}, which is not within {rtol=}.",
         )
 
     # reminder: fpr lower/upper bound is threshold upper/lower bound (reversed)
diff --git a/src/anomalib/metrics/per_image/utils.py b/src/anomalib/metrics/per_image/utils.py
index 80cda3c3c8..6925c82ecc 100644
--- a/src/anomalib/metrics/per_image/utils.py
+++ b/src/anomalib/metrics/per_image/utils.py
@@ -238,8 +238,8 @@ def per_image_scores_stats(
     per_image_scores: Tensor,
     images_classes: Tensor | None = None,
     only_class: int | None = None,
-    outliers_policy: str | None = StatsOutliersPolicy.NONE,
-    repeated_policy: str | None = StatsRepeatedPolicy.AVOID,
+    outliers_policy: str | StatsOutliersPolicy = StatsOutliersPolicy.NONE.value,
+    repeated_policy: str | StatsRepeatedPolicy = StatsRepeatedPolicy.AVOID.value,
     repeated_replacement_atol: float = 1e-2,
 ) -> list[dict[str, str | int | float]]:
     """Compute statistics of per-image scores (based on a boxplot's statistics).
diff --git a/src/anomalib/metrics/per_image/utils_numpy.py b/src/anomalib/metrics/per_image/utils_numpy.py
index 647080a3d0..be18050d08 100644
--- a/src/anomalib/metrics/per_image/utils_numpy.py
+++ b/src/anomalib/metrics/per_image/utils_numpy.py
@@ -144,8 +144,8 @@ def per_image_scores_stats(
     per_image_scores: ndarray,
     images_classes: ndarray | None = None,
     only_class: int | None = None,
-    outliers_policy: StatsOutliersPolicy | str | None = StatsOutliersPolicy.NONE,
-    repeated_policy: StatsRepeatedPolicy | str | None = StatsRepeatedPolicy.AVOID,
+    outliers_policy: StatsOutliersPolicy | str = StatsOutliersPolicy.NONE.value,
+    repeated_policy: StatsRepeatedPolicy | str = StatsRepeatedPolicy.AVOID.value,
     repeated_replacement_atol: float = 1e-2,
 ) -> list[dict[str, str | int | float]]:
     """Compute statistics of per-image scores (based on a boxplot's statistics).
@@ -247,13 +247,13 @@ def per_image_scores_stats(
     outliers_lo = outliers[outliers < boxplot_stats["med"]]
     outliers_hi = outliers[outliers > boxplot_stats["med"]]
 
-    if StatsOutliersPolicy(outliers_policy) in (StatsOutliersPolicy.HI, StatsOutliersPolicy.BOTH):
+    if outliers_policy in (StatsOutliersPolicy.HI, StatsOutliersPolicy.BOTH):
         boxplot_stats = {
             **boxplot_stats,
             **{f"outhi_{idx:06}": value for idx, value in enumerate(outliers_hi)},
         }
 
-    if StatsOutliersPolicy(outliers_policy) in (StatsOutliersPolicy.LO, StatsOutliersPolicy.BOTH):
+    if outliers_policy in (StatsOutliersPolicy.LO, StatsOutliersPolicy.BOTH):
         boxplot_stats = {
             **boxplot_stats,
             **{f"outlo_{idx:06}": value for idx, value in enumerate(outliers_lo)},
@@ -269,14 +269,10 @@ def append_record(stat_name: str, stat_value: float) -> None:
         image_idx = candidate2image_idx[candidate_idx]
 
         # handle repeated values
-        if (
-            image_idx not in images_idxs_selected
-            or repeated_policy is None
-            or StatsRepeatedPolicy(repeated_policy) == StatsRepeatedPolicy.NONE
-        ):
+        if image_idx not in images_idxs_selected or repeated_policy == StatsRepeatedPolicy.NONE:
             pass
 
-        elif StatsRepeatedPolicy(repeated_policy) == StatsRepeatedPolicy.AVOID:
+        elif repeated_policy == StatsRepeatedPolicy.AVOID:
             for other_candidate_idx in candidates_sorted:
                 other_candidate_image_idx = candidate2image_idx[other_candidate_idx]
                 if other_candidate_image_idx in images_idxs_selected:
diff --git a/tests/unit/metrics/per_image/test_binclf_curve.py b/tests/unit/metrics/per_image/test_binclf_curve.py
index 3a78d82237..254f3423f5 100644
--- a/tests/unit/metrics/per_image/test_binclf_curve.py
+++ b/tests/unit/metrics/per_image/test_binclf_curve.py
@@ -7,11 +7,12 @@
 import numpy as np
 import pytest
 import torch
-from anomalib import HAS_NUMBA
-from anomalib.metrics.per_image import binclf_curve, binclf_curve_numpy
 from numpy import ndarray
 from torch import Tensor
 
+from anomalib import HAS_NUMBA
+from anomalib.metrics.per_image import binclf_curve, binclf_curve_numpy
+
 if HAS_NUMBA:
     from anomalib.metrics.per_image import _binclf_curve_numba
 
diff --git a/tests/unit/metrics/per_image/test_pimo.py b/tests/unit/metrics/per_image/test_pimo.py
index d058951534..8de53de29a 100644
--- a/tests/unit/metrics/per_image/test_pimo.py
+++ b/tests/unit/metrics/per_image/test_pimo.py
@@ -9,11 +9,12 @@
 import numpy as np
 import pytest
 import torch
-from anomalib.metrics.per_image import pimo, pimo_numpy
-from anomalib.metrics.per_image.pimo import AUPIMOResult, PIMOResult
 from numpy import ndarray
 from torch import Tensor
 
+from anomalib.metrics.per_image import pimo, pimo_numpy
+from anomalib.metrics.per_image.pimo import AUPIMOResult, PIMOResult
+
 from .test_utils import assert_statsdict_stuff
 
 
diff --git a/tests/unit/metrics/per_image/test_utils.py b/tests/unit/metrics/per_image/test_utils.py
index 5dd1b0a8f6..d0ceff1860 100644
--- a/tests/unit/metrics/per_image/test_utils.py
+++ b/tests/unit/metrics/per_image/test_utils.py
@@ -8,6 +8,8 @@
 import numpy as np
 import pytest
 import torch
+from torch import Tensor
+
 from anomalib.metrics.per_image import (
     AUPIMOResult,
     PIMOSharedFPRMetric,
@@ -18,7 +20,6 @@
     format_pairwise_tests_results,
     per_image_scores_stats,
 )
-from torch import Tensor
 
 
 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:

From f11b4a969de83b6d58ed5746c5f20521df6fa242 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Wed, 26 Jun 2024 17:23:20 +0200
Subject: [PATCH 50/57] remove author tag

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/__init__.py            | 5 +----
 src/anomalib/metrics/per_image/_binclf_curve_numba.py | 2 --
 src/anomalib/metrics/per_image/_validate.py           | 2 --
 src/anomalib/metrics/per_image/binclf_curve.py        | 2 --
 src/anomalib/metrics/per_image/binclf_curve_numpy.py  | 2 --
 src/anomalib/metrics/per_image/pimo.py                | 2 --
 src/anomalib/metrics/per_image/pimo_numpy.py          | 2 --
 src/anomalib/metrics/per_image/utils.py               | 5 +----
 src/anomalib/metrics/per_image/utils_numpy.py         | 5 +----
 9 files changed, 3 insertions(+), 24 deletions(-)

diff --git a/src/anomalib/metrics/per_image/__init__.py b/src/anomalib/metrics/per_image/__init__.py
index 406da86931..d7cc6add94 100644
--- a/src/anomalib/metrics/per_image/__init__.py
+++ b/src/anomalib/metrics/per_image/__init__.py
@@ -1,7 +1,4 @@
-"""Per-Image Metrics.
-
-author: jpcbertoldo
-"""
+"""Per-Image Metrics."""
 
 # Original Code
 # https://github.com/jpcbertoldo/aupimo
diff --git a/src/anomalib/metrics/per_image/_binclf_curve_numba.py b/src/anomalib/metrics/per_image/_binclf_curve_numba.py
index e763fce826..3151a2faba 100644
--- a/src/anomalib/metrics/per_image/_binclf_curve_numba.py
+++ b/src/anomalib/metrics/per_image/_binclf_curve_numba.py
@@ -1,8 +1,6 @@
 """Binary classification matrix curve (NUMBA implementation of low level functions).
 
 Details: `.binclf_curve`.
-
-author: jpcbertoldo
 """
 
 # Original Code
diff --git a/src/anomalib/metrics/per_image/_validate.py b/src/anomalib/metrics/per_image/_validate.py
index fba8037e84..72f107e21e 100644
--- a/src/anomalib/metrics/per_image/_validate.py
+++ b/src/anomalib/metrics/per_image/_validate.py
@@ -4,8 +4,6 @@
 
 TODO(jpcbertoldo): Move validations to a common place and reuse them across the codebase.
 https://github.com/openvinotoolkit/anomalib/issues/2093
-
-author: jpcbertoldo
 """
 
 # Original Code
diff --git a/src/anomalib/metrics/per_image/binclf_curve.py b/src/anomalib/metrics/per_image/binclf_curve.py
index fc870946d3..1a1b614a68 100644
--- a/src/anomalib/metrics/per_image/binclf_curve.py
+++ b/src/anomalib/metrics/per_image/binclf_curve.py
@@ -8,8 +8,6 @@
 
 Validations will preferably happen in ndarray so the numpy code can be reused without torch,
 so often times the Tensor arguments will be converted to ndarray and then validated.
-
-author: jpcbertoldo
 """
 
 # Original Code
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index 93b70754b2..edf9fa9780 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -3,8 +3,6 @@
 A binary classification (binclf) matrix (TP, FP, FN, TN) is evaluated at multiple thresholds.
 
 The thresholds are shared by all instances/images, but their binclf are computed independently for each instance/image.
-
-author: jpcbertoldo
 """
 
 # Original Code
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 3e4052b19c..a397953be0 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -30,8 +30,6 @@
 
 Validations will preferably happen in ndarray so the numpy code can be reused without torch,
 so often times the Tensor arguments will be converted to ndarray and then validated.
-
-author: jpcbertoldo
 """
 
 # Original Code
diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index c743157fa8..0eaccd424f 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -1,8 +1,6 @@
 """Per-Image Overlap curve (PIMO, pronounced pee-mo) and its area under the curve (AUPIMO).
 
 Details: `anomalib.metrics.per_image.pimo`.
-
-author: jpcbertoldo
 """
 
 # Original Code
diff --git a/src/anomalib/metrics/per_image/utils.py b/src/anomalib/metrics/per_image/utils.py
index 6925c82ecc..581974c0e6 100644
--- a/src/anomalib/metrics/per_image/utils.py
+++ b/src/anomalib/metrics/per_image/utils.py
@@ -1,7 +1,4 @@
-"""Torch-oriented interfaces for `utils.py`.
-
-author: jpcbertoldo
-"""
+"""Torch-oriented interfaces for `utils.py`."""
 
 # Original Code
 # https://github.com/jpcbertoldo/aupimo
diff --git a/src/anomalib/metrics/per_image/utils_numpy.py b/src/anomalib/metrics/per_image/utils_numpy.py
index be18050d08..736780831c 100644
--- a/src/anomalib/metrics/per_image/utils_numpy.py
+++ b/src/anomalib/metrics/per_image/utils_numpy.py
@@ -1,7 +1,4 @@
-"""Utility functions for per-image metrics.
-
-author: jpcbertoldo
-"""
+"""Utility functions for per-image metrics."""
 
 # Original Code
 # https://github.com/jpcbertoldo/aupimo

From c92a6a9b5b7c37d2e260450566238f8b75759b9c Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Wed, 26 Jun 2024 17:30:38 +0200
Subject: [PATCH 51/57] add thrid party program

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 third-party-programs.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/third-party-programs.txt b/third-party-programs.txt
index 3155b2a930..8aff59c810 100644
--- a/third-party-programs.txt
+++ b/third-party-programs.txt
@@ -42,3 +42,7 @@ terms are listed below.
 7. CLIP neural network used for deep feature extraction in AI-VAD model
    Copyright (c) 2022 @openai, https://github.com/openai/CLIP.
    SPDX-License-Identifier: MIT
+
+8. AUPIMO metric implementation is based on the original code
+   Copyright (c) 2023 @jpcbertoldo, https://github.com/jpcbertoldo/aupimo
+   SPDX-License-Identifier: MIT
\ No newline at end of file

From 1cde9ce5b48b140b21a654ec3695acc8597de346 Mon Sep 17 00:00:00 2001
From: Joao P C Bertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Mon, 8 Jul 2024 15:55:16 +0200
Subject: [PATCH 52/57] Update src/anomalib/metrics/per_image/pimo.py

---
 src/anomalib/metrics/per_image/pimo.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index a397953be0..4989edb567 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -664,10 +664,7 @@ def aupimo_scores(
     Returns:
         tuple[PIMOResult, AUPIMOResult]: PIMO and AUPIMO results dataclass objects. See `PIMOResult` and `AUPIMOResult`.
     """
-    _validate_is_anomaly_maps(anomaly_maps)
     anomaly_maps_array = anomaly_maps.detach().cpu().numpy()
-
-    _validate_is_masks(masks)
     masks_array = masks.detach().cpu().numpy()
 
     if paths is not None:

From 0bef47120496d4947217f4d9a94a8e228217c601 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Wed, 21 Aug 2024 16:52:36 +0200
Subject: [PATCH 53/57] move HAS_NUMBA

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/__init__.py                             |  8 --------
 src/anomalib/metrics/per_image/binclf_curve_numpy.py | 11 +++++++++--
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/anomalib/__init__.py b/src/anomalib/__init__.py
index 9d5b1f482d..1b7a30497c 100644
--- a/src/anomalib/__init__.py
+++ b/src/anomalib/__init__.py
@@ -8,14 +8,6 @@
 __version__ = "1.2.0dev"
 
 
-try:
-    import numba  # noqa: F401
-except ImportError:
-    HAS_NUMBA = False
-else:
-    HAS_NUMBA = True
-
-
 class LearningType(str, Enum):
     """Learning type defining how the model learns from the dataset samples."""
 
diff --git a/src/anomalib/metrics/per_image/binclf_curve_numpy.py b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
index edf9fa9780..621932baeb 100644
--- a/src/anomalib/metrics/per_image/binclf_curve_numpy.py
+++ b/src/anomalib/metrics/per_image/binclf_curve_numpy.py
@@ -20,7 +20,13 @@
 import numpy as np
 from numpy import ndarray
 
-from anomalib import HAS_NUMBA
+try:
+    import numba  # noqa: F401
+except ImportError:
+    HAS_NUMBA = False
+else:
+    HAS_NUMBA = True
+
 
 if HAS_NUMBA:
     from . import _binclf_curve_numba
@@ -225,8 +231,9 @@ def binclf_multiple_curves(
             return _binclf_curve_numba.binclf_multiple_curves_numba(scores_batch, gts_batch, threshs)
 
         logger.warning(
-            f"Algorithm '{BinclfAlgorithm.NUMBA.value}' was selected, but numba is not installed. "
+            f"Algorithm '{BinclfAlgorithm.NUMBA.value}' was selected, but Numba is not installed. "
             f"Falling back to '{BinclfAlgorithm.PYTHON.value}' implementation.",
+            "Notice that the performance will be slower. Consider installing Numba for faster computation.",
         )
 
     return _binclf_multiple_curves_python(scores_batch, gts_batch, threshs)

From 440bf2f5f32c2a9230485a6df11ce3b026efa306 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Wed, 21 Aug 2024 17:20:54 +0200
Subject: [PATCH 54/57] remove PIMOSharedFPRMetric

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/__init__.py    |  2 -
 src/anomalib/metrics/per_image/pimo.py        | 44 +++----------------
 src/anomalib/metrics/per_image/pimo_numpy.py  | 34 +++++---------
 src/anomalib/metrics/per_image/utils.py       |  9 ----
 tests/unit/metrics/per_image/__init__.py      | 10 +++--
 .../metrics/per_image/test_binclf_curve.py    | 16 ++++---
 tests/unit/metrics/per_image/test_pimo.py     | 32 +++++---------
 tests/unit/metrics/per_image/test_utils.py    | 15 ++++---
 8 files changed, 54 insertions(+), 108 deletions(-)

diff --git a/src/anomalib/metrics/per_image/__init__.py b/src/anomalib/metrics/per_image/__init__.py
index d7cc6add94..2e34372ff7 100644
--- a/src/anomalib/metrics/per_image/__init__.py
+++ b/src/anomalib/metrics/per_image/__init__.py
@@ -10,7 +10,6 @@
 from .binclf_curve import per_image_binclf_curve, per_image_fpr, per_image_tpr
 from .binclf_curve_numpy import BinclfAlgorithm, BinclfThreshsChoice
 from .pimo import AUPIMO, PIMO, AUPIMOResult, PIMOResult, aupimo_scores, pimo_curves
-from .pimo_numpy import PIMOSharedFPRMetric
 from .utils import (
     compare_models_pairwise_ttest_rel,
     compare_models_pairwise_wilcoxon,
@@ -25,7 +24,6 @@
     "BinclfThreshsChoice",
     "StatsOutliersPolicy",
     "StatsRepeatedPolicy",
-    "PIMOSharedFPRMetric",
     # result classes
     "PIMOResult",
     "AUPIMOResult",
diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index 4989edb567..d0b380d36c 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -54,7 +54,6 @@
 
 from . import _validate, pimo_numpy, utils
 from .binclf_curve_numpy import BinclfAlgorithm
-from .pimo_numpy import PIMOSharedFPRMetric
 from .utils import StatsOutliersPolicy, StatsRepeatedPolicy
 
 logger = logging.getLogger(__name__)
@@ -177,16 +176,12 @@ class PIMOResult:
         - TPR: True Positive Rate
 
     Attributes:
-        shared_fpr_metric (str): [metadata] shared FPR metric used to compute the PIMO curve
         threshs (Tensor): sequence of K (monotonically increasing) thresholds used to compute the PIMO curve
         shared_fpr (Tensor): K values of the shared FPR metric at the corresponding thresholds
         per_image_tprs (Tensor): for each of the N images, the K values of in-image TPR at the corresponding thresholds
         paths (list[str]) (optional): [metadata] paths to the source images to which the PIMO curves correspond
     """
 
-    # metadata
-    shared_fpr_metric: str
-
     # data
     threshs: Tensor = field(repr=False)  # shape => (K,)
     shared_fpr: Tensor = field(repr=False)  # shape => (K,)
@@ -220,7 +215,6 @@ def __post_init__(self) -> None:
             _validate_is_threshs(self.threshs)
             _validate_is_shared_fpr(self.shared_fpr, nan_allowed=False)
             _validate_is_per_image_tprs(self.per_image_tprs, self.image_classes)
-            self.shared_fpr_metric = PIMOSharedFPRMetric(self.shared_fpr_metric).value
 
             if self.paths is not None:
                 _validate_is_source_images_paths(self.paths, expected_num_paths=self.per_image_tprs.shape[0])
@@ -266,7 +260,6 @@ def thresh_at(self, fpr_level: float) -> tuple[int, float, float]:
     def to_dict(self) -> dict[str, Tensor | str]:
         """Return a dictionary with the result object's attributes."""
         dic = {
-            "shared_fpr_metric": self.shared_fpr_metric,
             "threshs": self.threshs,
             "shared_fpr": self.shared_fpr,
             "per_image_tprs": self.per_image_tprs,
@@ -309,6 +302,9 @@ def load(cls: type["PIMOResult"], file_path: str | Path) -> "PIMOResult":
         if not isinstance(payload, dict):
             msg = f"Invalid content in file {file_path}. Must be a dictionary."
             raise TypeError(msg)
+        # for compatibility with the original code
+        if "shared_fpr_metric" in payload:
+            del payload["shared_fpr_metric"]
         try:
             return cls.from_dict(payload)
         except TypeError as ex:
@@ -323,7 +319,6 @@ class AUPIMOResult:
     This interface gathers the AUPIMO data and metadata and provides several utility methods.
 
     Attributes:
-        shared_fpr_metric (str): [metadata] shared FPR metric used to compute the PIMO curve
         fpr_lower_bound (float): [metadata] LOWER bound of the FPR integration range
         fpr_upper_bound (float): [metadata] UPPER bound of the FPR integration range
         num_threshs (int): [metadata] number of thresholds used to effectively compute AUPIMO;
@@ -334,7 +329,6 @@ class AUPIMOResult:
     """
 
     # metadata
-    shared_fpr_metric: str
     fpr_lower_bound: float
     fpr_upper_bound: float
     num_threshs: int
@@ -387,7 +381,6 @@ def thresh_bounds(self) -> tuple[float, float]:
     def __post_init__(self) -> None:
         """Validate the inputs for the result object are consistent."""
         try:
-            self.shared_fpr_metric = PIMOSharedFPRMetric(self.shared_fpr_metric).value
             _validate.is_rate_range((self.fpr_lower_bound, self.fpr_upper_bound))
             # TODO(jpcbertoldo): warn when it's too low (use parameters from the numpy code)  # noqa: TD003
             _validate.is_num_threshs_gte2(self.num_threshs)
@@ -447,7 +440,6 @@ def from_pimoresult(
         _, thresh_upper_bound, __ = pimoresult.thresh_at(fpr_lower_bound)
         # `_` is the threshold's index, `__` is the actual fpr value
         return cls(
-            shared_fpr_metric=pimoresult.shared_fpr_metric,
             fpr_lower_bound=fpr_lower_bound,
             fpr_upper_bound=fpr_upper_bound,
             num_threshs=num_threshs_auc,
@@ -460,7 +452,6 @@ def from_pimoresult(
     def to_dict(self) -> dict[str, Tensor | str | float | int]:
         """Return a dictionary with the result object's attributes."""
         dic = {
-            "shared_fpr_metric": self.shared_fpr_metric,
             "fpr_lower_bound": self.fpr_lower_bound,
             "fpr_upper_bound": self.fpr_upper_bound,
             "num_threshs": self.num_threshs,
@@ -514,6 +505,9 @@ def load(cls: type["AUPIMOResult"], file_path: str | Path) -> "AUPIMOResult":
             msg = f"Invalid payload in file {file_path}. Must be a dictionary."
             raise TypeError(msg)
         payload["aupimos"] = torch.tensor(payload["aupimos"], dtype=torch.float64)
+        # for compatibility with the original code
+        if "shared_fpr_metric" in payload:
+            del payload["shared_fpr_metric"]
         try:
             return cls.from_dict(payload)
         except (TypeError, ValueError) as ex:
@@ -551,7 +545,6 @@ def pimo_curves(
     masks: Tensor,
     num_threshs: int,
     binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA.value,
-    shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR.value,
     paths: list[str] | None = None,
 ) -> PIMOResult:
     """Compute the Per-IMage Overlap (PIMO, pronounced pee-mo) curves.
@@ -577,7 +570,6 @@ def pimo_curves(
         masks: binary (bool or int) ground truth masks of shape (N, H, W)
         num_threshs: number of thresholds to compute (K)
         binclf_algorithm: algorithm to compute the binary classifier curve (see `binclf_curve_numpy.Algorithm`)
-        shared_fpr_metric: metric to compute the shared FPR axis
         paths: paths to the source images to which the PIMO curves correspond. Default: None.
 
     Returns:
@@ -598,7 +590,6 @@ def pimo_curves(
         masks_array,
         num_threshs,
         binclf_algorithm=binclf_algorithm,
-        shared_fpr_metric=shared_fpr_metric,
     )
     # _ is `image_classes` -- not needed here because it's a property in the result object
 
@@ -614,9 +605,6 @@ def pimo_curves(
     per_image_tprs = torch.from_numpy(per_image_tprs_array).to(device)
 
     return PIMOResult(
-        shared_fpr_metric=shared_fpr_metric.value
-        if isinstance(shared_fpr_metric, PIMOSharedFPRMetric)
-        else shared_fpr_metric,
         threshs=threshs,
         shared_fpr=shared_fpr,
         per_image_tprs=per_image_tprs,
@@ -629,7 +617,6 @@ def aupimo_scores(
     masks: Tensor,
     num_threshs: int = 300_000,
     binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA.value,
-    shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR.value,
     fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
     force: bool = False,
     paths: list[str] | None = None,
@@ -656,7 +643,6 @@ def aupimo_scores(
         masks: binary (bool or int) ground truth masks of shape (N, H, W)
         num_threshs: number of thresholds to compute (K)
         binclf_algorithm: algorithm to compute the binary classifier curve (see `binclf_curve_numpy.Algorithm`)
-        shared_fpr_metric: metric to compute the shared FPR axis
         fpr_bounds: lower and upper bounds of the FPR integration range
         force: whether to force the computation despite bad conditions
         paths: paths to the source images to which the AUPIMO scores correspond.
@@ -677,7 +663,6 @@ def aupimo_scores(
         masks_array,
         num_threshs,
         binclf_algorithm=binclf_algorithm,
-        shared_fpr_metric=shared_fpr_metric,
         fpr_bounds=fpr_bounds,
         force=force,
     )
@@ -696,9 +681,6 @@ def aupimo_scores(
     aupimos = torch.from_numpy(aupimos_array).to(device)
 
     pimoresult = PIMOResult(
-        shared_fpr_metric=shared_fpr_metric.value
-        if isinstance(shared_fpr_metric, PIMOSharedFPRMetric)
-        else shared_fpr_metric,
         threshs=threshs,
         shared_fpr=shared_fpr,
         per_image_tprs=per_image_tprs,
@@ -745,7 +727,6 @@ class PIMO(Metric):
     Args:
         num_threshs: number of thresholds to compute (K)
         binclf_algorithm: algorithm to compute the binary classifier curve (see `binclf_curve_numpy.Algorithm`)
-        shared_fpr_metric: metric to compute the shared FPR axis
 
     Returns:
         PIMOResult: PIMO curves dataclass object. See `PIMOResult` for details.
@@ -757,7 +738,6 @@ class PIMO(Metric):
 
     num_threshs: int
     binclf_algorithm: str
-    shared_fpr_metric: str
 
     anomaly_maps: list[Tensor]
     masks: list[Tensor]
@@ -781,14 +761,12 @@ def __init__(
         self,
         num_threshs: int,
         binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA.value,
-        shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR.value,
     ) -> None:
         """Per-Image Overlap (PIMO) curve.
 
         Args:
             num_threshs: number of thresholds used to compute the PIMO curve (K)
             binclf_algorithm: algorithm to compute the binary classification curve (see `binclf_curve_numpy.Algorithm`)
-            shared_fpr_metric: metric to compute the shared FPR axis
         """
         super().__init__()
 
@@ -805,7 +783,6 @@ def __init__(
 
         # validate binclf_algorithm and get string
         self.binclf_algorithm = BinclfAlgorithm(binclf_algorithm).value
-        self.shared_fpr_metric = PIMOSharedFPRMetric(shared_fpr_metric).value
 
         self.add_state("anomaly_maps", default=[], dist_reduce_fx="cat")
         self.add_state("masks", default=[], dist_reduce_fx="cat")
@@ -841,7 +818,6 @@ def compute(self) -> PIMOResult:
             masks,
             self.num_threshs,
             binclf_algorithm=self.binclf_algorithm,
-            shared_fpr_metric=self.shared_fpr_metric,
         )
 
 
@@ -870,7 +846,6 @@ class AUPIMO(PIMO):
     Args:
         num_threshs: number of thresholds to compute (K)
         binclf_algorithm: algorithm to compute the binary classifier curve (see `binclf_curve_numpy.Algorithm`)
-        shared_fpr_metric: metric to compute the shared FPR axis
         fpr_bounds: lower and upper bounds of the FPR integration range
         force: whether to force the computation despite bad conditions
 
@@ -915,15 +890,13 @@ def random_model_score(fpr_bounds: tuple[float, float]) -> float:
 
     def __repr__(self) -> str:
         """Show the metric name and its integration bounds."""
-        metric = self.shared_fpr_metric
         lower, upper = self.fpr_bounds
-        return f"{self.__class__.__name__}({metric} in [{lower:.2g}, {upper:.2g}])"
+        return f"{self.__class__.__name__}([{lower:.2g}, {upper:.2g}])"
 
     def __init__(
         self,
         num_threshs: int = 300_000,
         binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA.value,
-        shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR.value,
         fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
         force: bool = False,
     ) -> None:
@@ -932,14 +905,12 @@ def __init__(
         Args:
             num_threshs: [passed to parent `PIMO`] number of thresholds used to compute the PIMO curve
             binclf_algorithm: [passed to parent `PIMO`] algorithm to compute the binary classification curve
-            shared_fpr_metric: [passed to parent `PIMO`] metric to compute the shared FPR curve
             fpr_bounds: lower and upper bounds of the FPR integration range
             force: if True, force the computation of the AUPIMO scores even in bad conditions (e.g. few points)
         """
         super().__init__(
             num_threshs=num_threshs,
             binclf_algorithm=binclf_algorithm,
-            shared_fpr_metric=shared_fpr_metric,
         )
 
         # other validations are done in PIMO.__init__()
@@ -972,7 +943,6 @@ def compute(self, force: bool | None = None) -> tuple[PIMOResult, AUPIMOResult]:
             masks,
             self.num_threshs,
             binclf_algorithm=self.binclf_algorithm,
-            shared_fpr_metric=self.shared_fpr_metric,
             fpr_bounds=self.fpr_bounds,
             force=force,
         )
diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index 0eaccd424f..5cddb5f181 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -73,7 +73,6 @@ def pimo_curves(
     masks: ndarray,
     num_threshs: int,
     binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA.value,
-    shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR.value,
 ) -> tuple[ndarray, ndarray, ndarray, ndarray]:
     """Compute the Per-IMage Overlap (PIMO, pronounced pee-mo) curves.
 
@@ -94,7 +93,6 @@ def pimo_curves(
         masks: binary (bool or int) ground truth masks of shape (N, H, W)
         num_threshs: number of thresholds to compute (K)
         binclf_algorithm: algorithm to compute the binary classifier curve (see `binclf_curve_numpy.Algorithm`)
-        shared_fpr_metric: metric to compute the shared FPR axis
 
     Returns:
         tuple[ndarray, ndarray, ndarray, ndarray]:
@@ -105,7 +103,6 @@ def pimo_curves(
     """
     # validate the strings are valid
     BinclfAlgorithm(binclf_algorithm)
-    shared_fpr_metric = PIMOSharedFPRMetric(shared_fpr_metric)
     _validate.is_num_threshs_gte2(num_threshs)
     _validate.is_anomaly_maps(anomaly_maps)
     _validate.is_masks(masks)
@@ -136,23 +133,19 @@ def pimo_curves(
     )
 
     shared_fpr: ndarray
-    if shared_fpr_metric == PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR:
-        # shape -> (N, K)
-        per_image_fprs_normals = binclf_curve_numpy.per_image_fpr(binclf_curves[image_classes == 0])
-        try:
-            _validate.is_per_image_rate_curves(per_image_fprs_normals, nan_allowed=False, decreasing=True)
-        except ValueError as ex:
-            msg = f"Cannot compute PIMO because the per-image FPR curves from normal images are invalid. Cause: {ex}"
-            raise RuntimeError(msg) from ex
-
-        # shape -> (K,)
-        # this is the only shared FPR metric implemented so far,
-        # see note about shared FPR in Details: `anomalib.metrics.per_image.pimo`.
-        shared_fpr = per_image_fprs_normals.mean(axis=0)
+    # mean-per-image-fpr on normal images
+    # shape -> (N, K)
+    per_image_fprs_normals = binclf_curve_numpy.per_image_fpr(binclf_curves[image_classes == 0])
+    try:
+        _validate.is_per_image_rate_curves(per_image_fprs_normals, nan_allowed=False, decreasing=True)
+    except ValueError as ex:
+        msg = f"Cannot compute PIMO because the per-image FPR curves from normal images are invalid. Cause: {ex}"
+        raise RuntimeError(msg) from ex
 
-    else:
-        msg = f"Shared FPR metric `{shared_fpr_metric}` is not implemented."
-        raise NotImplementedError(msg)
+    # shape -> (K,)
+    # this is the only shared FPR metric implemented so far,
+    # see note about shared FPR in Details: `anomalib.metrics.per_image.pimo`.
+    shared_fpr = per_image_fprs_normals.mean(axis=0)
 
     # shape -> (N, K)
     per_image_tprs = binclf_curve_numpy.per_image_tpr(binclf_curves)
@@ -168,7 +161,6 @@ def aupimo_scores(
     masks: ndarray,
     num_threshs: int = 300_000,
     binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA,
-    shared_fpr_metric: PIMOSharedFPRMetric | str = PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR.value,
     fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
     force: bool = False,
 ) -> tuple[ndarray, ndarray, ndarray, ndarray, ndarray, int]:
@@ -190,7 +182,6 @@ def aupimo_scores(
         masks: binary (bool or int) ground truth masks of shape (N, H, W)
         num_threshs: number of thresholds to compute (K)
         binclf_algorithm: algorithm to compute the binary classifier curve (see `binclf_curve_numpy.Algorithm`)
-        shared_fpr_metric: metric to compute the shared FPR axis
         fpr_bounds: lower and upper bounds of the FPR integration range
         force: whether to force the computation despite bad conditions
 
@@ -211,7 +202,6 @@ def aupimo_scores(
         masks=masks,
         num_threshs=num_threshs,
         binclf_algorithm=binclf_algorithm,
-        shared_fpr_metric=shared_fpr_metric,
     )
     try:
         _validate.is_threshs(threshs)
diff --git a/src/anomalib/metrics/per_image/utils.py b/src/anomalib/metrics/per_image/utils.py
index 581974c0e6..1d47674e2c 100644
--- a/src/anomalib/metrics/per_image/utils.py
+++ b/src/anomalib/metrics/per_image/utils.py
@@ -148,15 +148,6 @@ def _validate_is_scores_per_model_aupimoresult(
 
         first_model_name, first_aupimoresult = first_key_value
 
-        # check that the metadata is the same, so they can be compared indeed
-        if aupimoresult.shared_fpr_metric != first_aupimoresult.shared_fpr_metric:
-            msg = (
-                "Expected AUPIMOResult objects in scores per model to have the same shared FPR metric, "
-                f"but got ({model_name}) {aupimoresult.shared_fpr_metric} != "
-                f"{first_aupimoresult.shared_fpr_metric} ({first_model_name})."
-            )
-            raise ValueError(msg)
-
         if aupimoresult.fpr_bounds != first_aupimoresult.fpr_bounds:
             msg = (
                 "Expected AUPIMOResult objects in scores per model to have the same FPR bounds, "
diff --git a/tests/unit/metrics/per_image/__init__.py b/tests/unit/metrics/per_image/__init__.py
index 6c2c8af91d..9773b010c2 100644
--- a/tests/unit/metrics/per_image/__init__.py
+++ b/tests/unit/metrics/per_image/__init__.py
@@ -1,4 +1,8 @@
-"""Per-Image Metrics Tests.
+"""Per-Image Metrics Tests."""
 
-author: jpcbertoldo
-"""
+# Original Code
+# https://github.com/jpcbertoldo/aupimo
+#
+# Modified
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
\ No newline at end of file
diff --git a/tests/unit/metrics/per_image/test_binclf_curve.py b/tests/unit/metrics/per_image/test_binclf_curve.py
index 254f3423f5..339ec2b421 100644
--- a/tests/unit/metrics/per_image/test_binclf_curve.py
+++ b/tests/unit/metrics/per_image/test_binclf_curve.py
@@ -1,18 +1,22 @@
-"""Tests for per-image binary classification curves using numpy and numba versions.
+"""Tests for per-image binary classification curves using numpy and numba versions."""
+
+# Original Code
+# https://github.com/jpcbertoldo/aupimo
+#
+# Modified
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
 
-author: jpcbertoldo
-"""
 # ruff: noqa: SLF001, PT011
 
 import numpy as np
 import pytest
 import torch
+from anomalib.metrics.per_image import binclf_curve, binclf_curve_numpy
+from anomalib.metrics.per_image.binclf_curve_numpy import HAS_NUMBA
 from numpy import ndarray
 from torch import Tensor
 
-from anomalib import HAS_NUMBA
-from anomalib.metrics.per_image import binclf_curve, binclf_curve_numpy
-
 if HAS_NUMBA:
     from anomalib.metrics.per_image import _binclf_curve_numba
 
diff --git a/tests/unit/metrics/per_image/test_pimo.py b/tests/unit/metrics/per_image/test_pimo.py
index 8de53de29a..f92ab20758 100644
--- a/tests/unit/metrics/per_image/test_pimo.py
+++ b/tests/unit/metrics/per_image/test_pimo.py
@@ -1,7 +1,11 @@
-"""Test `anomalib.metrics.per_image.pimo_numpy`.
+"""Test `anomalib.metrics.per_image.pimo_numpy`."""
 
-author: jpcbertoldo
-"""
+# Original Code
+# https://github.com/jpcbertoldo/aupimo
+#
+# Modified
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
 
 import tempfile
 from pathlib import Path
@@ -9,11 +13,10 @@
 import numpy as np
 import pytest
 import torch
-from numpy import ndarray
-from torch import Tensor
-
 from anomalib.metrics.per_image import pimo, pimo_numpy
 from anomalib.metrics.per_image.pimo import AUPIMOResult, PIMOResult
+from numpy import ndarray
+from torch import Tensor
 
 from .test_utils import assert_statsdict_stuff
 
@@ -249,7 +252,6 @@ def test_pimo_numpy(
         masks,
         num_threshs=7,
         binclf_algorithm="numba",
-        shared_fpr_metric="mean-per-image-fpr",
     )
     _do_test_pimo_outputs(
         threshs,
@@ -274,7 +276,6 @@ def test_pimo(
     """Test if `pimo()` returns the expected values."""
 
     def do_assertions(pimoresult: PIMOResult) -> None:
-        assert pimoresult.shared_fpr_metric == "mean-per-image-fpr"
         threshs = pimoresult.threshs
         shared_fpr = pimoresult.shared_fpr
         per_image_tprs = pimoresult.per_image_tprs
@@ -296,7 +297,6 @@ def do_assertions(pimoresult: PIMOResult) -> None:
         masks,
         num_threshs=7,
         binclf_algorithm="numba",
-        shared_fpr_metric="mean-per-image-fpr",
     )
     do_assertions(pimoresult)
 
@@ -304,7 +304,6 @@ def do_assertions(pimoresult: PIMOResult) -> None:
     metric = pimo.PIMO(
         num_threshs=7,
         binclf_algorithm="numba",
-        shared_fpr_metric="mean-per-image-fpr",
     )
     metric.update(anomaly_maps, masks)
     pimoresult = metric.compute()
@@ -362,7 +361,6 @@ def test_aupimo_values_numpy(
         masks,
         num_threshs=7,
         binclf_algorithm="numba",
-        shared_fpr_metric="mean-per-image-fpr",
         fpr_bounds=fpr_bounds,
         force=True,
     )
@@ -394,8 +392,6 @@ def test_aupimo_values(
 
     def do_assertions(pimoresult: PIMOResult, aupimoresult: AUPIMOResult) -> None:
         # test metadata
-        assert pimoresult.shared_fpr_metric == "mean-per-image-fpr"
-        assert aupimoresult.shared_fpr_metric == "mean-per-image-fpr"
         assert aupimoresult.fpr_bounds == fpr_bounds
         # recall: this one is not the same as the number of thresholds in the curve
         # this is the number of thresholds used to compute the integral in `aupimo()`
@@ -432,7 +428,6 @@ def do_assertions(pimoresult: PIMOResult, aupimoresult: AUPIMOResult) -> None:
         masks,
         num_threshs=7,
         binclf_algorithm="numba",
-        shared_fpr_metric="mean-per-image-fpr",
         fpr_bounds=fpr_bounds,
         force=True,
     )
@@ -442,7 +437,6 @@ def do_assertions(pimoresult: PIMOResult, aupimoresult: AUPIMOResult) -> None:
     metric = pimo.AUPIMO(
         num_threshs=7,
         binclf_algorithm="numba",
-        shared_fpr_metric="mean-per-image-fpr",
         fpr_bounds=fpr_bounds,
         force=True,
     )
@@ -458,7 +452,7 @@ def test_aupimo_edge(
 ) -> None:
     """Test some edge cases."""
     # None is the case of testing the default bounds
-    fpr_bounds = {"fpr_bounds": fpr_bounds, "shared_fpr_metric": "mean-per-image-fpr"} if fpr_bounds is not None else {}
+    fpr_bounds = {"fpr_bounds": fpr_bounds} if fpr_bounds is not None else {}
 
     # not enough points on the curve
     # 10 threshs / 6 decades = 1.6 threshs per decade < 3
@@ -509,7 +503,6 @@ def test_pimoresult_object(
         masks,
         num_threshs=7,
         binclf_algorithm="numba",
-        shared_fpr_metric="mean-per-image-fpr",
         **optional_kwargs,
     )
 
@@ -523,7 +516,6 @@ def test_pimoresult_object(
     pimoresult_from_dict = PIMOResult.from_dict(dic)
     assert isinstance(pimoresult_from_dict, PIMOResult)
     # values should be the same
-    assert pimoresult_from_dict.shared_fpr_metric == pimoresult.shared_fpr_metric
     assert torch.allclose(pimoresult_from_dict.threshs, pimoresult.threshs)
     assert torch.allclose(pimoresult_from_dict.shared_fpr, pimoresult.shared_fpr)
     assert torch.allclose(pimoresult_from_dict.per_image_tprs, pimoresult.per_image_tprs, equal_nan=True)
@@ -536,7 +528,6 @@ def test_pimoresult_object(
         pimoresult_from_load = PIMOResult.load(str(file_path))
     assert isinstance(pimoresult_from_load, PIMOResult)
     # values should be the same
-    assert pimoresult_from_load.shared_fpr_metric == pimoresult.shared_fpr_metric
     assert torch.allclose(pimoresult_from_load.threshs, pimoresult.threshs)
     assert torch.allclose(pimoresult_from_load.shared_fpr, pimoresult.shared_fpr)
     assert torch.allclose(pimoresult_from_load.per_image_tprs, pimoresult.per_image_tprs, equal_nan=True)
@@ -557,7 +548,6 @@ def test_aupimoresult_object(
         masks,
         num_threshs=7,
         binclf_algorithm="numba",
-        shared_fpr_metric="mean-per-image-fpr",
         fpr_bounds=(1e-5, 1e-4),
         force=True,
         **optional_kwargs,
@@ -575,7 +565,6 @@ def test_aupimoresult_object(
     aupimoresult_from_dict = AUPIMOResult.from_dict(dic)
     assert isinstance(aupimoresult_from_dict, AUPIMOResult)
     # values should be the same
-    assert aupimoresult_from_dict.shared_fpr_metric == aupimoresult.shared_fpr_metric
     assert aupimoresult_from_dict.fpr_bounds == aupimoresult.fpr_bounds
     assert aupimoresult_from_dict.num_threshs == aupimoresult.num_threshs
     assert aupimoresult_from_dict.thresh_bounds == aupimoresult.thresh_bounds
@@ -589,7 +578,6 @@ def test_aupimoresult_object(
         aupimoresult_from_load = AUPIMOResult.load(str(file_path))
     assert isinstance(aupimoresult_from_load, AUPIMOResult)
     # values should be the same
-    assert aupimoresult_from_load.shared_fpr_metric == aupimoresult.shared_fpr_metric
     assert aupimoresult_from_load.fpr_bounds == aupimoresult.fpr_bounds
     assert aupimoresult_from_load.num_threshs == aupimoresult.num_threshs
     assert aupimoresult_from_load.thresh_bounds == aupimoresult.thresh_bounds
diff --git a/tests/unit/metrics/per_image/test_utils.py b/tests/unit/metrics/per_image/test_utils.py
index d0ceff1860..e0b172f6f4 100644
--- a/tests/unit/metrics/per_image/test_utils.py
+++ b/tests/unit/metrics/per_image/test_utils.py
@@ -1,18 +1,19 @@
-"""Test `utils.py`.
+"""Test `utils.py`."""
 
-author: jpcbertoldo
-"""
+# Original Code
+# https://github.com/jpcbertoldo/aupimo
+#
+# Modified
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
 
 from collections import OrderedDict
 
 import numpy as np
 import pytest
 import torch
-from torch import Tensor
-
 from anomalib.metrics.per_image import (
     AUPIMOResult,
-    PIMOSharedFPRMetric,
     StatsOutliersPolicy,
     StatsRepeatedPolicy,
     compare_models_pairwise_ttest_rel,
@@ -20,6 +21,7 @@
     format_pairwise_tests_results,
     per_image_scores_stats,
 )
+from torch import Tensor
 
 
 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
@@ -33,7 +35,6 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
     aucs3 = torch.sin(torch.linspace(0, torch.pi, num_images)).clip(0, 1)
 
     mock_aupimoresult_stuff = {
-        "shared_fpr_metric": PIMOSharedFPRMetric.MEAN_PERIMAGE_FPR,
         "fpr_lower_bound": 1e-5,
         "fpr_upper_bound": 1e-4,
         "num_threshs": 1_000,

From 937a515e053e2921876ed25652d21f04fa36fcee Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Wed, 21 Aug 2024 17:41:03 +0200
Subject: [PATCH 55/57] make torchmetrics compute avg by dft

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/pimo.py    | 12 ++++++++++--
 tests/unit/metrics/per_image/test_pimo.py | 12 ++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/anomalib/metrics/per_image/pimo.py b/src/anomalib/metrics/per_image/pimo.py
index d0b380d36c..f29056e973 100644
--- a/src/anomalib/metrics/per_image/pimo.py
+++ b/src/anomalib/metrics/per_image/pimo.py
@@ -854,6 +854,7 @@ class AUPIMO(PIMO):
     """
 
     fpr_bounds: tuple[float, float]
+    return_average: bool
     force: bool
 
     @staticmethod
@@ -898,6 +899,7 @@ def __init__(
         num_threshs: int = 300_000,
         binclf_algorithm: BinclfAlgorithm | str = BinclfAlgorithm.NUMBA.value,
         fpr_bounds: tuple[float, float] = (1e-5, 1e-4),
+        return_average: bool = True,
         force: bool = False,
     ) -> None:
         """Area Under the Per-Image Overlap (PIMO) curve.
@@ -906,6 +908,7 @@ def __init__(
             num_threshs: [passed to parent `PIMO`] number of thresholds used to compute the PIMO curve
             binclf_algorithm: [passed to parent `PIMO`] algorithm to compute the binary classification curve
             fpr_bounds: lower and upper bounds of the FPR integration range
+            return_average: if True, return the average AUPIMO score; if False, return all the individual AUPIMO scores
             force: if True, force the computation of the AUPIMO scores even in bad conditions (e.g. few points)
         """
         super().__init__(
@@ -917,7 +920,7 @@ def __init__(
 
         _validate.is_rate_range(fpr_bounds)
         self.fpr_bounds = fpr_bounds
-
+        self.return_average = return_average
         self.force = force
 
     def compute(self, force: bool | None = None) -> tuple[PIMOResult, AUPIMOResult]:  # type: ignore[override]
@@ -938,7 +941,7 @@ def compute(self, force: bool | None = None) -> tuple[PIMOResult, AUPIMOResult]:
         anomaly_maps = torch.concat(self.anomaly_maps, dim=0)
         masks = torch.concat(self.masks, dim=0)
         force = force if force is not None else self.force
-        return aupimo_scores(
+        pimoresult, aupimoresult = aupimo_scores(
             anomaly_maps,
             masks,
             self.num_threshs,
@@ -946,3 +949,8 @@ def compute(self, force: bool | None = None) -> tuple[PIMOResult, AUPIMOResult]:
             fpr_bounds=self.fpr_bounds,
             force=force,
         )
+        if self.return_average:
+            # normal images have NaN AUPIMO scores
+            is_nan = torch.isnan(aupimoresult.aupimos)
+            return aupimoresult.aupimos[~is_nan].mean()
+        return pimoresult, aupimoresult
diff --git a/tests/unit/metrics/per_image/test_pimo.py b/tests/unit/metrics/per_image/test_pimo.py
index f92ab20758..42e982c925 100644
--- a/tests/unit/metrics/per_image/test_pimo.py
+++ b/tests/unit/metrics/per_image/test_pimo.py
@@ -438,12 +438,24 @@ def do_assertions(pimoresult: PIMOResult, aupimoresult: AUPIMOResult) -> None:
         num_threshs=7,
         binclf_algorithm="numba",
         fpr_bounds=fpr_bounds,
+        return_average=False,
         force=True,
     )
     metric.update(anomaly_maps, masks)
     pimoresult_from_metric, aupimoresult_from_metric = metric.compute()
     do_assertions(pimoresult_from_metric, aupimoresult_from_metric)
 
+    # metric interface
+    metric = pimo.AUPIMO(
+        num_threshs=7,
+        binclf_algorithm="numba",
+        fpr_bounds=fpr_bounds,
+        return_average=True,  # only return the average AUPIMO
+        force=True,
+    )
+    metric.update(anomaly_maps, masks)
+    metric.compute()
+
 
 def test_aupimo_edge(
     anomaly_maps: ndarray,

From 7c037ec0a6e7f2708f6f5a34c759a58b085a16a7 Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Wed, 21 Aug 2024 17:50:25 +0200
Subject: [PATCH 56/57] pre-commit hooks corrections

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 tests/unit/metrics/per_image/__init__.py          | 2 +-
 tests/unit/metrics/per_image/test_binclf_curve.py | 5 +++--
 tests/unit/metrics/per_image/test_pimo.py         | 5 +++--
 tests/unit/metrics/per_image/test_utils.py        | 3 ++-
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/tests/unit/metrics/per_image/__init__.py b/tests/unit/metrics/per_image/__init__.py
index 9773b010c2..555d67a102 100644
--- a/tests/unit/metrics/per_image/__init__.py
+++ b/tests/unit/metrics/per_image/__init__.py
@@ -5,4 +5,4 @@
 #
 # Modified
 # Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
\ No newline at end of file
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/unit/metrics/per_image/test_binclf_curve.py b/tests/unit/metrics/per_image/test_binclf_curve.py
index 339ec2b421..6b0499bf9a 100644
--- a/tests/unit/metrics/per_image/test_binclf_curve.py
+++ b/tests/unit/metrics/per_image/test_binclf_curve.py
@@ -12,11 +12,12 @@
 import numpy as np
 import pytest
 import torch
-from anomalib.metrics.per_image import binclf_curve, binclf_curve_numpy
-from anomalib.metrics.per_image.binclf_curve_numpy import HAS_NUMBA
 from numpy import ndarray
 from torch import Tensor
 
+from anomalib.metrics.per_image import binclf_curve, binclf_curve_numpy
+from anomalib.metrics.per_image.binclf_curve_numpy import HAS_NUMBA
+
 if HAS_NUMBA:
     from anomalib.metrics.per_image import _binclf_curve_numba
 
diff --git a/tests/unit/metrics/per_image/test_pimo.py b/tests/unit/metrics/per_image/test_pimo.py
index 42e982c925..ce30a13542 100644
--- a/tests/unit/metrics/per_image/test_pimo.py
+++ b/tests/unit/metrics/per_image/test_pimo.py
@@ -13,11 +13,12 @@
 import numpy as np
 import pytest
 import torch
-from anomalib.metrics.per_image import pimo, pimo_numpy
-from anomalib.metrics.per_image.pimo import AUPIMOResult, PIMOResult
 from numpy import ndarray
 from torch import Tensor
 
+from anomalib.metrics.per_image import pimo, pimo_numpy
+from anomalib.metrics.per_image.pimo import AUPIMOResult, PIMOResult
+
 from .test_utils import assert_statsdict_stuff
 
 
diff --git a/tests/unit/metrics/per_image/test_utils.py b/tests/unit/metrics/per_image/test_utils.py
index e0b172f6f4..0e712b6584 100644
--- a/tests/unit/metrics/per_image/test_utils.py
+++ b/tests/unit/metrics/per_image/test_utils.py
@@ -12,6 +12,8 @@
 import numpy as np
 import pytest
 import torch
+from torch import Tensor
+
 from anomalib.metrics.per_image import (
     AUPIMOResult,
     StatsOutliersPolicy,
@@ -21,7 +23,6 @@
     format_pairwise_tests_results,
     per_image_scores_stats,
 )
-from torch import Tensor
 
 
 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:

From 27a38dad4a01e7da45dde9d71b2f3dcf6909260b Mon Sep 17 00:00:00 2001
From: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
Date: Thu, 22 Aug 2024 16:20:29 +0200
Subject: [PATCH 57/57] correct numpy.trapezoid

Signed-off-by: jpcbertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 src/anomalib/metrics/per_image/pimo_numpy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/anomalib/metrics/per_image/pimo_numpy.py b/src/anomalib/metrics/per_image/pimo_numpy.py
index 5cddb5f181..8b1f56f7ff 100644
--- a/src/anomalib/metrics/per_image/pimo_numpy.py
+++ b/src/anomalib/metrics/per_image/pimo_numpy.py
@@ -303,7 +303,7 @@ def aupimo_scores(
             "Try increasing `num_threshs`.",
         )
 
-    aucs: ndarray = np.trapz(per_image_tprs_bounded, x=shared_fpr_bounded_log, axis=1)
+    aucs: ndarray = np.trapezoid(per_image_tprs_bounded, x=shared_fpr_bounded_log, axis=1)
 
     # normalize, then clip(0, 1) makes sure that the values are in [0, 1] in case of numerical errors
     normalization_factor = aupimo_normalizing_factor(fpr_bounds)