From cfb6832d604797726bece8d84c91823d1992a950 Mon Sep 17 00:00:00 2001
From: Jihyeon Yi <jihyeon.yi@intel.com>
Date: Thu, 21 Dec 2023 18:10:40 +0900
Subject: [PATCH] Handling undefined labels at the annotation statistics
 (#1232)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

<!-- Contributing guide:
https://github.com/openvinotoolkit/datumaro/blob/develop/CONTRIBUTING.md
-->

### Summary
Regarding the issue #1204 , modify `compute_ann_statistics` function to
handle undefined labels.
And add corresponding unit test.

<!--
Resolves #111 and #222.
Depends on #1000 (for series of dependent commits).

This PR introduces this capability to make the project better in this
and that.

- Added this feature
- Removed that feature
- Fixed the problem #1234
-->

### How to test
<!-- Describe the testing procedure for reviewers, if changes are
not fully covered by unit tests or manual testing can be complicated.
-->

### Checklist
<!-- Put an 'x' in all the boxes that apply -->
- [x] I have added unit tests to cover my changes.​
- [ ] I have added integration tests to cover my changes.​
- [x] I have added the description of my changes into
[CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md).​
- [ ] I have updated the
[documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs)
accordingly

### License

- [x] I submit _my code changes_ under the same [MIT
License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE)
that covers the project.
  Feel free to contact the maintainers if that's a concern.
- [ ] I have updated the license header for each file (see an example
below).

```python
# Copyright (C) 2023 Intel Corporation
#
# SPDX-License-Identifier: MIT
```
---
 CHANGELOG.md                             |   2 +
 src/datumaro/components/operations.py    |  24 +-
 tests/requirements.py                    |   3 +
 tests/unit/operations/test_statistics.py | 302 ++++++++++++++++++++++-
 tests/unit/test_ops.py                   | 274 +-------------------
 5 files changed, 327 insertions(+), 278 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9226cf7156..172daabb27 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -39,6 +39,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/1221>)
 - Fix Kinetics data format to have media data
   (<https://github.com/openvinotoolkit/datumaro/pull/1223>)
+- Handling undefined labels at the annotation statistics
+  (<https://github.com/openvinotoolkit/datumaro/pull/1232>)
 
 ## 16/11/2023 - Release 1.5.1
 ### Enhancements
diff --git a/src/datumaro/components/operations.py b/src/datumaro/components/operations.py
index 98bbb963a1..9ae475d889 100644
--- a/src/datumaro/components/operations.py
+++ b/src/datumaro/components/operations.py
@@ -5,6 +5,7 @@
 import hashlib
 import logging as log
 import warnings
+from collections import defaultdict
 from copy import deepcopy
 from typing import Callable, Dict, Optional, Set, Tuple
 
@@ -225,10 +226,20 @@ def _extractor_stats(subset_name):
 
 
 def compute_ann_statistics(dataset: IDataset):
-    labels = dataset.categories().get(AnnotationType.label, LabelCategories())
+    warnings.warn(
+        "We are planning to change the type of stats['annotations']['labels']['distribution'] "
+        "and stats['annotations']['segments']['pixel distribution'] from `list` to `(named) tuple`. "
+        "If you are checking the types in your code, please revisit it after upgrading datumaro>=2.0.0.",
+        FutureWarning,
+    )
+    labels: LabelCategories = dataset.categories().get(AnnotationType.label, LabelCategories())
 
     def get_label(ann):
-        return labels.items[ann.label].name if ann.label is not None else None
+        try:
+            return labels.items[ann.label].name if ann.label is not None else None
+        except IndexError:
+            log.warning(f"annotation({ann}) has undefined label({ann.label})")
+            return ann.label
 
     stats = {
         "images count": 0,
@@ -253,21 +264,26 @@ def get_label(ann):
     }
     label_stat = {
         "count": 0,
-        "distribution": {l.name: [0, 0] for l in labels.items},  # label -> (count, total%)
+        "distribution": defaultdict(lambda: [0, 0]),  # label -> (count, total%)
         "attributes": {},
     }
+
     stats["annotations"]["labels"] = label_stat
     segm_stat = {
         "avg. area": 0,
         "area distribution": [],  # a histogram with 10 bins
         # (min, min+10%), ..., (min+90%, max) -> (count, total%)
-        "pixel distribution": {l.name: [0, 0] for l in labels.items},  # label -> (count, total%)
+        "pixel distribution": defaultdict(lambda: [0, 0]),  # label -> (count, total%)
     }
     stats["annotations"]["segments"] = segm_stat
     segm_areas = []
     pixel_dist = segm_stat["pixel distribution"]
     total_pixels = 0
 
+    for l in labels.items:
+        label_stat["distribution"][l.name] = [0, 0]
+        pixel_dist[l.name] = [0, 0]
+
     for item in dataset:
         if len(item.annotations) == 0:
             stats["unannotated images"].append(item.id)
diff --git a/tests/requirements.py b/tests/requirements.py
index 262b265728..bf2a160c27 100644
--- a/tests/requirements.py
+++ b/tests/requirements.py
@@ -61,6 +61,9 @@ class Requirements:
     DATUM_BUG_618 = "ResizeTransform returns broken image pixels"
     DATUM_BUG_721 = "Explain command cannot find the model"
     DATUM_BUG_873 = "Error using datum stats"
+    DATUM_BUG_1204 = (
+        "Statistics raise an error when there is a label annotation not in the category"
+    )
 
 
 class SkipMessages:
diff --git a/tests/unit/operations/test_statistics.py b/tests/unit/operations/test_statistics.py
index fc76f3f48c..a3a488615b 100644
--- a/tests/unit/operations/test_statistics.py
+++ b/tests/unit/operations/test_statistics.py
@@ -8,11 +8,16 @@
 import numpy as np
 import pytest
 
+from datumaro.components.annotation import Bbox, Caption, Ellipse, Label, Mask, Points
 from datumaro.components.dataset import Dataset
 from datumaro.components.dataset_base import DatasetItem
 from datumaro.components.errors import DatumaroError
 from datumaro.components.media import Image, PointCloud
-from datumaro.components.operations import IMAGE_STATS_SCHEMA, compute_image_statistics
+from datumaro.components.operations import (
+    IMAGE_STATS_SCHEMA,
+    compute_ann_statistics,
+    compute_image_statistics,
+)
 
 from tests.requirements import Requirements, mark_requirement
 
@@ -109,3 +114,298 @@ def test_invalid_media_type(
             with pytest.warns(UserWarning, match="only Image media_type is allowed"):
                 actual = compute_image_statistics(fxt_point_cloud_dataset)
             assert actual["dataset"] == IMAGE_STATS_SCHEMA["dataset"]
+
+
+class AnnStatisticsTest:
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    def test_stats(self):
+        dataset = Dataset.from_iterable(
+            [
+                DatasetItem(
+                    id=1,
+                    media=Image.from_numpy(data=np.ones((5, 5, 3))),
+                    annotations=[
+                        Caption("hello"),
+                        Caption("world"),
+                        Label(
+                            2,
+                            attributes={
+                                "x": 1,
+                                "y": "2",
+                            },
+                        ),
+                        Bbox(
+                            1,
+                            2,
+                            2,
+                            2,
+                            label=2,
+                            attributes={
+                                "score": 0.5,
+                            },
+                        ),
+                        Bbox(
+                            5,
+                            6,
+                            2,
+                            2,
+                            attributes={
+                                "x": 1,
+                                "y": "3",
+                                "occluded": True,
+                            },
+                        ),
+                        Points([1, 2, 2, 0, 1, 1], label=0),
+                        Mask(
+                            label=3,
+                            image=np.array(
+                                [
+                                    [0, 0, 1, 1, 1],
+                                    [0, 0, 1, 1, 1],
+                                    [0, 0, 1, 1, 1],
+                                    [0, 0, 0, 0, 0],
+                                    [0, 0, 0, 0, 0],
+                                ]
+                            ),
+                        ),
+                    ],
+                ),
+                DatasetItem(
+                    id=2,
+                    media=Image.from_numpy(data=np.ones((2, 4, 3))),
+                    annotations=[
+                        Label(
+                            2,
+                            attributes={
+                                "x": 2,
+                                "y": "2",
+                            },
+                        ),
+                        Bbox(
+                            1,
+                            2,
+                            2,
+                            2,
+                            label=3,
+                            attributes={
+                                "score": 0.5,
+                            },
+                        ),
+                        Bbox(
+                            5,
+                            6,
+                            2,
+                            2,
+                            attributes={
+                                "x": 2,
+                                "y": "3",
+                                "occluded": False,
+                            },
+                        ),
+                        Ellipse(
+                            5,
+                            6,
+                            2,
+                            2,
+                            attributes={
+                                "x": 2,
+                                "y": "3",
+                                "occluded": False,
+                            },
+                        ),
+                    ],
+                ),
+                DatasetItem(id=3),
+                DatasetItem(id="2.2", media=Image.from_numpy(data=np.ones((2, 4, 3)))),
+            ],
+            categories=["label_%s" % i for i in range(4)],
+        )
+
+        expected = {
+            "images count": 4,
+            "annotations count": 11,
+            "unannotated images count": 2,
+            "unannotated images": ["3", "2.2"],
+            "annotations by type": {
+                "label": {
+                    "count": 2,
+                },
+                "polygon": {
+                    "count": 0,
+                },
+                "polyline": {
+                    "count": 0,
+                },
+                "bbox": {
+                    "count": 4,
+                },
+                "mask": {
+                    "count": 1,
+                },
+                "points": {
+                    "count": 1,
+                },
+                "caption": {
+                    "count": 2,
+                },
+                "cuboid_3d": {"count": 0},
+                "super_resolution_annotation": {"count": 0},
+                "depth_annotation": {"count": 0},
+                "ellipse": {"count": 1},
+                "hash_key": {"count": 0},
+                "feature_vector": {"count": 0},
+                "tabular": {"count": 0},
+                "unknown": {"count": 0},
+            },
+            "annotations": {
+                "labels": {
+                    "count": 6,
+                    "distribution": {
+                        "label_0": [1, 1 / 6],
+                        "label_1": [0, 0.0],
+                        "label_2": [3, 3 / 6],
+                        "label_3": [2, 2 / 6],
+                    },
+                    "attributes": {
+                        "x": {
+                            "count": 2,  # annotations with no label are skipped
+                            "values count": 2,
+                            "values present": ["1", "2"],
+                            "distribution": {
+                                "1": [1, 1 / 2],
+                                "2": [1, 1 / 2],
+                            },
+                        },
+                        "y": {
+                            "count": 2,  # annotations with no label are skipped
+                            "values count": 1,
+                            "values present": ["2"],
+                            "distribution": {
+                                "2": [2, 2 / 2],
+                            },
+                        },
+                        # must not include "special" attributes like "occluded"
+                    },
+                },
+                "segments": {
+                    "avg. area": (4 * 2 + 9 * 1) / 3,
+                    "area distribution": [
+                        {"min": 4.0, "max": 4.5, "count": 2, "percent": 2 / 3},
+                        {"min": 4.5, "max": 5.0, "count": 0, "percent": 0.0},
+                        {"min": 5.0, "max": 5.5, "count": 0, "percent": 0.0},
+                        {"min": 5.5, "max": 6.0, "count": 0, "percent": 0.0},
+                        {"min": 6.0, "max": 6.5, "count": 0, "percent": 0.0},
+                        {"min": 6.5, "max": 7.0, "count": 0, "percent": 0.0},
+                        {"min": 7.0, "max": 7.5, "count": 0, "percent": 0.0},
+                        {"min": 7.5, "max": 8.0, "count": 0, "percent": 0.0},
+                        {"min": 8.0, "max": 8.5, "count": 0, "percent": 0.0},
+                        {"min": 8.5, "max": 9.0, "count": 1, "percent": 1 / 3},
+                    ],
+                    "pixel distribution": {
+                        "label_0": [0, 0.0],
+                        "label_1": [0, 0.0],
+                        "label_2": [4, 4 / 17],
+                        "label_3": [13, 13 / 17],
+                    },
+                },
+            },
+        }
+
+        actual = compute_ann_statistics(dataset)
+
+        assert actual == expected
+
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    def test_stats_with_empty_dataset(self):
+        label_names = ["label_%s" % i for i in range(4)]
+        dataset = Dataset.from_iterable(
+            [
+                DatasetItem(id=1),
+                DatasetItem(id=3),
+            ],
+            categories=label_names,
+        )
+
+        expected = self._get_stats_template(label_names)
+        expected["images count"] = 2
+        expected["unannotated images count"] = 2
+        expected["unannotated images"] = ["1", "3"]
+
+        actual = compute_ann_statistics(dataset)
+        assert actual == expected
+
+    @mark_requirement(Requirements.DATUM_BUG_1204)
+    def test_stats_with_invalid_label(self):
+        label_names = ["label_%s" % i for i in range(3)]
+        dataset = Dataset.from_iterable(
+            iterable=[DatasetItem(id=f"item{i}", annotations=[Label(i)]) for i in range(4)],
+            categories=label_names,
+        )
+
+        expected = self._get_stats_template(label_names)
+        expected["images count"] = 4
+        expected["annotations count"] = 4
+        expected["annotations by type"]["label"]["count"] = 4
+        expected["annotations"]["labels"]["count"] = 4
+        expected["annotations"]["labels"]["distribution"] = {
+            "label_0": [1, 0.25],
+            "label_1": [1, 0.25],
+            "label_2": [1, 0.25],
+            3: [1, 0.25],  # label which does not exist in categories.
+        }
+
+        actual = compute_ann_statistics(dataset)
+
+        assert actual == expected
+
+    @staticmethod
+    def _get_stats_template(label_names: list):
+        return {
+            "images count": 0,
+            "annotations count": 0,
+            "unannotated images count": 0,
+            "unannotated images": [],
+            "annotations by type": {
+                "label": {
+                    "count": 0,
+                },
+                "polygon": {
+                    "count": 0,
+                },
+                "polyline": {
+                    "count": 0,
+                },
+                "bbox": {
+                    "count": 0,
+                },
+                "mask": {
+                    "count": 0,
+                },
+                "points": {
+                    "count": 0,
+                },
+                "caption": {
+                    "count": 0,
+                },
+                "cuboid_3d": {"count": 0},
+                "super_resolution_annotation": {"count": 0},
+                "depth_annotation": {"count": 0},
+                "ellipse": {"count": 0},
+                "hash_key": {"count": 0},
+                "feature_vector": {"count": 0},
+                "tabular": {"count": 0},
+                "unknown": {"count": 0},
+            },
+            "annotations": {
+                "labels": {
+                    "count": 0,
+                    "distribution": {n: [0, 0] for n in label_names},
+                    "attributes": {},
+                },
+                "segments": {
+                    "avg. area": 0.0,
+                    "area distribution": [],
+                    "pixel distribution": {n: [0, 0] for n in label_names},
+                },
+            },
+        }
diff --git a/tests/unit/test_ops.py b/tests/unit/test_ops.py
index b15399fd26..45d97ab44e 100644
--- a/tests/unit/test_ops.py
+++ b/tests/unit/test_ops.py
@@ -8,7 +8,6 @@
 from datumaro.components.annotation import (
     AnnotationType,
     Bbox,
-    Caption,
     Ellipse,
     Label,
     LabelCategories,
@@ -25,7 +24,7 @@
 from datumaro.components.merge.exact_merge import ExactMerge
 from datumaro.components.merge.intersect_merge import IntersectMerge
 from datumaro.components.merge.union_merge import UnionMerge
-from datumaro.components.operations import compute_ann_statistics, find_unique_images, mean_std
+from datumaro.components.operations import find_unique_images, mean_std
 from datumaro.errors import (
     ConflictingCategoriesError,
     FailedAttrVotingError,
@@ -67,277 +66,6 @@ def test_mean_std(self):
         for estd, astd in zip(expected_std, actual_std):
             assert np.allclose(estd, astd, atol=0.1)
 
-    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
-    def test_stats(self):
-        dataset = Dataset.from_iterable(
-            [
-                DatasetItem(
-                    id=1,
-                    media=Image.from_numpy(data=np.ones((5, 5, 3))),
-                    annotations=[
-                        Caption("hello"),
-                        Caption("world"),
-                        Label(
-                            2,
-                            attributes={
-                                "x": 1,
-                                "y": "2",
-                            },
-                        ),
-                        Bbox(
-                            1,
-                            2,
-                            2,
-                            2,
-                            label=2,
-                            attributes={
-                                "score": 0.5,
-                            },
-                        ),
-                        Bbox(
-                            5,
-                            6,
-                            2,
-                            2,
-                            attributes={
-                                "x": 1,
-                                "y": "3",
-                                "occluded": True,
-                            },
-                        ),
-                        Points([1, 2, 2, 0, 1, 1], label=0),
-                        Mask(
-                            label=3,
-                            image=np.array(
-                                [
-                                    [0, 0, 1, 1, 1],
-                                    [0, 0, 1, 1, 1],
-                                    [0, 0, 1, 1, 1],
-                                    [0, 0, 0, 0, 0],
-                                    [0, 0, 0, 0, 0],
-                                ]
-                            ),
-                        ),
-                    ],
-                ),
-                DatasetItem(
-                    id=2,
-                    media=Image.from_numpy(data=np.ones((2, 4, 3))),
-                    annotations=[
-                        Label(
-                            2,
-                            attributes={
-                                "x": 2,
-                                "y": "2",
-                            },
-                        ),
-                        Bbox(
-                            1,
-                            2,
-                            2,
-                            2,
-                            label=3,
-                            attributes={
-                                "score": 0.5,
-                            },
-                        ),
-                        Bbox(
-                            5,
-                            6,
-                            2,
-                            2,
-                            attributes={
-                                "x": 2,
-                                "y": "3",
-                                "occluded": False,
-                            },
-                        ),
-                        Ellipse(
-                            5,
-                            6,
-                            2,
-                            2,
-                            attributes={
-                                "x": 2,
-                                "y": "3",
-                                "occluded": False,
-                            },
-                        ),
-                    ],
-                ),
-                DatasetItem(id=3),
-                DatasetItem(id="2.2", media=Image.from_numpy(data=np.ones((2, 4, 3)))),
-            ],
-            categories=["label_%s" % i for i in range(4)],
-        )
-
-        expected = {
-            "images count": 4,
-            "annotations count": 11,
-            "unannotated images count": 2,
-            "unannotated images": ["3", "2.2"],
-            "annotations by type": {
-                "label": {
-                    "count": 2,
-                },
-                "polygon": {
-                    "count": 0,
-                },
-                "polyline": {
-                    "count": 0,
-                },
-                "bbox": {
-                    "count": 4,
-                },
-                "mask": {
-                    "count": 1,
-                },
-                "points": {
-                    "count": 1,
-                },
-                "caption": {
-                    "count": 2,
-                },
-                "cuboid_3d": {"count": 0},
-                "super_resolution_annotation": {"count": 0},
-                "depth_annotation": {"count": 0},
-                "ellipse": {"count": 1},
-                "hash_key": {"count": 0},
-                "feature_vector": {"count": 0},
-                "tabular": {"count": 0},
-                "unknown": {"count": 0},
-            },
-            "annotations": {
-                "labels": {
-                    "count": 6,
-                    "distribution": {
-                        "label_0": [1, 1 / 6],
-                        "label_1": [0, 0.0],
-                        "label_2": [3, 3 / 6],
-                        "label_3": [2, 2 / 6],
-                    },
-                    "attributes": {
-                        "x": {
-                            "count": 2,  # annotations with no label are skipped
-                            "values count": 2,
-                            "values present": ["1", "2"],
-                            "distribution": {
-                                "1": [1, 1 / 2],
-                                "2": [1, 1 / 2],
-                            },
-                        },
-                        "y": {
-                            "count": 2,  # annotations with no label are skipped
-                            "values count": 1,
-                            "values present": ["2"],
-                            "distribution": {
-                                "2": [2, 2 / 2],
-                            },
-                        },
-                        # must not include "special" attributes like "occluded"
-                    },
-                },
-                "segments": {
-                    "avg. area": (4 * 2 + 9 * 1) / 3,
-                    "area distribution": [
-                        {"min": 4.0, "max": 4.5, "count": 2, "percent": 2 / 3},
-                        {"min": 4.5, "max": 5.0, "count": 0, "percent": 0.0},
-                        {"min": 5.0, "max": 5.5, "count": 0, "percent": 0.0},
-                        {"min": 5.5, "max": 6.0, "count": 0, "percent": 0.0},
-                        {"min": 6.0, "max": 6.5, "count": 0, "percent": 0.0},
-                        {"min": 6.5, "max": 7.0, "count": 0, "percent": 0.0},
-                        {"min": 7.0, "max": 7.5, "count": 0, "percent": 0.0},
-                        {"min": 7.5, "max": 8.0, "count": 0, "percent": 0.0},
-                        {"min": 8.0, "max": 8.5, "count": 0, "percent": 0.0},
-                        {"min": 8.5, "max": 9.0, "count": 1, "percent": 1 / 3},
-                    ],
-                    "pixel distribution": {
-                        "label_0": [0, 0.0],
-                        "label_1": [0, 0.0],
-                        "label_2": [4, 4 / 17],
-                        "label_3": [13, 13 / 17],
-                    },
-                },
-            },
-        }
-
-        actual = compute_ann_statistics(dataset)
-
-        self.assertEqual(expected, actual)
-
-    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
-    def test_stats_with_empty_dataset(self):
-        dataset = Dataset.from_iterable(
-            [
-                DatasetItem(id=1),
-                DatasetItem(id=3),
-            ],
-            categories=["label_%s" % i for i in range(4)],
-        )
-
-        expected = {
-            "images count": 2,
-            "annotations count": 0,
-            "unannotated images count": 2,
-            "unannotated images": ["1", "3"],
-            "annotations by type": {
-                "label": {
-                    "count": 0,
-                },
-                "polygon": {
-                    "count": 0,
-                },
-                "polyline": {
-                    "count": 0,
-                },
-                "bbox": {
-                    "count": 0,
-                },
-                "mask": {
-                    "count": 0,
-                },
-                "points": {
-                    "count": 0,
-                },
-                "caption": {
-                    "count": 0,
-                },
-                "cuboid_3d": {"count": 0},
-                "super_resolution_annotation": {"count": 0},
-                "depth_annotation": {"count": 0},
-                "ellipse": {"count": 0},
-                "hash_key": {"count": 0},
-                "feature_vector": {"count": 0},
-                "tabular": {"count": 0},
-                "unknown": {"count": 0},
-            },
-            "annotations": {
-                "labels": {
-                    "count": 0,
-                    "distribution": {
-                        "label_0": [0, 0.0],
-                        "label_1": [0, 0.0],
-                        "label_2": [0, 0.0],
-                        "label_3": [0, 0.0],
-                    },
-                    "attributes": {},
-                },
-                "segments": {
-                    "avg. area": 0.0,
-                    "area distribution": [],
-                    "pixel distribution": {
-                        "label_0": [0, 0.0],
-                        "label_1": [0, 0.0],
-                        "label_2": [0, 0.0],
-                        "label_3": [0, 0.0],
-                    },
-                },
-            },
-        }
-
-        actual = compute_ann_statistics(dataset)
-        self.assertEqual(expected, actual)
-
     @mark_requirement(Requirements.DATUM_GENERAL_REQ)
     def test_unique_image_count(self):
         expected = {