From cfb6832d604797726bece8d84c91823d1992a950 Mon Sep 17 00:00:00 2001 From: Jihyeon Yi Date: Thu, 21 Dec 2023 18:10:40 +0900 Subject: [PATCH] Handling undefined labels at the annotation statistics (#1232) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Summary Regarding the issue #1204 , modify `compute_ann_statistics` function to handle undefined labels. And add corresponding unit test. ### How to test ### Checklist - [x] I have added unit tests to cover my changes.​ - [ ] I have added integration tests to cover my changes.​ - [x] I have added the description of my changes into [CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md).​ - [ ] I have updated the [documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs) accordingly ### License - [x] I submit _my code changes_ under the same [MIT License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. - [ ] I have updated the license header for each file (see an example below). ```python # Copyright (C) 2023 Intel Corporation # # SPDX-License-Identifier: MIT ``` --- CHANGELOG.md | 2 + src/datumaro/components/operations.py | 24 +- tests/requirements.py | 3 + tests/unit/operations/test_statistics.py | 302 ++++++++++++++++++++++- tests/unit/test_ops.py | 274 +------------------- 5 files changed, 327 insertions(+), 278 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9226cf7156..172daabb27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 () - Fix Kinetics data format to have media data () +- Handling undefined labels at the annotation statistics + () ## 16/11/2023 - Release 1.5.1 ### Enhancements diff --git a/src/datumaro/components/operations.py b/src/datumaro/components/operations.py index 98bbb963a1..9ae475d889 100644 --- a/src/datumaro/components/operations.py +++ b/src/datumaro/components/operations.py @@ -5,6 +5,7 @@ import hashlib import logging as log import warnings +from collections import defaultdict from copy import deepcopy from typing import Callable, Dict, Optional, Set, Tuple @@ -225,10 +226,20 @@ def _extractor_stats(subset_name): def compute_ann_statistics(dataset: IDataset): - labels = dataset.categories().get(AnnotationType.label, LabelCategories()) + warnings.warn( + "We are planning to change the type of stats['annotations']['labels']['distribution'] " + "and stats['annotations']['segments']['pixel distribution'] from `list` to `(named) tuple`. " + "If you are checking the types in your code, please revisit it after upgrading datumaro>=2.0.0.", + FutureWarning, + ) + labels: LabelCategories = dataset.categories().get(AnnotationType.label, LabelCategories()) def get_label(ann): - return labels.items[ann.label].name if ann.label is not None else None + try: + return labels.items[ann.label].name if ann.label is not None else None + except IndexError: + log.warning(f"annotation({ann}) has undefined label({ann.label})") + return ann.label stats = { "images count": 0, @@ -253,21 +264,26 @@ def get_label(ann): } label_stat = { "count": 0, - "distribution": {l.name: [0, 0] for l in labels.items}, # label -> (count, total%) + "distribution": defaultdict(lambda: [0, 0]), # label -> (count, total%) "attributes": {}, } + stats["annotations"]["labels"] = label_stat segm_stat = { "avg. area": 0, "area distribution": [], # a histogram with 10 bins # (min, min+10%), ..., (min+90%, max) -> (count, total%) - "pixel distribution": {l.name: [0, 0] for l in labels.items}, # label -> (count, total%) + "pixel distribution": defaultdict(lambda: [0, 0]), # label -> (count, total%) } stats["annotations"]["segments"] = segm_stat segm_areas = [] pixel_dist = segm_stat["pixel distribution"] total_pixels = 0 + for l in labels.items: + label_stat["distribution"][l.name] = [0, 0] + pixel_dist[l.name] = [0, 0] + for item in dataset: if len(item.annotations) == 0: stats["unannotated images"].append(item.id) diff --git a/tests/requirements.py b/tests/requirements.py index 262b265728..bf2a160c27 100644 --- a/tests/requirements.py +++ b/tests/requirements.py @@ -61,6 +61,9 @@ class Requirements: DATUM_BUG_618 = "ResizeTransform returns broken image pixels" DATUM_BUG_721 = "Explain command cannot find the model" DATUM_BUG_873 = "Error using datum stats" + DATUM_BUG_1204 = ( + "Statistics raise an error when there is a label annotation not in the category" + ) class SkipMessages: diff --git a/tests/unit/operations/test_statistics.py b/tests/unit/operations/test_statistics.py index fc76f3f48c..a3a488615b 100644 --- a/tests/unit/operations/test_statistics.py +++ b/tests/unit/operations/test_statistics.py @@ -8,11 +8,16 @@ import numpy as np import pytest +from datumaro.components.annotation import Bbox, Caption, Ellipse, Label, Mask, Points from datumaro.components.dataset import Dataset from datumaro.components.dataset_base import DatasetItem from datumaro.components.errors import DatumaroError from datumaro.components.media import Image, PointCloud -from datumaro.components.operations import IMAGE_STATS_SCHEMA, compute_image_statistics +from datumaro.components.operations import ( + IMAGE_STATS_SCHEMA, + compute_ann_statistics, + compute_image_statistics, +) from tests.requirements import Requirements, mark_requirement @@ -109,3 +114,298 @@ def test_invalid_media_type( with pytest.warns(UserWarning, match="only Image media_type is allowed"): actual = compute_image_statistics(fxt_point_cloud_dataset) assert actual["dataset"] == IMAGE_STATS_SCHEMA["dataset"] + + +class AnnStatisticsTest: + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_stats(self): + dataset = Dataset.from_iterable( + [ + DatasetItem( + id=1, + media=Image.from_numpy(data=np.ones((5, 5, 3))), + annotations=[ + Caption("hello"), + Caption("world"), + Label( + 2, + attributes={ + "x": 1, + "y": "2", + }, + ), + Bbox( + 1, + 2, + 2, + 2, + label=2, + attributes={ + "score": 0.5, + }, + ), + Bbox( + 5, + 6, + 2, + 2, + attributes={ + "x": 1, + "y": "3", + "occluded": True, + }, + ), + Points([1, 2, 2, 0, 1, 1], label=0), + Mask( + label=3, + image=np.array( + [ + [0, 0, 1, 1, 1], + [0, 0, 1, 1, 1], + [0, 0, 1, 1, 1], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + ] + ), + ), + ], + ), + DatasetItem( + id=2, + media=Image.from_numpy(data=np.ones((2, 4, 3))), + annotations=[ + Label( + 2, + attributes={ + "x": 2, + "y": "2", + }, + ), + Bbox( + 1, + 2, + 2, + 2, + label=3, + attributes={ + "score": 0.5, + }, + ), + Bbox( + 5, + 6, + 2, + 2, + attributes={ + "x": 2, + "y": "3", + "occluded": False, + }, + ), + Ellipse( + 5, + 6, + 2, + 2, + attributes={ + "x": 2, + "y": "3", + "occluded": False, + }, + ), + ], + ), + DatasetItem(id=3), + DatasetItem(id="2.2", media=Image.from_numpy(data=np.ones((2, 4, 3)))), + ], + categories=["label_%s" % i for i in range(4)], + ) + + expected = { + "images count": 4, + "annotations count": 11, + "unannotated images count": 2, + "unannotated images": ["3", "2.2"], + "annotations by type": { + "label": { + "count": 2, + }, + "polygon": { + "count": 0, + }, + "polyline": { + "count": 0, + }, + "bbox": { + "count": 4, + }, + "mask": { + "count": 1, + }, + "points": { + "count": 1, + }, + "caption": { + "count": 2, + }, + "cuboid_3d": {"count": 0}, + "super_resolution_annotation": {"count": 0}, + "depth_annotation": {"count": 0}, + "ellipse": {"count": 1}, + "hash_key": {"count": 0}, + "feature_vector": {"count": 0}, + "tabular": {"count": 0}, + "unknown": {"count": 0}, + }, + "annotations": { + "labels": { + "count": 6, + "distribution": { + "label_0": [1, 1 / 6], + "label_1": [0, 0.0], + "label_2": [3, 3 / 6], + "label_3": [2, 2 / 6], + }, + "attributes": { + "x": { + "count": 2, # annotations with no label are skipped + "values count": 2, + "values present": ["1", "2"], + "distribution": { + "1": [1, 1 / 2], + "2": [1, 1 / 2], + }, + }, + "y": { + "count": 2, # annotations with no label are skipped + "values count": 1, + "values present": ["2"], + "distribution": { + "2": [2, 2 / 2], + }, + }, + # must not include "special" attributes like "occluded" + }, + }, + "segments": { + "avg. area": (4 * 2 + 9 * 1) / 3, + "area distribution": [ + {"min": 4.0, "max": 4.5, "count": 2, "percent": 2 / 3}, + {"min": 4.5, "max": 5.0, "count": 0, "percent": 0.0}, + {"min": 5.0, "max": 5.5, "count": 0, "percent": 0.0}, + {"min": 5.5, "max": 6.0, "count": 0, "percent": 0.0}, + {"min": 6.0, "max": 6.5, "count": 0, "percent": 0.0}, + {"min": 6.5, "max": 7.0, "count": 0, "percent": 0.0}, + {"min": 7.0, "max": 7.5, "count": 0, "percent": 0.0}, + {"min": 7.5, "max": 8.0, "count": 0, "percent": 0.0}, + {"min": 8.0, "max": 8.5, "count": 0, "percent": 0.0}, + {"min": 8.5, "max": 9.0, "count": 1, "percent": 1 / 3}, + ], + "pixel distribution": { + "label_0": [0, 0.0], + "label_1": [0, 0.0], + "label_2": [4, 4 / 17], + "label_3": [13, 13 / 17], + }, + }, + }, + } + + actual = compute_ann_statistics(dataset) + + assert actual == expected + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_stats_with_empty_dataset(self): + label_names = ["label_%s" % i for i in range(4)] + dataset = Dataset.from_iterable( + [ + DatasetItem(id=1), + DatasetItem(id=3), + ], + categories=label_names, + ) + + expected = self._get_stats_template(label_names) + expected["images count"] = 2 + expected["unannotated images count"] = 2 + expected["unannotated images"] = ["1", "3"] + + actual = compute_ann_statistics(dataset) + assert actual == expected + + @mark_requirement(Requirements.DATUM_BUG_1204) + def test_stats_with_invalid_label(self): + label_names = ["label_%s" % i for i in range(3)] + dataset = Dataset.from_iterable( + iterable=[DatasetItem(id=f"item{i}", annotations=[Label(i)]) for i in range(4)], + categories=label_names, + ) + + expected = self._get_stats_template(label_names) + expected["images count"] = 4 + expected["annotations count"] = 4 + expected["annotations by type"]["label"]["count"] = 4 + expected["annotations"]["labels"]["count"] = 4 + expected["annotations"]["labels"]["distribution"] = { + "label_0": [1, 0.25], + "label_1": [1, 0.25], + "label_2": [1, 0.25], + 3: [1, 0.25], # label which does not exist in categories. + } + + actual = compute_ann_statistics(dataset) + + assert actual == expected + + @staticmethod + def _get_stats_template(label_names: list): + return { + "images count": 0, + "annotations count": 0, + "unannotated images count": 0, + "unannotated images": [], + "annotations by type": { + "label": { + "count": 0, + }, + "polygon": { + "count": 0, + }, + "polyline": { + "count": 0, + }, + "bbox": { + "count": 0, + }, + "mask": { + "count": 0, + }, + "points": { + "count": 0, + }, + "caption": { + "count": 0, + }, + "cuboid_3d": {"count": 0}, + "super_resolution_annotation": {"count": 0}, + "depth_annotation": {"count": 0}, + "ellipse": {"count": 0}, + "hash_key": {"count": 0}, + "feature_vector": {"count": 0}, + "tabular": {"count": 0}, + "unknown": {"count": 0}, + }, + "annotations": { + "labels": { + "count": 0, + "distribution": {n: [0, 0] for n in label_names}, + "attributes": {}, + }, + "segments": { + "avg. area": 0.0, + "area distribution": [], + "pixel distribution": {n: [0, 0] for n in label_names}, + }, + }, + } diff --git a/tests/unit/test_ops.py b/tests/unit/test_ops.py index b15399fd26..45d97ab44e 100644 --- a/tests/unit/test_ops.py +++ b/tests/unit/test_ops.py @@ -8,7 +8,6 @@ from datumaro.components.annotation import ( AnnotationType, Bbox, - Caption, Ellipse, Label, LabelCategories, @@ -25,7 +24,7 @@ from datumaro.components.merge.exact_merge import ExactMerge from datumaro.components.merge.intersect_merge import IntersectMerge from datumaro.components.merge.union_merge import UnionMerge -from datumaro.components.operations import compute_ann_statistics, find_unique_images, mean_std +from datumaro.components.operations import find_unique_images, mean_std from datumaro.errors import ( ConflictingCategoriesError, FailedAttrVotingError, @@ -67,277 +66,6 @@ def test_mean_std(self): for estd, astd in zip(expected_std, actual_std): assert np.allclose(estd, astd, atol=0.1) - @mark_requirement(Requirements.DATUM_GENERAL_REQ) - def test_stats(self): - dataset = Dataset.from_iterable( - [ - DatasetItem( - id=1, - media=Image.from_numpy(data=np.ones((5, 5, 3))), - annotations=[ - Caption("hello"), - Caption("world"), - Label( - 2, - attributes={ - "x": 1, - "y": "2", - }, - ), - Bbox( - 1, - 2, - 2, - 2, - label=2, - attributes={ - "score": 0.5, - }, - ), - Bbox( - 5, - 6, - 2, - 2, - attributes={ - "x": 1, - "y": "3", - "occluded": True, - }, - ), - Points([1, 2, 2, 0, 1, 1], label=0), - Mask( - label=3, - image=np.array( - [ - [0, 0, 1, 1, 1], - [0, 0, 1, 1, 1], - [0, 0, 1, 1, 1], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - ] - ), - ), - ], - ), - DatasetItem( - id=2, - media=Image.from_numpy(data=np.ones((2, 4, 3))), - annotations=[ - Label( - 2, - attributes={ - "x": 2, - "y": "2", - }, - ), - Bbox( - 1, - 2, - 2, - 2, - label=3, - attributes={ - "score": 0.5, - }, - ), - Bbox( - 5, - 6, - 2, - 2, - attributes={ - "x": 2, - "y": "3", - "occluded": False, - }, - ), - Ellipse( - 5, - 6, - 2, - 2, - attributes={ - "x": 2, - "y": "3", - "occluded": False, - }, - ), - ], - ), - DatasetItem(id=3), - DatasetItem(id="2.2", media=Image.from_numpy(data=np.ones((2, 4, 3)))), - ], - categories=["label_%s" % i for i in range(4)], - ) - - expected = { - "images count": 4, - "annotations count": 11, - "unannotated images count": 2, - "unannotated images": ["3", "2.2"], - "annotations by type": { - "label": { - "count": 2, - }, - "polygon": { - "count": 0, - }, - "polyline": { - "count": 0, - }, - "bbox": { - "count": 4, - }, - "mask": { - "count": 1, - }, - "points": { - "count": 1, - }, - "caption": { - "count": 2, - }, - "cuboid_3d": {"count": 0}, - "super_resolution_annotation": {"count": 0}, - "depth_annotation": {"count": 0}, - "ellipse": {"count": 1}, - "hash_key": {"count": 0}, - "feature_vector": {"count": 0}, - "tabular": {"count": 0}, - "unknown": {"count": 0}, - }, - "annotations": { - "labels": { - "count": 6, - "distribution": { - "label_0": [1, 1 / 6], - "label_1": [0, 0.0], - "label_2": [3, 3 / 6], - "label_3": [2, 2 / 6], - }, - "attributes": { - "x": { - "count": 2, # annotations with no label are skipped - "values count": 2, - "values present": ["1", "2"], - "distribution": { - "1": [1, 1 / 2], - "2": [1, 1 / 2], - }, - }, - "y": { - "count": 2, # annotations with no label are skipped - "values count": 1, - "values present": ["2"], - "distribution": { - "2": [2, 2 / 2], - }, - }, - # must not include "special" attributes like "occluded" - }, - }, - "segments": { - "avg. area": (4 * 2 + 9 * 1) / 3, - "area distribution": [ - {"min": 4.0, "max": 4.5, "count": 2, "percent": 2 / 3}, - {"min": 4.5, "max": 5.0, "count": 0, "percent": 0.0}, - {"min": 5.0, "max": 5.5, "count": 0, "percent": 0.0}, - {"min": 5.5, "max": 6.0, "count": 0, "percent": 0.0}, - {"min": 6.0, "max": 6.5, "count": 0, "percent": 0.0}, - {"min": 6.5, "max": 7.0, "count": 0, "percent": 0.0}, - {"min": 7.0, "max": 7.5, "count": 0, "percent": 0.0}, - {"min": 7.5, "max": 8.0, "count": 0, "percent": 0.0}, - {"min": 8.0, "max": 8.5, "count": 0, "percent": 0.0}, - {"min": 8.5, "max": 9.0, "count": 1, "percent": 1 / 3}, - ], - "pixel distribution": { - "label_0": [0, 0.0], - "label_1": [0, 0.0], - "label_2": [4, 4 / 17], - "label_3": [13, 13 / 17], - }, - }, - }, - } - - actual = compute_ann_statistics(dataset) - - self.assertEqual(expected, actual) - - @mark_requirement(Requirements.DATUM_GENERAL_REQ) - def test_stats_with_empty_dataset(self): - dataset = Dataset.from_iterable( - [ - DatasetItem(id=1), - DatasetItem(id=3), - ], - categories=["label_%s" % i for i in range(4)], - ) - - expected = { - "images count": 2, - "annotations count": 0, - "unannotated images count": 2, - "unannotated images": ["1", "3"], - "annotations by type": { - "label": { - "count": 0, - }, - "polygon": { - "count": 0, - }, - "polyline": { - "count": 0, - }, - "bbox": { - "count": 0, - }, - "mask": { - "count": 0, - }, - "points": { - "count": 0, - }, - "caption": { - "count": 0, - }, - "cuboid_3d": {"count": 0}, - "super_resolution_annotation": {"count": 0}, - "depth_annotation": {"count": 0}, - "ellipse": {"count": 0}, - "hash_key": {"count": 0}, - "feature_vector": {"count": 0}, - "tabular": {"count": 0}, - "unknown": {"count": 0}, - }, - "annotations": { - "labels": { - "count": 0, - "distribution": { - "label_0": [0, 0.0], - "label_1": [0, 0.0], - "label_2": [0, 0.0], - "label_3": [0, 0.0], - }, - "attributes": {}, - }, - "segments": { - "avg. area": 0.0, - "area distribution": [], - "pixel distribution": { - "label_0": [0, 0.0], - "label_1": [0, 0.0], - "label_2": [0, 0.0], - "label_3": [0, 0.0], - }, - }, - }, - } - - actual = compute_ann_statistics(dataset) - self.assertEqual(expected, actual) - @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_unique_image_count(self): expected = {