Skip to content

Commit

Permalink
Handling undefined labels at the annotation statistics (openvinotoolk…
Browse files Browse the repository at this point in the history
…it#1232)

<!-- Contributing guide:
https://github.com/openvinotoolkit/datumaro/blob/develop/CONTRIBUTING.md
-->

### Summary
Regarding the issue openvinotoolkit#1204 , modify `compute_ann_statistics` function to
handle undefined labels.
And add corresponding unit test.

<!--
Resolves openvinotoolkit#111 and openvinotoolkit#222.
Depends on openvinotoolkit#1000 (for series of dependent commits).

This PR introduces this capability to make the project better in this
and that.

- Added this feature
- Removed that feature
- Fixed the problem openvinotoolkit#1234
-->

### How to test
<!-- Describe the testing procedure for reviewers, if changes are
not fully covered by unit tests or manual testing can be complicated.
-->

### Checklist
<!-- Put an 'x' in all the boxes that apply -->
- [x] I have added unit tests to cover my changes.​
- [ ] I have added integration tests to cover my changes.​
- [x] I have added the description of my changes into
[CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md).​
- [ ] I have updated the
[documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs)
accordingly

### License

- [x] I submit _my code changes_ under the same [MIT
License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE)
that covers the project.
  Feel free to contact the maintainers if that's a concern.
- [ ] I have updated the license header for each file (see an example
below).

```python
# Copyright (C) 2023 Intel Corporation
#
# SPDX-License-Identifier: MIT
```
  • Loading branch information
Jihyeon Yi authored Dec 21, 2023
1 parent cce5fc9 commit cfb6832
Show file tree
Hide file tree
Showing 5 changed files with 327 additions and 278 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
(<https://github.com/openvinotoolkit/datumaro/pull/1221>)
- Fix Kinetics data format to have media data
(<https://github.com/openvinotoolkit/datumaro/pull/1223>)
- Handling undefined labels at the annotation statistics
(<https://github.com/openvinotoolkit/datumaro/pull/1232>)

## 16/11/2023 - Release 1.5.1
### Enhancements
Expand Down
24 changes: 20 additions & 4 deletions src/datumaro/components/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import hashlib
import logging as log
import warnings
from collections import defaultdict
from copy import deepcopy
from typing import Callable, Dict, Optional, Set, Tuple

Expand Down Expand Up @@ -225,10 +226,20 @@ def _extractor_stats(subset_name):


def compute_ann_statistics(dataset: IDataset):
labels = dataset.categories().get(AnnotationType.label, LabelCategories())
warnings.warn(
"We are planning to change the type of stats['annotations']['labels']['distribution'] "
"and stats['annotations']['segments']['pixel distribution'] from `list` to `(named) tuple`. "
"If you are checking the types in your code, please revisit it after upgrading datumaro>=2.0.0.",
FutureWarning,
)
labels: LabelCategories = dataset.categories().get(AnnotationType.label, LabelCategories())

def get_label(ann):
return labels.items[ann.label].name if ann.label is not None else None
try:
return labels.items[ann.label].name if ann.label is not None else None
except IndexError:
log.warning(f"annotation({ann}) has undefined label({ann.label})")
return ann.label

stats = {
"images count": 0,
Expand All @@ -253,21 +264,26 @@ def get_label(ann):
}
label_stat = {
"count": 0,
"distribution": {l.name: [0, 0] for l in labels.items}, # label -> (count, total%)
"distribution": defaultdict(lambda: [0, 0]), # label -> (count, total%)
"attributes": {},
}

stats["annotations"]["labels"] = label_stat
segm_stat = {
"avg. area": 0,
"area distribution": [], # a histogram with 10 bins
# (min, min+10%), ..., (min+90%, max) -> (count, total%)
"pixel distribution": {l.name: [0, 0] for l in labels.items}, # label -> (count, total%)
"pixel distribution": defaultdict(lambda: [0, 0]), # label -> (count, total%)
}
stats["annotations"]["segments"] = segm_stat
segm_areas = []
pixel_dist = segm_stat["pixel distribution"]
total_pixels = 0

for l in labels.items:
label_stat["distribution"][l.name] = [0, 0]
pixel_dist[l.name] = [0, 0]

for item in dataset:
if len(item.annotations) == 0:
stats["unannotated images"].append(item.id)
Expand Down
3 changes: 3 additions & 0 deletions tests/requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ class Requirements:
DATUM_BUG_618 = "ResizeTransform returns broken image pixels"
DATUM_BUG_721 = "Explain command cannot find the model"
DATUM_BUG_873 = "Error using datum stats"
DATUM_BUG_1204 = (
"Statistics raise an error when there is a label annotation not in the category"
)


class SkipMessages:
Expand Down
302 changes: 301 additions & 1 deletion tests/unit/operations/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,16 @@
import numpy as np
import pytest

from datumaro.components.annotation import Bbox, Caption, Ellipse, Label, Mask, Points
from datumaro.components.dataset import Dataset
from datumaro.components.dataset_base import DatasetItem
from datumaro.components.errors import DatumaroError
from datumaro.components.media import Image, PointCloud
from datumaro.components.operations import IMAGE_STATS_SCHEMA, compute_image_statistics
from datumaro.components.operations import (
IMAGE_STATS_SCHEMA,
compute_ann_statistics,
compute_image_statistics,
)

from tests.requirements import Requirements, mark_requirement

Expand Down Expand Up @@ -109,3 +114,298 @@ def test_invalid_media_type(
with pytest.warns(UserWarning, match="only Image media_type is allowed"):
actual = compute_image_statistics(fxt_point_cloud_dataset)
assert actual["dataset"] == IMAGE_STATS_SCHEMA["dataset"]


class AnnStatisticsTest:
@mark_requirement(Requirements.DATUM_GENERAL_REQ)
def test_stats(self):
dataset = Dataset.from_iterable(
[
DatasetItem(
id=1,
media=Image.from_numpy(data=np.ones((5, 5, 3))),
annotations=[
Caption("hello"),
Caption("world"),
Label(
2,
attributes={
"x": 1,
"y": "2",
},
),
Bbox(
1,
2,
2,
2,
label=2,
attributes={
"score": 0.5,
},
),
Bbox(
5,
6,
2,
2,
attributes={
"x": 1,
"y": "3",
"occluded": True,
},
),
Points([1, 2, 2, 0, 1, 1], label=0),
Mask(
label=3,
image=np.array(
[
[0, 0, 1, 1, 1],
[0, 0, 1, 1, 1],
[0, 0, 1, 1, 1],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
]
),
),
],
),
DatasetItem(
id=2,
media=Image.from_numpy(data=np.ones((2, 4, 3))),
annotations=[
Label(
2,
attributes={
"x": 2,
"y": "2",
},
),
Bbox(
1,
2,
2,
2,
label=3,
attributes={
"score": 0.5,
},
),
Bbox(
5,
6,
2,
2,
attributes={
"x": 2,
"y": "3",
"occluded": False,
},
),
Ellipse(
5,
6,
2,
2,
attributes={
"x": 2,
"y": "3",
"occluded": False,
},
),
],
),
DatasetItem(id=3),
DatasetItem(id="2.2", media=Image.from_numpy(data=np.ones((2, 4, 3)))),
],
categories=["label_%s" % i for i in range(4)],
)

expected = {
"images count": 4,
"annotations count": 11,
"unannotated images count": 2,
"unannotated images": ["3", "2.2"],
"annotations by type": {
"label": {
"count": 2,
},
"polygon": {
"count": 0,
},
"polyline": {
"count": 0,
},
"bbox": {
"count": 4,
},
"mask": {
"count": 1,
},
"points": {
"count": 1,
},
"caption": {
"count": 2,
},
"cuboid_3d": {"count": 0},
"super_resolution_annotation": {"count": 0},
"depth_annotation": {"count": 0},
"ellipse": {"count": 1},
"hash_key": {"count": 0},
"feature_vector": {"count": 0},
"tabular": {"count": 0},
"unknown": {"count": 0},
},
"annotations": {
"labels": {
"count": 6,
"distribution": {
"label_0": [1, 1 / 6],
"label_1": [0, 0.0],
"label_2": [3, 3 / 6],
"label_3": [2, 2 / 6],
},
"attributes": {
"x": {
"count": 2, # annotations with no label are skipped
"values count": 2,
"values present": ["1", "2"],
"distribution": {
"1": [1, 1 / 2],
"2": [1, 1 / 2],
},
},
"y": {
"count": 2, # annotations with no label are skipped
"values count": 1,
"values present": ["2"],
"distribution": {
"2": [2, 2 / 2],
},
},
# must not include "special" attributes like "occluded"
},
},
"segments": {
"avg. area": (4 * 2 + 9 * 1) / 3,
"area distribution": [
{"min": 4.0, "max": 4.5, "count": 2, "percent": 2 / 3},
{"min": 4.5, "max": 5.0, "count": 0, "percent": 0.0},
{"min": 5.0, "max": 5.5, "count": 0, "percent": 0.0},
{"min": 5.5, "max": 6.0, "count": 0, "percent": 0.0},
{"min": 6.0, "max": 6.5, "count": 0, "percent": 0.0},
{"min": 6.5, "max": 7.0, "count": 0, "percent": 0.0},
{"min": 7.0, "max": 7.5, "count": 0, "percent": 0.0},
{"min": 7.5, "max": 8.0, "count": 0, "percent": 0.0},
{"min": 8.0, "max": 8.5, "count": 0, "percent": 0.0},
{"min": 8.5, "max": 9.0, "count": 1, "percent": 1 / 3},
],
"pixel distribution": {
"label_0": [0, 0.0],
"label_1": [0, 0.0],
"label_2": [4, 4 / 17],
"label_3": [13, 13 / 17],
},
},
},
}

actual = compute_ann_statistics(dataset)

assert actual == expected

@mark_requirement(Requirements.DATUM_GENERAL_REQ)
def test_stats_with_empty_dataset(self):
label_names = ["label_%s" % i for i in range(4)]
dataset = Dataset.from_iterable(
[
DatasetItem(id=1),
DatasetItem(id=3),
],
categories=label_names,
)

expected = self._get_stats_template(label_names)
expected["images count"] = 2
expected["unannotated images count"] = 2
expected["unannotated images"] = ["1", "3"]

actual = compute_ann_statistics(dataset)
assert actual == expected

@mark_requirement(Requirements.DATUM_BUG_1204)
def test_stats_with_invalid_label(self):
label_names = ["label_%s" % i for i in range(3)]
dataset = Dataset.from_iterable(
iterable=[DatasetItem(id=f"item{i}", annotations=[Label(i)]) for i in range(4)],
categories=label_names,
)

expected = self._get_stats_template(label_names)
expected["images count"] = 4
expected["annotations count"] = 4
expected["annotations by type"]["label"]["count"] = 4
expected["annotations"]["labels"]["count"] = 4
expected["annotations"]["labels"]["distribution"] = {
"label_0": [1, 0.25],
"label_1": [1, 0.25],
"label_2": [1, 0.25],
3: [1, 0.25], # label which does not exist in categories.
}

actual = compute_ann_statistics(dataset)

assert actual == expected

@staticmethod
def _get_stats_template(label_names: list):
return {
"images count": 0,
"annotations count": 0,
"unannotated images count": 0,
"unannotated images": [],
"annotations by type": {
"label": {
"count": 0,
},
"polygon": {
"count": 0,
},
"polyline": {
"count": 0,
},
"bbox": {
"count": 0,
},
"mask": {
"count": 0,
},
"points": {
"count": 0,
},
"caption": {
"count": 0,
},
"cuboid_3d": {"count": 0},
"super_resolution_annotation": {"count": 0},
"depth_annotation": {"count": 0},
"ellipse": {"count": 0},
"hash_key": {"count": 0},
"feature_vector": {"count": 0},
"tabular": {"count": 0},
"unknown": {"count": 0},
},
"annotations": {
"labels": {
"count": 0,
"distribution": {n: [0, 0] for n in label_names},
"attributes": {},
},
"segments": {
"avg. area": 0.0,
"area distribution": [],
"pixel distribution": {n: [0, 0] for n in label_names},
},
},
}
Loading

0 comments on commit cfb6832

Please sign in to comment.