Skip to content

Commit

Permalink
Add TabularValidator (openvinotoolkit#1498)
Browse files Browse the repository at this point in the history
- Add TabularValidator
- Validate annotations based on dataset after `AstypeAnnotations`
- Add unit test for TabularValidator
  • Loading branch information
sooahleex authored May 22, 2024
1 parent 072c8a8 commit 5573852
Show file tree
Hide file tree
Showing 11 changed files with 1,096 additions and 9 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
(<https://github.com/openvinotoolkit/datumaro/pull/1475>)
- Add AstypeAnnotations Transform
(<https://github.com/openvinotoolkit/datumaro/pull/1484>)
- Add TabularValidator
(<https://github.com/openvinotoolkit/datumaro/pull/1498>)

### Enhancements
- Fix ambiguous COCO format detector
Expand Down
42 changes: 40 additions & 2 deletions docs/source/docs/command-reference/context_free/validate.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
This command inspects annotations with respect to the task type
and stores the results in JSON file.

The task types supported are `classification`, `detection`, and
`segmentation` (the `-t/--task-type` parameter).
The task types supported are `classification`, `detection`, `segmentation` and
`tabular` (the `-t/--task-type` parameter).

The validation result contains
- `annotation statistics` based on the task type
Expand Down Expand Up @@ -82,6 +82,14 @@ Examples:
| InvalidValue | There's invalid (ex. inf, nan) value for bounding box info. | detection |
| FarFromLabelMean | An annotation has an too small or large value than average for a label | detection, segmentation |
| FarFromAttrMean | An annotation has an too small or large value than average for an attribute | detection, segmentation |
| BrokenAnnotation | Some annotations are not defined for an item | tabular |
| EmptyLabel | A value of the label column is not defined for an item | tabular |
| EmptyCaption | A value of the caption column is not defined for an item | tabular |
| FewSamplesInCaption | The number of samples in a caption might be too low | tabular |
| RedundanciesInCaption | Redundancies of an caption for an item | tabular |
| ImbalancedCaptions | There is an imbalance in the caption distribution | tabular |
| ImbalancedDistInCaption | Values are not evenly distributed for a caption only if caption is number | tabular |
| FarFromCaptionMean | An annotation has an too small or large value than average for a caption only if caption is number | tabular |

Validation Result Format:

Expand Down Expand Up @@ -146,6 +154,36 @@ Validation Result Format:
# }
'mask_distribution_in_dataset_item': <dict>,
# '<item_key>': <mask/polygon count: int>

## statistics for tabular task
'items_broken_annotation': <list>, # [<item_key>, ]
'label_distribution': {
'defined_labels': <dict>, # <label:str>: <count:int>
'empty_labels': <dict>
# <label:str>: {
# 'count': <int>,
# 'items_with_empty_label': [<item_key>, ]
# }
},
'caption_distribution': {
'defined_captions': <dict>, # <label:str>: <count:int>
'empty_captions': <dict>
# <label:str>: {
# 'count': <int>,
# 'items_with_empty_label': [<item_key>, ]
# }
'redundancies': <dict>
# <label:str>: {
# 'stopword': <dict>,
# 'count': <int>,
# 'items_with_redundancies': [<item_key>, ]
# 'url': <dict>,
# 'count': <int>,
# 'items_with_redundancies': [<item_key>, ]
# }
# }
},

},
'validation_reports': <list>, # [ <validation_error_format>, ]
# validation_error_format = {
Expand Down
3 changes: 3 additions & 0 deletions requirements-core.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,6 @@ scikit-learn

# Stream JSON parser
json-stream

# TabularValidator
nltk
84 changes: 83 additions & 1 deletion src/datumaro/components/errors.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2020-2022 Intel Corporation
# Copyright (C) 2020-2024 Intel Corporation
#
# SPDX-License-Identifier: MIT

Expand Down Expand Up @@ -540,6 +540,32 @@ def __str__(self):
return f"Item needs '{self.ann_type}' annotation(s), " "but not found."


@define(auto_exc=False)
class BrokenAnnotation(DatasetItemValidationError):
ann_type = field()

def __str__(self):
return f"Item needs whole '{self.ann_type}' annotation(s), " "but missed some."


@define(auto_exc=False)
class EmptyLabel(DatasetItemValidationError):
label_name = field()

def __str__(self):
return f"Item should have the label '{self.label_name}' annotation(s), " "but not found."


@define(auto_exc=False)
class EmptyCaption(DatasetItemValidationError):
caption_name = field()

def __str__(self):
return (
f"Item should have the caption '{self.caption_name}' annotation(s), " "but not found."
)


@define(auto_exc=False)
class MultiLabelAnnotations(DatasetItemValidationError):
def __str__(self):
Expand Down Expand Up @@ -633,6 +659,31 @@ def __str__(self):
)


@define(auto_exc=False)
class FewSamplesInCaption(DatasetValidationError):
caption_name = field()
count = field()

def __str__(self):
return (
f"The number of samples in the caption '{self.caption_name}'"
f" might be too low. Found '{self.count}' samples."
)


@define(auto_exc=False)
class RedundanciesInCaption(DatasetValidationError):
caption_name = field()
redundancy_type = field()
count = field()

def __str__(self):
return (
f"The number of '{self.redundancy_type}' redundancy in the caption '{self.caption_name}'"
f" have found '{self.count}'."
)


@define(auto_exc=False)
class FewSamplesInAttribute(DatasetValidationError):
label_name = field()
Expand All @@ -655,6 +706,12 @@ def __str__(self):
return "There is an imbalance in the label distribution."


@define(auto_exc=False)
class ImbalancedCaptions(DatasetValidationError):
def __str__(self):
return "There is an imbalance in the caption distribution."


@define(auto_exc=False)
class ImbalancedAttribute(DatasetValidationError):
label_name = field()
Expand All @@ -678,6 +735,14 @@ def __str__(self):
)


@define(auto_exc=False)
class ImbalancedDistInCaption(DatasetValidationError):
caption_name = field()

def __str__(self):
return f"Values are not evenly " f"distributed for '{self.caption_name}' caption."


@define(auto_exc=False)
class ImbalancedDistInAttribute(DatasetValidationError):
label_name = field()
Expand Down Expand Up @@ -737,6 +802,23 @@ def __str__(self):
)


@define(auto_exc=False)
class FarFromCaptionMean(DatasetItemValidationError):
caption_name = field()
ann_id = field()
prop = field()
mean = field()
val = field()

def __str__(self):
return (
f"Annotation '{self.ann_id}' in "
f"the item has a value of '{self.prop}' that "
"is too far from the caption average. (mean of "
f"'{self.caption_name}' caption: {self.mean}, got '{self.val}')."
)


@define(auto_exc=False)
class FarFromAttrMean(DatasetItemValidationError):
label_name = field()
Expand Down
1 change: 1 addition & 0 deletions src/datumaro/components/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class TaskType(Enum):
classification = auto()
detection = auto()
segmentation = auto()
tabular = auto()


class Validator(CliPlugin):
Expand Down
5 changes: 5 additions & 0 deletions src/datumaro/plugins/specs.json
Original file line number Diff line number Diff line change
Expand Up @@ -1968,5 +1968,10 @@
"import_path": "datumaro.plugins.validators.SegmentationValidator",
"plugin_name": "segmentation",
"plugin_type": "Validator"
},
{
"import_path": "datumaro.plugins.validators.TabularValidator",
"plugin_name": "tabular",
"plugin_type": "Validator"
}
]
6 changes: 4 additions & 2 deletions src/datumaro/plugins/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1536,12 +1536,14 @@ def categories(self):
return self._categories

def transform_item(self, item: DatasetItem):
import pandas as pd

annotations = [
Label(label=self._id_mapping[name + self._sep_token + str(value)])
if self._tabular_cat_types.get(name) == CategoricalDtype() and value is not None
else Caption(value)
else Caption(name + self._sep_token + str(value))
for name, value in item.annotations[0].values.items()
if value is not None
if not pd.isna(value)
]

return self.wrap_item(item, annotations=annotations)
Loading

0 comments on commit 5573852

Please sign in to comment.