Add TabularValidator (openvinotoolkit#1498)

- Add TabularValidator - Validate annotations based on dataset after `AstypeAnnotations` - Add unit test for TabularValidator
sooahleex · May 22, 2024 · 5573852 · 5573852
1 parent 072c8a8
commit 5573852
Show file tree

Hide file tree

Showing 11 changed files with 1,096 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/1475>)
 - Add AstypeAnnotations Transform
   (<https://github.com/openvinotoolkit/datumaro/pull/1484>)
+- Add TabularValidator
+  (<https://github.com/openvinotoolkit/datumaro/pull/1498>)
 
 ### Enhancements
 - Fix ambiguous COCO format detector

diff --git a/docs/source/docs/command-reference/context_free/validate.md b/docs/source/docs/command-reference/context_free/validate.md
@@ -5,8 +5,8 @@
 This command inspects annotations with respect to the task type
 and stores the results in JSON file.
 
-The task types supported are `classification`, `detection`, and
-`segmentation` (the `-t/--task-type` parameter).
+The task types supported are `classification`, `detection`, `segmentation` and
+`tabular` (the `-t/--task-type` parameter).
 
 The validation result contains
 - `annotation statistics` based on the task type
@@ -82,6 +82,14 @@ Examples:
 | InvalidValue | There's invalid (ex. inf, nan) value for bounding box info. | detection |
 | FarFromLabelMean | An annotation has an too small or large value than average for a label | detection, segmentation |
 | FarFromAttrMean  | An annotation has an too small or large value than average for an attribute | detection, segmentation |
+| BrokenAnnotation  | Some annotations are not defined for an item | tabular |
+| EmptyLabel  | A value of the label column is not defined for an item | tabular |
+| EmptyCaption  | A value of the caption column is not defined for an item | tabular |
+| FewSamplesInCaption | The number of samples in a caption might be too low | tabular |
+| RedundanciesInCaption | Redundancies of an caption for an item | tabular |
+| ImbalancedCaptions  | There is an imbalance in the caption distribution | tabular |
+| ImbalancedDistInCaption  | Values are not evenly distributed for a caption only if caption is number | tabular |
+| FarFromCaptionMean | An annotation has an too small or large value than average for a caption only if caption is number | tabular |
 
 Validation Result Format:
 
@@ -146,6 +154,36 @@ Validation Result Format:
         # }
         'mask_distribution_in_dataset_item': <dict>,
         # '<item_key>': <mask/polygon count: int>
+
+        ## statistics for tabular task
+        'items_broken_annotation': <list>, # [<item_key>, ]
+        'label_distribution': {
+            'defined_labels': <dict>,   # <label:str>: <count:int>
+            'empty_labels': <dict>
+            # <label:str>: {
+            #     'count': <int>,
+            #     'items_with_empty_label': [<item_key>, ]
+            # }
+        },
+        'caption_distribution': {
+            'defined_captions': <dict>,   # <label:str>: <count:int>
+            'empty_captions': <dict>
+            # <label:str>: {
+            #     'count': <int>,
+            #     'items_with_empty_label': [<item_key>, ]
+            # }
+            'redundancies': <dict>
+            # <label:str>: {
+            #     'stopword': <dict>,
+            #         'count': <int>,
+            #         'items_with_redundancies': [<item_key>, ]
+            #     'url': <dict>,
+            #         'count': <int>,
+            #         'items_with_redundancies': [<item_key>, ]
+            #     }
+            # }
+        },
+
     },
     'validation_reports': <list>, # [ <validation_error_format>, ]
     # validation_error_format = {

diff --git a/requirements-core.txt b/requirements-core.txt
@@ -61,3 +61,6 @@ scikit-learn
 
 # Stream JSON parser
 json-stream
+
+# TabularValidator
+nltk
diff --git a/src/datumaro/components/errors.py b/src/datumaro/components/errors.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020-2022 Intel Corporation
+# Copyright (C) 2020-2024 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
@@ -540,6 +540,32 @@ def __str__(self):
         return f"Item needs '{self.ann_type}' annotation(s), " "but not found."
 
 
+@define(auto_exc=False)
+class BrokenAnnotation(DatasetItemValidationError):
+    ann_type = field()
+
+    def __str__(self):
+        return f"Item needs whole '{self.ann_type}' annotation(s), " "but missed some."
+
+
+@define(auto_exc=False)
+class EmptyLabel(DatasetItemValidationError):
+    label_name = field()
+
+    def __str__(self):
+        return f"Item should have the label '{self.label_name}' annotation(s), " "but not found."
+
+
+@define(auto_exc=False)
+class EmptyCaption(DatasetItemValidationError):
+    caption_name = field()
+
+    def __str__(self):
+        return (
+            f"Item should have the caption '{self.caption_name}' annotation(s), " "but not found."
+        )
+
+
 @define(auto_exc=False)
 class MultiLabelAnnotations(DatasetItemValidationError):
     def __str__(self):
@@ -633,6 +659,31 @@ def __str__(self):
         )
 
 
+@define(auto_exc=False)
+class FewSamplesInCaption(DatasetValidationError):
+    caption_name = field()
+    count = field()
+
+    def __str__(self):
+        return (
+            f"The number of samples in the caption '{self.caption_name}'"
+            f" might be too low. Found '{self.count}' samples."
+        )
+
+
+@define(auto_exc=False)
+class RedundanciesInCaption(DatasetValidationError):
+    caption_name = field()
+    redundancy_type = field()
+    count = field()
+
+    def __str__(self):
+        return (
+            f"The number of '{self.redundancy_type}' redundancy in the caption '{self.caption_name}'"
+            f" have found '{self.count}'."
+        )
+
+
 @define(auto_exc=False)
 class FewSamplesInAttribute(DatasetValidationError):
     label_name = field()
@@ -655,6 +706,12 @@ def __str__(self):
         return "There is an imbalance in the label distribution."
 
 
+@define(auto_exc=False)
+class ImbalancedCaptions(DatasetValidationError):
+    def __str__(self):
+        return "There is an imbalance in the caption distribution."
+
+
 @define(auto_exc=False)
 class ImbalancedAttribute(DatasetValidationError):
     label_name = field()
@@ -678,6 +735,14 @@ def __str__(self):
         )
 
 
+@define(auto_exc=False)
+class ImbalancedDistInCaption(DatasetValidationError):
+    caption_name = field()
+
+    def __str__(self):
+        return f"Values are not evenly " f"distributed for '{self.caption_name}' caption."
+
+
 @define(auto_exc=False)
 class ImbalancedDistInAttribute(DatasetValidationError):
     label_name = field()
@@ -737,6 +802,23 @@ def __str__(self):
         )
 
 
+@define(auto_exc=False)
+class FarFromCaptionMean(DatasetItemValidationError):
+    caption_name = field()
+    ann_id = field()
+    prop = field()
+    mean = field()
+    val = field()
+
+    def __str__(self):
+        return (
+            f"Annotation '{self.ann_id}' in "
+            f"the item has a value of '{self.prop}' that "
+            "is too far from the caption average. (mean of "
+            f"'{self.caption_name}' caption: {self.mean}, got '{self.val}')."
+        )
+
+
 @define(auto_exc=False)
 class FarFromAttrMean(DatasetItemValidationError):
     label_name = field()

diff --git a/src/datumaro/components/validator.py b/src/datumaro/components/validator.py
@@ -19,6 +19,7 @@ class TaskType(Enum):
     classification = auto()
     detection = auto()
     segmentation = auto()
+    tabular = auto()
 
 
 class Validator(CliPlugin):

diff --git a/src/datumaro/plugins/specs.json b/src/datumaro/plugins/specs.json
@@ -1968,5 +1968,10 @@
     "import_path": "datumaro.plugins.validators.SegmentationValidator",
     "plugin_name": "segmentation",
     "plugin_type": "Validator"
+  },
+  {
+    "import_path": "datumaro.plugins.validators.TabularValidator",
+    "plugin_name": "tabular",
+    "plugin_type": "Validator"
   }
 ]
diff --git a/src/datumaro/plugins/transforms.py b/src/datumaro/plugins/transforms.py
@@ -1536,12 +1536,14 @@ def categories(self):
         return self._categories
 
     def transform_item(self, item: DatasetItem):
+        import pandas as pd
+
         annotations = [
             Label(label=self._id_mapping[name + self._sep_token + str(value)])
             if self._tabular_cat_types.get(name) == CategoricalDtype() and value is not None
-            else Caption(value)
+            else Caption(name + self._sep_token + str(value))
             for name, value in item.annotations[0].values.items()
-            if value is not None
+            if not pd.isna(value)
         ]
 
         return self.wrap_item(item, annotations=annotations)