From 5573852d75b5638a81fec1490569d7d8c374dfb2 Mon Sep 17 00:00:00 2001 From: Sooah Lee Date: Wed, 22 May 2024 15:50:43 +0900 Subject: [PATCH] Add TabularValidator (#1498) - Add TabularValidator - Validate annotations based on dataset after `AstypeAnnotations` - Add unit test for TabularValidator --- CHANGELOG.md | 2 + .../context_free/validate.md | 42 +- requirements-core.txt | 3 + src/datumaro/components/errors.py | 84 +++- src/datumaro/components/validator.py | 1 + src/datumaro/plugins/specs.json | 5 + src/datumaro/plugins/transforms.py | 6 +- src/datumaro/plugins/validators.py | 474 +++++++++++++++++- .../assets/tabular_dataset/women_clothing.csv | 121 +++++ tests/unit/test_transforms.py | 4 +- tests/unit/test_validator.py | 363 ++++++++++++++ 11 files changed, 1096 insertions(+), 9 deletions(-) create mode 100644 tests/assets/tabular_dataset/women_clothing.csv diff --git a/CHANGELOG.md b/CHANGELOG.md index ee5c8ec64c..f72122b741 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 () - Add AstypeAnnotations Transform () +- Add TabularValidator + () ### Enhancements - Fix ambiguous COCO format detector diff --git a/docs/source/docs/command-reference/context_free/validate.md b/docs/source/docs/command-reference/context_free/validate.md index 130286f34a..a1eaf51e49 100644 --- a/docs/source/docs/command-reference/context_free/validate.md +++ b/docs/source/docs/command-reference/context_free/validate.md @@ -5,8 +5,8 @@ This command inspects annotations with respect to the task type and stores the results in JSON file. -The task types supported are `classification`, `detection`, and -`segmentation` (the `-t/--task-type` parameter). +The task types supported are `classification`, `detection`, `segmentation` and +`tabular` (the `-t/--task-type` parameter). The validation result contains - `annotation statistics` based on the task type @@ -82,6 +82,14 @@ Examples: | InvalidValue | There's invalid (ex. inf, nan) value for bounding box info. | detection | | FarFromLabelMean | An annotation has an too small or large value than average for a label | detection, segmentation | | FarFromAttrMean | An annotation has an too small or large value than average for an attribute | detection, segmentation | +| BrokenAnnotation | Some annotations are not defined for an item | tabular | +| EmptyLabel | A value of the label column is not defined for an item | tabular | +| EmptyCaption | A value of the caption column is not defined for an item | tabular | +| FewSamplesInCaption | The number of samples in a caption might be too low | tabular | +| RedundanciesInCaption | Redundancies of an caption for an item | tabular | +| ImbalancedCaptions | There is an imbalance in the caption distribution | tabular | +| ImbalancedDistInCaption | Values are not evenly distributed for a caption only if caption is number | tabular | +| FarFromCaptionMean | An annotation has an too small or large value than average for a caption only if caption is number | tabular | Validation Result Format: @@ -146,6 +154,36 @@ Validation Result Format: # } 'mask_distribution_in_dataset_item': , # '': + + ## statistics for tabular task + 'items_broken_annotation': , # [, ] + 'label_distribution': { + 'defined_labels': , # : + 'empty_labels': + # : { + # 'count': , + # 'items_with_empty_label': [, ] + # } + }, + 'caption_distribution': { + 'defined_captions': , # : + 'empty_captions': + # : { + # 'count': , + # 'items_with_empty_label': [, ] + # } + 'redundancies': + # : { + # 'stopword': , + # 'count': , + # 'items_with_redundancies': [, ] + # 'url': , + # 'count': , + # 'items_with_redundancies': [, ] + # } + # } + }, + }, 'validation_reports': , # [ , ] # validation_error_format = { diff --git a/requirements-core.txt b/requirements-core.txt index b59fc346bf..78a36e601b 100644 --- a/requirements-core.txt +++ b/requirements-core.txt @@ -61,3 +61,6 @@ scikit-learn # Stream JSON parser json-stream + +# TabularValidator +nltk diff --git a/src/datumaro/components/errors.py b/src/datumaro/components/errors.py index 5c456a2675..3a63b8a04b 100644 --- a/src/datumaro/components/errors.py +++ b/src/datumaro/components/errors.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2022 Intel Corporation +# Copyright (C) 2020-2024 Intel Corporation # # SPDX-License-Identifier: MIT @@ -540,6 +540,32 @@ def __str__(self): return f"Item needs '{self.ann_type}' annotation(s), " "but not found." +@define(auto_exc=False) +class BrokenAnnotation(DatasetItemValidationError): + ann_type = field() + + def __str__(self): + return f"Item needs whole '{self.ann_type}' annotation(s), " "but missed some." + + +@define(auto_exc=False) +class EmptyLabel(DatasetItemValidationError): + label_name = field() + + def __str__(self): + return f"Item should have the label '{self.label_name}' annotation(s), " "but not found." + + +@define(auto_exc=False) +class EmptyCaption(DatasetItemValidationError): + caption_name = field() + + def __str__(self): + return ( + f"Item should have the caption '{self.caption_name}' annotation(s), " "but not found." + ) + + @define(auto_exc=False) class MultiLabelAnnotations(DatasetItemValidationError): def __str__(self): @@ -633,6 +659,31 @@ def __str__(self): ) +@define(auto_exc=False) +class FewSamplesInCaption(DatasetValidationError): + caption_name = field() + count = field() + + def __str__(self): + return ( + f"The number of samples in the caption '{self.caption_name}'" + f" might be too low. Found '{self.count}' samples." + ) + + +@define(auto_exc=False) +class RedundanciesInCaption(DatasetValidationError): + caption_name = field() + redundancy_type = field() + count = field() + + def __str__(self): + return ( + f"The number of '{self.redundancy_type}' redundancy in the caption '{self.caption_name}'" + f" have found '{self.count}'." + ) + + @define(auto_exc=False) class FewSamplesInAttribute(DatasetValidationError): label_name = field() @@ -655,6 +706,12 @@ def __str__(self): return "There is an imbalance in the label distribution." +@define(auto_exc=False) +class ImbalancedCaptions(DatasetValidationError): + def __str__(self): + return "There is an imbalance in the caption distribution." + + @define(auto_exc=False) class ImbalancedAttribute(DatasetValidationError): label_name = field() @@ -678,6 +735,14 @@ def __str__(self): ) +@define(auto_exc=False) +class ImbalancedDistInCaption(DatasetValidationError): + caption_name = field() + + def __str__(self): + return f"Values are not evenly " f"distributed for '{self.caption_name}' caption." + + @define(auto_exc=False) class ImbalancedDistInAttribute(DatasetValidationError): label_name = field() @@ -737,6 +802,23 @@ def __str__(self): ) +@define(auto_exc=False) +class FarFromCaptionMean(DatasetItemValidationError): + caption_name = field() + ann_id = field() + prop = field() + mean = field() + val = field() + + def __str__(self): + return ( + f"Annotation '{self.ann_id}' in " + f"the item has a value of '{self.prop}' that " + "is too far from the caption average. (mean of " + f"'{self.caption_name}' caption: {self.mean}, got '{self.val}')." + ) + + @define(auto_exc=False) class FarFromAttrMean(DatasetItemValidationError): label_name = field() diff --git a/src/datumaro/components/validator.py b/src/datumaro/components/validator.py index bf5c5c6e31..40d2870c1b 100644 --- a/src/datumaro/components/validator.py +++ b/src/datumaro/components/validator.py @@ -19,6 +19,7 @@ class TaskType(Enum): classification = auto() detection = auto() segmentation = auto() + tabular = auto() class Validator(CliPlugin): diff --git a/src/datumaro/plugins/specs.json b/src/datumaro/plugins/specs.json index 68ee3b7a82..7e8220f7a4 100644 --- a/src/datumaro/plugins/specs.json +++ b/src/datumaro/plugins/specs.json @@ -1968,5 +1968,10 @@ "import_path": "datumaro.plugins.validators.SegmentationValidator", "plugin_name": "segmentation", "plugin_type": "Validator" + }, + { + "import_path": "datumaro.plugins.validators.TabularValidator", + "plugin_name": "tabular", + "plugin_type": "Validator" } ] diff --git a/src/datumaro/plugins/transforms.py b/src/datumaro/plugins/transforms.py index 299ed6eb05..bf512f71d2 100644 --- a/src/datumaro/plugins/transforms.py +++ b/src/datumaro/plugins/transforms.py @@ -1536,12 +1536,14 @@ def categories(self): return self._categories def transform_item(self, item: DatasetItem): + import pandas as pd + annotations = [ Label(label=self._id_mapping[name + self._sep_token + str(value)]) if self._tabular_cat_types.get(name) == CategoricalDtype() and value is not None - else Caption(value) + else Caption(name + self._sep_token + str(value)) for name, value in item.annotations[0].values.items() - if value is not None + if not pd.isna(value) ] return self.wrap_item(item, annotations=annotations) diff --git a/src/datumaro/plugins/validators.py b/src/datumaro/plugins/validators.py index 7e12c00ce9..a8a5ad33c2 100644 --- a/src/datumaro/plugins/validators.py +++ b/src/datumaro/plugins/validators.py @@ -1,21 +1,29 @@ -# Copyright (C) 2021 Intel Corporation +# Copyright (C) 2021-2024 Intel Corporation # # SPDX-License-Identifier: MIT from copy import deepcopy import numpy as np +from pandas.api.types import CategoricalDtype from datumaro.components.annotation import AnnotationType, GroupType, LabelCategories from datumaro.components.cli_plugin import CliPlugin from datumaro.components.errors import ( AttributeDefinedButNotFound, + BrokenAnnotation, + EmptyCaption, + EmptyLabel, FarFromAttrMean, + FarFromCaptionMean, FarFromLabelMean, FewSamplesInAttribute, + FewSamplesInCaption, FewSamplesInLabel, ImbalancedAttribute, + ImbalancedCaptions, ImbalancedDistInAttribute, + ImbalancedDistInCaption, ImbalancedDistInLabel, ImbalancedLabels, InvalidValue, @@ -27,6 +35,7 @@ NegativeLength, OnlyOneAttributeValue, OnlyOneLabel, + RedundanciesInCaption, UndefinedAttribute, UndefinedLabel, ) @@ -64,7 +73,7 @@ class _TaskValidator(Validator, CliPlugin): Attributes ---------- task_type : str or TaskType - task type (ie. classification, detection, segmentation) + task type (ie. classification, detection, segmentation, tabular) """ @classmethod @@ -157,6 +166,9 @@ def __init__( elif self.task_type == TaskType.segmentation: self.ann_types = {AnnotationType.mask, AnnotationType.polygon, AnnotationType.ellipse} self.str_ann_type = "mask or polygon or ellipse" + elif self.task_type == TaskType.tabular: + self.ann_types = {AnnotationType.label, AnnotationType.caption} + self.str_ann_type = "label or caption" if few_samples_thr is None: few_samples_thr = self.DEFAULT_FEW_SAMPLES_THR @@ -1196,3 +1208,461 @@ def generate_reports(self, stats): ) return reports + + +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Any, List, Optional + +from nltk.corpus import stopwords + + +@dataclass +class TabularValidationStats: + total_ann_count: int = field(default=0) + items_missing_annotation: List[Any] = field(default_factory=list) + + @classmethod + def create_with_dataset(cls, dataset): + instance = cls() + instance.__post_init__(dataset) + return instance + + def __post_init__(self, dataset: Optional[Any] = None): + if dataset: + self.label_categories = dataset.categories().get( + AnnotationType.label, LabelCategories() + ) + self.tabular_categories = dataset._tabular_cat_types.items() + self.label_columns = [ + cat for cat, type_ in self.tabular_categories if isinstance(type_, CategoricalDtype) + ] + self.caption_columns = [ + cat + for cat, type_ in self.tabular_categories + if not isinstance(type_, CategoricalDtype) + ] + + self.defined_labels = {cat.name: 0 for cat in self.label_categories} + self.empty_labels = {cat: [] for cat in self.label_columns} + + self.defined_captions = {cat: 0 for cat in self.caption_columns} + self.empty_captions = {cat: [] for cat in self.caption_columns} + self.redundancies = { + cat: {"stopword": [], "url": []} + for cat, type_ in self.tabular_categories + if type_ == str + } + + def to_dict(self): + empty_labels = self._build_empty_labels_dict(self.empty_labels, "items_with_empty_label") + empty_captions = self._build_empty_labels_dict( + self.empty_captions, "items_with_empty_caption" + ) + redundancies = self._build_redundancies_dict(self.redundancies) + + return { + "total_ann_count": self.total_ann_count, + "items_missing_annotation": self.items_missing_annotation, + "items_broken_annotation": self._collect_broken_annotations(), + "label_distribution": { + "defined_labels": self.defined_labels, + "empty_labels": empty_labels, + }, + "caption_distribution": { + "defined_captions": self.defined_captions, + "empty_captions": empty_captions, + "redundancies": redundancies, + }, + } + + def _build_empty_labels_dict(self, empty_dict, key_name): + result = defaultdict(dict) + for label, items in empty_dict.items(): + result[label]["count"] = len(items) + result[label][key_name] = list(items) + return result + + def _build_redundancies_dict(self, redundancies): + result = defaultdict(lambda: defaultdict(dict)) + for caption, items in redundancies.items(): + for key in ["stopword", "url"]: + result[caption][key]["count"] = len(items[key]) + result[caption][key]["items_with_redundancies"] = list(items[key]) + return result + + def _collect_broken_annotations(self): + broken_annotations = set() + for items in [self.empty_labels, self.empty_captions]: + for key, value in items.items(): + broken_annotations.update(value) + return list(broken_annotations) + + +class TabularValidator(_TaskValidator): + """ + A specific validator class for tabular dataset. + """ + + def __init__( + self, + task_type=TaskType.tabular, + few_samples_thr=None, + imbalance_ratio_thr=None, + far_from_mean_thr=None, + dominance_ratio_thr=None, + topk_bins=None, + ): + super().__init__( + task_type=task_type, + few_samples_thr=few_samples_thr, + imbalance_ratio_thr=imbalance_ratio_thr, + far_from_mean_thr=far_from_mean_thr, + dominance_ratio_thr=dominance_ratio_thr, + topk_bins=topk_bins, + ) + + self.value_template = {"value": deepcopy(self.numerical_stat_template)} + + def _compute_common_statistics(self, dataset): + try: + stop_words = set(stopwords.words("english")) # TODO + except LookupError: + import nltk + + nltk.download("stopwords") + stop_words = set(stopwords.words("english")) # TODO + + stats = TabularValidationStats.create_with_dataset(dataset=dataset) + + filtered_items = [] + for item in dataset: + item_key = (item.id, item.subset) + label_check = {cat: 0 for cat in stats.label_columns} + annotations = [ann for ann in item.annotations if ann.type in self.ann_types] + ann_count = len(annotations) + filtered_items.append((item_key, annotations)) + if ann_count == 0: + stats.items_missing_annotation.append(item_key) + stats.total_ann_count += ann_count + + caption_check = deepcopy(stats.caption_columns) + for ann in annotations: + if ann.type == AnnotationType.caption: + caption_ = ann.caption + for cat in stats.caption_columns: + if cat + ":" in caption_: + stats.defined_captions[cat] += 1 + caption_ = caption_.split(cat + ":")[-1] + caption_check.remove(cat) + if any(c in stop_words for c in str(caption_)): + stats.redundancies[cat]["stopword"].append(item_key) + elif any( + "http" in w or "https" in w for w in str(caption_).lower().split() + ): + stats.redundancies[cat]["url"].append(item_key) + + else: + label_name = stats.label_categories[ann.label].name + stats.defined_labels[label_name] += 1 + label_name = label_name.split(":")[0] + label_check[label_name] += 1 + + for cap in caption_check: + stats.empty_captions[cap].append(item_key) + + for label_col, v in label_check.items(): + if v == 0: + stats.empty_labels[label_col].append(item_key) + + return stats.to_dict(), filtered_items + + def _compute_prop_dist(self, caption_columns, stats, update_stats_by_caption): + dist_by_caption = stats["distribution_in_caption"] + dist_in_item = stats["distribution_in_dataset_item"] + + for item_key, annotations in self.items: + ann_count = len(annotations) + dist_in_item[item_key] = ann_count + + for ann in annotations: + if ann.type == AnnotationType.caption: + caption_ = ann.caption + for cat_name, type_ in caption_columns: + if cat_name + ":" in caption_: + caption_value = type_(caption_.split(f"{cat_name}:")[-1]) + update_stats_by_caption(caption_value, dist_by_caption[cat_name]) + + def _update_prop_distributions(self, curr_stats, target_stats): + for prop, val in curr_stats.items(): + prop_stats = target_stats[prop] + prop_stats["distribution"].append(val) + + def _compute_prop_stats_from_dist(self, dist_by_caption): + for stats in dist_by_caption.values(): + prop_stats_list = list(stats.values()) + + for prop_stats in prop_stats_list: + prop_dist = prop_stats.pop("distribution", []) + if len(prop_dist) > 0: + prop_stats["mean"] = np.mean(prop_dist) + prop_stats["stdev"] = np.std(prop_dist) + prop_stats["min"] = np.min(prop_dist) + prop_stats["max"] = np.max(prop_dist) + prop_stats["median"] = np.median(prop_dist) + + counts, bins = np.histogram(prop_dist) + prop_stats["histogram"]["bins"] = bins.tolist() + prop_stats["histogram"]["counts"] = counts.tolist() + + def _compute_far_from_mean(self, prop_stats, val, item_key, ann): + def _far_from_mean(val, mean, stdev): + thr = self.far_from_mean_thr + return val > mean + (thr * stdev) or val < mean - (thr * stdev) + + mean = prop_stats["mean"] + stdev = prop_stats["stdev"] + + if _far_from_mean(val, mean, stdev): + items_far_from_mean = prop_stats["items_far_from_mean"] + far_from_mean = items_far_from_mean.setdefault(item_key, {}) + far_from_mean[ann.id] = val + + def compute_statistics(self, dataset): + """ + Computes statistics of the tabular dataset. + + Parameters + ---------- + dataset : IDataset object + + Returns + ------- + stats (dict): A dict object containing statistics of the dataset. + """ + + stats, filtered_items = self._compute_common_statistics(dataset) + + self.items = filtered_items + + num_caption_columns = [ + (cat, type_) + for cat, type_ in dataset._tabular_cat_types.items() + if (type_ is int) or (type_ is float) + ] + + stats["distribution_in_caption"] = { + cap: deepcopy(self.value_template) for cap, _ in num_caption_columns + } + stats["distribution_in_dataset_item"] = {} + + def _update_stats_by_caption(caption_, caption_stats): + caption_has_error = False + + if not caption_has_error: + caption_info = {"value": caption_} + self._update_prop_distributions(caption_info, caption_stats) + + # Collect property distribution + self._compute_prop_dist(num_caption_columns, stats, _update_stats_by_caption) + + # Compute property statistics from distribution + dist_by_caption = stats["distribution_in_caption"] + self._compute_prop_stats_from_dist(dist_by_caption) + + def _update_captions_far_from_mean(caption_columns, item_key, ann): + for cap, type_ in caption_columns: + prop_stats = dist_by_caption[cap]["value"] + if cap + ":" in ann.caption: + val = type_(ann.caption.split(f"{cap}:")[-1]) + self._compute_far_from_mean(prop_stats, val, item_key, ann) + + # Compute far_from_mean from property + for item_key, annotations in self.items: + for ann in annotations: + if ann.type == AnnotationType.caption: + _update_captions_far_from_mean(num_caption_columns, item_key, ann) + + return stats + + def generate_reports(self, stats): + """ + Validates the dataset for classification tasks based on its statistics. + + Parameters + ---------- + dataset : IDataset object + stats: Dict object + + Returns + ------- + reports (list): List of validation reports (DatasetValidationError). + """ + reports = [] + + # report for dataset + reports += self._check_missing_label_categories(stats) + + # report for item + reports += self._check_missing_annotation(stats) + + # report for label + reports += self._check_few_samples_in_label(stats) + reports += self._check_imbalanced_labels(stats) + + # report for caption + reports += self._check_few_samples_in_caption(stats) + reports += self._check_redundancies_in_caption(stats) + reports += self._check_imbalanced_captions(stats) + + # report for missing value + reports += self._check_broken_annotation(stats) + reports += self._check_empty_label(stats) + reports += self._check_empty_caption(stats) + + dist_by_caption = stats["distribution_in_caption"] + for caption, caption_stats in dist_by_caption.items(): + reports += self._check_far_from_caption_mean(caption, caption_stats) + reports += self._check_imbalanced_dist_in_caption(caption, caption_stats) + + return reports + + def _check_broken_annotation(self, stats): + validation_reports = [] + + items_broken = stats["items_broken_annotation"] + for item_id, item_subset in items_broken: + validation_reports += self._generate_validation_report( + BrokenAnnotation, Severity.warning, item_id, item_subset, self.str_ann_type + ) + + return validation_reports + + def _check_empty_label(self, stats): + validation_reports = [] + + empty_label_dist = stats["label_distribution"]["empty_labels"] + for label_name, label_stats in empty_label_dist.items(): + for item_id, item_subset in label_stats["items_with_empty_label"]: + details = (item_subset, label_name) + validation_reports += self._generate_validation_report( + EmptyLabel, Severity.warning, item_id, *details + ) + + return validation_reports + + def _check_empty_caption(self, stats): + validation_reports = [] + + empty_caption_dist = stats["caption_distribution"]["empty_captions"] + for caption_name, caption_stats in empty_caption_dist.items(): + for item_id, item_subset in caption_stats["items_with_empty_caption"]: + details = (item_subset, caption_name) + validation_reports += self._generate_validation_report( + EmptyCaption, Severity.warning, item_id, *details + ) + + return validation_reports + + def _check_few_samples_in_caption(self, stats): + validation_reports = [] + thr = self.few_samples_thr + + defined_caption_dist = stats["caption_distribution"]["defined_captions"] + captions_with_few_samples = [ + (caption_name, count) + for caption_name, count in defined_caption_dist.items() + if 0 < count <= thr + ] + + for caption_name, count in captions_with_few_samples: + validation_reports += self._generate_validation_report( + FewSamplesInCaption, Severity.info, caption_name, count + ) + + return validation_reports + + def _check_far_from_caption_mean(self, caption_name, caption_stats): + validation_reports = [] + + for prop, prop_stats in caption_stats.items(): + items_far_from_mean = prop_stats["items_far_from_mean"] + if prop_stats["mean"] is not None: + mean = round(prop_stats["mean"], 2) + + for item_dets, anns_far in items_far_from_mean.items(): + item_id, item_subset = item_dets + for ann_id, val in anns_far.items(): + val = round(val, 2) + details = ( + item_subset, + caption_name, + ann_id, + f"{self.str_ann_type} {prop}", + mean, + val, + ) + validation_reports += self._generate_validation_report( + FarFromCaptionMean, Severity.info, item_id, *details + ) + + return validation_reports + + def _check_redundancies_in_caption(self, stats): + validation_reports = [] + + redundancies_in_caption_dist = stats["caption_distribution"]["redundancies"] + captions_with_redundancies = [] + for cap_column, cap_stats in redundancies_in_caption_dist.items(): + for redundancy_type, items in cap_stats.items(): + if 0 < items["count"]: + captions_with_redundancies.append((cap_column, redundancy_type, items["count"])) + + for cap_column, redundancy_type, count in captions_with_redundancies: + validation_reports += self._generate_validation_report( + RedundanciesInCaption, Severity.info, cap_column, redundancy_type, count + ) + + return validation_reports + + def _check_imbalanced_captions(self, stats): + validation_reports = [] + thr = self.imbalance_ratio_thr + + defined_caption_dist = stats["caption_distribution"]["defined_captions"] + count_by_caption_labels = [count for _, count in defined_caption_dist.items()] + + if len(defined_caption_dist) == 0: + return validation_reports + + count_max = np.max(count_by_caption_labels) + count_min = np.min(count_by_caption_labels) + balance = count_max / count_min if count_min > 0 else float("inf") + if balance >= thr: + validation_reports += self._generate_validation_report( + ImbalancedCaptions, Severity.info + ) + + return validation_reports + + def _check_imbalanced_dist_in_caption(self, caption_name, caption_stats): + validation_reports = [] + thr = self.dominance_thr + topk_ratio = self.topk_bins_ratio + + for prop, prop_stats in caption_stats.items(): + value_counts = prop_stats["histogram"]["counts"] + n_bucket = len(value_counts) + if n_bucket < 2: + continue + topk = max(1, int(np.around(n_bucket * topk_ratio))) + + if topk > 0: + topk_values = np.sort(value_counts)[-topk:] + ratio = np.sum(topk_values) / np.sum(value_counts) + if ratio >= thr: + validation_reports += self._generate_validation_report( + ImbalancedDistInCaption, Severity.info, caption_name + ) + + return validation_reports diff --git a/tests/assets/tabular_dataset/women_clothing.csv b/tests/assets/tabular_dataset/women_clothing.csv new file mode 100644 index 0000000000..e7e389ad5f --- /dev/null +++ b/tests/assets/tabular_dataset/women_clothing.csv @@ -0,0 +1,121 @@ +Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name +,,,,,,, +863,,Simple and stylish and lovely-runs a bit big,"I find that this brand can be a little bit all-over-the-place with sizes. had i tried this on in person i may have bought it in a small instead of a m. despite being a bit big it still looks great and hides my flaws well. i bought a m in another shirt from them (the seamed scoop neck) and that shirt was a full 2 inches smaller in the bust than this. just something to keep in mind. still a great shirt, love the neutral color.",4,1,1,General,Tops,Knits +822,69,,Stunning lace top. This top is absolutely stunning. i purchased the white one. just received and it fits nicely-a little on the big side but i believe it looks better that way. this will look great with everything. it is well constructed and looks very unique. love it!,5,1,2,General,Tops,Blouses +895,43,Huuuuge,,3,0,3,General,Tops,Fine gauge +822,23,Not a fan,The fabric felt cheap and i didn't find it to be a flattering top. for reference i am wearing a medium in the photos and my measurements are 38-30-40.,,0,4,General,Tops,Blouses +863,51,Title,"Runs big and looked unflattering. i am petite, might work on someone taller.",2,0,,General,Tops,Knits +836,66,Excited ... but ...,"I bought this lovely silk/velvet shirt in the "sky" color but it is more on the teal blue side than sky blue, which disappointed me. it is definitely darker than appears in photo. still a luxurious well-made beauty with sassy appeal. it drapes like a snake slithering down your body. it comes with attitude.",5,1,6,,Tops,Blouses +836,47,Beautiful shirt but runs small!,Was so excited to order this beautiful shirt! and the color sky is gorgeous! but as another reviewer said--it runs small. it was also way too short. so sadly i will be returning this. i gave it 5 stars because it is beautiful and great quality--it just didn't fit me.,4,1,7,General,Tops,Blouses +836,66,Beautiful,I ordered ivory in xl because this brand tends to run tight if you're well endowed. it's a beautiful color and i love the contrasting plaid lining the inside of the collar and cuffs. haven't decided about keeping it because it looks oversized (but cozy) and it's really long. it almost touches the top of my knees in back. sooooo pretty though.,4,1,8,General,Tops,Blouses +836,23,Perfect fall shirt!,"The shirt is exactly as it is pictured, i have the burnt orange color and love it! i received numerous compliments both paired with jeans and tucked into a fun skirt. it fits true to size, as i normally wear a small in all retailer however it is the slightest bit tight in the shoulders. i would give this top a 5/5.",5,1,9,General,Tops,Blouses +1081,47,Title,"Perfect dress for hot, humid, sticky weather.",5,1,10,General,Dresses,Dresses +861,44,Poor quality,This is so thin and poor quality. especially for the price. it felt like a thin pajama top. the buttons are terrible little shell buttons. this could not have been returned faster.,1,0,11,General Petite,Tops,Knits +1081,44,An almost for me...,"I didn't end up keeping this dress...it just wasn't right for me. it is very cute and fit well but the fabric was very thin (partially lined which is a plus) and the hem line coming up on the sides made it just too short for me. i ordered the petite xs and liked the length in front and back well but the side slit/vent was just too high for my liking. the tie part hit at the perfect spot for me at only 5'4"" (~118#). i guess i just didn't ""love"" it and would rather save my funds for something i ca",3,1,12,General,Dresses,Dresses +836,50,"Beautiful, stunning, cozy top!","I read the first review on this and ordered both a small and a medium as i thought this would run small. i have to totally disagree with the reviewer! i find that this top runs true to size or even generous! the sky color is so pretty and this top can be dressed up with some nice heels and a necklace or it can be comfy casual! i usually wear a small in hh brand and this one was true to fit (5""2"", broad shoulders, 120 lb)",5,1,13,General,Tops,Blouses +1081,61,Cute and comfy,"This dress is comfortable and stylish at the same time. it runs true to size. i'm 5'1"" 113 lbs and got the xs petite. wore it once so far for a few hours and got a few compliments on it!",5,1,14,General,Dresses,Dresses +1082,32,Disappointing quality,"The design/shape of the dress are quite flattering, flirty and feminine. but.... there is no way that the dress i received is new. the color is a faded washed out red and there are black stains all over the belt area. there is no tag... the fabric looks droopy and laundered and is not crisp, stiff or new. i am very disappointed by the quality of the item that i received. undoubtedly this one is going back. + +dear retailer - please make sure that you do not send pre-owend clothing articles to",2,0,15,General,Dresses,Dresses +863,52,Cute tee,"This one totally worked for me. love the color, length, and style.",4,1,16,General,Tops,Knits +1072,51,Very vintage feel,"Prior reviewer nailed it with the summary of this dress. it definitely needs heels. i'm 5'4"", 120#, 34d and this is a size 36. i usually wear a size s or 4 with most retailer stuff.",4,1,17,General,Dresses,Dresses +1078,32,Title,"This is the perfect summer dress. it can be dressed up or down. the quality of the linen fabric is very nice. i'm 5'1"" and it hit right below my knees. i found it to run true to size. those with a smaller bust might want to go down a size, but the tie in the front can be adjusted. it's lovely.",5,1,18,General Petite,Dresses,Dresses +850,37,Awkward fitting,"First, the fabric is beautiful and lovely for spring and summer. i really wanted to like this top, but the fitting is so awkward for me. i typically where a 0/xs, and sized up in the shirt to a size 2. it was very tight and pulled funny across the chest (size 32/a). i also found the cut at the shoulders very narrow (need a strapless bra) and made it look unflattering overall. had to return this one back to the store.",2,0,19,General,Tops,Blouses +836,54,Cozy casual - perfect for fall,"The color is perfect for fall and into winter. only the inside collar on the photo shows the subtle plaid lining. the lining was an unexpected bonus, and adds even more dimension to the shirt if the sleeves are turned up. +the fit is true to size and the 'stressed' velvet fabric is current for this season. +i'm going to enjoy wearing this shirt.",5,1,20,General,Tops,Blouses +861,33,The perfect striped t,"The perfect striped t-shirt, and the cute little buttons down the front are a wonderful accent with each one being different. i love it!",5,1,21,General Petite,Tops,Knits +966,63,Title,Great look and you can wear this vest with almost everything . i normally wear a small but they only had mediums and it fit fine( i like it tied). this will stay in my closet all year round . it does look like it will wrinkle easily but that doesn't bother me . a must buy !,5,1,22,General,Jackets,Jackets +1196,29,Title,"This is a great pull over dress that can easily be dressed to wear to work, to a bbq, or to happy hour. i have a large chest which sometimes makes this fit look like a tent but this fabric drapes nicely. the only thing i would note is the neckline was slightly higher on me than for the model.",5,1,23,General Petite,Dresses,Dresses +836,24,So beautiful! gorgeous orange color!,"After reading the previous reviews, i ordered a size larger. i am so glad i did it! it fits perfectly! i am 5'4""/115/32dd and went with the s regular. so beautiful! i can't wait to wear it!",5,1,24,General,Tops,Blouses +923,50,"Lovely top, not lovely shape","I was so excited to try out this top since it was such a bargain and neutral. unfortunately, the shape is ""a"" line and accentuates the hip area a bit more than i find flattering. so, it will be returned.",3,0,25,General,Tops,Sweaters +1072,34,"Wonderfully made, poorly designed for busty gals","Like others reviewers mentioned on here, this dress is extremely well made. but there were too many cons for me, as well. this would most certainly work on a petite/shorter frame, but for tall, busty, and curvier girls, it just sits wrong on all places unfortunately. the color is lovely along with the nice collar that ties in the back really made this dress stand out. + +however, i'm a 36c and found that the arm/shoulder part so large and bulky that is made me look bigger on top. there was entirel",3,1,26,General,Dresses,Dresses +1133,71,Title,"I read the previous reviews and had hoped that the exclamations about the color being inaccurate were exaggerated- however they are sadly very true. let me be clear that this is a beautiful, comfortable piece of clothing- when you put it on it feels thick and of very good quality. and while the pattern is interesting and pretty, the ""dark orange"" color i was expecting is actually a mud brown. i think it's unfortunate that retailer misrepresented such a key element of the jacket, but i digress. the",4,1,27,General,Jackets,Outerwear +1133,30,Title,"This coat is beautiful! i love the color and the design. it definitely runs a little large. i usually wear a small or a zero and i can wear a small wth a large sweater underneath no problem. i might have wanted an extra small but i just stuck with the small because i needed the coat already. the vest keeps it warm, but the sleeves aren't lined so it's cold unless you have a nice sweater underneath it if you're going to be in cold weather.",5,1,28,General,Jackets,Outerwear +966,52,Cute vest but...,"Nice vest, pretty olive color, hangs nice but...its just kinda funny. just hangs and really looks odd when tied. i like the laying piece just wished it had more ""umph"".",4,1,29,General,Jackets,Jackets +863,28,Great top,This top is super comfy and casual. the slit/design in the front gives it more of a stylish look than your average white long sleeve tee. would definitely recommend.,5,1,30,General,Tops,Knits +845,62,Title,"Intrepid soul that i am, i washed it in the machine in cold water on a 25 minute cycle, then put it in the dryer for 8 minutes, shook it out, hung it up, and it is just the same as when it went in. doesn't need to be ironed and didn't shrink. yes, the swing is vast, but that is part of its charm. pair it with a fitted piece on the bottom and no one will think that you are actually needing that much material to cover anything! i think it is fun, something i will use for years, wonderfully comfort",4,1,31,General Petite,Tops,Blouses +1081,39,Just ok,"It's ok, fit doesn't wow me because of my body. chest is too wide, hips look too narrow. drapes across my back fat in an especially non-flattering way. basically made my square-apple body look more square-apple. great part about this dress is that it's comfy and hides the tummy pooch. construction is poorly done...contrasting liner at v-neck is rolling out on one side only and then doing the same at the hem contralaterally. another negative point is dry clean only. boo. i'm 5'3"" 140# 39-28-35 an",3,1,32,General,Dresses,Dresses +861,46,Pernette henley,"In my retailer this was hung over in the pj section, and it really has more of a jammie top feel... soft, thin and stretchy. it's super cozy and comfortable and it drapes really nice. mine seems to have stretched out a little, especially the neckline (and the little buttons have a hard time staying closed). i'm happy with it though for what it is - a great, casual day t.",4,1,33,General Petite,Tops,Knits +966,36,Stylish and versatile!,"I love this vest! there are so many ways to style it...open or tied closed, over a dress or with a tshirt and jeans. i get compliments every time i wear it. it's soft and light enough to add interest to an outfit without being too heavy/hot to wear all day.",5,1,34,General,Jackets,Jackets +845,65,You'll smile on a humid summer's day,"This blouse is a perfect creation: perfect-weight cotton, many, many details, and color, all perfectly executed. there's fullness to the body, so this will definitely be easy to wear in hot, humid locations this summer, and it would be easy to pair with many things you have already for a casual yet collected look. there's no mistaking quality when you look at this piece! i bought the medium for my 130#, 34f, size 8 blouse/tee frame. this is the blouse you will have for years, and you will enjoy",5,1,35,General Petite,Tops,Blouses +861,35,Title,Review Text,4,1,36,General Petite,Tops,Knits +966,31,A new staple for my wardrobe,"Love this vest! the color looks a little more brown in the picture than it really is--it's a deep forest green that goes great with jeans or black pants. a great piece for polished but casual style, and the fabric has a nice, soft suede-like finish. + +if only the was *slightly* higher to hit my true natural waist, it would get that fifth star (but sometimes i wear petite sizes, so that might be the issue here--still looks good tied).",4,1,37,General,Jackets,Jackets +836,52,Runs short,I received the sky color in m online. am definetly keeping it as it is beautiful. however i wish they had made it a little less wide and more long. i am 5 3 and could totally size down to a s but even at my height it would be too short. i like crop tops and waist hitting tops a lot. but the shortness on this one just looks like it shrunk in the dryer,4,1,38,General,Tops,Blouses +126,34,Feminine alternative to your shapeless puffer,I am obsessed with peplum down coats because the ones you usually see have no shape and are extremely unflattering. i was excited for this to arrive. this is quite nice and it looks more feminine than the other down coats out there. the coat itself runs just true to my shirt size but not what i want for a coat. i will be returning this for a size up because i will want to wear a sweater or at least maybe a thicker shirt under this especially for winter or fall. the fabric is decent and the weigh,4,1,39,Initmates,Intimate,Lounge +1008,31,Love. love. love this skirt!!,"The silhouette and length of this skirt and length are flattering, classic and comfortable! the colors and weight of this skirt make it versatile - could be worn year-round (so long as it's not 100 degrees out - there is a bit of weight to it). it's one of my favorite pieces in my closet. can be styled 20 different ways. pair with a higher-end tee, tank, denim jacket or body-hugging sweater. i sized down.",5,1,40,General,Bottoms,Skirts +829,36,Pretry top,"Perfect for work or going out. i layered this with the reversible tank in medium pink so it would be work appropriate. it did not feel scratchy to me, maybe because i layered it. great buy especially with the discounts. feel like i lucked out.",5,1,41,General,Tops,Blouses +828,39,Title,"It's a pretty top, but it runs very short. the back is also pretty see through so i'm not sure i will keep it. it's called off the shoulder but it is so tight when you try and actually wear it off the shoulder. great design, just not sure i will keep it due to length",4,1,42,General,Tops,Blouses +1126,35,Title,Review Text,5,1,43,General,Jackets,Outerwear +1008,51,Great classic,Ii'm not usually big on lace but this is so unique and versatile that i went for it. looks good dressed up w/a nice top or casual w/a tee. can be worn practically all year. love it!,5,1,44,General,Bottoms,Skirts +829,39,"Beautiful, but scratchy","My firned tried this on and her first comment was ""take it off, it is scratchy"", so i didn't bother trying it on. it is, however, beautiful, if you are not sensitive to scratchy material. +fit was true to size.",3,0,45,General,Tops,Blouses +828,39,On the fence,"The stylist had me try this top on off the shoulders, but that was kind of snug, perhaps why it didn't fit right, it is not off the shoulders on this picture. the cut is loose, so it looked big on me, but going smaller would not have worked the way i had it on... may give it another try on. xs was big, so would need to try the xxs next time. but off the shoulders looked nice on the lady who was smaller built next to me! + +colors are nice, light, but appropriate for fall (or summer!) light fabri",3,1,46,General,Tops,Blouses +1008,39,"So pretty, a bit long and bigger than what i read","Some reviewers found this skirt to be on the smaller side, but for me, it was big,,, too bad the smaller size is sold out (well, all peittes are sold out). the length is also pretty long, below the knee, but in the narrow part still...so able to keep it. the colors are great. i brought it in the store to try on in front fo the stylists, adn they all loved it and told me to keep it... winner for sure!",5,1,47,General,Bottoms,Skirts +1011,47,Cute skirt!,I purchased this skirt at retailer store in texas. i fell in love since i saw it. the fabric is nice and the colors are pretty and cheerful for this spring/summer !! and it has pockets on the sides! don't we love pockets? very practical and lightweight,5,1,48,General,Bottoms,Skirts +1008,64,Very pretty,"This is a very pretty skirt and the colors are better in real life. sizing was mostly true to size for me, slightly tight in the waist but i have a thicker waist compared to my hips. i'm 5'7 and 138# and ordered a size 4 (i usually size up to a 6 with retailer pencil skirts.) a 6 probably would have fit as well, especially in the waist but didn't want it too big in the hips. length was perfect, to my knees.",5,1,49,General,Bottoms,Skirts +1008,38,A beautiful skirt!,"This skirt is wonderful! the price point seems a bit high for the average office gal, but the quality is impeccable. i really enjoy looking at myself in the mirror when wearing this skirt. it can be dressed up, dressed down, worn during all seasons. the design/pattern is artsy and whimsical without being too ""out-there"" (unless you want it to be, of course!). the fit is true to size, and the delicate details are just lovely.",5,1,50,General,Bottoms,Skirts +1008,36,Gorgeous skirt!,"Like other reviewers noted, the pics don't do this skirt justice. it is truly beautiful with an intricate lace pattern and rich colors. can't wait to wear this to work!",5,1,51,General,Bottoms,Skirts +1008,39,Beautiful,"Love this skirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug.",4,1,52,General,Bottoms,Skirts +829,53,Not impressed...,"Not keeping this one. the fabric is a bit tacky-looking in person, the cut is odd and it's just not me. fit is fine and there are snaps to keep the neckline flat and shaped, the colors are as shown and it is a good length (falls to top of hip). i simply did not like it. too metallic looking maybe...",3,0,53,General,Tops,Blouses +829,36,Fair,The top as with most of ap's tops is well stitched. material is very uncomfortable. if you have large bust it is a little divulging. this may prompt you to wear something underneath to look modest and change the shape of the top!,3,1,54,General,Tops,Blouses +829,44,Great colors but....,"I love the metallic colors of this top and figured i could wear it under a ruched jacket and circle skirt for work. welp, that's out the window. this design is poor. for one, this is not a piece for a petite woman with no torso and i don't know how anyone with a longer torso wears t his. this hits above my belly botton on and i got apetite 2. i have no torso. so, without a jacket, i would never wear this. it's very low cut..the back is very low..it's a little loose but i run between a 2 and a 4.",3,1,55,General Petite,Tops,Blouses +829,48,Beautiful colors,This top is so much better in person. i do not agree with some of the other reviews about the fabric being scratchy. it is not and i have sensitive skin. i love this top and have got lots of compliments.,5,1,56,General Petite,Tops,Blouses +1008,37,Title,"This is a lovely pencil skirt that is well-made and really brightens a work day. like a lot skirts with multiple colors, a lot of tops feel like they should match but you have to try a few that really work with the skirt and your look to get the right impact.",5,1,57,General,Bottoms,Skirts +523,43,Australian sizing!!,So disappointed that no where in the limiting did it mention this suit is australian in size!! i ordered the 10 (i'm usually an 8 but that was sold out) and the suit arrives and doesn't even fir over my hips. -- the label clearly says 10 australian us 6! gutted that i have to return this suit because of this sizing issue. the suit looks to be well made and the design is adorable of it only for.,3,0,58,Initmates,Intimate,Swim +829,53,Title,I saw this top online and read the reviews so i passed when i went into the store. when i went in again i thought i would give it a try. so glad i did ! it fits great and is way way prettier in person ! i ended up buying it and so glad i did !,4,1,59,General Petite,Tops,Blouses +829,33,Vibrant metallic,"This is a very pretty top with vibrant metallic colors. i would be perfect for a holiday party or going out to dinner. it wasn't as scratchy as i thought it would be based on the reviews, however, it ran a little big. i'm 5.3 and about 130 pounds and i almost always wear a size 6, but the six was too big. i'd recommend sizing down. it was also lower cut than i thought it would be....showed a lot of cleavage (i'm a 34d) so it's not work appropriate. .",4,1,60,General Petite,Tops,Blouses +1126,39,So sad not mine,"Love everything about this beautiful coat except the way it fits on me. it is just perfect in the shoulders but once it flares out at the bottom, i look like a clown costume. if i size down it will not fit in my shoulders. perhaps a tailor can install buckles that match the neckline buckle to the sides of the coat to fold in the flare. on another note the buckle at the neckline feels very hard and fake, not sure if it's real leather.",4,1,61,General,Jackets,Outerwear +829,52,"Scratchy, uncomfortable top","The title says it all....this fabric of this top is both the best and worst part of the design. the colors are vibrant and the combination of materials (shoulder is a knit, sweater-like navy fabric) is interesting. however, that is where the positive comments end on this one. the top is so scratchy,, stiff, and, frankly, uncomfortable. i cannot imagine wanting to wear it. it could benefit from a lining, and that might have solved the problem of scratchy, itchy fabric. + +the stiff fabric of the bo",1,0,62,General Petite,Tops,Blouses +829,64,Yes it's scratchy but it can work out!,"My usual size 6 fits perfectly... yes the metallic fibers on the inside are scratchy. a cami solves that problem. when ordering, i realized i cld not go strapless & wld have to wear a one of my wider strap cami's in navy or black to cover the bra strap area. it was obvious some sort of cover-up swtr or jacket w/b needed & wld cover the strap area anyway. so, i also ordered the 'faux-fur cardi' in the ivory to wear over this top. it's a shrug-like cardi w/ 3/4 slvs. comes in the plum also if you",4,1,63,General Petite,Tops,Blouses +1008,52,Classy and cute,The online picture does not do this skirt justice. it's very pretty and unique in design. i think it should be worn with a tighter shirt that is tucked in unlike the picture. this is truly a pretty pencil skirt.,5,1,64,General,Bottoms,Skirts +1126,42,So pretty!,"I bought this and like other reviews, agree that the quality probably could be better, but i still love it enough to keep. the buckle fell apart, but i was able to fix it and the zipper seems a little weak. it has pilled, but the fabric is textured, so i didn't even notice until i read the other reviews and then looked back at my jacket. it is a perfect medium coat for 40's & 50's before i am ready to bust out my ugly down north face for winter. i get compliments every time i wear it! love!!",4,1,65,General,Jackets,Outerwear +829,35,Title,Review Text,3,0,66,General Petite,Tops,Blouses +1020,56,Just as pictured,"I bought a petite, size 2. i am 5'3"", 111 lb - it fit perfect with a tiny bit of room. looks just like the one pictured. length on me was about 1 inch lower than the model. very bright with multiple colors. has a nice stretch. very cute.",5,1,67,General Petite,Bottoms,Skirts +895,55,Great purchase,This will be perfect for the mild fall weather in texas. it's light weight and i love that the top is a little more fitted and the bottom swings out.,5,1,68,General,Tops,Fine gauge +862,40,Title,Review Text,5,1,69,General,Tops,Knits +1104,47,Antoher beautiful maeve dress!,"I love the dresses by maeve and this one is no exception. i was pleasantly surprised upon receiving this dress because i wasn't sure about the print size and colors when i saw it online, but the print is beautiful. it's a very simple but elegant style and the ruffle on the back adds an extra touch of interest. i like that it has enough coverage at the top (front and back), the underarms do not fall very low, and it is lined. the a-line skirt will flatter most body types. the fabric and overall q",5,1,70,General,Dresses,Dresses +670,36,Fun,These pants are fun! i use them as sleep pants. i had to size up because the waist band was a little tight for comfort.,5,1,71,Initmates,Intimate,Sleep +329,46,Comfy and adorable!,"I ordered this in xs, i'm 5'4"", around 115lbs, and it fits perfectly. the material is very soft, but not see-through, and the romper is well-made. i wish it came in other colours, i'd buy more!",5,1,72,Initmates,Intimate,Sleep +670,66,Beautiful fabric and style,"I purchased these for something other than sweats to wear for a girls get away weekend. they were so comfortable and flattering, friends told me i could wear them out for dinner. + +the fabric is beautiful, and i loved the way they draped. the angled cut on the legs is very cute, and something a bit different. these could be perfect for a hot summer evening dinner, because the fabric is very light and breezy. dress it up with a tank top and ballet flats and you're good to go.",4,1,73,Initmates,Intimate,Sleep +868,51,Great for summer,"Love this cream sleeveless top....it goes with everything and you can dress it up or down! this will be a go to top all summer long and probably wear thru the fall as well with a layered sweater, if needed. i typically wear small or medium size and got the medium hoping for a little longer length. i am 5'7, 34c, and overall wt. of approx. 128 lbs...it fits very nicely . thank you retailer!",4,1,74,General Petite,Tops,Knits +1020,34,Wonderful but going back,"This skirt is beautiful (especially the color) and looks well made. however, i am very pear shaped and this skirt is just too straight to be flattering on me. i agree that it runs small but even when i sized up it still wasn't flattering. i imagine that those who were not bestowed with massive hips will look lovely in this.",5,1,75,General Petite,Bottoms,Skirts +862,30,"Comfy, casual shirt",Happy with this top- slightly thinner material than i was expecting but that'll be fine with our hot summers. got the navy striped one- very pretty and extremely soft. washes well. i do wear an tank underneath though bc the armholes are slightly large and you can see part of my bra.,4,1,76,General,Tops,Knits +895,43,Perfect casual sweater,"Love the fit of this sweater! it almost fits like a sweatshirt and definitely not as long as on the model. it hits me just below the hips( for ref. im 5-3""). im considering getting all the other colors because its so cute and comfortable. could be dressed up or down. runs true to size-im always a small and this fits perfectly roomy.",5,1,77,General,Tops,Fine gauge +895,52,Title,"I love this sweater!! i like sweaters that are narrow on top and taper out so it doesn't look like a sack on me. plus, it is the perfect weight. i wish i would have bought one in a different color because it is so comfortable.",5,1,78,General,Tops,Fine gauge +1094,35,Not for me,"The colors are vivid and perfectly autumnal but the fit is a mess. it was overall too large, the waistline curves up in the front and then falls into small pleats which was maternityish, the waistband was thicker than the dress and sat away from my body and the material was a cheapish poly. had the outer dress been made from the same material as the lining, i would have liked it better. the modesty closure was a plus but the dress was already unraveling when i took it out. #returned",2,0,79,General Petite,Dresses,Dresses +895,41,Perfect for warmer climates,"Love this tunic! i am a curvy gal (with a few extra 'curves' in the middle) and this is a perfect top for accentuating the good and masking the negative. i purchased the pink color and it is a gorgeous peachy- pink, a much deeper color than what is portrayed in the picture. i found it true to size. typically i wear a medium or large, depending on the structure and the medium fit well. a little more fitted on top and looser around the waist. i came back to buy another in the ivory but alas, it is",5,1,80,General,Tops,Fine gauge +895,39,Need petite,"Loved the green color, the cut is super flattering, but alas, i do need hte petite, ti was a bit long and i looked a little lost. unfrotuantely, the color is sold out in petite :-(",5,1,81,General,Tops,Fine gauge +1020,39,Fun,"But i thought this was lace or with texture, it is more a pattern on regular fabric. + +colors are nice, there are subtle flowers with bright colors in the pattern, the fit is nice, the 0p fit snugly enough, i don't think there was much give but that was comfortable enough. petite length did end below the knee as shown. i think it would have been nicer shorter but don't want to have to pay extra to ehm so i left that one behind...maybe on sale :-)",4,1,82,General Petite,Bottoms,Skirts +895,50,Completes so many outfits!,"I like this sweater so much i just bought it in a second color! the pleats make the sweater conform to my shape just enough to be flattering. i wore it over three different dresses this week that might have felt too bare for work or cooler weather. i live in a hot climate so this is the right weight for our cooler months. the metallic threads give it a little bit of flair and the grey color goes with everything. i'm 5'7"" size 10-12 and the large fit just right.",5,1,83,General,Tops,Fine gauge +1020,27,Size up!,"Beautiful color,, great quality, and great fit if you size up! i learned my lesson with previous purchases and sized up when i read it runs small. i usually wear a 0/2 and i took a risk and went for the 4. it fits perfectly. i have a 25 in waist.",4,1,84,General Petite,Bottoms,Skirts +895,33,Not as pictured.,"Online, this looks like a great sweater. i ordered an xxsp and found that this sweater is much wider in the middle than pictured. in fact, i'm pretty sure they pinned the shirt in the back for the picture to make it appear slimmer. unfortunately, this sweater will not work for me, as i am an hourglass shape and this shirt makes me look 20 pounds heavier.",2,0,85,General,Tops,Fine gauge +895,35,"Fits strange, flimsy material","I was worried about this item when i ordered it because of how it looks in the picture, but i had wishful thinking. i should have gone with my gut! this shirt does not have the same quality as all my other retailer purchases. it is see-through and flimsy. the bottom is just like the picture, it hangs in an odd rumpled way. the top is very flattering though, so it's a shame! if the bottom fit nicer like their other products i could have gotten away with wearing a cami under it to make up for the qu",2,0,86,General,Tops,Fine gauge +1020,34,"Nice color, love the snap buttons",This skirt is a great length and nice piece for fall/winter. i love the color. it fits like a high waisted skirt would which is why i purchased the next size up.,4,1,87,General Petite,Bottoms,Skirts +596,41,Modern comfort,This easel caftan is simply amazing! the silhouette fits all sizes and shapes while providing a unique dress.,5,1,88,General,Trend,Trend +1094,54,"It's gorgeous, but...","Just came today: the print is gorgeous. very bohemian. it's a dressier dress with the chiffon. on the con side, the chiffon is very delicate, snagged easily while trying on. there's raw edges on the waistband, needed trimming fresh out of the package. sz 10 fit in the waist, too small for 36d bust; sz 12 fits bust but is big in the waist. sz 10 ankle length for 5'7"", 12 hits the floor. haven't decided if i'll keep the 12 and try to alter.",4,1,89,General Petite,Dresses,Dresses +1020,41,Runs very small,"My waist measures 28"" and the size 4 is a snug fit. i normally buy size 2 in retailer skirts, but the 2 was unbearably tight. cute skirt though! i like that it's machine washable, and the quality looks good.",4,1,90,General Petite,Bottoms,Skirts +895,47,Pretty,I love the fabric and color (i bought the green one). my only complaint is that the base is wider than the picture shows. it looks more fitted on the model. it is more of an a shape (significantly wider at the hips.),4,1,91,General Petite,Tops,Fine gauge +895,52,Nice but too thin,"I was minimally torn over whether to return this but ultimately it's going back because the knit is just too thin. i thought it would be cozy and be of normal sweater weight but it's not. and because it's so light, the swing effect doesn't really come off. nothing special.",2,0,92,General Petite,Tops,Fine gauge +895,46,Title,Review Text,5,1,93,General Petite,Tops,Fine gauge +1020,22,Title,"I love the color of this skirt, and the fabric is wonderful. it was a bit longer than anticipated, but fit well.",5,1,94,General Petite,Bottoms,Skirts +1098,40,Title,"Dress ran very large in every way. beautiful design, lining and quality material. i should have sized down 2 sizes. item is now sold out.",3,1,95,General,Dresses,Dresses +895,62,Lovely and feminine,"Finally a ""swing top"" that doesn't look like a sack on me! i've been wanting to partake of this current style, but everything i've tried so far just looks ""dumpy"" on me. not this top -- it is very feminine and flattering. i am 5'7"" 118 lbs and have a small waist -- and although this is a ""swing"" style, it still shows my waist and doesn't overpower me. it is also a lovely fabric -- especially in the soft pink which i got. i'm thinking of going back for more colors!",5,1,96,General Petite,Tops,Fine gauge +862,26,A great piece,"I bought this shirt in the neutral and white and love it. so many people compliment it. i usually pair it with white pants and cute wedges to dress it up (obviously with a statement necklace too). but the greatest thing about this shirt is the fact that you can also dress it down. the material is fabulous but i have not washed it yet so i am not sure if it is going to shrink (which i hope it doesn't because it is one of my favorite pieces). + +unlike the other reviewer i did not think that the",5,1,97,General Petite,Tops,Knits +1020,38,Love everything about this skirt,"Unlike the other reviewers, i did not have any problem with the sizing, fit or length of this skirt. it is a midi skirt so i think it's suppose to be a little bit longer?? for me it fit true to size. for reference i am 5'8"", 135 pounds and the size 6 fit perfectly. i think the color is beautiful and the quality is good.",5,1,98,General Petite,Bottoms,Skirts +895,53,Perfect comfy now top!,"For a now feel, that comfy and well made, this was a great choice!",5,1,99,General,Tops,Fine gauge +1020,49,Poor quality,"This skirt looks exactly as pictured and fits great. i purchased it a few weeks ago and got lots of compliments on it. however, on the third wear, the side zipper split wide open. needless to say, it was returned.",3,0,100,General Petite,Bottoms,Skirts diff --git a/tests/unit/test_transforms.py b/tests/unit/test_transforms.py index edbf1a31bf..7541831f47 100644 --- a/tests/unit/test_transforms.py +++ b/tests/unit/test_transforms.py @@ -1383,13 +1383,13 @@ def test_transform_annotation_type_caption(self): id="0", subset="train", media=TableRow(table=table, index=0), - annotations=[Caption("0.076108")], + annotations=[Caption("nswprice:0.076108")], ), DatasetItem( id="1", subset="train", media=TableRow(table=table, index=1), - annotations=[Caption("0.060376")], + annotations=[Caption("nswprice:0.060376")], ), ], categories={}, diff --git a/tests/unit/test_validator.py b/tests/unit/test_validator.py index 3b98d683ed..8f7cc0f75d 100644 --- a/tests/unit/test_validator.py +++ b/tests/unit/test_validator.py @@ -2,15 +2,18 @@ # # SPDX-License-Identifier: MIT +import os.path as osp from collections import Counter from unittest import TestCase import numpy as np import pytest +import datumaro.plugins.transforms as transforms from datumaro.components.annotation import ( AnnotationType, Bbox, + Caption, Ellipse, Label, LabelCategories, @@ -21,12 +24,19 @@ from datumaro.components.environment import Environment from datumaro.components.errors import ( AttributeDefinedButNotFound, + BrokenAnnotation, + EmptyCaption, + EmptyLabel, FarFromAttrMean, + FarFromCaptionMean, FarFromLabelMean, FewSamplesInAttribute, + FewSamplesInCaption, FewSamplesInLabel, ImbalancedAttribute, + ImbalancedCaptions, ImbalancedDistInAttribute, + ImbalancedDistInCaption, ImbalancedDistInLabel, ImbalancedLabels, InvalidValue, @@ -38,6 +48,7 @@ NegativeLength, OnlyOneAttributeValue, OnlyOneLabel, + RedundanciesInCaption, UndefinedAttribute, UndefinedLabel, ) @@ -47,11 +58,14 @@ ClassificationValidator, DetectionValidator, SegmentationValidator, + TabularValidator, _TaskValidator, ) from ..requirements import Requirements, mark_requirement +from tests.utils.assets import get_test_asset_path + class _TestValidatorBase(TestCase): @classmethod @@ -368,6 +382,24 @@ def setUpClass(cls): ], ) + path = osp.join(get_test_asset_path("tabular_dataset"), "women_clothing.csv") + tabular_dataset = Dataset.import_from( + path, + "tabular", + target={ + "input": ["Review Text"], + "output": [ + "Age", + "Title", + "Review Text", + "Rating", + "Positive Feedback Count", + "Division Name", + ], + }, + ) + cls.tabular_dataset = transforms.AstypeAnnotations(tabular_dataset) + class TestBaseValidator(_TestValidatorBase): @classmethod @@ -835,6 +867,255 @@ def test_check_far_from_attr_mean(self): self.assertIsInstance(actual_reports[0], FarFromAttrMean) +class TestTabularValidator(_TestValidatorBase): + @classmethod + def setUpClass(cls): + cls.validator = TabularValidator( + few_samples_thr=1, + imbalance_ratio_thr=50, + far_from_mean_thr=5.0, + dominance_ratio_thr=0.8, + topk_bins=0.1, + ) + + def _update_stats_by_caption(self, caption_, caption_stats): + caption_has_error = False + + if not caption_has_error: + caption_info = {"value": caption_} + self.validator._update_prop_distributions(caption_info, caption_stats) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_compute_prop_dist(self): + stats = { + "distribution_in_caption": { + "unittest": { + "value": { + "items_far_from_mean": {}, + "mean": None, + "stdev": None, + "min": None, + "max": None, + "median": None, + "histogram": { + "bins": [], + "counts": [], + }, + "distribution": [], + } + }, + }, + "distribution_in_dataset_item": {}, + } + num_caption_columns = [("unittest", int)] + + self.validator.items = [ + ( + ("1", "train"), + [Caption(id=0, attributes={}, group=0, object_id=-1, caption="unittest:0")], + ) + ] + + self.validator._compute_prop_dist(num_caption_columns, stats, self._update_stats_by_caption) + self.assertEqual(stats["distribution_in_caption"]["unittest"]["value"]["distribution"], [0]) + self.assertEqual(stats["distribution_in_dataset_item"], {("1", "train"): 1}) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_compute_prop_stats_from_dist(self): + dist = range(0, 100) + dist_by_caption = { + "unittest": { + "value": { + "items_far_from_mean": {}, + "mean": None, + "stdev": None, + "min": None, + "max": None, + "median": None, + "histogram": { + "bins": [], + "counts": [], + }, + "distribution": [dist], + } + }, + } + + self.validator._compute_prop_stats_from_dist(dist_by_caption) + self.assertEqual(dist_by_caption["unittest"]["value"]["mean"], np.mean(dist)) + self.assertEqual(dist_by_caption["unittest"]["value"]["stdev"], np.std(dist)) + self.assertEqual(dist_by_caption["unittest"]["value"]["min"], np.min(dist)) + self.assertEqual(dist_by_caption["unittest"]["value"]["max"], np.max(dist)) + self.assertEqual(dist_by_caption["unittest"]["value"]["median"], np.median(dist)) + + counts, bins = np.histogram(dist) + self.assertEqual(dist_by_caption["unittest"]["value"]["histogram"]["bins"], bins.tolist()) + self.assertEqual( + dist_by_caption["unittest"]["value"]["histogram"]["counts"], counts.tolist() + ) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_compute_far_from_mean(self): + dist = range(1, 101) + counts, bins = np.histogram(dist) + prop_stats = { + "items_far_from_mean": {}, + "mean": 50, + "stdev": 28, + "min": 1, + "max": 100, + "median": 50, + "histogram": { + "bins": bins.tolist(), + "counts": counts.tolist(), + }, + } + val = 1000 + item_key = ("1", "train") + ann = Caption(id=0, attributes={}, group=0, object_id=-1, caption="unittest:0") + + self.validator._compute_far_from_mean(prop_stats, val, item_key, ann) + self.assertEqual(prop_stats["items_far_from_mean"], {item_key: {0: val}}) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_check_broken_annotation(self): + stats = {"items_broken_annotation": [(1, "train")]} + + actual_reports = self.validator._check_broken_annotation(stats) + + self.assertTrue(len(actual_reports) == 1) + self.assertIsInstance(actual_reports[0], BrokenAnnotation) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_check_empty_label(self): + stats = { + "label_distribution": { + "empty_labels": {"unittest": {"count": 1, "items_with_empty_label": [(1, "train")]}} + } + } + + actual_reports = self.validator._check_empty_label(stats) + + self.assertTrue(len(actual_reports) == 1) + self.assertIsInstance(actual_reports[0], EmptyLabel) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_check_empty_caption(self): + stats = { + "caption_distribution": { + "empty_captions": { + "unittest": {"count": 1, "items_with_empty_caption": [(1, "train")]} + } + } + } + + actual_reports = self.validator._check_empty_caption(stats) + + self.assertTrue(len(actual_reports) == 1) + self.assertIsInstance(actual_reports[0], EmptyCaption) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_check_few_samples_in_caption(self): + with self.subTest("Few Samples"): + stats = { + "caption_distribution": { + "defined_captions": {"unit": self.validator.few_samples_thr} + } + } + + actual_reports = self.validator._check_few_samples_in_caption(stats) + + self.assertTrue(len(actual_reports) == 1) + self.assertIsInstance(actual_reports[0], FewSamplesInCaption) + + with self.subTest("No Few Samples Warning"): + stats = { + "caption_distribution": { + "defined_captions": {"unit": self.validator.few_samples_thr + 1} + } + } + + actual_reports = self.validator._check_few_samples_in_caption(stats) + + self.assertTrue(len(actual_reports) == 0) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_check_far_from_caption_mean(self): + caption_name = "unittest" + caption_stats = { + "w": { + "items_far_from_mean": {("1", "train"): {1: 100}}, + "mean": 0, + } + } + + actual_reports = self.validator._check_far_from_caption_mean(caption_name, caption_stats) + + self.assertTrue(len(actual_reports) == 1) + self.assertIsInstance(actual_reports[0], FarFromCaptionMean) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_check_redundancies_in_caption(self): + stats = { + "caption_distribution": { + "redundancies": { + "unittest": { + "stopword": {"count": 1, "items_with_redundancies": [("1", "train")]} + } + } + } + } + + actual_reports = self.validator._check_redundancies_in_caption(stats) + + self.assertTrue(len(actual_reports) == 1) + self.assertIsInstance(actual_reports[0], RedundanciesInCaption) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_check_imbalanced_captions(self): + with self.subTest("Imbalance"): + stats = { + "caption_distribution": { + "defined_captions": {"unit": self.validator.imbalance_ratio_thr, "test": 1} + } + } + + actual_reports = self.validator._check_imbalanced_captions(stats) + + self.assertTrue(len(actual_reports) == 1) + self.assertIsInstance(actual_reports[0], ImbalancedCaptions) + + with self.subTest("No Imbalance Warning"): + stats = { + "caption_distribution": { + "defined_captions": {"unit": self.validator.imbalance_ratio_thr - 1, "test": 1} + } + } + + actual_reports = self.validator._check_imbalanced_captions(stats) + + self.assertTrue(len(actual_reports) == 0) + + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_check_imbalanced_dist_in_caption(self): + caption = "unittest" + most = int(self.validator.dominance_thr * 100) + rest = 100 - most + + with self.subTest("Imbalanced"): + caption_stats = {"value": {"histogram": {"counts": [most, rest]}}} + reports = self.validator._check_imbalanced_dist_in_caption(caption, caption_stats) + + self.assertTrue(len(reports) == 1) + self.assertIsInstance(reports[0], ImbalancedDistInCaption) + + with self.subTest("No Imbalanced Warning"): + caption_stats = {"value": {"histogram": {"counts": [most - 1, rest]}}} + reports = self.validator._check_imbalanced_dist_in_caption(caption, caption_stats) + + self.assertTrue(len(reports) == 0) + + class TestValidateAnnotations(_TestValidatorBase): extra_args = { "few_samples_thr": 1, @@ -1099,6 +1380,88 @@ def test_validate_annotations_segmentation(self): self.assertEqual(actual_summary, expected_summary) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_validate_annotations_tabular(self): + validator = TabularValidator(**self.extra_args) + actual_results = validator.validate(self.tabular_dataset) + + with self.subTest("Test of statistics", i=0): + actual_stats = actual_results["statistics"] + self.assertEqual(actual_stats["total_ann_count"], 594) + self.assertEqual(len(actual_stats["items_missing_annotation"]), 1) + self.assertEqual(len(actual_stats["items_broken_annotation"]), 7) + + label_dist = actual_stats["label_distribution"] + self.assertEqual(len(label_dist["defined_labels"]), 8) + empty_labels = label_dist["empty_labels"] + self.assertEqual(len(empty_labels), 2) + self.assertEqual(empty_labels["Rating"]["count"], 2) + self.assertEqual( + empty_labels["Rating"]["items_with_empty_label"][0][0], "0@women_clothing" + ) + self.assertEqual(empty_labels["Division Name"]["count"], 2) + self.assertEqual( + empty_labels["Division Name"]["items_with_empty_label"][0][0], "0@women_clothing" + ) + + caption_dist = actual_stats["caption_distribution"] + self.assertEqual(len(caption_dist["defined_captions"]), 4) + self.assertEqual(caption_dist["defined_captions"]["Age"], 99) + self.assertEqual(caption_dist["defined_captions"]["Title"], 99) + self.assertEqual(caption_dist["defined_captions"]["Review Text"], 99) + self.assertEqual(caption_dist["defined_captions"]["Positive Feedback Count"], 99) + empty_captions = caption_dist["empty_captions"] + self.assertEqual(len(empty_captions), 4) + self.assertEqual(empty_captions["Age"]["count"], 2) + self.assertEqual( + empty_captions["Age"]["items_with_empty_caption"][0][0], "0@women_clothing" + ) + self.assertEqual(empty_captions["Title"]["count"], 2) + self.assertEqual( + empty_captions["Title"]["items_with_empty_caption"][0][0], "0@women_clothing" + ) + self.assertEqual(empty_captions["Review Text"]["count"], 2) + self.assertEqual( + empty_captions["Review Text"]["items_with_empty_caption"][0][0], "0@women_clothing" + ) + self.assertEqual(empty_captions["Positive Feedback Count"]["count"], 2) + self.assertEqual( + empty_captions["Positive Feedback Count"]["items_with_empty_caption"][0][0], + "0@women_clothing", + ) + + dist_in_caption = actual_stats["distribution_in_caption"] + self.assertEqual(dist_in_caption["Age"]["value"]["items_far_from_mean"], {}) + pos_dist_in_caption = dist_in_caption["Positive Feedback Count"]["value"] + self.assertEqual(pos_dist_in_caption["items_far_from_mean"], {}) + counts = [i for i in range(1, 101) if i != 5] + self.assertEqual(pos_dist_in_caption["mean"], np.mean(counts)) + self.assertEqual(pos_dist_in_caption["stdev"], np.std(counts)) + self.assertEqual(pos_dist_in_caption["min"], np.min(counts)) + self.assertEqual(pos_dist_in_caption["max"], np.max(counts)) + self.assertEqual(pos_dist_in_caption["median"], np.median(counts)) + + dist_item = actual_stats["distribution_in_dataset_item"] + self.assertEqual(sum(dist_item.values()), 594) + + with self.subTest("Test of validation reports", i=1): + actual_reports = actual_results["validation_reports"] + report_types = [r["anomaly_type"] for r in actual_reports] + count_by_type = Counter(report_types) + + self.assertEqual(len(actual_reports), 22) + self.assertEqual(count_by_type["MissingAnnotation"], 1) + self.assertEqual(count_by_type["RedundanciesInCaption"], 2) + self.assertEqual(count_by_type["BrokenAnnotation"], 7) + self.assertEqual(count_by_type["EmptyLabel"], 4) + self.assertEqual(count_by_type["EmptyCaption"], 8) + + with self.subTest("Test of summary", i=2): + actual_summary = actual_results["summary"] + expected_summary = {"errors": 0, "infos": 2, "warnings": 20} + + self.assertEqual(actual_summary, expected_summary) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) def test_validate_invalid_dataset_type(self): with self.assertRaises(TypeError):