Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Configurable Threshold CLI support #250

Merged
merged 6 commits into from
May 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Documentation file for COCO format (<https://github.com/openvinotoolkit/datumaro/pull/241>)
- Documentation file and integration tests for YOLO format (<https://github.com/openvinotoolkit/datumaro/pull/246>)
- Support for Cityscapes dataset format (<https://github.com/openvinotoolkit/datumaro/pull/249>)
- Support for Validator configurable threshold(<https://github.com/openvinotoolkit/datumaro/pull/250>)

### Changed
- LabelMe format saves dataset items with their relative paths by subsets without changing names (<https://github.com/openvinotoolkit/datumaro/pull/200>)
Expand Down
8 changes: 7 additions & 1 deletion datumaro/cli/contexts/project/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,6 +808,8 @@ def build_validate_parser(parser_ctor=argparse.ArgumentParser):
help="Subset to validate (default: None)")
parser.add_argument('-p', '--project', dest='project_dir', default='.',
help="Directory of the project to validate (default: current dir)")
parser.add_argument('extra_args', nargs=argparse.REMAINDER, default=None,
help="Optional arguments for validator (pass '-- -h' for help)")
parser.set_defaults(command=validate_command)

return parser
Expand All @@ -822,7 +824,11 @@ def validate_command(args):
if subset_name is not None:
dataset = dataset.get_subset(subset_name)
dst_file_name += f'-{subset_name}'
validation_results = validate_annotations(dataset, task_type)

extra_args = {}
from datumaro.components.validator import _Validator
extra_args = _Validator.parse_cmdline(args.extra_args)
validation_results = validate_annotations(dataset, task_type, **extra_args)

def numpy_encoder(obj):
if isinstance(obj, np.generic):
Expand Down
117 changes: 94 additions & 23 deletions datumaro/components/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
NegativeLength, InvalidValue, FarFromLabelMean,
FarFromAttrMean, OnlyOneAttributeValue)
from datumaro.components.extractor import AnnotationType, LabelCategories
from datumaro.components.cli_plugin import CliPlugin
from datumaro.util import parse_str_enum_value


Expand All @@ -26,13 +27,7 @@
TaskType = Enum('TaskType', ['classification', 'detection', 'segmentation'])


class _Validator:
DEFAULT_FEW_SAMPLES = 1
DEFAULT_IMBALANCE_RATIO = 50
DEFAULT_FAR_FROM_MEAN = 5.0
DEFAULT_DOMINANCE_RATIO = 0.8
DEFAULT_DOMINANCE_TOPK_BINS = 0.1

class _Validator(CliPlugin):
# statistics templates
numerical_stat_template = {
'items_far_from_mean': {},
Expand Down Expand Up @@ -64,7 +59,50 @@ class _Validator:
Abstract method that must be implemented in a subclass.
"""

def __init__(self, task_type=None):
@classmethod
def build_cmdline_parser(cls, **kwargs):
parser = super().build_cmdline_parser(**kwargs)
parser.add_argument('-fs', '--few_samples_thr', default=1, type=int,
help="Threshold for giving a warning for minimum number of"
"samples per class")
parser.add_argument('-ir', '--imbalance_ratio_thr', default=50, type=int,
help="Threshold for giving data imbalance warning;"
"IR(imbalance ratio) = majority/minority")
parser.add_argument('-m', '--far_from_mean_thr', default=5.0, type=float,
help="Threshold for giving a warning that data is far from mean;"
"A constant used to define mean +/- k * standard deviation;")
parser.add_argument('-dr', '--dominance_ratio_thr', default=0.8, type=float,
help="Threshold for giving a warning for bounding box imbalance;"
"Dominace_ratio = ratio of Top-k bin to total in histogram;")
parser.add_argument('-k', '--topk_bins', default=0.1, type=float,
help="Ratio of bins with the highest number of data"
"to total bins in the histogram; [0, 1]; 0.1 = 10%;")
return parser

def __init__(self, task_type, few_samples_thr=None,
imbalance_ratio_thr=None, far_from_mean_thr=None,
dominance_ratio_thr=None, topk_bins=None):
"""
Validator

Parameters
---------------
few_samples_thr: int
minimum number of samples per class
warn user when samples per class is less than threshold
imbalance_ratio_thr: int
ratio of majority attribute to minority attribute
warn user when annotations are unevenly distributed
far_from_mean_thr: float
constant used to define mean +/- m * stddev
warn user when there are too big or small values
dominance_ratio_thr: float
ratio of Top-k bin to total
warn user when dominance ratio is over threshold
topk_bins: float
ratio of selected bins with most item number to total bins
warn user when values are not evenly distributed
"""
self.task_type = parse_str_enum_value(task_type, TaskType,
default=TaskType.classification)

Expand All @@ -78,11 +116,11 @@ def __init__(self, task_type=None):
self.ann_types = {AnnotationType.mask, AnnotationType.polygon}
self.str_ann_type = "mask or polygon"

self.far_from_mean_thr = self.DEFAULT_FAR_FROM_MEAN
self.imbalance_ratio_thr = self.DEFAULT_IMBALANCE_RATIO
self.few_samples_thr = self.DEFAULT_FEW_SAMPLES
self.dominance_thr = self.DEFAULT_DOMINANCE_RATIO
self.topk_bins_ratio = self.DEFAULT_DOMINANCE_TOPK_BINS
self.few_samples_thr = few_samples_thr
self.imbalance_ratio_thr = imbalance_ratio_thr
self.far_from_mean_thr = far_from_mean_thr
self.dominance_thr = dominance_ratio_thr
self.topk_bins_ratio = topk_bins

def _compute_common_statistics(self, dataset):
defined_attr_template = {
Expand Down Expand Up @@ -537,8 +575,13 @@ class ClassificationValidator(_Validator):
A validator class for classification tasks.
"""

def __init__(self):
super().__init__(TaskType.classification)
def __init__(self, few_samples_thr, imbalance_ratio_thr,
far_from_mean_thr, dominance_ratio_thr, topk_bins):
super().__init__(task_type=TaskType.classification,
few_samples_thr=few_samples_thr,
imbalance_ratio_thr=imbalance_ratio_thr,
far_from_mean_thr=far_from_mean_thr,
dominance_ratio_thr=dominance_ratio_thr, topk_bins=topk_bins)

def _check_multi_label_annotations(self, stats):
validation_reports = []
Expand Down Expand Up @@ -636,8 +679,13 @@ class DetectionValidator(_Validator):
"""
A validator class for detection tasks.
"""
def __init__(self):
super().__init__(TaskType.detection)
def __init__(self, few_samples_thr, imbalance_ratio_thr,
far_from_mean_thr, dominance_ratio_thr, topk_bins):
super().__init__(task_type=TaskType.detection,
few_samples_thr=few_samples_thr,
imbalance_ratio_thr=imbalance_ratio_thr,
far_from_mean_thr=far_from_mean_thr,
dominance_ratio_thr=dominance_ratio_thr, topk_bins=topk_bins)

def _check_negative_length(self, stats):
validation_reports = []
Expand Down Expand Up @@ -917,8 +965,13 @@ class SegmentationValidator(_Validator):
A validator class for (instance) segmentation tasks.
"""

def __init__(self):
super().__init__(TaskType.segmentation)
def __init__(self, few_samples_thr, imbalance_ratio_thr,
far_from_mean_thr, dominance_ratio_thr, topk_bins):
super().__init__(task_type=TaskType.segmentation,
few_samples_thr=few_samples_thr,
imbalance_ratio_thr=imbalance_ratio_thr,
far_from_mean_thr=far_from_mean_thr,
dominance_ratio_thr=dominance_ratio_thr, topk_bins=topk_bins)

def compute_statistics(self, dataset):
"""
Expand Down Expand Up @@ -1149,7 +1202,7 @@ def generate_reports(self, stats):
return reports


def validate_annotations(dataset: IDataset, task_type: Union[str, TaskType]):
def validate_annotations(dataset: IDataset, task_type: Union[str, TaskType], **extra_args):
"""
Returns the validation results of a dataset based on task type.

Expand All @@ -1167,15 +1220,33 @@ def validate_annotations(dataset: IDataset, task_type: Union[str, TaskType]):

"""

few_samples_thr = extra_args['few_samples_thr']
imbalance_ratio_thr = extra_args['imbalance_ratio_thr']
far_from_mean_thr = extra_args['far_from_mean_thr']
dominance_ratio_thr = extra_args['dominance_ratio_thr']
topk_bins = extra_args['topk_bins']

validation_results = {}

task_type = parse_str_enum_value(task_type, TaskType)
if task_type == TaskType.classification:
validator = ClassificationValidator()
validator = ClassificationValidator(few_samples_thr=few_samples_thr,
imbalance_ratio_thr=imbalance_ratio_thr,
far_from_mean_thr=far_from_mean_thr,
dominance_ratio_thr=dominance_ratio_thr,
topk_bins=topk_bins)
elif task_type == TaskType.detection:
validator = DetectionValidator()
validator = DetectionValidator(few_samples_thr=few_samples_thr,
imbalance_ratio_thr=imbalance_ratio_thr,
far_from_mean_thr=far_from_mean_thr,
dominance_ratio_thr=dominance_ratio_thr,
topk_bins=topk_bins)
elif task_type == TaskType.segmentation:
validator = SegmentationValidator()
validator = SegmentationValidator(few_samples_thr=few_samples_thr,
imbalance_ratio_thr=imbalance_ratio_thr,
far_from_mean_thr=far_from_mean_thr,
dominance_ratio_thr=dominance_ratio_thr,
topk_bins=topk_bins)

if not isinstance(dataset, IDataset):
raise TypeError("Invalid dataset type '%s'" % type(dataset))
Expand Down
Loading