Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Configurable Threshold CLI support #250

Merged
merged 6 commits into from
May 27, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Support for MNIST and MNIST in CSV dataset formats (<https://github.com/openvinotoolkit/datumaro/pull/234>)
- Documentation file for COCO format (<https://github.com/openvinotoolkit/datumaro/pull/241>)
- Documentation file and integration tests for YOLO format (<https://github.com/openvinotoolkit/datumaro/pull/246>)
- Support for Validator configurable threshold(<https://github.com/openvinotoolkit/datumaro/pull/250>)

### Changed
- LabelMe format saves dataset items with their relative paths by subsets without changing names (<https://github.com/openvinotoolkit/datumaro/pull/200>)
Expand Down
10 changes: 8 additions & 2 deletions datumaro/cli/contexts/project/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,7 +677,7 @@ def transform_command(args):
extra_args = {}
if hasattr(transform, 'parse_cmdline'):
extra_args = transform.parse_cmdline(args.extra_args)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I will consider making a new plugin type.

Copy link
Contributor Author

@seungyoon-woo seungyoon-woo May 26, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you tell me detailed guideline how to make new plugin type(DatasetValidator)?

Copy link
Contributor

@zhiltsov-max zhiltsov-max May 26, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Define a base class and add it to the Environment and CliPlugin classes. Maybe, you'd better do this in another PR, to avoid blocking this one.

Copy link
Contributor Author

@seungyoon-woo seungyoon-woo May 26, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean handling three requested feedback above in this PR and handling new plugin type in additional PR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will handle new plugin type in another PR!

log.info("Loading the project...")
dataset = project.make_dataset()

Expand Down Expand Up @@ -808,6 +808,8 @@ def build_validate_parser(parser_ctor=argparse.ArgumentParser):
help="Subset to validate (default: None)")
parser.add_argument('-p', '--project', dest='project_dir', default='.',
help="Directory of the project to validate (default: current dir)")
parser.add_argument('extra_args', nargs=argparse.REMAINDER, default=None,
help="Optional arguments for validator (pass '-- -h' for help)")
parser.set_defaults(command=validate_command)

return parser
Expand All @@ -822,7 +824,11 @@ def validate_command(args):
if subset_name is not None:
dataset = dataset.get_subset(subset_name)
dst_file_name += f'-{subset_name}'
validation_results = validate_annotations(dataset, task_type)

extra_args = {}
from datumaro.components.validator import _Validator
extra_args = _Validator.parse_cmdline(args.extra_args)
validation_results = validate_annotations(dataset, task_type, **extra_args)

def numpy_encoder(obj):
if isinstance(obj, np.generic):
Expand Down
109 changes: 86 additions & 23 deletions datumaro/components/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
NegativeLength, InvalidValue, FarFromLabelMean,
FarFromAttrMean, OnlyOneAttributeValue)
from datumaro.components.extractor import AnnotationType, LabelCategories
from datumaro.components.cli_plugin import CliPlugin
from datumaro.util import parse_str_enum_value


Expand All @@ -26,13 +27,7 @@
TaskType = Enum('TaskType', ['classification', 'detection', 'segmentation'])


class _Validator:
DEFAULT_FEW_SAMPLES = 1
DEFAULT_IMBALANCE_RATIO = 50
DEFAULT_FAR_FROM_MEAN = 5.0
DEFAULT_DOMINANCE_RATIO = 0.8
DEFAULT_DOMINANCE_TOPK_BINS = 0.1

class _Validator(CliPlugin):
# statistics templates
numerical_stat_template = {
'items_far_from_mean': {},
Expand Down Expand Up @@ -64,7 +59,48 @@ class _Validator:
Abstract method that must be implemented in a subclass.
"""

def __init__(self, task_type=None):
@classmethod
def build_cmdline_parser(cls, **kwargs):
parser = super().build_cmdline_parser(**kwargs)
parser.add_argument('-fs', '--few_samples_thr', default=1, type=int,
help="Threshold for giving a warning for minimum number of samples per class")
parser.add_argument('-ir', '--imbalance_ratio_thr', default=50, type=int,
help="Threshold for giving data imbalance warning; IR(imbalance ratio) = majority/minority")
parser.add_argument('-m', '--far_from_mean_thr', default=5.0, type=float,
help="Threshold for giving a warning that data is far from mean;"
"A constant used to define mean +/- k * standard deviation;")
parser.add_argument('-dr', '--dominance_ratio_thr', default=0.8, type=float,
help="Threshold for giving a warning for bounding box imbalance;"
"Dominace_ratio = ratio of Top-k bin to total in histogram;")
parser.add_argument('-k', '--topk_bins', default=0.1, type=float,
help="Ratio of bins with the highest number of data to total bins in the histogram;"
"[0, 1]; 0.1 = 10%;")
return parser

def __init__(self, task_type, few_samples_thr=None,
imbalance_ratio_thr=None, far_from_mean_thr=None,
dominance_ratio_thr=None, topk_bins=None, ):
"""
Validator

Parameters
---------------
few_samples_thr: int
minimum number of samples per class
warn user when samples per class is less than threshold
imbalance_ratio_thr: int
ratio of majority attribute to minority attribute
warn user when annotations are unevenly distributed
far_from_mean_thr: float
constant used to define mean +/- m * stddev
warn user when there are too big or small values
dominance_ratio_thr: float
ratio of Top-k bin to total
warn user when dominance ratio is over threshold
topk_bins: float
ratio of selected bins with most item number to total bins
warn user when values are not evenly distributed
"""
self.task_type = parse_str_enum_value(task_type, TaskType,
default=TaskType.classification)

Expand All @@ -78,11 +114,11 @@ def __init__(self, task_type=None):
self.ann_types = {AnnotationType.mask, AnnotationType.polygon}
self.str_ann_type = "mask or polygon"

self.far_from_mean_thr = self.DEFAULT_FAR_FROM_MEAN
self.imbalance_ratio_thr = self.DEFAULT_IMBALANCE_RATIO
self.few_samples_thr = self.DEFAULT_FEW_SAMPLES
self.dominance_thr = self.DEFAULT_DOMINANCE_RATIO
self.topk_bins_ratio = self.DEFAULT_DOMINANCE_TOPK_BINS
self.few_samples_thr = few_samples_thr
self.imbalance_ratio_thr = imbalance_ratio_thr
self.far_from_mean_thr = far_from_mean_thr
self.dominance_thr = dominance_ratio_thr
self.topk_bins_ratio = topk_bins

def _compute_common_statistics(self, dataset):
defined_attr_template = {
Expand Down Expand Up @@ -537,8 +573,12 @@ class ClassificationValidator(_Validator):
A validator class for classification tasks.
"""

def __init__(self):
super().__init__(TaskType.classification)
def __init__(self, few_samples_thr, imbalance_ratio_thr,
far_from_mean_thr, dominance_ratio_thr, topk_bins):
super().__init__(task_type=TaskType.classification, few_samples_thr=few_samples_thr,
imbalance_ratio_thr=imbalance_ratio_thr,
far_from_mean_thr=far_from_mean_thr,
dominance_ratio_thr=dominance_ratio_thr, topk_bins=topk_bins)

def _check_multi_label_annotations(self, stats):
validation_reports = []
Expand Down Expand Up @@ -636,8 +676,12 @@ class DetectionValidator(_Validator):
"""
A validator class for detection tasks.
"""
def __init__(self):
super().__init__(TaskType.detection)
def __init__(self, few_samples_thr, imbalance_ratio_thr,
far_from_mean_thr, dominance_ratio_thr, topk_bins):
super().__init__(task_type=TaskType.detection, few_samples_thr=few_samples_thr,
imbalance_ratio_thr=imbalance_ratio_thr,
far_from_mean_thr=far_from_mean_thr,
dominance_ratio_thr=dominance_ratio_thr, topk_bins=topk_bins)

def _check_negative_length(self, stats):
validation_reports = []
Expand Down Expand Up @@ -917,8 +961,12 @@ class SegmentationValidator(_Validator):
A validator class for (instance) segmentation tasks.
"""

def __init__(self):
super().__init__(TaskType.segmentation)
def __init__(self, few_samples_thr, imbalance_ratio_thr,
far_from_mean_thr, dominance_ratio_thr, topk_bins):
super().__init__(task_type=TaskType.segmentation, few_samples_thr=few_samples_thr,
imbalance_ratio_thr=imbalance_ratio_thr,
far_from_mean_thr=far_from_mean_thr,
dominance_ratio_thr=dominance_ratio_thr, topk_bins=topk_bins)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def __init__(self, few_samples_thr, imbalance_ratio_thr,
far_from_mean_thr, dominance_ratio_thr, topk_bins):
super().__init__(task_type=TaskType.segmentation, few_samples_thr=few_samples_thr,
imbalance_ratio_thr=imbalance_ratio_thr,
far_from_mean_thr=far_from_mean_thr,
dominance_ratio_thr=dominance_ratio_thr, topk_bins=topk_bins)
def __init__(self, few_samples_thr, imbalance_ratio_thr,
far_from_mean_thr, dominance_ratio_thr, topk_bins):
super().__init__(task_type=TaskType.segmentation,
few_samples_thr=few_samples_thr,
imbalance_ratio_thr=imbalance_ratio_thr,
far_from_mean_thr=far_from_mean_thr,
dominance_ratio_thr=dominance_ratio_thr, topk_bins=topk_bins)


def compute_statistics(self, dataset):
"""
Expand Down Expand Up @@ -1149,7 +1197,7 @@ def generate_reports(self, stats):
return reports


def validate_annotations(dataset: IDataset, task_type: Union[str, TaskType]):
def validate_annotations(dataset: IDataset, task_type: Union[str, TaskType], **extra_args):
"""
Returns the validation results of a dataset based on task type.

Expand All @@ -1167,15 +1215,30 @@ def validate_annotations(dataset: IDataset, task_type: Union[str, TaskType]):

"""

few_samples_thr = extra_args['few_samples_thr']
imbalance_ratio_thr = extra_args['imbalance_ratio_thr']
far_from_mean_thr = extra_args['far_from_mean_thr']
dominance_ratio_thr = extra_args['dominance_ratio_thr']
topk_bins = extra_args['topk_bins']

validation_results = {}

task_type = parse_str_enum_value(task_type, TaskType)
if task_type == TaskType.classification:
validator = ClassificationValidator()
validator = ClassificationValidator(few_samples_thr=few_samples_thr,
imbalance_ratio_thr=imbalance_ratio_thr,
far_from_mean_thr=far_from_mean_thr,
dominance_ratio_thr=dominance_ratio_thr, topk_bins=topk_bins)
elif task_type == TaskType.detection:
validator = DetectionValidator()
validator = DetectionValidator(few_samples_thr=few_samples_thr,
imbalance_ratio_thr=imbalance_ratio_thr,
far_from_mean_thr=far_from_mean_thr,
dominance_ratio_thr=dominance_ratio_thr, topk_bins=topk_bins)
elif task_type == TaskType.segmentation:
validator = SegmentationValidator()
validator = SegmentationValidator(few_samples_thr=few_samples_thr,
imbalance_ratio_thr=imbalance_ratio_thr,
far_from_mean_thr=far_from_mean_thr,
dominance_ratio_thr=dominance_ratio_thr, topk_bins=topk_bins)

if not isinstance(dataset, IDataset):
raise TypeError("Invalid dataset type '%s'" % type(dataset))
Expand Down
Loading