diff --git a/datumaro/plugins/splitter.py b/datumaro/plugins/splitter.py index abc391ab19..786828b3c5 100644 --- a/datumaro/plugins/splitter.py +++ b/datumaro/plugins/splitter.py @@ -4,6 +4,7 @@ import logging as log import numpy as np +import copy from math import gcd from enum import Enum @@ -295,7 +296,7 @@ def _get_sections(dataset_size, ratio): def _group_by_attr(items): """ Args: - items: list of (idx, ann). ann is the annotation from Label object. + items: list of (idx_img, ann). ann is the annotation from Label object. Returns: by_attributes: dict of { combination-of-attrs : list of index } """ @@ -315,17 +316,17 @@ def _is_float(value): # group by attributes by_attributes = dict() - for idx, ann in items: + for idx_img, ann in items: # ignore numeric attributes filtered = {} - for k, v in ann.attributes.items(): - if _is_float(v): + for attr, value in ann.attributes.items(): + if _is_float(value): continue - filtered[k] = v + filtered[attr] = value attributes = tuple(sorted(filtered.items())) if attributes not in by_attributes: by_attributes[attributes] = [] - by_attributes[attributes].append(idx) + by_attributes[attributes].append(idx_img) return by_attributes @@ -344,9 +345,9 @@ def _split_indice(indice): for _, items in datasets.items(): np.random.shuffle(items) by_attributes = self._group_by_attr(items) - attr_names = list(by_attributes.keys()) - np.random.shuffle(attr_names) # add randomness - for attr in attr_names: + attr_combinations = list(by_attributes.keys()) + np.random.shuffle(attr_combinations) # add randomness + for attr in attr_combinations: indice = by_attributes[attr] quo = len(indice) // required if quo > 0: @@ -719,17 +720,19 @@ def __init__(self, dataset, splits, task, seed=None): def _group_by_labels(self, dataset): by_labels = dict() unlabeled = [] + for idx, item in enumerate(dataset): - bbox_anns = [a for a in item.annotations if a.type in self.annotation_type] - if len(bbox_anns) == 0: + instance_anns = [a for a in item.annotations if a.type in self.annotation_type] + if len(instance_anns) == 0: unlabeled.append(idx) continue - for ann in bbox_anns: - label = getattr(ann, "label", None) + for instance_ann in instance_anns: + label = getattr(instance_ann, "label", None) if label not in by_labels: - by_labels[label] = [(idx, ann)] + by_labels[label] = [(idx, instance_ann)] else: - by_labels[label].append((idx, ann)) + by_labels[label].append((idx, instance_ann)) + return by_labels, unlabeled def _split_dataset(self): @@ -746,79 +749,80 @@ def _split_dataset(self): for _, items in by_labels.items(): by_attributes = self._group_by_attr(items) # merge groups which have too small samples. - attr_names = list(by_attributes.keys()) - np.random.shuffle(attr_names) # add randomless + attr_combinations = list(by_attributes.keys()) + np.random.shuffle(attr_combinations) # add randomless cluster = [] - minumum = max(required, len(items) * 0.1) # temp solution - for attr in attr_names: + min_cluster = max(required, len(items) * 0.01) # temp solution + for attr in attr_combinations: indice = by_attributes[attr] - if len(indice) >= minumum: + if len(indice) >= min_cluster: by_combinations.append(indice) else: cluster.extend(indice) - if len(cluster) >= minumum: + if len(cluster) >= min_cluster: by_combinations.append(cluster) cluster = [] + if len(cluster) > 0: by_combinations.append(cluster) cluster = [] total = len(self._extractor) - # total number of GT samples per label-attr combinations n_combs = [len(v) for v in by_combinations] # 3-1. initially count per-image GT samples counts_all = {} - for idx in range(total): - if idx not in unlabeled: - counts_all[idx] = dict() + for idx_img in range(total): + if idx_img not in unlabeled: + counts_all[idx_img] = dict() for idx_comb, indice in enumerate(by_combinations): - for idx in indice: - if idx_comb not in counts_all[idx]: - counts_all[idx] = {idx_comb: 1} + for idx_img in indice: + if idx_comb not in counts_all[idx_img]: + counts_all[idx_img][idx_comb] = 1 else: - counts_all[idx][idx_comb] += 1 - - init_scores = {} - for idx, counts in counts_all.items(): - norm_sum = 0.0 - for idx_comb, count in counts.items(): - norm_sum += count / n_combs[idx_comb] - init_scores[idx] = norm_sum + counts_all[idx_img][idx_comb] += 1 by_splits = dict() for sname in self._subsets: by_splits[sname] = [] - target_size = dict() - expected = [] # expected numbers of per split GT samples + target_ins = [] # target instance numbers to be split for sname, ratio in zip(subsets, sratio): - target_size[sname] = (total - len(unlabeled)) * ratio - expected.append([sname, np.array(n_combs) * ratio]) + target_ins.append([sname, np.array(n_combs) * ratio]) + + init_scores = {} + for idx_img, distributions in counts_all.items(): + norm_sum = 0.0 + for idx_comb, dis in distributions.items(): + norm_sum += dis / n_combs[idx_comb] + init_scores[idx_img] = norm_sum - # functions for keep the # of annotations not exceed the expected num + by_scores = dict() + for idx_img, score in init_scores.items(): + if score not in by_scores: + by_scores[score] = [idx_img] + else: + by_scores[score].append(idx_img) + + # functions for keep the # of annotations not exceed the target_ins num def compute_penalty(counts, n_combs): p = 0 for idx_comb, v in counts.items(): - p += max(0, (v / n_combs[idx_comb]) - 1.0) + if n_combs[idx_comb] <= 0: + p += 1 + else: + p += max(0, (v / n_combs[idx_comb]) - 1.0) + return p def update_nc(counts, n_combs): for idx_comb, v in counts.items(): - n_combs[idx_comb] = max(0, n_combs[idx_comb] - v) - if n_combs[idx_comb] == 0: - n_combs[idx_comb] = -1 - - by_scores = dict() - for idx, score in init_scores.items(): - if score not in by_scores: - by_scores[score] = [idx] - else: - by_scores[score].append(idx) + n_combs[idx_comb] = n_combs[idx_comb] - v # 3-2. assign each DatasetItem to a split, one by one + actual_ins = copy.deepcopy(target_ins) for score in sorted(by_scores.keys(), reverse=True): indice = by_scores[score] np.random.shuffle(indice) # add randomness for the same score @@ -827,12 +831,12 @@ def update_nc(counts, n_combs): counts = counts_all[idx] # shuffling split order to add randomness # when two or more splits have the same penalty value - np.random.shuffle(expected) + np.random.shuffle(actual_ins) pp = [] - for sname, nc in expected: - if target_size[sname] <= len(by_splits[sname]): - # the split has enough images, + for sname, nc in actual_ins: + if np.sum(nc) <= 0: + # the split has enough instances, # stop adding more images to this split pp.append(1e08) else: @@ -842,7 +846,7 @@ def update_nc(counts, n_combs): # we push an image to a split with the minimum penalty midx = np.argmin(pp) - sname, nc = expected[midx] + sname, nc = actual_ins[midx] by_splits[sname].append(idx) update_nc(counts, nc) diff --git a/docs/user_manual.md b/docs/user_manual.md index a1602be2ec..df9bb33124 100644 --- a/docs/user_manual.md +++ b/docs/user_manual.md @@ -1037,8 +1037,8 @@ Example: split a dataset randomly to `train` and `test` subsets, ratio is 2:1 datum transform -t random_split -- --subset train:.67 --subset test:.33 ``` -Example: split a dataset in task-specific manner. Supported tasks are -classification, detection, re-identification and segmentation. +Example: split a dataset in task-specific manner. The tasks supported are +classification, detection, segmentation and re-identification. ``` bash datum transform -t split -- \ @@ -1081,9 +1081,7 @@ datum transform -t rename -- -e '|pattern|replacement|' datum transform -t rename -- -e '|frame_(\d+)|\\1|' ``` -Example: Sampling dataset items, subset `train` is divided into `sampled`(sampled_subset) and `unsampled` -- `train` has 100 data, and 20 samples are selected. There are `sampled`(20 samples) and 80 `unsampled`(80 datas) subsets. -- Remove `train` subset (if sampled_subset=`train` or unsampled_name=`train`, still remain) +Example: sampling dataset items as many as the number of target samples with sampling method entered by the user, divide into `sampled` and `unsampled` subsets - There are five methods of sampling the m option. - `topk`: Return the k with high uncertainty data - `lowk`: Return the k with low uncertainty data @@ -1101,7 +1099,7 @@ datum transform -t sampler -- \ -k 20 ``` -Example : Control number of outputs to 100 after NDR +Example : control number of outputs to 100 after NDR - There are two methods in NDR e option - `random`: sample from removed data randomly - `similarity`: sample from removed data with ascending diff --git a/tests/test_splitter.py b/tests/test_splitter.py index 4c233f0eb2..a6b778e192 100644 --- a/tests/test_splitter.py +++ b/tests/test_splitter.py @@ -79,7 +79,7 @@ def test_split_for_classification_multi_class_no_attr(self): task = splitter.SplitTask.classification.name splits = [("train", 0.7), ("test", 0.3)] - actual = splitter.Split(source, task, splits) + actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(42, len(actual.get_subset("train"))) self.assertEqual(18, len(actual.get_subset("test"))) @@ -105,7 +105,7 @@ def test_split_for_classification_single_class_single_attr(self): task = splitter.SplitTask.classification.name splits = [("train", 0.7), ("test", 0.3)] - actual = splitter.Split(source, task, splits) + actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(42, len(actual.get_subset("train"))) self.assertEqual(18, len(actual.get_subset("test"))) @@ -140,7 +140,7 @@ def test_split_for_classification_single_class_multi_attr(self): with self.subTest("zero remainder"): splits = [("train", 0.7), ("test", 0.3)] - actual = splitter.Split(source, task, splits) + actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(84, len(actual.get_subset("train"))) self.assertEqual(36, len(actual.get_subset("test"))) @@ -165,7 +165,7 @@ def test_split_for_classification_single_class_multi_attr(self): with self.subTest("non-zero remainder"): splits = [("train", 0.95), ("test", 0.05)] - actual = splitter.Split(source, task, splits) + actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(114, len(actual.get_subset("train"))) self.assertEqual(6, len(actual.get_subset("test"))) @@ -189,7 +189,7 @@ def test_split_for_classification_multi_label_with_attr(self): task = splitter.SplitTask.classification.name splits = [("train", 0.7), ("test", 0.3)] - actual = splitter.Split(source, task, splits) + actual = splitter.Split(source, task, splits, seed=100) train = actual.get_subset("train") test = actual.get_subset("test") @@ -243,7 +243,7 @@ def test_split_for_classification_zero_ratio(self): splits = [("train", 0.1), ("val", 0.9), ("test", 0.0)] task = splitter.SplitTask.classification.name - actual = splitter.Split(source, task, splits) + actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(1, len(actual.get_subset("train"))) self.assertEqual(4, len(actual.get_subset("val"))) @@ -255,7 +255,7 @@ def test_split_for_classification_unlabeled(self): source = Dataset.from_iterable(iterable, categories=["a", "b"]) splits = [("train", 0.7), ("test", 0.3)] task = splitter.SplitTask.classification.name - actual = splitter.Split(source, task, splits) + actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(7, len(actual.get_subset("train"))) self.assertEqual(3, len(actual.get_subset("test"))) @@ -266,7 +266,7 @@ def test_split_for_classification_unlabeled(self): source = Dataset.from_iterable(iterable, categories=["a", "b"]) splits = [("train", 0.7), ("test", 0.3)] task = splitter.SplitTask.classification.name - actual = splitter.Split(source, task, splits) + actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(7, len(actual.get_subset("train"))) self.assertEqual(3, len(actual.get_subset("test"))) @@ -405,7 +405,7 @@ def test_split_for_reidentification_rebalance(self): task = splitter.SplitTask.reid.name splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)] query = 0.4 / 0.7 - actual = splitter.Split(source, task, splits, query) + actual = splitter.Split(source, task, splits, query, seed=100) self.assertEqual(350, len(actual.get_subset("train"))) self.assertEqual(140, len(actual.get_subset("val"))) @@ -420,7 +420,7 @@ def test_split_for_reidentification_unlabeled(self): iterable = [DatasetItem(i, annotations=[]) for i in range(10)] source = Dataset.from_iterable(iterable, categories=["a", "b"]) splits = [("train", 0.6), ("test", 0.4)] - actual = splitter.Split(source, task, splits, query) + actual = splitter.Split(source, task, splits, query, seed=100) self.assertEqual(10, len(actual.get_subset("not-supported"))) with self.subTest("multi label"): @@ -428,7 +428,7 @@ def test_split_for_reidentification_unlabeled(self): iterable = [DatasetItem(i, annotations=anns) for i in range(10)] source = Dataset.from_iterable(iterable, categories=["a", "b"]) splits = [("train", 0.6), ("test", 0.4)] - actual = splitter.Split(source, task, splits, query) + actual = splitter.Split(source, task, splits, query, seed=100) self.assertEqual(10, len(actual.get_subset("not-supported"))) @@ -827,7 +827,7 @@ def test_split_for_detection(self): test=test, task=task, ): - actual = splitter.Split(source, task, splits) + actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(train, len(actual.get_subset("train"))) self.assertEqual(val, len(actual.get_subset("val"))) @@ -858,7 +858,7 @@ def test_split_for_detection_with_unlabeled(self): splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)] task = splitter.SplitTask.detection.name - actual = splitter.Split(source, task, splits) + actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(10, len(actual.get_subset("train"))) self.assertEqual(4, len(actual.get_subset("val"))) self.assertEqual(6, len(actual.get_subset("test"))) @@ -898,7 +898,7 @@ def test_no_subset_name_and_count_restriction(self): config = {"label1": {"attrs": None, "counts": 10}} task = splitter.SplitTask.classification.name source = self._generate_dataset(config) - actual = splitter.Split(source, task, splits) + actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(5, len(actual.get_subset("_train"))) self.assertEqual(1, len(actual.get_subset("valid"))) self.assertEqual(1, len(actual.get_subset("valid2"))) @@ -912,10 +912,10 @@ def test_no_subset_name_and_count_restriction(self): nimages=10, ) task = splitter.SplitTask.detection.name - actual = splitter.Split(source, task, splits) - self.assertEqual(5, len(actual.get_subset("_train"))) + actual = splitter.Split(source, task, splits, seed=21) + self.assertEqual(4, len(actual.get_subset("_train"))) self.assertEqual(1, len(actual.get_subset("valid"))) - self.assertEqual(1, len(actual.get_subset("valid2"))) + self.assertEqual(2, len(actual.get_subset("valid2"))) self.assertEqual(2, len(actual.get_subset("test*"))) self.assertEqual(1, len(actual.get_subset("test2"))) @@ -926,7 +926,7 @@ def test_no_subset_name_and_count_restriction(self): nimages=10, ) task = splitter.SplitTask.detection.name - actual = splitter.Split(source, task, splits) + actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(5, len(actual.get_subset("_train"))) self.assertEqual(1, len(actual.get_subset("valid"))) self.assertEqual(1, len(actual.get_subset("valid2"))) @@ -938,7 +938,7 @@ def test_no_subset_name_and_count_restriction(self): with_attr=True, nimages=10, ) - actual = splitter.Split(source, task, splits) + actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(5, len(actual.get_subset("_train"))) self.assertEqual(1, len(actual.get_subset("valid"))) self.assertEqual(1, len(actual.get_subset("valid2"))) @@ -977,7 +977,7 @@ def test_split_for_segmentation(self): test=test, task=task, ): - actual = splitter.Split(source, task, splits) + actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(train, len(actual.get_subset("train"))) self.assertEqual(val, len(actual.get_subset("val"))) @@ -1008,6 +1008,7 @@ def test_split_for_segmentation(self): params.append((dtype, with_attr, 10, 5, 3, 2)) params.append((dtype, with_attr, 10, 7, 0, 3)) + expected = [] for dtype, with_attr, nimages, train, val, test in params: source, _ = self._generate_detection_segmentation_dataset( annotation_type=self._get_append_polygon(dtype), @@ -1029,7 +1030,9 @@ def test_split_for_segmentation(self): test=test, task=task, ): - actual = splitter.Split(source, task, splits) + actual = splitter.Split(source, task, splits, seed=21) + + expected.append([dtype, with_attr, len(actual.get_subset("train")), len(actual.get_subset("val")), len(actual.get_subset("test"))]) self.assertEqual(train, len(actual.get_subset("train"))) self.assertEqual(val, len(actual.get_subset("val"))) @@ -1064,7 +1067,7 @@ def test_split_for_segmentation_with_unlabeled(self): splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)] task = splitter.SplitTask.segmentation.name - actual = splitter.Split(source, task, splits) + actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(10, len(actual.get_subset("train"))) self.assertEqual(4, len(actual.get_subset("val"))) self.assertEqual(6, len(actual.get_subset("test"))) @@ -1080,7 +1083,7 @@ def test_split_for_segmentation_with_unlabeled(self): splits = [("train", 0.5), ("val", 0.2), ("test", 0.3)] task = splitter.SplitTask.segmentation.name - actual = splitter.Split(source, task, splits) + actual = splitter.Split(source, task, splits, seed=100) self.assertEqual(10, len(actual.get_subset("train"))) self.assertEqual(4, len(actual.get_subset("val"))) self.assertEqual(6, len(actual.get_subset("test")))