Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detection splitter #235

Merged
merged 9 commits into from
May 10, 2021
118 changes: 61 additions & 57 deletions datumaro/plugins/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import logging as log
import numpy as np
import copy
from math import gcd
from enum import Enum

Expand Down Expand Up @@ -295,7 +296,7 @@ def _get_sections(dataset_size, ratio):
def _group_by_attr(items):
"""
Args:
items: list of (idx, ann). ann is the annotation from Label object.
items: list of (idx_img, ann). ann is the annotation from Label object.
Returns:
by_attributes: dict of { combination-of-attrs : list of index }
"""
Expand All @@ -315,17 +316,17 @@ def _is_float(value):

# group by attributes
by_attributes = dict()
for idx, ann in items:
for idx_img, ann in items:
# ignore numeric attributes
filtered = {}
for k, v in ann.attributes.items():
if _is_float(v):
for attr, value in ann.attributes.items():
if _is_float(value):
continue
filtered[k] = v
filtered[attr] = value
attributes = tuple(sorted(filtered.items()))
if attributes not in by_attributes:
by_attributes[attributes] = []
by_attributes[attributes].append(idx)
by_attributes[attributes].append(idx_img)

return by_attributes

Expand All @@ -344,9 +345,9 @@ def _split_indice(indice):
for _, items in datasets.items():
np.random.shuffle(items)
by_attributes = self._group_by_attr(items)
attr_names = list(by_attributes.keys())
np.random.shuffle(attr_names) # add randomness
for attr in attr_names:
attr_combinations = list(by_attributes.keys())
np.random.shuffle(attr_combinations) # add randomness
for attr in attr_combinations:
indice = by_attributes[attr]
quo = len(indice) // required
if quo > 0:
Expand Down Expand Up @@ -719,17 +720,19 @@ def __init__(self, dataset, splits, task, seed=None):
def _group_by_labels(self, dataset):
by_labels = dict()
unlabeled = []

for idx, item in enumerate(dataset):
bbox_anns = [a for a in item.annotations if a.type in self.annotation_type]
if len(bbox_anns) == 0:
instance_anns = [a for a in item.annotations if a.type in self.annotation_type]
if len(instance_anns) == 0:
unlabeled.append(idx)
continue
for ann in bbox_anns:
label = getattr(ann, "label", None)
for instance_ann in instance_anns:
label = getattr(instance_ann, "label", None)
if label not in by_labels:
by_labels[label] = [(idx, ann)]
by_labels[label] = [(idx, instance_ann)]
else:
by_labels[label].append((idx, ann))
by_labels[label].append((idx, instance_ann))

return by_labels, unlabeled

def _split_dataset(self):
Expand All @@ -746,79 +749,80 @@ def _split_dataset(self):
for _, items in by_labels.items():
by_attributes = self._group_by_attr(items)
# merge groups which have too small samples.
attr_names = list(by_attributes.keys())
np.random.shuffle(attr_names) # add randomless
attr_combinations = list(by_attributes.keys())
np.random.shuffle(attr_combinations) # add randomless
cluster = []
minumum = max(required, len(items) * 0.1) # temp solution
for attr in attr_names:
min_cluster = max(required, len(items) * 0.01) # temp solution
for attr in attr_combinations:
indice = by_attributes[attr]
if len(indice) >= minumum:
if len(indice) >= min_cluster:
by_combinations.append(indice)
else:
cluster.extend(indice)
if len(cluster) >= minumum:
if len(cluster) >= min_cluster:
by_combinations.append(cluster)
cluster = []

if len(cluster) > 0:
by_combinations.append(cluster)
cluster = []

total = len(self._extractor)

# total number of GT samples per label-attr combinations
n_combs = [len(v) for v in by_combinations]

# 3-1. initially count per-image GT samples
counts_all = {}
for idx in range(total):
if idx not in unlabeled:
counts_all[idx] = dict()
for idx_img in range(total):
if idx_img not in unlabeled:
counts_all[idx_img] = dict()

for idx_comb, indice in enumerate(by_combinations):
for idx in indice:
if idx_comb not in counts_all[idx]:
counts_all[idx] = {idx_comb: 1}
for idx_img in indice:
if idx_comb not in counts_all[idx_img]:
counts_all[idx_img][idx_comb] = 1
else:
counts_all[idx][idx_comb] += 1

init_scores = {}
for idx, counts in counts_all.items():
norm_sum = 0.0
for idx_comb, count in counts.items():
norm_sum += count / n_combs[idx_comb]
init_scores[idx] = norm_sum
counts_all[idx_img][idx_comb] += 1

by_splits = dict()
for sname in self._subsets:
by_splits[sname] = []

target_size = dict()
expected = [] # expected numbers of per split GT samples
target_ins = [] # target instance numbers to be split
for sname, ratio in zip(subsets, sratio):
target_size[sname] = (total - len(unlabeled)) * ratio
expected.append([sname, np.array(n_combs) * ratio])
target_ins.append([sname, np.array(n_combs) * ratio])

init_scores = {}
for idx_img, distributions in counts_all.items():
norm_sum = 0.0
for idx_comb, dis in distributions.items():
norm_sum += dis / n_combs[idx_comb]
init_scores[idx_img] = norm_sum

# functions for keep the # of annotations not exceed the expected num
by_scores = dict()
for idx_img, score in init_scores.items():
if score not in by_scores:
by_scores[score] = [idx_img]
else:
by_scores[score].append(idx_img)

# functions for keep the # of annotations not exceed the target_ins num
def compute_penalty(counts, n_combs):
p = 0
for idx_comb, v in counts.items():
p += max(0, (v / n_combs[idx_comb]) - 1.0)
if n_combs[idx_comb] <= 0:
p += 1
else:
p += max(0, (v / n_combs[idx_comb]) - 1.0)

return p

def update_nc(counts, n_combs):
for idx_comb, v in counts.items():
n_combs[idx_comb] = max(0, n_combs[idx_comb] - v)
if n_combs[idx_comb] == 0:
n_combs[idx_comb] = -1

by_scores = dict()
for idx, score in init_scores.items():
if score not in by_scores:
by_scores[score] = [idx]
else:
by_scores[score].append(idx)
n_combs[idx_comb] = n_combs[idx_comb] - v

# 3-2. assign each DatasetItem to a split, one by one
actual_ins = copy.deepcopy(target_ins)
for score in sorted(by_scores.keys(), reverse=True):
indice = by_scores[score]
np.random.shuffle(indice) # add randomness for the same score
Expand All @@ -827,12 +831,12 @@ def update_nc(counts, n_combs):
counts = counts_all[idx]
# shuffling split order to add randomness
# when two or more splits have the same penalty value
np.random.shuffle(expected)
np.random.shuffle(actual_ins)

pp = []
for sname, nc in expected:
if target_size[sname] <= len(by_splits[sname]):
# the split has enough images,
for sname, nc in actual_ins:
if np.sum(nc) <= 0:
# the split has enough instances,
# stop adding more images to this split
pp.append(1e08)
else:
Expand All @@ -842,7 +846,7 @@ def update_nc(counts, n_combs):

# we push an image to a split with the minimum penalty
midx = np.argmin(pp)
sname, nc = expected[midx]
sname, nc = actual_ins[midx]
by_splits[sname].append(idx)
update_nc(counts, nc)

Expand Down
10 changes: 4 additions & 6 deletions docs/user_manual.md
Original file line number Diff line number Diff line change
Expand Up @@ -1037,8 +1037,8 @@ Example: split a dataset randomly to `train` and `test` subsets, ratio is 2:1
datum transform -t random_split -- --subset train:.67 --subset test:.33
```

Example: split a dataset in task-specific manner. Supported tasks are
classification, detection, re-identification and segmentation.
Example: split a dataset in task-specific manner. The tasks supported are
classification, detection, segmentation and re-identification.

``` bash
datum transform -t split -- \
Expand Down Expand Up @@ -1081,9 +1081,7 @@ datum transform -t rename -- -e '|pattern|replacement|'
datum transform -t rename -- -e '|frame_(\d+)|\\1|'
```

Example: Sampling dataset items, subset `train` is divided into `sampled`(sampled_subset) and `unsampled`
- `train` has 100 data, and 20 samples are selected. There are `sampled`(20 samples) and 80 `unsampled`(80 datas) subsets.
- Remove `train` subset (if sampled_subset=`train` or unsampled_name=`train`, still remain)
Example: sampling dataset items as many as the number of target samples with sampling method entered by the user, divide into `sampled` and `unsampled` subsets
- There are five methods of sampling the m option.
- `topk`: Return the k with high uncertainty data
- `lowk`: Return the k with low uncertainty data
Expand All @@ -1101,7 +1099,7 @@ datum transform -t sampler -- \
-k 20
```

Example : Control number of outputs to 100 after NDR
Example : control number of outputs to 100 after NDR
- There are two methods in NDR e option
- `random`: sample from removed data randomly
- `similarity`: sample from removed data with ascending
Expand Down
Loading