diff --git a/docs/COCO.md b/docs/COCO.md index db7dc3117..647292a69 100644 --- a/docs/COCO.md +++ b/docs/COCO.md @@ -172,3 +172,35 @@ subsampled_coco = coco.get_subsampled_coco(subsample_ratio=10) # export subsampled COCO dataset save_json(subsampled_coco.json, "subsampled_coco.json") ``` + +## Get dataset stats: + +```python +from sahi.utils.coco import Coco + +# init Coco object +coco = Coco.from_coco_dict_or_path("coco.json") + +# get dataset stats +coco.stats +{ + 'avg_annotation_area': 2448.405738278109, + 'avg_num_annotations_in_image': 53.037243084530985, + 'max_annotation_area': 328640, + 'max_num_annotations_in_image': 902, + 'min_annotation_area': 3, + 'min_num_annotations_in_image': 1, + 'num_annotations': 343204, + 'num_annotations_per_category': { + 'human': 106396, + 'vehicle': 236808 + }, + 'num_categories': 2, + 'num_images': 6471, + 'num_images_per_category': { + 'human': 5684, + 'vehicle': 6323 + } +} + +``` diff --git a/sahi/utils/coco.py b/sahi/utils/coco.py index 7a36f42c5..601eb8f1d 100644 --- a/sahi/utils/coco.py +++ b/sahi/utils/coco.py @@ -4,7 +4,7 @@ import copy import os -from collections import OrderedDict, defaultdict +from collections import Counter, OrderedDict, defaultdict from dataclasses import dataclass from multiprocessing import Pool from pathlib import Path @@ -737,7 +737,7 @@ def __repr__(self): class Coco: - def __init__(self, name=None, image_dir=None, remapping_dict=None): + def __init__(self, name=None, image_dir=None, remapping_dict=None, ignore_negative_samples=True): """ Creates Coco object. @@ -748,12 +748,16 @@ def __init__(self, name=None, image_dir=None, remapping_dict=None): Base file directory that contains dataset images. Required for dataset merging. remapping_dict: dict {1:0, 2:1} maps category id 1 to 0 and category id 2 to 1 + ignore_negative_samples: bool + If True ignores images without annotations in all operations. """ self.name = name self.image_dir = image_dir self.remapping_dict = remapping_dict + self.ignore_negative_samples = ignore_negative_samples self.categories = [] self.images = [] + self._stats = None def add_categories_from_coco_category_list(self, coco_category_list): """ @@ -998,9 +1002,71 @@ def json(self): return create_coco_dict( images=self.images, categories=self.json_categories, - ignore_negative_samples=True, + ignore_negative_samples=self.ignore_negative_samples, ) + @property + def stats(self): + if not self._stats: + self.calculate_stats() + return self._stats + + def calculate_stats(self): + """ + Iterates over all annotations and calculates total number of + """ + num_annotations = 0 + num_images = len(self.images) + num_categories = len(self.json_categories) + category_name_to_zero = {category["name"]:0 for category in self.json_categories} + num_images_per_category = copy.deepcopy(category_name_to_zero) + num_annotations_per_category = copy.deepcopy(category_name_to_zero) + min_num_annotations_in_image = 1e10 + max_num_annotations_in_image = 0 + total_annotation_area = 0 + min_annotation_area = 1e10 + max_annotation_area = 0 + for image in self.images: + image_contains_category = {} + for annotation in image.annotations: + annotation_area = annotation.area + total_annotation_area += annotation_area + num_annotations_per_category[annotation.category_name] += 1 + image_contains_category[annotation.category_name] = 1 + # update min&max annotation area + if annotation_area>max_annotation_area: + max_annotation_area = annotation_area + if annotation_areamax_num_annotations_in_image: + max_num_annotations_in_image = num_annotations_in_image + if num_annotations_in_image