From 16fa1f9726f80f12c0d5d9b47a5d4e464ecd879a Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 30 Mar 2021 12:16:28 +0300 Subject: [PATCH 1/5] Add saving and parsing of attributes in label categories in datumaro format --- datumaro/plugins/datumaro_format/converter.py | 5 +++++ datumaro/plugins/datumaro_format/extractor.py | 6 ++++-- tests/test_datumaro_format.py | 16 ++++++++++------ 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/datumaro/plugins/datumaro_format/converter.py b/datumaro/plugins/datumaro_format/converter.py index 9146fe8451..6e9de7142c 100644 --- a/datumaro/plugins/datumaro_format/converter.py +++ b/datumaro/plugins/datumaro_format/converter.py @@ -181,14 +181,19 @@ def _convert_caption_object(self, obj): }) return converted + def _convert_attribute_categories(self, attributes): + return sorted(attributes) + def _convert_label_categories(self, obj): converted = { 'labels': [], + 'attributes': self._convert_attribute_categories(obj.attributes), } for label in obj.items: converted['labels'].append({ 'name': cast(label.name, str), 'parent': cast(label.parent, str), + 'attributes': self._convert_attribute_categories(label.attributes), }) return converted diff --git a/datumaro/plugins/datumaro_format/extractor.py b/datumaro/plugins/datumaro_format/extractor.py index 7d92399b94..24179ef4e8 100644 --- a/datumaro/plugins/datumaro_format/extractor.py +++ b/datumaro/plugins/datumaro_format/extractor.py @@ -39,9 +39,11 @@ def _load_categories(parsed): parsed_label_cat = parsed['categories'].get(AnnotationType.label.name) if parsed_label_cat: - label_categories = LabelCategories() + label_categories = LabelCategories( + attributes=parsed_label_cat.get('attributes', [])) for item in parsed_label_cat['labels']: - label_categories.add(item['name'], parent=item['parent']) + label_categories.add(item['name'], parent=item['parent'], + attributes=item.get('attributes', [])) categories[AnnotationType.label] = label_categories diff --git a/tests/test_datumaro_format.py b/tests/test_datumaro_format.py index 047950e6d3..b3d3a950b2 100644 --- a/tests/test_datumaro_format.py +++ b/tests/test_datumaro_format.py @@ -3,7 +3,6 @@ import os.path as osp import numpy as np - from unittest import TestCase from datumaro.components.project import Dataset from datumaro.components.extractor import (DatasetItem, @@ -29,9 +28,9 @@ def _test_save_and_load(self, source_dataset, converter, test_dir, @property def test_dataset(self): - label_categories = LabelCategories() + label_categories = LabelCategories(attributes={'a', 'b', 'score'}) for i in range(5): - label_categories.add('cat' + str(i)) + label_categories.add('cat' + str(i), attributes={'x', 'y'}) mask_categories = MaskCategories( generate_colormap(len(label_categories.items))) @@ -52,9 +51,14 @@ def test_dataset(self): Bbox(1, 2, 3, 4, label=4, id=4, z_order=1, attributes={ 'score': 1.0, }), - Bbox(5, 6, 7, 8, id=5, group=5), - Points([1, 2, 2, 0, 1, 1], label=0, id=5, z_order=4), - Mask(label=3, id=5, z_order=2, image=np.ones((2, 3))), + Bbox(5, 6, 7, 8, id=5, group=5, attributes={ + 'a': 1.5, + 'b': 'text', + }), + Points([1, 2, 2, 0, 1, 1], label=0, id=5, z_order=4, + attributes={ 'x': 1, 'y': '2', }), + Mask(label=3, id=5, z_order=2, image=np.ones((2, 3)), + attributes={ 'x': 1, 'y': '2', }), ]), DatasetItem(id=21, subset='train', annotations=[ From 3dda35887232d4bd74270e2a4344e06c4471970b Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 30 Mar 2021 12:18:01 +0300 Subject: [PATCH 2/5] Support common label attributes in cvat format, add an option to ignore undeclared attrs --- datumaro/plugins/cvat_format/converter.py | 60 ++++++++++++++++++----- datumaro/plugins/cvat_format/format.py | 2 + tests/test_cvat_format.py | 51 +++++++++++++------ 3 files changed, 87 insertions(+), 26 deletions(-) diff --git a/datumaro/plugins/cvat_format/converter.py b/datumaro/plugins/cvat_format/converter.py index c0611e7dbb..5a9f64469b 100644 --- a/datumaro/plugins/cvat_format/converter.py +++ b/datumaro/plugins/cvat_format/converter.py @@ -7,6 +7,7 @@ import os import os.path as osp from collections import OrderedDict +from itertools import chain from xml.sax.saxutils import XMLGenerator from datumaro.components.converter import Converter @@ -182,9 +183,9 @@ def _write_item(self, item, index): for ann in item.annotations: if ann.type in {AnnotationType.points, AnnotationType.polyline, AnnotationType.polygon, AnnotationType.bbox}: - self._write_shape(ann) + self._write_shape(ann, item) elif ann.type == AnnotationType.label: - self._write_tag(ann) + self._write_tag(ann, item) else: continue @@ -215,7 +216,7 @@ def _write_meta(self): ("input_type", "text"), ("default_value", ""), ("values", ""), - ])) for attr in label.attributes + ])) for attr in self._get_label_attrs(label) ]) ])) for label in label_cat.items ]), @@ -226,15 +227,27 @@ def _write_meta(self): def _get_label(self, label_id): if label_id is None: return "" - label_cat = self._extractor.categories()[AnnotationType.label] + label_cat = self._extractor.categories().get( + AnnotationType.label, LabelCategories()) return label_cat.items[label_id] - def _write_shape(self, shape): + def _get_label_attrs(self, label): + label_cat = self._extractor.categories().get( + AnnotationType.label, LabelCategories()) + if isinstance(label, int): + label = label_cat[label] + return set(chain(label.attributes, label_cat.attributes)) - \ + self._context._builtin_attrs + + def _write_shape(self, shape, item): if shape.label is None: + log.warning("Item %s: skipping a %s with no label", + item.id, shape.type.name) return + label_name = self._get_label(shape.label).name shape_data = OrderedDict([ - ("label", self._get_label(shape.label).name), + ("label", label_name), ("occluded", str(int(shape.attributes.get('occluded', False)))), ]) @@ -271,13 +284,21 @@ def _write_shape(self, shape): raise NotImplementedError("unknown shape type") for attr_name, attr_value in shape.attributes.items(): + if attr_name in self._context._builtin_attrs: + continue if isinstance(attr_value, bool): attr_value = 'true' if attr_value else 'false' - if attr_name in self._get_label(shape.label).attributes: + if self._context._allow_undeclared_attrs or \ + attr_name in self._get_label_attrs(shape.label): self._writer.add_attribute(OrderedDict([ ("name", str(attr_name)), ("value", str(attr_value)), ])) + else: + log.warning("Item %s: skipping undeclared " + "attribute '%s' for label '%s' " + "(allow with --allow-undeclared-attrs option)", + item.id, attr_name, label_name) if shape.type == AnnotationType.bbox: self._writer.close_box() @@ -290,25 +311,36 @@ def _write_shape(self, shape): else: raise NotImplementedError("unknown shape type") - def _write_tag(self, label): + def _write_tag(self, label, item): if label.label is None: + log.warning("Item %s: skipping a %s with no label", + item.id, label.type.name) return + label_name = self._get_label(label.label).name tag_data = OrderedDict([ - ('label', self._get_label(label.label).name), + ('label', label_name), ]) if label.group: tag_data['group_id'] = str(label.group) self._writer.open_tag(tag_data) for attr_name, attr_value in label.attributes.items(): + if attr_name in self._context._builtin_attrs: + continue if isinstance(attr_value, bool): attr_value = 'true' if attr_value else 'false' - if attr_name in self._get_label(label.label).attributes: + if self._context._allow_undeclared_attrs or \ + attr_name in self._get_label_attrs(label.label): self._writer.add_attribute(OrderedDict([ ("name", str(attr_name)), ("value", str(attr_value)), ])) + else: + log.warning("Item %s: skipping undeclared " + "attribute '%s' for label '%s' " + "(allow with --allow-undeclared-attrs option)", + item.id, attr_name, label_name) self._writer.close_tag() @@ -320,12 +352,18 @@ def build_cmdline_parser(cls, **kwargs): parser = super().build_cmdline_parser(**kwargs) parser.add_argument('--reindex', action='store_true', help="Assign new indices to frames (default: %(default)s)") + parser.add_argument('--allow-undeclared-attrs', action='store_true', + help="Write annotation attributes even if they are not present in " + "the input dataset metainfo (default: %(default)s)") return parser - def __init__(self, extractor, save_dir, reindex=False, **kwargs): + def __init__(self, extractor, save_dir, reindex=False, + allow_undeclared_attrs=False, **kwargs): super().__init__(extractor, save_dir, **kwargs) self._reindex = reindex + self._builtin_attrs = CvatPath.BUILTIN_ATTRS + self._allow_undeclared_attrs = allow_undeclared_attrs def apply(self): self._images_dir = osp.join(self._save_dir, CvatPath.IMAGES_DIR) diff --git a/datumaro/plugins/cvat_format/format.py b/datumaro/plugins/cvat_format/format.py index e5572a89be..9f16ba9838 100644 --- a/datumaro/plugins/cvat_format/format.py +++ b/datumaro/plugins/cvat_format/format.py @@ -7,3 +7,5 @@ class CvatPath: IMAGES_DIR = 'images' IMAGE_EXT = '.jpg' + + BUILTIN_ATTRS = {'occluded', 'outside', 'keyframe', 'track_id'} \ No newline at end of file diff --git a/tests/test_cvat_format.py b/tests/test_cvat_format.py index ced0930f04..d85c66dea5 100644 --- a/tests/test_cvat_format.py +++ b/tests/test_cvat_format.py @@ -1,8 +1,8 @@ from functools import partial -import numpy as np import os import os.path as osp +import numpy as np from unittest import TestCase from datumaro.components.project import Dataset from datumaro.components.extractor import (DatasetItem, @@ -148,18 +148,17 @@ def _test_save_and_load(self, source_dataset, converter, test_dir, target_dataset=target_dataset, importer_args=importer_args, **kwargs) def test_can_save_and_load(self): - label_categories = LabelCategories() + src_label_cat = LabelCategories(attributes={'occluded', 'common'}) for i in range(10): - label_categories.add(str(i)) - label_categories.items[2].attributes.update(['a1', 'a2', 'empty']) - label_categories.attributes.update(['occluded']) + src_label_cat.add(str(i)) + src_label_cat.items[2].attributes.update(['a1', 'a2', 'empty']) source_dataset = Dataset.from_iterable([ DatasetItem(id=0, subset='s1', image=np.zeros((5, 10, 3)), annotations=[ Polygon([0, 0, 4, 0, 4, 4], label=1, group=4, - attributes={ 'occluded': True}), + attributes={ 'occluded': True, 'common': 't' }), Points([1, 1, 3, 2, 2, 3], label=2, attributes={ 'a1': 'x', 'a2': 42, 'empty': '', @@ -188,16 +187,19 @@ def test_can_save_and_load(self): DatasetItem(id=3, subset='s3', image=Image( path='3.jpg', size=(2, 4))), - ], categories={ - AnnotationType.label: label_categories, - }) + ], categories={ AnnotationType.label: src_label_cat }) + target_label_cat = LabelCategories( + attributes={'occluded'}) # unable to represent a common attribute + for i in range(10): + target_label_cat.add(str(i), attributes={'common'}) + target_label_cat.items[2].attributes.update(['a1', 'a2', 'empty', 'common']) target_dataset = Dataset.from_iterable([ DatasetItem(id=0, subset='s1', image=np.zeros((5, 10, 3)), annotations=[ Polygon([0, 0, 4, 0, 4, 4], label=1, group=4, - attributes={ 'occluded': True }), + attributes={ 'occluded': True, 'common': 't' }), Points([1, 1, 3, 2, 2, 3], label=2, attributes={ 'occluded': False, 'empty': '', @@ -228,15 +230,35 @@ def test_can_save_and_load(self): DatasetItem(id=3, subset='s3', image=Image( path='3.jpg', size=(2, 4)), attributes={'frame': 0}), - ], categories={ - AnnotationType.label: label_categories, - }) + ], categories={ AnnotationType.label: target_label_cat }) with TestDir() as test_dir: self._test_save_and_load(source_dataset, partial(CvatConverter.convert, save_images=True), test_dir, target_dataset=target_dataset) + def test_can_allow_undeclared_attrs(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id=0, annotations=[ + Label(2, attributes={ 'x': 4, 'y': 2 }), + Bbox(1, 2, 3, 4, label=1, attributes={ 'x': 1, 'y': 1 }), + ]), + ], categories=[{'name': 'a', 'attributes': {'x'}}]) + + target_label_cat = LabelCategories(attributes={'occluded'}) + target_label_cat.add('a', attributes={'x'}) + target_dataset = Dataset.from_iterable([ + DatasetItem(id=0, annotations=[ + Label(2, attributes={ 'x': 4, 'y': 2 }), + Bbox(1, 2, 3, 4, label=1, attributes={ 'x': 1, 'y': 1 }), + ]), + ], categories={ AnnotationType.label: target_label_cat }) + + with TestDir() as test_dir: + self._test_save_and_load(source_dataset, + partial(CvatConverter.convert, allow_undeclared_attrs=True), + test_dir, target_dataset=target_dataset) + def test_relative_paths(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='1', image=np.ones((4, 2, 3))), @@ -259,11 +281,10 @@ def test_relative_paths(self): target_dataset=target_dataset, require_images=True) def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): - label_categories = LabelCategories() + label_categories = LabelCategories(attributes={'occluded'}) for i in range(10): label_categories.add(str(i)) label_categories.items[2].attributes.update(['a1', 'a2', 'empty']) - label_categories.attributes.update(['occluded']) source_dataset = Dataset.from_iterable([ DatasetItem(id='кириллица с пробелом', From 7a681b77ab5f8df512def689f57a5d91bcfe79cc Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 30 Mar 2021 12:24:05 +0300 Subject: [PATCH 3/5] Add logging for parsed parameters in plugins --- datumaro/components/cli_plugin.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/datumaro/components/cli_plugin.py b/datumaro/components/cli_plugin.py index 0346cddcea..702158aa70 100644 --- a/datumaro/components/cli_plugin.py +++ b/datumaro/components/cli_plugin.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: MIT import argparse +import logging as log from datumaro.cli.util import MultilineFormatter from datumaro.util import to_snake_case @@ -36,7 +37,12 @@ def parse_cmdline(cls, args=None): args = args[1:] parser = cls.build_cmdline_parser() args = parser.parse_args(args) - return vars(args) + args = vars(args) + + log.debug("Parsed parameters: \n\t%s", + '\n\t'.join('%s: %s' % (k, v) for k, v in args.items())) + + return args def remove_plugin_type(s): for t in {'transform', 'extractor', 'converter', 'launcher', 'importer'}: From d30e82c3ccbb250726619919c3e480192263d849 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 30 Mar 2021 13:24:21 +0300 Subject: [PATCH 4/5] fix test --- tests/test_cvat_format.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/test_cvat_format.py b/tests/test_cvat_format.py index d85c66dea5..5b2c60e130 100644 --- a/tests/test_cvat_format.py +++ b/tests/test_cvat_format.py @@ -240,18 +240,19 @@ def test_can_save_and_load(self): def test_can_allow_undeclared_attrs(self): source_dataset = Dataset.from_iterable([ DatasetItem(id=0, annotations=[ - Label(2, attributes={ 'x': 4, 'y': 2 }), - Bbox(1, 2, 3, 4, label=1, attributes={ 'x': 1, 'y': 1 }), + Label(0, attributes={ 'x': 4, 'y': 2 }), + Bbox(1, 2, 3, 4, label=0, attributes={ 'x': 1, 'y': 1 }), ]), - ], categories=[{'name': 'a', 'attributes': {'x'}}]) + ], categories=[ ('a', '', {'x'}) ]) target_label_cat = LabelCategories(attributes={'occluded'}) target_label_cat.add('a', attributes={'x'}) target_dataset = Dataset.from_iterable([ DatasetItem(id=0, annotations=[ - Label(2, attributes={ 'x': 4, 'y': 2 }), - Bbox(1, 2, 3, 4, label=1, attributes={ 'x': 1, 'y': 1 }), - ]), + Label(0, attributes={ 'x': 4, 'y': 2 }), + Bbox(1, 2, 3, 4, label=0, + attributes={ 'x': 1, 'y': 1, 'occluded': False }), + ], attributes={'frame': 0}), ], categories={ AnnotationType.label: target_label_cat }) with TestDir() as test_dir: From a95561c6c2531abe0e15765ce84adfa5dae2bd30 Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 30 Mar 2021 16:46:56 +0300 Subject: [PATCH 5/5] update changelog --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 77ac97eb79..e77317e1d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ### Changed -- +- Added an option to allow undeclared annotation attributes in CVAT format export () ### Deprecated - @@ -20,7 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ### Fixed -- +- Added support for label attributes in Datumaro format () ### Security -