diff --git a/.vscode/python.env b/.vscode/python.env new file mode 100644 index 000000000000..a624ab1397b6 --- /dev/null +++ b/.vscode/python.env @@ -0,0 +1 @@ +PYTHONPATH="datumaro/:$PYTHONPATH" \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index f83561156008..209c01e6b26c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -24,5 +24,5 @@ "changeProcessCWD": true } ], - "python.linting.pylintEnabled": true + "python.envFile": "${workspaceFolder}/.vscode/python.env" } diff --git a/datumaro/datumaro/__init__.py b/datumaro/datumaro/__init__.py index fae1306ab665..ea5ad68ed7be 100644 --- a/datumaro/datumaro/__init__.py +++ b/datumaro/datumaro/__init__.py @@ -82,7 +82,11 @@ def main(args=None): set_up_logger(general_args) command = get_command(command_name, general_args) - return command(command_args) + try: + return command(command_args) + except Exception as e: + log.error(e) + raise if __name__ == '__main__': diff --git a/datumaro/datumaro/cli/project/__init__.py b/datumaro/datumaro/cli/project/__init__.py index f3ed7589c6ad..bd43a72db51c 100644 --- a/datumaro/datumaro/cli/project/__init__.py +++ b/datumaro/datumaro/cli/project/__init__.py @@ -26,8 +26,16 @@ def build_create_parser(parser): def create_command(args): project_dir = osp.abspath(args.dst_dir) project_path = make_project_path(project_dir) + + if not args.overwrite and osp.isdir(project_dir) and os.listdir(project_dir): + log.error("Directory '%s' already exists " + "(pass --overwrite to force creation)" % project_dir) + return 1 + os.makedirs(project_dir, exist_ok=args.overwrite) + if not args.overwrite and osp.isfile(project_path): - log.error("Project file '%s' already exists" % (project_path)) + log.error("Project file '%s' already exists " + "(pass --overwrite to force creation)" % project_path) return 1 project_name = args.name @@ -59,7 +67,7 @@ def build_import_parser(parser): parser.add_argument('--overwrite', action='store_true', help="Overwrite existing files in the save directory") parser.add_argument('--copy', action='store_true', - help="Make a deep copy instead of saving source links") + help="Copy the dataset instead of saving source links") # parser.add_argument('extra_args', nargs=argparse.REMAINDER, # help="Additional arguments for importer (pass '-- -h' for help)") return parser @@ -67,8 +75,16 @@ def build_import_parser(parser): def import_command(args): project_dir = osp.abspath(args.dst_dir) project_path = make_project_path(project_dir) + + if not args.overwrite and osp.isdir(project_dir) and os.listdir(project_dir): + log.error("Directory '%s' already exists " + "(pass --overwrite to force creation)" % project_dir) + return 1 + os.makedirs(project_dir, exist_ok=args.overwrite) + if not args.overwrite and osp.isfile(project_path): - log.error("Project file '%s' already exists" % (project_path)) + log.error("Project file '%s' already exists " + "(pass --overwrite to force creation)" % project_path) return 1 project_name = args.name diff --git a/datumaro/datumaro/cli/source/__init__.py b/datumaro/datumaro/cli/source/__init__.py index cd17e2e89caf..605d222b2131 100644 --- a/datumaro/datumaro/cli/source/__init__.py +++ b/datumaro/datumaro/cli/source/__init__.py @@ -26,7 +26,7 @@ def create_command(args): name = args.name if project.env.git.has_submodule(name): - log.fatal("Source '%s' already exists" % (name)) + log.fatal("Submodule '%s' already exists" % (name)) return 1 try: @@ -171,12 +171,21 @@ def remove_command(args): def build_export_parser(parser): parser.add_argument('-n', '--name', required=True, help="Source dataset to be extracted") + parser.add_argument('-e', '--filter', default=None, + help="Filter expression for dataset items. Examples: " + "extract images with width < height: " + "'/item[image/width < image/height]'; " + "extract images with large-area bboxes: " + "'/item[annotation/type=\"bbox\" and annotation/area>2000]'" + ) parser.add_argument('-d', '--dest', dest='dst_dir', required=True, help="Directory to save output") parser.add_argument('-f', '--output-format', required=True, - help="Output format (default: %(default)s)") + help="Output format") parser.add_argument('-p', '--project', dest='project_dir', default='.', help="Directory of the project to operate on (default: current dir)") + parser.add_argument('extra_args', nargs=argparse.REMAINDER, default=None, + help="Additional arguments for converter (pass '-- -h' for help)") return parser def export_command(args): @@ -187,8 +196,10 @@ def export_command(args): source_project = project.make_source_project(args.name) source_project.make_dataset().export( - save_dir=args.dst_dir, - output_format=args.output_format) + save_dir=dst_dir, + output_format=args.output_format, + filter_expr=args.filter, + cmdline_args=args.extra_args) log.info("Source '%s' exported to '%s' as '%s'" % \ (args.name, dst_dir, args.output_format)) diff --git a/datumaro/datumaro/components/converters/ms_coco.py b/datumaro/datumaro/components/converters/ms_coco.py index 3354f2f09f3d..f629f72dabc9 100644 --- a/datumaro/datumaro/components/converters/ms_coco.py +++ b/datumaro/datumaro/components/converters/ms_coco.py @@ -71,7 +71,7 @@ def save_image_info(self, item, filename): 'id': _cast(item.id, int, 0), 'width': int(w), 'height': int(h), - 'file_name': filename, + 'file_name': _cast(filename, str, ''), 'license': 0, 'flickr_url': '', 'coco_url': '', @@ -117,8 +117,8 @@ def save_categories(self, dataset): for idx, cat in enumerate(label_categories.items): self.categories.append({ 'id': 1 + idx, - 'name': cat.name, - 'supercategory': cat.parent, + 'name': _cast(cat.name, str, ''), + 'supercategory': _cast(cat.parent, str, ''), }) def save_annotations(self, item): @@ -282,8 +282,8 @@ def save_categories(self, dataset): cat = { 'id': 1 + idx, - 'name': label_cat.name, - 'supercategory': label_cat.parent, + 'name': _cast(label_cat.name, str, ''), + 'supercategory': _cast(label_cat.parent, str, ''), 'keypoints': [str(l) for l in kp_cat.labels], 'skeleton': [int(i) for i in kp_cat.adjacent], } @@ -339,8 +339,8 @@ def save_categories(self, dataset): for idx, cat in enumerate(label_categories.items): self.categories.append({ 'id': 1 + idx, - 'name': cat.name, - 'supercategory': cat.parent, + 'name': _cast(cat.name, str, ''), + 'supercategory': _cast(cat.parent, str, ''), }) def save_annotations(self, item): diff --git a/datumaro/datumaro/components/converters/voc.py b/datumaro/datumaro/components/converters/voc.py index 2c76be81cca3..d8ba98cf37ab 100644 --- a/datumaro/datumaro/components/converters/voc.py +++ b/datumaro/datumaro/components/converters/voc.py @@ -4,9 +4,10 @@ # SPDX-License-Identifier: MIT from collections import OrderedDict, defaultdict +import logging as log +from lxml import etree as ET import os import os.path as osp -from lxml import etree as ET from datumaro.components.converter import Converter from datumaro.components.extractor import DEFAULT_SUBSET_NAME, AnnotationType @@ -143,6 +144,11 @@ def save_subsets(self): ET.SubElement(root_elem, 'filename').text = \ item_id + VocPath.IMAGE_EXT + source_elem = ET.SubElement(root_elem, 'source') + ET.SubElement(source_elem, 'database').text = 'Unknown' + ET.SubElement(source_elem, 'annotation').text = 'Unknown' + ET.SubElement(source_elem, 'image').text = 'Unknown' + if item.has_image: h, w, c = item.image.shape size_elem = ET.SubElement(root_elem, 'size') @@ -151,8 +157,8 @@ def save_subsets(self): ET.SubElement(size_elem, 'depth').text = str(c) item_segmented = 0 < len(masks) - if item_segmented: - ET.SubElement(root_elem, 'segmented').text = '1' + ET.SubElement(root_elem, 'segmented').text = \ + str(int(item_segmented)) objects_with_parts = [] objects_with_actions = defaultdict(dict) @@ -296,6 +302,12 @@ def save_class_lists(self, subset_name, class_lists): if len(class_lists) == 0: return + label_cat = self._extractor.categories().get(AnnotationType.label, None) + if not label_cat: + log.warn("Unable to save classification task lists " + "as source does not provide class labels. Skipped.") + return + for label in VocLabel: ann_file = osp.join(self._cls_subsets_dir, '%s_%s.txt' % (label.name, subset_name)) @@ -303,7 +315,8 @@ def save_class_lists(self, subset_name, class_lists): for item, item_labels in class_lists.items(): if not item_labels: continue - presented = label.value in item_labels + item_labels = [label_cat.items[l].name for l in item_labels] + presented = label.name in item_labels f.write('%s % d\n' % \ (item, 1 if presented else -1)) diff --git a/datumaro/datumaro/components/extractors/voc.py b/datumaro/datumaro/components/extractors/voc.py index 8aa202d5c3fd..fdbe7b37d178 100644 --- a/datumaro/datumaro/components/extractors/voc.py +++ b/datumaro/datumaro/components/extractors/voc.py @@ -13,8 +13,10 @@ AnnotationType, LabelObject, MaskObject, BboxObject, LabelCategories, MaskCategories ) -from datumaro.components.formats.voc import VocLabel, VocAction, \ - VocBodyPart, VocTask, VocPath, VocColormap, VocInstColormap +from datumaro.components.formats.voc import (VocLabel, VocAction, + VocBodyPart, VocTask, VocPath, VocColormap, VocInstColormap, + VocIgnoredLabel +) from datumaro.util import dir_items from datumaro.util.image import lazy_image from datumaro.util.mask_tools import lazy_mask, invert_colormap @@ -32,13 +34,16 @@ def _make_voc_categories(): categories[AnnotationType.label] = label_categories def label_id(class_index): + if class_index in [0, VocIgnoredLabel]: + return class_index + class_label = VocLabel(class_index).name label_id, _ = label_categories.find(class_label) - return label_id + return label_id + 1 colormap = { label_id(idx): tuple(color) \ for idx, color in VocColormap.items() } mask_categories = MaskCategories(colormap) - mask_categories.inverse_colormap # init inverse colormap + mask_categories.inverse_colormap # force init categories[AnnotationType.mask] = mask_categories return categories diff --git a/datumaro/datumaro/components/formats/voc.py b/datumaro/datumaro/components/formats/voc.py index b7acaffde6a4..5a9652906a4c 100644 --- a/datumaro/datumaro/components/formats/voc.py +++ b/datumaro/datumaro/components/formats/voc.py @@ -3,6 +3,7 @@ # # SPDX-License-Identifier: MIT +from collections import OrderedDict from enum import Enum import numpy as np @@ -16,28 +17,30 @@ ]) VocLabel = Enum('VocLabel', [ - ('aeroplane', 0), - ('bicycle', 1), - ('bird', 2), - ('boat', 3), - ('bottle', 4), - ('bus', 5), - ('car', 6), - ('cat', 7), - ('chair', 8), - ('cow', 9), - ('diningtable', 10), - ('dog', 11), - ('horse', 12), - ('motorbike', 13), - ('person', 14), - ('pottedplant', 15), - ('sheep', 16), - ('sofa', 17), - ('train', 18), - ('tvmonitor', 19), + ('aeroplane', 1), + ('bicycle', 2), + ('bird', 3), + ('boat', 4), + ('bottle', 5), + ('bus', 6), + ('car', 7), + ('cat', 8), + ('chair', 9), + ('cow', 10), + ('diningtable', 11), + ('dog', 12), + ('horse', 13), + ('motorbike', 14), + ('person', 15), + ('pottedplant', 16), + ('sheep', 17), + ('sofa', 18), + ('train', 19), + ('tvmonitor', 20), ]) +VocIgnoredLabel = 255 + VocPose = Enum('VocPose', [ 'Unspecified', 'Left', @@ -78,11 +81,12 @@ def get_bit(number, index): colormap[:, c] |= get_bit(indices, c) << j indices >>= 3 - return { - id: tuple(color) for id, color in enumerate(colormap) - } + return OrderedDict( + (id, tuple(color)) for id, color in enumerate(colormap) + ) -VocColormap = generate_colormap(len(VocLabel)) +VocColormap = {id: color for id, color in generate_colormap(256).items() + if id in [l.value for l in VocLabel] + [0, VocIgnoredLabel]} VocInstColormap = generate_colormap(256) class VocPath: diff --git a/datumaro/datumaro/components/project.py b/datumaro/datumaro/components/project.py index 0bce665ee0b4..a648f461e593 100644 --- a/datumaro/datumaro/components/project.py +++ b/datumaro/datumaro/components/project.py @@ -652,7 +652,10 @@ def remove_source(self, name): self.env.sources.unregister(name) def get_source(self, name): - return self.config.sources[name] + try: + return self.config.sources[name] + except KeyError: + raise KeyError("Source '%s' is not found" % name) def get_subsets(self): return self.config.subsets @@ -669,7 +672,10 @@ def add_model(self, name, value=Model()): self.env.register_model(name, value) def get_model(self, name): - return self.env.models.get(name) + try: + return self.env.models.get(name) + except KeyError: + raise KeyError("Model '%s' is not found" % name) def remove_model(self, name): self.env.unregister_model(name) diff --git a/datumaro/datumaro/util/image.py b/datumaro/datumaro/util/image.py index 02364de90eaf..2785f9e14b93 100644 --- a/datumaro/datumaro/util/image.py +++ b/datumaro/datumaro/util/image.py @@ -47,6 +47,10 @@ def load_image(path): def save_image(path, image, params=None): if _IMAGE_BACKEND == _IMAGE_BACKENDS.cv2: import cv2 + ext = path[-4:] + if ext.upper() == '.JPG': + params = [ int(cv2.IMWRITE_JPEG_QUALITY), 75 ] + image = image.astype(np.uint8) cv2.imwrite(path, image, params=params) elif _IMAGE_BACKEND == _IMAGE_BACKENDS.PIL: @@ -73,6 +77,7 @@ def encode_image(image, ext, params=None): if ext.upper() == '.JPG': params = [ int(cv2.IMWRITE_JPEG_QUALITY), 75 ] + image = image.astype(np.uint8) success, result = cv2.imencode(ext, image, params=params) if not success: raise Exception("Failed to encode image to '%s' format" % (ext)) @@ -149,4 +154,4 @@ def _get_cache(self): cache = _ImageCache.get_instance() elif cache == False: return None - return cache \ No newline at end of file + return cache diff --git a/datumaro/tests/test_voc_format.py b/datumaro/tests/test_voc_format.py index 841ce7264afb..fac5c27e37ef 100644 --- a/datumaro/tests/test_voc_format.py +++ b/datumaro/tests/test_voc_format.py @@ -34,7 +34,7 @@ class VocTest(TestCase): def test_colormap_generator(self): - reference = [ + reference = np.array([ [ 0, 0, 0], [128, 0, 0], [ 0, 128, 0], @@ -55,7 +55,9 @@ def test_colormap_generator(self): [128, 64, 0], [ 0, 192, 0], [128, 192, 0], - ] + [ 0, 64, 128], + [224, 224, 192], # ignored + ]) self.assertTrue(np.array_equal(reference, list(VOC.VocColormap.values()))) @@ -192,7 +194,7 @@ def test_can_load_voc_cls(self): count += 1 ann = find(item.annotations, lambda x: x.type == AnnotationType.label and \ - x.label == label.value) + get_label(extractor, x.label) == label.name) self.assertFalse(ann is None) self.assertEqual(count, len(item.annotations))