From 3e3bdeba229579734cb2241e1ac1b86376c1d17f Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Fri, 5 Mar 2021 15:46:12 +0300 Subject: [PATCH 01/16] Fix CamVid --- datumaro/plugins/camvid_format.py | 12 ++++++++++-- tests/test_camvid_format.py | 16 ++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py index ace780148b..2a3c9aaf6c 100644 --- a/datumaro/plugins/camvid_format.py +++ b/datumaro/plugins/camvid_format.py @@ -7,7 +7,6 @@ import os.path as osp from collections import OrderedDict from enum import Enum -from glob import glob import numpy as np from datumaro.components.converter import Converter @@ -157,6 +156,15 @@ def _load_items(self, path): with open(path, encoding='utf-8') as f: for line in f: objects = line.split() + if 2 < len(objects): + if len(objects) % 2: + raise Exception("Line %s: image and gt file must have " + "the same name" % line) + else: + mid = int(len(objects) / 2) + objects[0] = ' '.join(objects[i] for i in range(mid)) + objects[1] = ' '.join(objects[i] for i in range(mid, 2 * mid)) + objects = objects[:2] image = objects[0] item_id = ('/'.join(image.split('/')[2:]))[:-len(CamvidPath.IMAGE_EXT)] image_path = osp.join(self._dataset_dir, @@ -262,7 +270,7 @@ def save_segm_lists(self, subset_name, segm_list): return ann_file = osp.join(self._save_dir, subset_name + '.txt') - with open(ann_file, 'w') as f: + with open(ann_file, 'w' , encoding='utf-8') as f: for item in segm_list: if segm_list[item]: path_mask = '/%s/%s' % (subset_name + CamvidPath.SEGM_DIR, diff --git a/tests/test_camvid_format.py b/tests/test_camvid_format.py index 85e0b6e7d9..1a1c17d429 100644 --- a/tests/test_camvid_format.py +++ b/tests/test_camvid_format.py @@ -145,6 +145,22 @@ def __iter__(self): self._test_save_and_load(TestExtractor(), partial(CamvidConverter.convert, label_map='camvid'), test_dir) + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + class TestExtractor(TestExtractorBase): + def __iter__(self): + return iter([ + DatasetItem(id='кириллица в имени файла', + image=np.ones((1, 5, 3)), annotations=[ + Mask(image=np.array([[1, 0, 0, 1, 0]]), label=0), + Mask(image=np.array([[0, 1, 1, 0, 1]]), label=3), + ] + ), + ]) + + with TestDir() as test_dir: + self._test_save_and_load(TestExtractor(), + partial(CamvidConverter.convert, label_map='camvid'), test_dir) + def test_can_save_with_no_masks(self): class TestExtractor(TestExtractorBase): def __iter__(self): From 382e7019ebad0bd9048bf73ccb9b2dba5b120b18 Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Wed, 10 Mar 2021 11:08:09 +0300 Subject: [PATCH 02/16] add test and fixes to other formats --- datumaro/plugins/imagenet_format.py | 2 +- datumaro/plugins/imagenet_txt_format.py | 20 +++++++--- datumaro/plugins/lfw_format.py | 46 +++++++++++++++++------ datumaro/plugins/vgg_face2_format.py | 8 ++-- datumaro/plugins/voc_format/converter.py | 16 ++++---- datumaro/plugins/voc_format/extractor.py | 10 +++-- datumaro/plugins/widerface_format.py | 4 +- datumaro/plugins/yolo_format/converter.py | 8 ++-- datumaro/plugins/yolo_format/extractor.py | 10 ++--- tests/test_camvid_format.py | 2 +- tests/test_coco_format.py | 10 +++++ tests/test_cvat_format.py | 34 +++++++++++++++++ tests/test_datumaro_format.py | 9 +++++ tests/test_icdar_format.py | 12 ++++++ tests/test_image_dir_format.py | 11 +++++- tests/test_imagenet_format.py | 18 +++++++++ tests/test_imagenet_txt_format.py | 19 ++++++++++ tests/test_labelme_format.py | 42 +++++++++++++++++++++ tests/test_lfw_format.py | 24 ++++++++++++ tests/test_market1501_format.py | 18 +++++++++ tests/test_mots_format.py | 13 +++++++ tests/test_tfrecord_format.py | 21 +++++++++++ tests/test_vgg_face2_format.py | 16 ++++++++ tests/test_voc_format.py | 14 +++++++ tests/test_widerface_format.py | 20 ++++++++++ tests/test_yolo_format.py | 18 +++++++++ 26 files changed, 378 insertions(+), 47 deletions(-) diff --git a/datumaro/plugins/imagenet_format.py b/datumaro/plugins/imagenet_format.py index 9702262008..829534d700 100644 --- a/datumaro/plugins/imagenet_format.py +++ b/datumaro/plugins/imagenet_format.py @@ -1,4 +1,4 @@ - +#_*_ coding:utf-8 _*_ # Copyright (C) 2020 Intel Corporation # # SPDX-License-Identifier: MIT diff --git a/datumaro/plugins/imagenet_txt_format.py b/datumaro/plugins/imagenet_txt_format.py index 36ee68a7c5..f24c7e6bc1 100644 --- a/datumaro/plugins/imagenet_txt_format.py +++ b/datumaro/plugins/imagenet_txt_format.py @@ -49,9 +49,14 @@ def _load_items(self, path): items = {} with open(path, encoding='utf-8') as f: for line in f: - item = line.split() - item_id = item[0] - label_ids = [int(id) for id in item[1:]] + item = line.split('\"') + if len(item) == 3: + item_id = item[1] + label_ids = [int(id) for id in item[2].split()] + else: + item = line.split() + item_id = item[0] + label_ids = [int(id) for id in item[1:]] anno = [] for label in label_ids: assert 0 <= label and \ @@ -96,10 +101,13 @@ def apply(self): self._save_image(item, osp.join(self._save_dir, ImagenetTxtPath.IMAGE_DIR, self._make_image_filename(item))) - + annotation = '' + for item_id, item_labels in labels.items(): + if 1 < len(item_id.split()): + item_id = '\"' + item_id + '\"' + annotation += '%s %s\n' % (item_id, ' '.join(item_labels)) with open(annotation_file, 'w', encoding='utf-8') as f: - f.writelines(['%s %s\n' % (item_id, ' '.join(labels[item_id])) - for item_id in labels]) + f.write(annotation) labels_file = osp.join(subset_dir, ImagenetTxtPath.LABELS_FILE) with open(labels_file, 'w', encoding='utf-8') as f: diff --git a/datumaro/plugins/lfw_format.py b/datumaro/plugins/lfw_format.py index 3d16a2949f..44370475f4 100644 --- a/datumaro/plugins/lfw_format.py +++ b/datumaro/plugins/lfw_format.py @@ -31,10 +31,14 @@ def _load_items(self, path): images_dir = osp.join(self._dataset_dir, self._subset, LfwPath.IMAGES_DIR) with open(path, encoding='utf-8') as f: for line in f: - pair = line.strip().split() + pair = line.strip().split('\t') if len(pair) == 3: - image1 = self.get_image_name(pair[0], pair[1]) - image2 = self.get_image_name(pair[0], pair[2]) + if pair[0] == '-': + image1 = pair[1] + image2 = pair[2] + else: + image1 = self.get_image_name(pair[0], pair[1]) + image2 = self.get_image_name(pair[0], pair[2]) if image1 not in items: items[image1] = DatasetItem(id=image1, subset=self._subset, image=osp.join(images_dir, image1 + LfwPath.IMAGE_EXT), @@ -47,8 +51,14 @@ def _load_items(self, path): attributes = items[image1].attributes attributes['positive_pairs'].append(image2) elif len(pair) == 4: - image1 = self.get_image_name(pair[0], pair[1]) - image2 = self.get_image_name(pair[2], pair[3]) + if pair[0] == '-': + image1 = pair[1] + else: + image1 = self.get_image_name(pair[0], pair[1]) + if pair[2] == '-': + image2 = pair[3] + else: + image2 = self.get_image_name(pair[2], pair[3]) if image1 not in items: items[image1] = DatasetItem(id=image1, subset=self._subset, image=osp.join(images_dir, image1 + LfwPath.IMAGE_EXT), @@ -102,17 +112,31 @@ def apply(self): self._save_image(item, osp.join(self._save_dir, subset_name, LfwPath.IMAGES_DIR, item.id + LfwPath.IMAGE_EXT)) - person1, num1 = LfwPath.PATTERN.search(item.id).groups() - num1 = int(num1) + search = LfwPath.PATTERN.search(item.id) + if search: + person1, num1 = search.groups() + num1 = int(num1) + else: + person1 = '-' + num1 = item.id if 'positive_pairs' in item.attributes: for pair in item.attributes['positive_pairs']: - num2 = LfwPath.PATTERN.search(pair).groups()[1] - num2 = int(num2) + search = LfwPath.PATTERN.search(pair) + if search: + num2 = search.groups()[1] + num2 = int(num2) + else: + num2 = pair positive_pairs.append('%s\t%s\t%s' % (person1, num1, num2)) if 'negative_pairs' in item.attributes: for pair in item.attributes['negative_pairs']: - person2, num2 = LfwPath.PATTERN.search(pair).groups() - num2 = int(num2) + search = LfwPath.PATTERN.search(pair) + if search: + person2, num2 = search.groups() + num2 = int(num2) + else: + person2 = '-' + num2 = pair negative_pairs.append('%s\t%s\t%s\t%s' % \ (person1, num1, person2, num2)) diff --git a/datumaro/plugins/vgg_face2_format.py b/datumaro/plugins/vgg_face2_format.py index c38478193b..e4e718606e 100644 --- a/datumaro/plugins/vgg_face2_format.py +++ b/datumaro/plugins/vgg_face2_format.py @@ -68,7 +68,7 @@ def _split_item_path(path): items = {} - with open(path) as content: + with open(path, encoding='utf-8') as content: landmarks_table = list(csv.DictReader(content)) for row in landmarks_table: item_id = row['NAME_ID'] @@ -96,7 +96,7 @@ def _split_item_path(path): bboxes_path = osp.join(self._dataset_dir, VggFace2Path.ANNOTATION_DIR, VggFace2Path.BBOXES_FILE + self._subset + '.csv') if osp.isfile(bboxes_path): - with open(bboxes_path) as content: + with open(bboxes_path, encoding='utf-8') as content: bboxes_table = list(csv.DictReader(content)) for row in bboxes_table: item_id = row['NAME_ID'] @@ -224,7 +224,7 @@ def apply(self): landmarks_path = osp.join(save_dir, VggFace2Path.ANNOTATION_DIR, VggFace2Path.LANDMARKS_FILE + subset_name + '.csv') os.makedirs(osp.dirname(landmarks_path), exist_ok=True) - with open(landmarks_path, 'w', newline='') as file: + with open(landmarks_path, 'w', encoding='utf-8', newline='') as file: columns = ['NAME_ID', 'P1X', 'P1Y', 'P2X', 'P2Y', 'P3X', 'P3Y', 'P4X', 'P4Y', 'P5X', 'P5Y'] writer = csv.DictWriter(file, fieldnames=columns) @@ -235,7 +235,7 @@ def apply(self): bboxes_path = osp.join(save_dir, VggFace2Path.ANNOTATION_DIR, VggFace2Path.BBOXES_FILE + subset_name + '.csv') os.makedirs(osp.dirname(bboxes_path), exist_ok=True) - with open(bboxes_path, 'w', newline='') as file: + with open(bboxes_path, 'w', encoding='utf-8', newline='') as file: columns = ['NAME_ID', 'X', 'Y', 'W', 'H'] writer = csv.DictWriter(file, fieldnames=columns) writer.writeheader() diff --git a/datumaro/plugins/voc_format/converter.py b/datumaro/plugins/voc_format/converter.py index a022d04265..73c9925292 100644 --- a/datumaro/plugins/voc_format/converter.py +++ b/datumaro/plugins/voc_format/converter.py @@ -296,7 +296,7 @@ def save_subsets(self): VocTask.action_classification}: ann_path = osp.join(self._ann_dir, item.id + '.xml') os.makedirs(osp.dirname(ann_path), exist_ok=True) - with open(ann_path, 'w') as f: + with open(ann_path, 'w', encoding='utf-8') as f: f.write(ET.tostring(root_elem, encoding='unicode', pretty_print=True)) @@ -350,7 +350,7 @@ def save_subsets(self): @staticmethod def _get_filtered_lines(path, patch, subset, items=None): lines = {} - with open(path) as f: + with open(path, encoding='utf-8') as f: for line in f: item, text, _ = line.split(maxsplit=1) + ['', ''] if not patch or patch.updated_items.get((item, subset)) != \ @@ -367,7 +367,7 @@ def save_action_lists(self, subset_name, action_list): items = {k: True for k in action_list} if self._patch and osp.isfile(ann_file): self._get_filtered_lines(ann_file, self._patch, subset_name, items) - with open(ann_file, 'w') as f: + with open(ann_file, 'w', encoding='utf-8') as f: for item in items: f.write('%s\n' % item) @@ -392,7 +392,7 @@ def _write_item(f, item, objs, action): if self._patch and osp.isfile(ann_file): lines = self._get_filtered_lines(ann_file, None, subset_name) - with open(ann_file, 'w') as f: + with open(ann_file, 'w', encoding='utf-8') as f: for item in items: if item in action_list: _write_item(f, item, action_list[item], action) @@ -418,7 +418,7 @@ def _write_item(f, item, item_labels): lines = self._get_filtered_lines(ann_file, self._patch, subset_name, items) - with open(ann_file, 'w') as f: + with open(ann_file, 'w', encoding='utf-8') as f: for item in items: if item in class_lists: _write_item(f, item, class_lists[item]) @@ -433,7 +433,7 @@ def save_clsdet_lists(self, subset_name, clsdet_list): if self._patch and osp.isfile(ann_file): self._get_filtered_lines(ann_file, self._patch, subset_name, items) - with open(ann_file, 'w') as f: + with open(ann_file, 'w', encoding='utf-8') as f: for item in items: f.write('%s\n' % item) @@ -445,7 +445,7 @@ def save_segm_lists(self, subset_name, segm_list): if self._patch and osp.isfile(ann_file): self._get_filtered_lines(ann_file, self._patch, subset_name, items) - with open(ann_file, 'w') as f: + with open(ann_file, 'w', encoding='utf-8') as f: for item in items: f.write('%s\n' % item) @@ -465,7 +465,7 @@ def _write_item(f, item, item_layouts): if self._patch and osp.isfile(ann_file): self._get_filtered_lines(ann_file, self._patch, subset_name, items) - with open(ann_file, 'w') as f: + with open(ann_file, 'w', encoding='utf-8') as f: for item in items: if item in layout_list: _write_item(f, item, layout_list[item]) diff --git a/datumaro/plugins/voc_format/extractor.py b/datumaro/plugins/voc_format/extractor.py index 655d72b893..b84ac0fd23 100644 --- a/datumaro/plugins/voc_format/extractor.py +++ b/datumaro/plugins/voc_format/extractor.py @@ -58,8 +58,8 @@ def _load_categories(dataset_path): @staticmethod def _load_subset_list(subset_path): - with open(subset_path) as f: - return [line.split()[0] for line in f] + with open(subset_path, encoding='utf-8') as f: + return [line.strip() for line in f] class VocClassificationExtractor(_VocExtractor): def __iter__(self): @@ -78,11 +78,13 @@ def _load_annotations(self): anno_files = [s for s in dir_items(task_dir, '.txt') if s.endswith('_' + osp.basename(self._path))] for ann_filename in anno_files: - with open(osp.join(task_dir, ann_filename)) as f: + with open(osp.join(task_dir, ann_filename), encoding='utf-8') as f: label = ann_filename[:ann_filename.rfind('_')] label_id = self._get_label_id(label) for line in f: - item, present = line.split() + objects = line.split() + item = ' '.join(objects[i] for i in range(len(objects) - 1)) + present = objects[-1] if present == '1': annotations[item].append(label_id) diff --git a/datumaro/plugins/widerface_format.py b/datumaro/plugins/widerface_format.py index 87005b66ad..5a968f9c8c 100644 --- a/datumaro/plugins/widerface_format.py +++ b/datumaro/plugins/widerface_format.py @@ -62,7 +62,7 @@ def _load_categories(self): def _load_items(self, path): items = {} - with open(path, 'r') as f: + with open(path, 'r', encoding='utf-8') as f: lines = f.readlines() image_ids = [image_id for image_id, line in enumerate(lines) @@ -178,5 +178,5 @@ def apply(self): annotation_path = osp.join(save_dir, WiderFacePath.ANNOTATIONS_DIR, 'wider_face_' + subset_name + '_bbx_gt.txt') os.makedirs(osp.dirname(annotation_path), exist_ok=True) - with open(annotation_path, 'w') as f: + with open(annotation_path, 'w', encoding='utf-8') as f: f.write(wider_annotation) diff --git a/datumaro/plugins/yolo_format/converter.py b/datumaro/plugins/yolo_format/converter.py index 351636b5d8..fb71b8f172 100644 --- a/datumaro/plugins/yolo_format/converter.py +++ b/datumaro/plugins/yolo_format/converter.py @@ -39,7 +39,7 @@ def apply(self): label_categories = extractor.categories()[AnnotationType.label] label_ids = {label.name: idx for idx, label in enumerate(label_categories.items)} - with open(osp.join(save_dir, 'obj.names'), 'w') as f: + with open(osp.join(save_dir, 'obj.names'), 'w', encoding='utf-8') as f: f.writelines('%s\n' % l[0] for l in sorted(label_ids.items(), key=lambda x: x[1])) @@ -88,15 +88,15 @@ def apply(self): annotation_path = osp.join(subset_dir, '%s.txt' % item.id) os.makedirs(osp.dirname(annotation_path), exist_ok=True) - with open(annotation_path, 'w') as f: + with open(annotation_path, 'w', encoding='utf-8') as f: f.write(yolo_annotation) subset_list_name = '%s.txt' % subset_name subset_lists[subset_name] = subset_list_name - with open(osp.join(save_dir, subset_list_name), 'w') as f: + with open(osp.join(save_dir, subset_list_name), 'w', encoding='utf-8') as f: f.writelines('%s\n' % s for s in image_paths.values()) - with open(osp.join(save_dir, 'obj.data'), 'w') as f: + with open(osp.join(save_dir, 'obj.data'), 'w', encoding='utf-8') as f: f.write('classes = %s\n' % len(label_ids)) for subset_name, subset_list_name in subset_lists.items(): diff --git a/datumaro/plugins/yolo_format/extractor.py b/datumaro/plugins/yolo_format/extractor.py index 54774f08cb..5b5e8d20ec 100644 --- a/datumaro/plugins/yolo_format/extractor.py +++ b/datumaro/plugins/yolo_format/extractor.py @@ -52,14 +52,14 @@ def __init__(self, config_path, image_info=None): if isinstance(image_info, str): if not osp.isfile(image_info): raise Exception("Can't read image meta file '%s'" % image_info) - with open(image_info) as f: + with open(image_info, encoding='utf-8') as f: image_info = {} for line in f: image_name, h, w = line.strip().split() image_info[image_name] = (int(h), int(w)) self._image_info = image_info - with open(config_path, 'r') as f: + with open(config_path, 'r', encoding='utf-8') as f: config_lines = f.readlines() subsets = OrderedDict() @@ -89,7 +89,7 @@ def __init__(self, config_path, image_info=None): raise Exception("Not found '%s' subset list file" % subset_name) subset = YoloExtractor.Subset(subset_name, self) - with open(list_path, 'r') as f: + with open(list_path, 'r', encoding='utf-8') as f: subset.items = OrderedDict( (self.name_from_path(p), self.localize_path(p)) for p in f @@ -143,7 +143,7 @@ def _get(self, item_id, subset_name): @staticmethod def _parse_annotations(anno_path, image): lines = [] - with open(anno_path, 'r') as f: + with open(anno_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: @@ -174,7 +174,7 @@ def _parse_annotations(anno_path, image): def _load_categories(names_path): label_categories = LabelCategories() - with open(names_path, 'r') as f: + with open(names_path, 'r', encoding='utf-8') as f: for label in f: label_categories.add(label.strip()) diff --git a/tests/test_camvid_format.py b/tests/test_camvid_format.py index 1a1c17d429..dba9d0116d 100644 --- a/tests/test_camvid_format.py +++ b/tests/test_camvid_format.py @@ -149,7 +149,7 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): class TestExtractor(TestExtractorBase): def __iter__(self): return iter([ - DatasetItem(id='кириллица в имени файла', + DatasetItem(id='кириллица с пробелом', image=np.ones((1, 5, 3)), annotations=[ Mask(image=np.array([[1, 0, 0, 1, 0]]), label=0), Mask(image=np.array([[0, 1, 1, 0, 1]]), label=3), diff --git a/tests/test_coco_format.py b/tests/test_coco_format.py index c2ee51bd8e..d57133d458 100644 --- a/tests/test_coco_format.py +++ b/tests/test_coco_format.py @@ -424,6 +424,16 @@ def test_can_save_and_load_images(self): self._test_save_and_load(expected_dataset, CocoImageInfoConverter.convert, test_dir) + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + expected_dataset = Dataset.from_iterable([ + DatasetItem(id='кириллица с пробелом', subset='train', + attributes={'id': 1}), + ]) + + with TestDir() as test_dir: + self._test_save_and_load(expected_dataset, + CocoImageInfoConverter.convert, test_dir) + def test_can_save_and_load_labels(self): expected_dataset = Dataset.from_iterable([ DatasetItem(id=1, subset='train', diff --git a/tests/test_cvat_format.py b/tests/test_cvat_format.py index 4caeaeed0d..c23c32a4b1 100644 --- a/tests/test_cvat_format.py +++ b/tests/test_cvat_format.py @@ -237,6 +237,40 @@ def test_can_save_and_load(self): partial(CvatConverter.convert, save_images=True), test_dir, target_dataset=target_dataset) + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + label_categories = LabelCategories() + for i in range(10): + label_categories.add(str(i)) + label_categories.items[2].attributes.update(['a1', 'a2', 'empty']) + label_categories.attributes.update(['occluded']) + + source_dataset = Dataset.from_iterable([ + DatasetItem(id='кириллица с пробелом', + subset='s1', image=np.zeros((5, 10, 3)), + annotations=[ + Label(1), + ] + ), + ], categories={ + AnnotationType.label: label_categories, + }) + + target_dataset = Dataset.from_iterable([ + DatasetItem(id='кириллица с пробелом', + subset='s1', image=np.zeros((5, 10, 3)), + annotations=[ + Label(1), + ], attributes={'frame': 0} + ), + ], categories={ + AnnotationType.label: label_categories, + }) + + with TestDir() as test_dir: + self._test_save_and_load(source_dataset, + partial(CvatConverter.convert, save_images=True), test_dir, + target_dataset=target_dataset) + def test_relative_paths(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='1', image=np.ones((4, 2, 3))), diff --git a/tests/test_datumaro_format.py b/tests/test_datumaro_format.py index fa063fcfda..582bad7fdd 100644 --- a/tests/test_datumaro_format.py +++ b/tests/test_datumaro_format.py @@ -102,6 +102,15 @@ def test_relative_paths(self): self._test_save_and_load(test_dataset, partial(DatumaroConverter.convert, save_images=True), test_dir) + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + test_dataset = Dataset.from_iterable([ + DatasetItem(id='кириллица с пробелом', image=np.ones((4, 2, 3))), + ]) + + with TestDir() as test_dir: + self._test_save_and_load(test_dataset, + partial(DatumaroConverter.convert, save_images=True), test_dir) + def test_inplace_save_writes_only_updated_data(self): with TestDir() as path: # generate initial dataset diff --git a/tests/test_icdar_format.py b/tests/test_icdar_format.py index 5583446531..69a4c89109 100644 --- a/tests/test_icdar_format.py +++ b/tests/test_icdar_format.py @@ -184,3 +184,15 @@ def test_can_save_and_load_with_no_subsets(self): with TestDir() as test_dir: self._test_save_and_load(expected_dataset, IcdarTextLocalizationConverter.convert, test_dir) + + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + expected_dataset = Dataset.from_iterable([ + DatasetItem(id='кириллица с пробелом', image=np.ones((8, 8, 3)), + annotations=[ + Bbox(0, 1, 3, 5), + ]), + ]) + + with TestDir() as test_dir: + self._test_save_and_load(expected_dataset, + IcdarTextLocalizationConverter.convert, test_dir) diff --git a/tests/test_image_dir_format.py b/tests/test_image_dir_format.py index f7f21b0888..8ff52eb4cc 100644 --- a/tests/test_image_dir_format.py +++ b/tests/test_image_dir_format.py @@ -28,4 +28,13 @@ def test_relative_paths(self): with TestDir() as test_dir: test_save_and_load(self, dataset, ImageDirConverter.convert, - test_dir, importer='image_dir') \ No newline at end of file + test_dir, importer='image_dir') + + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + dataset = Dataset.from_iterable([ + DatasetItem(id='кириллица с пробелом', image=np.ones((4, 2, 3))), + ]) + + with TestDir() as test_dir: + test_save_and_load(self, dataset, ImageDirConverter.convert, + test_dir, importer='image_dir') diff --git a/tests/test_imagenet_format.py b/tests/test_imagenet_format.py index 2b4ef79fb1..2b093e2bc2 100644 --- a/tests/test_imagenet_format.py +++ b/tests/test_imagenet_format.py @@ -83,6 +83,24 @@ def test_can_save_and_load_with_multiple_labels(self): compare_datasets(self, source_dataset, parsed_dataset, require_images=True) + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id="кириллица с пробелом", + image=np.ones((8, 8, 3)), + annotations=[Label(0), Label(1)] + ), + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + 'label_' + str(label) for label in range(2)), + }) + + with TestDir() as test_dir: + ImagenetConverter.convert(source_dataset, test_dir, save_images=True) + + parsed_dataset = Dataset.import_from(test_dir, 'imagenet') + + compare_datasets(self, source_dataset, parsed_dataset, + require_images=True) DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'imagenet_dataset') diff --git a/tests/test_imagenet_txt_format.py b/tests/test_imagenet_txt_format.py index 4f5dda37c5..a5ad3b778c 100644 --- a/tests/test_imagenet_txt_format.py +++ b/tests/test_imagenet_txt_format.py @@ -90,6 +90,25 @@ def test_can_save_dataset_with_no_subsets(self): compare_datasets(self, source_dataset, parsed_dataset, require_images=True) + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id="кириллица с пробелом", + image=np.ones((8, 8, 3)), + annotations=[Label(0), Label(1)] + ), + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + 'label_' + str(label) for label in range(2)), + }) + + with TestDir() as test_dir: + ImagenetTxtConverter.convert(source_dataset, test_dir, save_images=True) + + parsed_dataset = Dataset.import_from(test_dir, 'imagenet_txt') + + compare_datasets(self, source_dataset, parsed_dataset, + require_images=True) + DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'imagenet_txt_dataset') class ImagenetTxtImporterTest(TestCase): diff --git a/tests/test_labelme_format.py b/tests/test_labelme_format.py index 244a590b07..fbeae19d57 100644 --- a/tests/test_labelme_format.py +++ b/tests/test_labelme_format.py @@ -87,6 +87,48 @@ def test_can_save_and_load(self): partial(LabelMeConverter.convert, save_images=True), test_dir, target_dataset=target_dataset) + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='кириллица с пробелом', subset='train', + image=np.ones((16, 16, 3)), + annotations=[ + Polygon([0, 4, 4, 4, 5, 6], label=3, attributes={ + 'occluded': True, + 'a1': 'qwe', + 'a2': True, + 'a3': 123, + }), + ] + ), + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + 'label_' + str(label) for label in range(10)), + }) + + target_dataset = Dataset.from_iterable([ + DatasetItem(id='кириллица с пробелом', subset='train', + image=np.ones((16, 16, 3)), + annotations=[ + Polygon([0, 4, 4, 4, 5, 6], label=0, id=0, + attributes={ + 'occluded': True, 'username': '', + 'a1': 'qwe', + 'a2': True, + 'a3': 123, + } + ), + ] + ), + ], categories={ + AnnotationType.label: LabelCategories.from_iterable([ + 'label_3']), + }) + + with TestDir() as test_dir: + self._test_save_and_load( + source_dataset, + partial(LabelMeConverter.convert, save_images=True), + test_dir, target_dataset=target_dataset) DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'labelme_dataset') diff --git a/tests/test_lfw_format.py b/tests/test_lfw_format.py index 541cccaa02..975ba84057 100644 --- a/tests/test_lfw_format.py +++ b/tests/test_lfw_format.py @@ -101,6 +101,30 @@ def test_can_save_and_load_with_no_subsets(self): compare_datasets(self, source_dataset, parsed_dataset) + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='кириллица с пробелом', + image=np.ones((2, 5, 3)), + attributes = { + 'positive_pairs': [], + 'negative_pairs': ['name0/name0_0002'] + }, + ), + DatasetItem(id='name0/name0_0002', + image=np.ones((2, 5, 3)), + attributes = { + 'positive_pairs': [], + 'negative_pairs': [] + }, + ), + ]) + + with TestDir() as test_dir: + LfwConverter.convert(source_dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'lfw') + + compare_datasets(self, source_dataset, parsed_dataset) + DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'lfw_dataset') class LfwImporterTest(TestCase): diff --git a/tests/test_market1501_format.py b/tests/test_market1501_format.py index d5422acc09..a53ef89d6f 100644 --- a/tests/test_market1501_format.py +++ b/tests/test_market1501_format.py @@ -62,6 +62,24 @@ def test_can_save_dataset_with_no_subsets(self): compare_datasets(self, source_dataset, parsed_dataset) + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='кириллица с пробелом', + image=np.ones((2, 5, 3)), + attributes = { + 'camera_id': 1, + 'person_id': 1, + 'query': True + } + ), + ]) + + with TestDir() as test_dir: + Market1501Converter.convert(source_dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'market1501') + + compare_datasets(self, source_dataset, parsed_dataset) + def test_can_save_dataset_with_no_save_images(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='0001_c2s3_000001_00', diff --git a/tests/test_mots_format.py b/tests/test_mots_format.py index f8358dda3c..dfe613f773 100644 --- a/tests/test_mots_format.py +++ b/tests/test_mots_format.py @@ -66,6 +66,19 @@ def test_can_save_masks(self): partial(MotsPngConverter.convert, save_images=True), test_dir, target_dataset=target) + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + source = Dataset.from_iterable([ + DatasetItem(id='кириллица с пробелом', subset='a', + image=np.ones((5, 1)), annotations=[ + Mask(np.array([[1, 0, 0, 0, 0]]), label=0, + attributes={'track_id': 2}), + ]), + ], categories=['a']) + + with TestDir() as test_dir: + self._test_save_and_load(source, + partial(MotsPngConverter.convert, save_images=True), test_dir) + class MotsImporterTest(TestCase): def test_can_detect(self): self.assertTrue(MotsImporter.detect(DUMMY_DATASET_DIR)) diff --git a/tests/test_tfrecord_format.py b/tests/test_tfrecord_format.py index 8b63c71a1b..96e70ee4a8 100644 --- a/tests/test_tfrecord_format.py +++ b/tests/test_tfrecord_format.py @@ -121,6 +121,27 @@ def test_can_save_dataset_with_no_subsets(self): partial(TfDetectionApiConverter.convert, save_images=True), test_dir) + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + test_dataset = Dataset.from_iterable([ + DatasetItem(id='кириллица с пробелом', + image=np.ones((16, 16, 3)), + annotations=[ + Bbox(2, 1, 4, 4, label=2), + Bbox(4, 2, 8, 4, label=3), + ], + attributes={'source_id': ''} + ), + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + 'label_' + str(label) for label in range(10)), + }) + + with TestDir() as test_dir: + self._test_save_and_load( + test_dataset, + partial(TfDetectionApiConverter.convert, save_images=True), + test_dir) + def test_can_save_dataset_with_image_info(self): test_dataset = Dataset.from_iterable([ DatasetItem(id='1/q.e', diff --git a/tests/test_vgg_face2_format.py b/tests/test_vgg_face2_format.py index 38eb7aacdf..da95614d9a 100644 --- a/tests/test_vgg_face2_format.py +++ b/tests/test_vgg_face2_format.py @@ -71,6 +71,22 @@ def test_can_save_dataset_with_no_subsets(self): compare_datasets(self, source_dataset, parsed_dataset) + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='кириллица с пробелом', image=np.ones((8, 8, 3)), + annotations=[ + Points([4.23, 4.32, 5.34, 4.45, 3.54, + 3.56, 4.52, 3.51, 4.78, 3.34], label=0), + ] + ), + ], categories=['a']) + + with TestDir() as test_dir: + VggFace2Converter.convert(source_dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'vgg_face2') + + compare_datasets(self, source_dataset, parsed_dataset) + def test_can_save_dataset_with_no_save_images(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='1', image=np.ones((8, 8, 3)), diff --git a/tests/test_voc_format.py b/tests/test_voc_format.py index fddafd6470..f936eb6746 100644 --- a/tests/test_voc_format.py +++ b/tests/test_voc_format.py @@ -415,6 +415,20 @@ def __iter__(self): self._test_save_and_load(TestExtractor(), partial(VocConverter.convert, label_map='voc'), test_dir) + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + class TestExtractor(TestExtractorBase): + def __iter__(self): + return iter([ + DatasetItem(id='кириллица с пробелом', annotations=[ + Label(2), + Label(3), + ]), + ]) + + with TestDir() as test_dir: + self._test_save_and_load(TestExtractor(), + partial(VocConverter.convert, label_map='voc'), test_dir) + def test_can_save_dataset_with_images(self): class TestExtractor(TestExtractorBase): def __iter__(self): diff --git a/tests/test_widerface_format.py b/tests/test_widerface_format.py index 03f15e623c..46163554c9 100644 --- a/tests/test_widerface_format.py +++ b/tests/test_widerface_format.py @@ -83,6 +83,26 @@ def test_can_save_dataset_with_no_subsets(self): compare_datasets(self, source_dataset, parsed_dataset) + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='кириллица с пробелом', image=np.ones((8, 8, 3)), + annotations=[ + Bbox(0, 1, 2, 3, label=1, attributes = { + 'blur': '2', 'expression': '0', 'illumination': '0', + 'occluded': '0', 'pose': '2', 'invalid': '0'}), + ] + ), + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + 'label_' + str(i) for i in range(3)), + }) + + with TestDir() as test_dir: + WiderFaceConverter.convert(source_dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'wider_face') + + compare_datasets(self, source_dataset, parsed_dataset) + def test_can_save_dataset_with_non_widerface_attributes(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='a/b/1', image=np.ones((8, 8, 3)), diff --git a/tests/test_yolo_format.py b/tests/test_yolo_format.py index 5c46eb27a0..f21420e11c 100644 --- a/tests/test_yolo_format.py +++ b/tests/test_yolo_format.py @@ -90,6 +90,24 @@ def test_can_load_dataset_with_exact_image_info(self): compare_datasets(self, source_dataset, parsed_dataset) + def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self): + source_dataset = Dataset.from_iterable([ + DatasetItem(id='кириллица с пробелом', subset='train', image=np.ones((8, 8, 3)), + annotations=[ + Bbox(0, 2, 4, 2, label=2), + Bbox(0, 1, 2, 3, label=4), + ]), + ], categories={ + AnnotationType.label: LabelCategories.from_iterable( + 'label_' + str(i) for i in range(10)), + }) + + with TestDir() as test_dir: + YoloConverter.convert(source_dataset, test_dir, save_images=True) + parsed_dataset = Dataset.import_from(test_dir, 'yolo') + + compare_datasets(self, source_dataset, parsed_dataset) + def test_relative_paths(self): source_dataset = Dataset.from_iterable([ DatasetItem(id='1', subset='train', From 0a09ba063e5328927500917f286fa00472290ac1 Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Wed, 10 Mar 2021 11:47:52 +0300 Subject: [PATCH 03/16] some fixes --- datumaro/plugins/imagenet_format.py | 1 - datumaro/plugins/voc_format/extractor.py | 9 ++++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/datumaro/plugins/imagenet_format.py b/datumaro/plugins/imagenet_format.py index 829534d700..8dbe6cc6d6 100644 --- a/datumaro/plugins/imagenet_format.py +++ b/datumaro/plugins/imagenet_format.py @@ -1,4 +1,3 @@ -#_*_ coding:utf-8 _*_ # Copyright (C) 2020 Intel Corporation # # SPDX-License-Identifier: MIT diff --git a/datumaro/plugins/voc_format/extractor.py b/datumaro/plugins/voc_format/extractor.py index b84ac0fd23..4be5bec2c1 100644 --- a/datumaro/plugins/voc_format/extractor.py +++ b/datumaro/plugins/voc_format/extractor.py @@ -58,8 +58,15 @@ def _load_categories(dataset_path): @staticmethod def _load_subset_list(subset_path): + subset_list = [] with open(subset_path, encoding='utf-8') as f: - return [line.strip() for line in f] + for line in f: + objects = line.split('\"') + if 1 < len(objects): + subset_list.append(objects[1]) + else: + subset_list.append(line.split()[0]) + return subset_list class VocClassificationExtractor(_VocExtractor): def __iter__(self): From fac8f111bdf4a0d689aead614a27d2a72e8ad8a9 Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Wed, 10 Mar 2021 12:27:56 +0300 Subject: [PATCH 04/16] fix voc format --- datumaro/plugins/voc_format/converter.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/datumaro/plugins/voc_format/converter.py b/datumaro/plugins/voc_format/converter.py index 73c9925292..934fc1ffd1 100644 --- a/datumaro/plugins/voc_format/converter.py +++ b/datumaro/plugins/voc_format/converter.py @@ -369,6 +369,8 @@ def save_action_lists(self, subset_name, action_list): self._get_filtered_lines(ann_file, self._patch, subset_name, items) with open(ann_file, 'w', encoding='utf-8') as f: for item in items: + if 1 < len(item.split()): + item = '\"' + item + '\"' f.write('%s\n' % item) if not items and not self._patch: @@ -435,6 +437,8 @@ def save_clsdet_lists(self, subset_name, clsdet_list): with open(ann_file, 'w', encoding='utf-8') as f: for item in items: + if 1 < len(item.split()): + item = '\"' + item + '\"' f.write('%s\n' % item) def save_segm_lists(self, subset_name, segm_list): @@ -447,10 +451,14 @@ def save_segm_lists(self, subset_name, segm_list): with open(ann_file, 'w', encoding='utf-8') as f: for item in items: + if 1 < len(item.split()): + item = '\"' + item + '\"' f.write('%s\n' % item) def save_layout_lists(self, subset_name, layout_list): def _write_item(f, item, item_layouts): + if 1 < len(item.split()): + item = '\"' + item + '\"' if item_layouts: for obj_id in item_layouts: f.write('%s % d\n' % (item, 1 + obj_id)) From 6c8812841d657a2fc3d8f2fffc07cb3e9faea9d8 Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Thu, 11 Mar 2021 10:53:13 +0300 Subject: [PATCH 05/16] fix camvid --- datumaro/plugins/camvid_format.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py index 2a3c9aaf6c..3f5767bfa1 100644 --- a/datumaro/plugins/camvid_format.py +++ b/datumaro/plugins/camvid_format.py @@ -7,6 +7,7 @@ import os.path as osp from collections import OrderedDict from enum import Enum +import re import numpy as np from datumaro.components.converter import Converter @@ -57,6 +58,7 @@ class CamvidPath: LABELMAP_FILE = 'label_colors.txt' SEGM_DIR = "annot" IMAGE_EXT = '.png' + PATTERN = re.compile(r'(.+[.]\S+) (.+)?') def parse_label_map(path): @@ -155,16 +157,8 @@ def _load_items(self, path): items = {} with open(path, encoding='utf-8') as f: for line in f: + objects = CamvidPath.PATTERN.search(line).groups() objects = line.split() - if 2 < len(objects): - if len(objects) % 2: - raise Exception("Line %s: image and gt file must have " - "the same name" % line) - else: - mid = int(len(objects) / 2) - objects[0] = ' '.join(objects[i] for i in range(mid)) - objects[1] = ' '.join(objects[i] for i in range(mid, 2 * mid)) - objects = objects[:2] image = objects[0] item_id = ('/'.join(image.split('/')[2:]))[:-len(CamvidPath.IMAGE_EXT)] image_path = osp.join(self._dataset_dir, From 135b4ae13f9076f7345b3539779bb9e63c0d707f Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Thu, 11 Mar 2021 11:22:23 +0300 Subject: [PATCH 06/16] fix voc --- datumaro/plugins/camvid_format.py | 5 ++--- datumaro/plugins/voc_format/converter.py | 16 ++++++++-------- datumaro/plugins/voc_format/extractor.py | 9 ++++----- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py index 3f5767bfa1..4161b197dd 100644 --- a/datumaro/plugins/camvid_format.py +++ b/datumaro/plugins/camvid_format.py @@ -5,9 +5,9 @@ import os import os.path as osp +import re from collections import OrderedDict from enum import Enum -import re import numpy as np from datumaro.components.converter import Converter @@ -16,8 +16,7 @@ MaskCategories, SourceExtractor) from datumaro.util import find, str_to_bool from datumaro.util.image import save_image -from datumaro.util.mask_tools import lazy_mask, paint_mask, generate_colormap - +from datumaro.util.mask_tools import generate_colormap, lazy_mask, paint_mask CamvidLabelMap = OrderedDict([ ('Void', (0, 0, 0)), diff --git a/datumaro/plugins/voc_format/converter.py b/datumaro/plugins/voc_format/converter.py index 934fc1ffd1..c3aac6b18a 100644 --- a/datumaro/plugins/voc_format/converter.py +++ b/datumaro/plugins/voc_format/converter.py @@ -369,8 +369,8 @@ def save_action_lists(self, subset_name, action_list): self._get_filtered_lines(ann_file, self._patch, subset_name, items) with open(ann_file, 'w', encoding='utf-8') as f: for item in items: - if 1 < len(item.split()): - item = '\"' + item + '\"' + # if 1 < len(item.split()): + # item = '\"' + item + '\"' f.write('%s\n' % item) if not items and not self._patch: @@ -437,8 +437,8 @@ def save_clsdet_lists(self, subset_name, clsdet_list): with open(ann_file, 'w', encoding='utf-8') as f: for item in items: - if 1 < len(item.split()): - item = '\"' + item + '\"' + # if 1 < len(item.split()): + # item = '\"' + item + '\"' f.write('%s\n' % item) def save_segm_lists(self, subset_name, segm_list): @@ -451,14 +451,14 @@ def save_segm_lists(self, subset_name, segm_list): with open(ann_file, 'w', encoding='utf-8') as f: for item in items: - if 1 < len(item.split()): - item = '\"' + item + '\"' + # if 1 < len(item.split()): + # item = '\"' + item + '\"' f.write('%s\n' % item) def save_layout_lists(self, subset_name, layout_list): def _write_item(f, item, item_layouts): - if 1 < len(item.split()): - item = '\"' + item + '\"' + # if 1 < len(item.split()): + # item = '\"' + item + '\"' if item_layouts: for obj_id in item_layouts: f.write('%s % d\n' % (item, 1 + obj_id)) diff --git a/datumaro/plugins/voc_format/extractor.py b/datumaro/plugins/voc_format/extractor.py index 4be5bec2c1..23cc14806c 100644 --- a/datumaro/plugins/voc_format/extractor.py +++ b/datumaro/plugins/voc_format/extractor.py @@ -61,11 +61,10 @@ def _load_subset_list(subset_path): subset_list = [] with open(subset_path, encoding='utf-8') as f: for line in f: - objects = line.split('\"') - if 1 < len(objects): - subset_list.append(objects[1]) - else: - subset_list.append(line.split()[0]) + line = line.strip().split() + if 2 < len(line): + line[0] = ' '.join(line[i] for i in range(len(line))) + subset_list.append(line[0]) return subset_list class VocClassificationExtractor(_VocExtractor): From d48d98ed53a21d012b3ab24dea3d474ae61c2868 Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Thu, 11 Mar 2021 11:56:41 +0300 Subject: [PATCH 07/16] some fixes --- datumaro/plugins/camvid_format.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py index 4161b197dd..a4a5440466 100644 --- a/datumaro/plugins/camvid_format.py +++ b/datumaro/plugins/camvid_format.py @@ -156,14 +156,17 @@ def _load_items(self, path): items = {} with open(path, encoding='utf-8') as f: for line in f: - objects = CamvidPath.PATTERN.search(line).groups() - objects = line.split() + search = CamvidPath.PATTERN.search(line) + if search: + objects = CamvidPath.PATTERN.search(line).groups() + else: + raise Exception("Line %s: invalid path format" % line) image = objects[0] item_id = ('/'.join(image.split('/')[2:]))[:-len(CamvidPath.IMAGE_EXT)] image_path = osp.join(self._dataset_dir, (image, image[1:])[image[0] == '/']) item_annotations = [] - if 1 < len(objects): + if objects[1] != None: gt = objects[1] gt_path = osp.join(self._dataset_dir, (gt, gt[1:]) [gt[0] == '/']) From 1b42acc23d474671a56ef9e64f668f0478e803d3 Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Thu, 11 Mar 2021 12:16:39 +0300 Subject: [PATCH 08/16] fix regex in camvid --- datumaro/plugins/camvid_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py index 9b467e6a18..d1e43a3ca6 100644 --- a/datumaro/plugins/camvid_format.py +++ b/datumaro/plugins/camvid_format.py @@ -61,7 +61,7 @@ class CamvidPath: SEGM_DIR = "annot" IMAGE_EXT = '.jpg' MASK_EXT = '.png' - PATTERN = re.compile(r'(.+[.]\S+) (.+)?') + PATTERN = re.compile(r'(.+[.]\S+) (.+[.]\S+)?') def parse_label_map(path): From 2d837d63940301fb193ef459da2baaebe73e2904 Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Fri, 12 Mar 2021 09:41:14 +0300 Subject: [PATCH 09/16] add exception if unexpected number of quotes --- datumaro/plugins/imagenet_txt_format.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/datumaro/plugins/imagenet_txt_format.py b/datumaro/plugins/imagenet_txt_format.py index aceee4e5b3..751ce9d11c 100644 --- a/datumaro/plugins/imagenet_txt_format.py +++ b/datumaro/plugins/imagenet_txt_format.py @@ -50,9 +50,13 @@ def _load_items(self, path): with open(path, encoding='utf-8') as f: for line in f: item = line.split('\"') - if len(item) == 3: - item_id = item[1] - label_ids = [int(id) for id in item[2].split()] + if 1 < len(item): + if len(item) == 3: + item_id = item[1] + label_ids = [int(id) for id in item[2].split()] + else: + raise Exception("Line %s: unexpected number " + "of quotes in filename" % line) else: item = line.split() item_id = item[0] From 385ad9b38542a31ef4f159ca641b4e0d631a44d8 Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Sat, 13 Mar 2021 19:35:12 +0300 Subject: [PATCH 10/16] fix voc format --- datumaro/plugins/voc_format/converter.py | 10 ++-------- datumaro/plugins/voc_format/extractor.py | 22 +++++++++++++++------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/datumaro/plugins/voc_format/converter.py b/datumaro/plugins/voc_format/converter.py index 9ddad7c716..bbf4f8a8de 100644 --- a/datumaro/plugins/voc_format/converter.py +++ b/datumaro/plugins/voc_format/converter.py @@ -370,8 +370,6 @@ def save_action_lists(self, subset_name, action_list): self._get_filtered_lines(ann_file, self._patch, subset_name, items) with open(ann_file, 'w', encoding='utf-8') as f: for item in items: - # if 1 < len(item.split()): - # item = '\"' + item + '\"' f.write('%s\n' % item) if not items and not self._patch: @@ -438,8 +436,6 @@ def save_clsdet_lists(self, subset_name, clsdet_list): with open(ann_file, 'w', encoding='utf-8') as f: for item in items: - # if 1 < len(item.split()): - # item = '\"' + item + '\"' f.write('%s\n' % item) def save_segm_lists(self, subset_name, segm_list): @@ -452,14 +448,12 @@ def save_segm_lists(self, subset_name, segm_list): with open(ann_file, 'w', encoding='utf-8') as f: for item in items: - # if 1 < len(item.split()): - # item = '\"' + item + '\"' f.write('%s\n' % item) def save_layout_lists(self, subset_name, layout_list): def _write_item(f, item, item_layouts): - # if 1 < len(item.split()): - # item = '\"' + item + '\"' + if 1 < len(item.split()): + item = '\"' + item + '\"' if item_layouts: for obj_id in item_layouts: f.write('%s % d\n' % (item, 1 + obj_id)) diff --git a/datumaro/plugins/voc_format/extractor.py b/datumaro/plugins/voc_format/extractor.py index 23cc14806c..e883a26703 100644 --- a/datumaro/plugins/voc_format/extractor.py +++ b/datumaro/plugins/voc_format/extractor.py @@ -61,10 +61,20 @@ def _load_subset_list(subset_path): subset_list = [] with open(subset_path, encoding='utf-8') as f: for line in f: - line = line.strip().split() - if 2 < len(line): - line[0] = ' '.join(line[i] for i in range(len(line))) - subset_list.append(line[0]) + dirname = osp.basename(osp.dirname(subset_path)) + if dirname == VocPath.TASK_DIR[VocTask.person_layout]: + objects = line.split('\"') + if 1 < len(objects): + if len(objects) == 3: + line = objects[1] + else: + raise Exception("Line %s: unexpected number " + "of quotes in filename" % line) + else: + line = line.split()[0] + else: + line = line.strip() + subset_list.append(line) return subset_list class VocClassificationExtractor(_VocExtractor): @@ -88,9 +98,7 @@ def _load_annotations(self): label = ann_filename[:ann_filename.rfind('_')] label_id = self._get_label_id(label) for line in f: - objects = line.split() - item = ' '.join(objects[i] for i in range(len(objects) - 1)) - present = objects[-1] + item, present = line.rsplit(maxsplit=1) if present == '1': annotations[item].append(label_id) From 1c5638abcc936327870e5dd5afb1bc06bf1095e5 Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Mon, 15 Mar 2021 16:06:28 +0300 Subject: [PATCH 11/16] some fixes --- datumaro/plugins/camvid_format.py | 6 +++--- datumaro/plugins/voc_format/extractor.py | 18 +++++++++++------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py index d1e43a3ca6..40d69334c6 100644 --- a/datumaro/plugins/camvid_format.py +++ b/datumaro/plugins/camvid_format.py @@ -61,7 +61,7 @@ class CamvidPath: SEGM_DIR = "annot" IMAGE_EXT = '.jpg' MASK_EXT = '.png' - PATTERN = re.compile(r'(.+[.]\S+) (.+[.]\S+)?') + PATTERN = re.compile(r'(.+\.\S+)(?:\s+(.+\.\S+)?)?\s*') def parse_label_map(path): @@ -160,9 +160,9 @@ def _load_items(self, path): items = {} with open(path, encoding='utf-8') as f: for line in f: - search = CamvidPath.PATTERN.search(line) + search = CamvidPath.PATTERN.search(line.strip()) if search: - objects = CamvidPath.PATTERN.search(line).groups() + objects = search.groups() else: raise Exception("Line %s: invalid path format" % line) image = objects[0] diff --git a/datumaro/plugins/voc_format/extractor.py b/datumaro/plugins/voc_format/extractor.py index e883a26703..dcee3523be 100644 --- a/datumaro/plugins/voc_format/extractor.py +++ b/datumaro/plugins/voc_format/extractor.py @@ -24,10 +24,11 @@ _inverse_inst_colormap = invert_colormap(VocInstColormap) class _VocExtractor(SourceExtractor): - def __init__(self, path): + def __init__(self, path, task): assert osp.isfile(path), path self._path = path self._dataset_dir = osp.dirname(osp.dirname(osp.dirname(path))) + self._task = task super().__init__(subset=osp.splitext(osp.basename(path))[0]) @@ -56,13 +57,11 @@ def _load_categories(dataset_path): label_map = parse_label_map(label_map_path) return make_voc_categories(label_map) - @staticmethod - def _load_subset_list(subset_path): + def _load_subset_list(self, subset_path): subset_list = [] with open(subset_path, encoding='utf-8') as f: for line in f: - dirname = osp.basename(osp.dirname(subset_path)) - if dirname == VocPath.TASK_DIR[VocTask.person_layout]: + if self._task == VocTask.person_layout: objects = line.split('\"') if 1 < len(objects): if len(objects) == 3: @@ -78,6 +77,9 @@ def _load_subset_list(subset_path): return subset_list class VocClassificationExtractor(_VocExtractor): + def __init__(self, path): + super().__init__(path, VocTask.classification) + def __iter__(self): raw_anns = self._load_annotations() for item_id in self._items: @@ -110,8 +112,7 @@ def _parse_annotations(raw_anns, item_id): class _VocXmlExtractor(_VocExtractor): def __init__(self, path, task): - super().__init__(path) - self._task = task + super().__init__(path, task) def __iter__(self): anno_dir = osp.join(self._dataset_dir, VocPath.ANNOTATIONS_DIR) @@ -246,6 +247,9 @@ def __init__(self, path): super().__init__(path, task=VocTask.action_classification) class VocSegmentationExtractor(_VocExtractor): + def __init__(self, path): + super().__init__(path, task=VocTask.segmentation) + def __iter__(self): for item_id in self._items: log.debug("Reading item '%s'" % item_id) From be16492346932125c447306dd12ad29bcd2d4f0c Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Mon, 15 Mar 2021 18:36:34 +0300 Subject: [PATCH 12/16] fix camvid extractor --- datumaro/plugins/camvid_format.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py index 40d69334c6..28b7631ec2 100644 --- a/datumaro/plugins/camvid_format.py +++ b/datumaro/plugins/camvid_format.py @@ -61,7 +61,7 @@ class CamvidPath: SEGM_DIR = "annot" IMAGE_EXT = '.jpg' MASK_EXT = '.png' - PATTERN = re.compile(r'(.+\.\S+)(?:\s+(.+\.\S+)?)?\s*') + PATTERN = re.compile(r'(.+\.\S+)(?:\s+(.+\.\S+)?)\s*') def parse_label_map(path): @@ -160,7 +160,7 @@ def _load_items(self, path): items = {} with open(path, encoding='utf-8') as f: for line in f: - search = CamvidPath.PATTERN.search(line.strip()) + search = CamvidPath.PATTERN.search(line.strip('\n')) if search: objects = search.groups() else: From dd6eecfe2dead6041f6ccef115ab49dd512bd185 Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Tue, 16 Mar 2021 11:56:34 +0300 Subject: [PATCH 13/16] update Changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e475ce93e1..dd8fb88a98 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ### Fixed -- +- The ability to work with file names containing Cyrillic and spaces () ### Security - From 68e41dc400a946c270268631c73fa41b709da5e7 Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Tue, 16 Mar 2021 16:09:00 +0300 Subject: [PATCH 14/16] fix regex --- datumaro/plugins/camvid_format.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py index 28b7631ec2..9df865e686 100644 --- a/datumaro/plugins/camvid_format.py +++ b/datumaro/plugins/camvid_format.py @@ -61,7 +61,7 @@ class CamvidPath: SEGM_DIR = "annot" IMAGE_EXT = '.jpg' MASK_EXT = '.png' - PATTERN = re.compile(r'(.+\.\S+)(?:\s+(.+\.\S+)?)\s*') + PATTERN = re.compile(r'(.+\.\S+)?(?:\s*(.+\.\S+))') def parse_label_map(path): @@ -160,9 +160,11 @@ def _load_items(self, path): items = {} with open(path, encoding='utf-8') as f: for line in f: - search = CamvidPath.PATTERN.search(line.strip('\n')) + search = CamvidPath.PATTERN.search(line.strip()) if search: objects = search.groups() + if not objects[0]: + objects = [objects[1], objects[0]] else: raise Exception("Line %s: invalid path format" % line) image = objects[0] From 8b3f93c7ebb43efcf6ac2f510f171176867cd2fb Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Tue, 16 Mar 2021 16:46:01 +0300 Subject: [PATCH 15/16] fix regex --- datumaro/plugins/camvid_format.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py index 9df865e686..e59cf756b1 100644 --- a/datumaro/plugins/camvid_format.py +++ b/datumaro/plugins/camvid_format.py @@ -61,7 +61,7 @@ class CamvidPath: SEGM_DIR = "annot" IMAGE_EXT = '.jpg' MASK_EXT = '.png' - PATTERN = re.compile(r'(.+\.\S+)?(?:\s*(.+\.\S+))') + PATTERN = re.compile(r'(.+?\.\S+)(?:\s+(.+\.\S+)?)?') def parse_label_map(path): @@ -163,8 +163,6 @@ def _load_items(self, path): search = CamvidPath.PATTERN.search(line.strip()) if search: objects = search.groups() - if not objects[0]: - objects = [objects[1], objects[0]] else: raise Exception("Line %s: invalid path format" % line) image = objects[0] From 37da6d10bc9a8d921d6c7ff4a4f0095cf114671a Mon Sep 17 00:00:00 2001 From: yasakova-anastasia Date: Tue, 16 Mar 2021 17:44:14 +0300 Subject: [PATCH 16/16] fix camvid extractor --- datumaro/plugins/camvid_format.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py index e59cf756b1..8577a423ba 100644 --- a/datumaro/plugins/camvid_format.py +++ b/datumaro/plugins/camvid_format.py @@ -6,7 +6,6 @@ import logging as log import os import os.path as osp -import re from collections import OrderedDict from enum import Enum @@ -61,7 +60,6 @@ class CamvidPath: SEGM_DIR = "annot" IMAGE_EXT = '.jpg' MASK_EXT = '.png' - PATTERN = re.compile(r'(.+?\.\S+)(?:\s+(.+\.\S+)?)?') def parse_label_map(path): @@ -160,17 +158,23 @@ def _load_items(self, path): items = {} with open(path, encoding='utf-8') as f: for line in f: - search = CamvidPath.PATTERN.search(line.strip()) - if search: - objects = search.groups() + line = line.strip() + objects = line.split('\"') + if 1 < len(objects): + if len(objects) == 5: + objects[0] = objects[1] + objects[1] = objects[3] + else: + raise Exception("Line %s: unexpected number " + "of quotes in filename" % line) else: - raise Exception("Line %s: invalid path format" % line) + objects = line.split() image = objects[0] item_id = ('/'.join(image.split('/')[2:]))[:-len(CamvidPath.IMAGE_EXT)] image_path = osp.join(self._dataset_dir, (image, image[1:])[image[0] == '/']) item_annotations = [] - if objects[1] != None: + if 1 < len(objects): gt = objects[1] gt_path = osp.join(self._dataset_dir, (gt, gt[1:]) [gt[0] == '/']) @@ -271,10 +275,12 @@ def save_segm_lists(self, subset_name, segm_list): ann_file = osp.join(self._save_dir, subset_name + '.txt') with open(ann_file, 'w', encoding='utf-8') as f: for (image_path, mask_path) in segm_list.values(): - f.write('/%s %s\n' % ( - image_path.replace('\\', '/'), - mask_path.replace('\\', '/')) - ) + image_path = '/' + image_path.replace('\\', '/') + mask_path = mask_path.replace('\\', '/') + if 1 < len(image_path.split()) or 1 < len(mask_path.split()): + image_path = '\"' + image_path + '\"' + mask_path = '\"' + mask_path + '\"' + f.write('%s %s\n' % (image_path, mask_path)) def save_label_map(self): path = osp.join(self._save_dir, CamvidPath.LABELMAP_FILE)