Fix dataset formats (cyrilic and spaces in filename) (#148)

* support different languages and spaces in filenames in formats * update Changelog
openvinotoolkit · Mar 17, 2021 · ef12d30 · ef12d30
1 parent a698cac
commit ef12d30
Show file tree

Hide file tree

Showing 28 changed files with 442 additions and 59 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,7 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 -
 
 ### Fixed
--
+- The ability to work with file names containing Cyrillic and spaces (<https://github.com/openvinotoolkit/datumaro/pull/148>)
 
 ### Security
 -

diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py
@@ -18,8 +18,7 @@
 from datumaro.util import find, str_to_bool
 from datumaro.util.annotation_util import make_label_id_mapping
 from datumaro.util.image import save_image
-from datumaro.util.mask_tools import lazy_mask, paint_mask, generate_colormap
-
+from datumaro.util.mask_tools import generate_colormap, lazy_mask, paint_mask
 
 CamvidLabelMap = OrderedDict([
     ('Void', (0, 0, 0)),
@@ -159,7 +158,17 @@ def _load_items(self, path):
         items = {}
         with open(path, encoding='utf-8') as f:
             for line in f:
-                objects = line.split()
+                line = line.strip()
+                objects = line.split('\"')
+                if 1 < len(objects):
+                    if len(objects) == 5:
+                        objects[0] = objects[1]
+                        objects[1] = objects[3]
+                    else:
+                        raise Exception("Line %s: unexpected number "
+                            "of quotes in filename" % line)
+                else:
+                    objects = line.split()
                 image = objects[0]
                 item_id = ('/'.join(image.split('/')[2:]))[:-len(CamvidPath.IMAGE_EXT)]
                 image_path = osp.join(self._dataset_dir,
@@ -264,12 +273,14 @@ def save_segm_lists(self, subset_name, segm_list):
             return
 
         ann_file = osp.join(self._save_dir, subset_name + '.txt')
-        with open(ann_file, 'w') as f:
+        with open(ann_file, 'w', encoding='utf-8') as f:
             for (image_path, mask_path) in segm_list.values():
-                f.write('/%s %s\n' % (
-                    image_path.replace('\\', '/'),
-                    mask_path.replace('\\', '/'))
-                )
+                image_path = '/' + image_path.replace('\\', '/')
+                mask_path = mask_path.replace('\\', '/')
+                if 1 < len(image_path.split()) or 1 < len(mask_path.split()):
+                    image_path = '\"' + image_path + '\"'
+                    mask_path = '\"' + mask_path + '\"'
+                f.write('%s %s\n' % (image_path, mask_path))
 
     def save_label_map(self):
         path = osp.join(self._save_dir, CamvidPath.LABELMAP_FILE)

diff --git a/datumaro/plugins/imagenet_format.py b/datumaro/plugins/imagenet_format.py
@@ -1,4 +1,3 @@
-
 # Copyright (C) 2020 Intel Corporation
 #
 # SPDX-License-Identifier: MIT

diff --git a/datumaro/plugins/imagenet_txt_format.py b/datumaro/plugins/imagenet_txt_format.py
@@ -49,9 +49,18 @@ def _load_items(self, path):
         items = {}
         with open(path, encoding='utf-8') as f:
             for line in f:
-                item = line.split()
-                item_id = item[0]
-                label_ids = [int(id) for id in item[1:]]
+                item = line.split('\"')
+                if 1 < len(item):
+                    if len(item) == 3:
+                        item_id = item[1]
+                        label_ids = [int(id) for id in item[2].split()]
+                    else:
+                        raise Exception("Line %s: unexpected number "
+                            "of quotes in filename" % line)
+                else:
+                    item = line.split()
+                    item_id = item[0]
+                    label_ids = [int(id) for id in item[1:]]
                 anno = []
                 for label in label_ids:
                     assert 0 <= label and \
@@ -95,9 +104,14 @@ def apply(self):
                 if self._save_images and item.has_image:
                     self._save_image(item, subdir=ImagenetTxtPath.IMAGE_DIR)
 
+            annotation = ''
+            for item_id, item_labels in labels.items():
+                if 1 < len(item_id.split()):
+                    item_id = '\"' + item_id + '\"'
+                annotation += '%s %s\n' % (item_id, ' '.join(item_labels))
+
             with open(annotation_file, 'w', encoding='utf-8') as f:
-                f.writelines(['%s %s\n' % (item_id, ' '.join(labels[item_id]))
-                    for item_id in labels])
+                f.write(annotation)
 
         labels_file = osp.join(subset_dir, ImagenetTxtPath.LABELS_FILE)
         with open(labels_file, 'w', encoding='utf-8') as f:

diff --git a/datumaro/plugins/lfw_format.py b/datumaro/plugins/lfw_format.py
@@ -31,10 +31,14 @@ def _load_items(self, path):
         images_dir = osp.join(self._dataset_dir, self._subset, LfwPath.IMAGES_DIR)
         with open(path, encoding='utf-8') as f:
             for line in f:
-                pair = line.strip().split()
+                pair = line.strip().split('\t')
                 if len(pair) == 3:
-                    image1 = self.get_image_name(pair[0], pair[1])
-                    image2 = self.get_image_name(pair[0], pair[2])
+                    if pair[0] == '-':
+                        image1 = pair[1]
+                        image2 = pair[2]
+                    else:
+                        image1 = self.get_image_name(pair[0], pair[1])
+                        image2 = self.get_image_name(pair[0], pair[2])
                     if image1 not in items:
                         items[image1] = DatasetItem(id=image1, subset=self._subset,
                             image=osp.join(images_dir, image1 + LfwPath.IMAGE_EXT),
@@ -47,8 +51,14 @@ def _load_items(self, path):
                     attributes = items[image1].attributes
                     attributes['positive_pairs'].append(image2)
                 elif len(pair) == 4:
-                    image1 = self.get_image_name(pair[0], pair[1])
-                    image2 = self.get_image_name(pair[2], pair[3])
+                    if pair[0] == '-':
+                        image1 = pair[1]
+                    else:
+                        image1 = self.get_image_name(pair[0], pair[1])
+                    if pair[2] == '-':
+                        image2 = pair[3]
+                    else:
+                        image2 = self.get_image_name(pair[2], pair[3])
                     if image1 not in items:
                         items[image1] = DatasetItem(id=image1, subset=self._subset,
                             image=osp.join(images_dir, image1 + LfwPath.IMAGE_EXT),
@@ -102,17 +112,31 @@ def apply(self):
                     self._save_image(item, osp.join(self._save_dir, subset_name,
                         LfwPath.IMAGES_DIR, item.id + LfwPath.IMAGE_EXT))
 
-                person1, num1 = LfwPath.PATTERN.search(item.id).groups()
-                num1 = int(num1)
+                search = LfwPath.PATTERN.search(item.id)
+                if search:
+                    person1, num1 = search.groups()
+                    num1 = int(num1)
+                else:
+                    person1 = '-'
+                    num1 = item.id
                 if 'positive_pairs' in item.attributes:
                     for pair in item.attributes['positive_pairs']:
-                        num2 = LfwPath.PATTERN.search(pair).groups()[1]
-                        num2 = int(num2)
+                        search = LfwPath.PATTERN.search(pair)
+                        if search:
+                            num2 = search.groups()[1]
+                            num2 = int(num2)
+                        else:
+                            num2 = pair
                         positive_pairs.append('%s\t%s\t%s' % (person1, num1, num2))
                 if 'negative_pairs' in item.attributes:
                     for pair in item.attributes['negative_pairs']:
-                        person2, num2 = LfwPath.PATTERN.search(pair).groups()
-                        num2 = int(num2)
+                        search = LfwPath.PATTERN.search(pair)
+                        if search:
+                            person2, num2 = search.groups()
+                            num2 = int(num2)
+                        else:
+                            person2 = '-'
+                            num2 = pair
                         negative_pairs.append('%s\t%s\t%s\t%s' % \
                             (person1, num1, person2, num2))
 

diff --git a/datumaro/plugins/vgg_face2_format.py b/datumaro/plugins/vgg_face2_format.py
@@ -68,7 +68,7 @@ def _split_item_path(path):
 
         items = {}
 
-        with open(path) as content:
+        with open(path, encoding='utf-8') as content:
             landmarks_table = list(csv.DictReader(content))
         for row in landmarks_table:
             item_id = row['NAME_ID']
@@ -96,7 +96,7 @@ def _split_item_path(path):
         bboxes_path = osp.join(self._dataset_dir, VggFace2Path.ANNOTATION_DIR,
             VggFace2Path.BBOXES_FILE + self._subset + '.csv')
         if osp.isfile(bboxes_path):
-            with open(bboxes_path) as content:
+            with open(bboxes_path, encoding='utf-8') as content:
                 bboxes_table = list(csv.DictReader(content))
             for row in bboxes_table:
                 item_id = row['NAME_ID']
@@ -221,7 +221,7 @@ def apply(self):
             landmarks_path = osp.join(save_dir, VggFace2Path.ANNOTATION_DIR,
                 VggFace2Path.LANDMARKS_FILE + subset_name + '.csv')
             os.makedirs(osp.dirname(landmarks_path), exist_ok=True)
-            with open(landmarks_path, 'w', newline='') as file:
+            with open(landmarks_path, 'w', encoding='utf-8', newline='') as file:
                 columns = ['NAME_ID', 'P1X', 'P1Y', 'P2X', 'P2Y',
                     'P3X', 'P3Y', 'P4X', 'P4Y', 'P5X', 'P5Y']
                 writer = csv.DictWriter(file, fieldnames=columns)
@@ -232,7 +232,7 @@ def apply(self):
                 bboxes_path = osp.join(save_dir, VggFace2Path.ANNOTATION_DIR,
                     VggFace2Path.BBOXES_FILE + subset_name + '.csv')
                 os.makedirs(osp.dirname(bboxes_path), exist_ok=True)
-                with open(bboxes_path, 'w', newline='') as file:
+                with open(bboxes_path, 'w', encoding='utf-8', newline='') as file:
                     columns = ['NAME_ID', 'X', 'Y', 'W', 'H']
                     writer = csv.DictWriter(file, fieldnames=columns)
                     writer.writeheader()

diff --git a/datumaro/plugins/voc_format/converter.py b/datumaro/plugins/voc_format/converter.py
@@ -297,7 +297,7 @@ def save_subsets(self):
                             VocTask.action_classification}:
                         ann_path = osp.join(self._ann_dir, item.id + '.xml')
                         os.makedirs(osp.dirname(ann_path), exist_ok=True)
-                        with open(ann_path, 'w') as f:
+                        with open(ann_path, 'w', encoding='utf-8') as f:
                             f.write(ET.tostring(root_elem,
                                 encoding='unicode', pretty_print=True))
 
@@ -351,7 +351,7 @@ def save_subsets(self):
     @staticmethod
     def _get_filtered_lines(path, patch, subset, items=None):
         lines = {}
-        with open(path) as f:
+        with open(path, encoding='utf-8') as f:
             for line in f:
                 item, text, _ = line.split(maxsplit=1) + ['', '']
                 if not patch or patch.updated_items.get((item, subset)) != \
@@ -368,7 +368,7 @@ def save_action_lists(self, subset_name, action_list):
         items = {k: True for k in action_list}
         if self._patch and osp.isfile(ann_file):
             self._get_filtered_lines(ann_file, self._patch, subset_name, items)
-        with open(ann_file, 'w') as f:
+        with open(ann_file, 'w', encoding='utf-8') as f:
             for item in items:
                 f.write('%s\n' % item)
 
@@ -393,7 +393,7 @@ def _write_item(f, item, objs, action):
             if self._patch and osp.isfile(ann_file):
                 lines = self._get_filtered_lines(ann_file, None, subset_name)
 
-            with open(ann_file, 'w') as f:
+            with open(ann_file, 'w', encoding='utf-8') as f:
                 for item in items:
                     if item in action_list:
                         _write_item(f, item, action_list[item], action)
@@ -419,7 +419,7 @@ def _write_item(f, item, item_labels):
                 lines = self._get_filtered_lines(ann_file, self._patch,
                     subset_name, items)
 
-            with open(ann_file, 'w') as f:
+            with open(ann_file, 'w', encoding='utf-8') as f:
                 for item in items:
                     if item in class_lists:
                         _write_item(f, item, class_lists[item])
@@ -434,7 +434,7 @@ def save_clsdet_lists(self, subset_name, clsdet_list):
         if self._patch and osp.isfile(ann_file):
             self._get_filtered_lines(ann_file, self._patch, subset_name, items)
 
-        with open(ann_file, 'w') as f:
+        with open(ann_file, 'w', encoding='utf-8') as f:
             for item in items:
                 f.write('%s\n' % item)
 
@@ -446,12 +446,14 @@ def save_segm_lists(self, subset_name, segm_list):
         if self._patch and osp.isfile(ann_file):
             self._get_filtered_lines(ann_file, self._patch, subset_name, items)
 
-        with open(ann_file, 'w') as f:
+        with open(ann_file, 'w', encoding='utf-8') as f:
             for item in items:
                 f.write('%s\n' % item)
 
     def save_layout_lists(self, subset_name, layout_list):
         def _write_item(f, item, item_layouts):
+            if 1 < len(item.split()):
+                item = '\"' + item + '\"'
             if item_layouts:
                 for obj_id in item_layouts:
                     f.write('%s % d\n' % (item, 1 + obj_id))
@@ -466,7 +468,7 @@ def _write_item(f, item, item_layouts):
         if self._patch and osp.isfile(ann_file):
             self._get_filtered_lines(ann_file, self._patch, subset_name, items)
 
-        with open(ann_file, 'w') as f:
+        with open(ann_file, 'w', encoding='utf-8') as f:
             for item in items:
                 if item in layout_list:
                     _write_item(f, item, layout_list[item])

diff --git a/datumaro/plugins/voc_format/extractor.py b/datumaro/plugins/voc_format/extractor.py
@@ -24,10 +24,11 @@
 _inverse_inst_colormap = invert_colormap(VocInstColormap)
 
 class _VocExtractor(SourceExtractor):
-    def __init__(self, path):
+    def __init__(self, path, task):
         assert osp.isfile(path), path
         self._path = path
         self._dataset_dir = osp.dirname(osp.dirname(osp.dirname(path)))
+        self._task = task
 
         super().__init__(subset=osp.splitext(osp.basename(path))[0])
 
@@ -56,12 +57,29 @@ def _load_categories(dataset_path):
             label_map = parse_label_map(label_map_path)
         return make_voc_categories(label_map)
 
-    @staticmethod
-    def _load_subset_list(subset_path):
-        with open(subset_path) as f:
-            return [line.split()[0] for line in f]
+    def _load_subset_list(self, subset_path):
+        subset_list = []
+        with open(subset_path, encoding='utf-8') as f:
+            for line in f:
+                if self._task == VocTask.person_layout:
+                    objects = line.split('\"')
+                    if 1 < len(objects):
+                        if len(objects) == 3:
+                            line = objects[1]
+                        else:
+                            raise Exception("Line %s: unexpected number "
+                                "of quotes in filename" % line)
+                    else:
+                        line = line.split()[0]
+                else:
+                    line = line.strip()
+                subset_list.append(line)
+            return subset_list
 
 class VocClassificationExtractor(_VocExtractor):
+    def __init__(self, path):
+        super().__init__(path, VocTask.classification)
+
     def __iter__(self):
         raw_anns = self._load_annotations()
         for item_id in self._items:
@@ -78,11 +96,11 @@ def _load_annotations(self):
         anno_files = [s for s in dir_items(task_dir, '.txt')
             if s.endswith('_' + osp.basename(self._path))]
         for ann_filename in anno_files:
-            with open(osp.join(task_dir, ann_filename)) as f:
+            with open(osp.join(task_dir, ann_filename), encoding='utf-8') as f:
                 label = ann_filename[:ann_filename.rfind('_')]
                 label_id = self._get_label_id(label)
                 for line in f:
-                    item, present = line.split()
+                    item, present = line.rsplit(maxsplit=1)
                     if present == '1':
                         annotations[item].append(label_id)
 
@@ -94,8 +112,7 @@ def _parse_annotations(raw_anns, item_id):
 
 class _VocXmlExtractor(_VocExtractor):
     def __init__(self, path, task):
-        super().__init__(path)
-        self._task = task
+        super().__init__(path, task)
 
     def __iter__(self):
         anno_dir = osp.join(self._dataset_dir, VocPath.ANNOTATIONS_DIR)
@@ -230,6 +247,9 @@ def __init__(self, path):
         super().__init__(path, task=VocTask.action_classification)
 
 class VocSegmentationExtractor(_VocExtractor):
+    def __init__(self, path):
+        super().__init__(path, task=VocTask.segmentation)
+
     def __iter__(self):
         for item_id in self._items:
             log.debug("Reading item '%s'" % item_id)

diff --git a/datumaro/plugins/widerface_format.py b/datumaro/plugins/widerface_format.py
@@ -62,7 +62,7 @@ def _load_categories(self):
     def _load_items(self, path):
         items = {}
 
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
             lines = f.readlines()
 
         image_ids = [image_id for image_id, line in enumerate(lines)
@@ -178,5 +178,5 @@ def apply(self):
             annotation_path = osp.join(save_dir, WiderFacePath.ANNOTATIONS_DIR,
                 'wider_face_' + subset_name + '_bbx_gt.txt')
             os.makedirs(osp.dirname(annotation_path), exist_ok=True)
-            with open(annotation_path, 'w') as f:
+            with open(annotation_path, 'w', encoding='utf-8') as f:
                 f.write(wider_annotation)
-Original file line number
+Diff line change
@@ Expand Up @@
     -
     ### Fixed
-    -
+    - The ability to work with file names containing Cyrillic and spaces (<https://github.com/openvinotoolkit/datumaro/pull/148>)
     ### Security
     -
@@ Expand Down @@